| 1 | /*! |
| 2 | Types and routines specific to lazy DFAs. |
| 3 | |
| 4 | This module is the home of [`hybrid::dfa::DFA`](DFA). |
| 5 | |
| 6 | This module also contains a [`hybrid::dfa::Builder`](Builder) and a |
| 7 | [`hybrid::dfa::Config`](Config) for configuring and building a lazy DFA. |
| 8 | */ |
| 9 | |
| 10 | use core::{iter, mem::size_of}; |
| 11 | |
| 12 | use alloc::vec::Vec; |
| 13 | |
| 14 | use crate::{ |
| 15 | hybrid::{ |
| 16 | error::{BuildError, CacheError, StartError}, |
| 17 | id::{LazyStateID, LazyStateIDError}, |
| 18 | search, |
| 19 | }, |
| 20 | nfa::thompson, |
| 21 | util::{ |
| 22 | alphabet::{self, ByteClasses, ByteSet}, |
| 23 | determinize::{self, State, StateBuilderEmpty, StateBuilderNFA}, |
| 24 | empty, |
| 25 | prefilter::Prefilter, |
| 26 | primitives::{PatternID, StateID as NFAStateID}, |
| 27 | search::{ |
| 28 | Anchored, HalfMatch, Input, MatchError, MatchKind, PatternSet, |
| 29 | }, |
| 30 | sparse_set::SparseSets, |
| 31 | start::{self, Start, StartByteMap}, |
| 32 | }, |
| 33 | }; |
| 34 | |
| 35 | /// The minimum number of states that a lazy DFA's cache size must support. |
| 36 | /// |
| 37 | /// This is checked at time of construction to ensure that at least some small |
| 38 | /// number of states can fit in the given capacity allotment. If we can't fit |
| 39 | /// at least this number of states, then the thinking is that it's pretty |
| 40 | /// senseless to use the lazy DFA. More to the point, parts of the code do |
| 41 | /// assume that the cache can fit at least some small number of states. |
| 42 | const MIN_STATES: usize = SENTINEL_STATES + 2; |
| 43 | |
| 44 | /// The number of "sentinel" states that get added to every lazy DFA. |
| 45 | /// |
| 46 | /// These are special states indicating status conditions of a search: unknown, |
| 47 | /// dead and quit. These states in particular also use zero NFA states, so |
| 48 | /// their memory usage is quite small. This is relevant for computing the |
| 49 | /// minimum memory needed for a lazy DFA cache. |
| 50 | const SENTINEL_STATES: usize = 3; |
| 51 | |
| 52 | /// A hybrid NFA/DFA (also called a "lazy DFA") for regex searching. |
| 53 | /// |
| 54 | /// A lazy DFA is a DFA that builds itself at search time. It otherwise has |
| 55 | /// very similar characteristics as a [`dense::DFA`](crate::dfa::dense::DFA). |
| 56 | /// Indeed, both support precisely the same regex features with precisely the |
| 57 | /// same semantics. |
| 58 | /// |
| 59 | /// Where as a `dense::DFA` must be completely built to handle any input before |
| 60 | /// it may be used for search, a lazy DFA starts off effectively empty. During |
| 61 | /// a search, a lazy DFA will build itself depending on whether it has already |
| 62 | /// computed the next transition or not. If it has, then it looks a lot like |
| 63 | /// a `dense::DFA` internally: it does a very fast table based access to find |
| 64 | /// the next transition. Otherwise, if the state hasn't been computed, then it |
| 65 | /// does determinization _for that specific transition_ to compute the next DFA |
| 66 | /// state. |
| 67 | /// |
| 68 | /// The main selling point of a lazy DFA is that, in practice, it has |
| 69 | /// the performance profile of a `dense::DFA` without the weakness of it |
| 70 | /// taking worst case exponential time to build. Indeed, for each byte of |
| 71 | /// input, the lazy DFA will construct as most one new DFA state. Thus, a |
| 72 | /// lazy DFA achieves worst case `O(mn)` time for regex search (where `m ~ |
| 73 | /// pattern.len()` and `n ~ haystack.len()`). |
| 74 | /// |
| 75 | /// The main downsides of a lazy DFA are: |
| 76 | /// |
| 77 | /// 1. It requires mutable "cache" space during search. This is where the |
| 78 | /// transition table, among other things, is stored. |
| 79 | /// 2. In pathological cases (e.g., if the cache is too small), it will run |
| 80 | /// out of room and either require a bigger cache capacity or will repeatedly |
| 81 | /// clear the cache and thus repeatedly regenerate DFA states. Overall, this |
| 82 | /// will tend to be slower than a typical NFA simulation. |
| 83 | /// |
| 84 | /// # Capabilities |
| 85 | /// |
| 86 | /// Like a `dense::DFA`, a single lazy DFA fundamentally supports the following |
| 87 | /// operations: |
| 88 | /// |
| 89 | /// 1. Detection of a match. |
| 90 | /// 2. Location of the end of a match. |
| 91 | /// 3. In the case of a lazy DFA with multiple patterns, which pattern matched |
| 92 | /// is reported as well. |
| 93 | /// |
| 94 | /// A notable absence from the above list of capabilities is the location of |
| 95 | /// the *start* of a match. In order to provide both the start and end of |
| 96 | /// a match, *two* lazy DFAs are required. This functionality is provided by a |
| 97 | /// [`Regex`](crate::hybrid::regex::Regex). |
| 98 | /// |
| 99 | /// # Example |
| 100 | /// |
| 101 | /// This shows how to build a lazy DFA with the default configuration and |
| 102 | /// execute a search. Notice how, in contrast to a `dense::DFA`, we must create |
| 103 | /// a cache and pass it to our search routine. |
| 104 | /// |
| 105 | /// ``` |
| 106 | /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; |
| 107 | /// |
| 108 | /// let dfa = DFA::new("foo[0-9]+" )?; |
| 109 | /// let mut cache = dfa.create_cache(); |
| 110 | /// |
| 111 | /// let expected = Some(HalfMatch::must(0, 8)); |
| 112 | /// assert_eq!(expected, dfa.try_search_fwd( |
| 113 | /// &mut cache, &Input::new("foo12345" ))?, |
| 114 | /// ); |
| 115 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 116 | /// ``` |
| 117 | #[derive (Clone, Debug)] |
| 118 | pub struct DFA { |
| 119 | config: Config, |
| 120 | nfa: thompson::NFA, |
| 121 | stride2: usize, |
| 122 | start_map: StartByteMap, |
| 123 | classes: ByteClasses, |
| 124 | quitset: ByteSet, |
| 125 | cache_capacity: usize, |
| 126 | } |
| 127 | |
| 128 | impl DFA { |
| 129 | /// Parse the given regular expression using a default configuration and |
| 130 | /// return the corresponding lazy DFA. |
| 131 | /// |
| 132 | /// If you want a non-default configuration, then use the [`Builder`] to |
| 133 | /// set your own configuration. |
| 134 | /// |
| 135 | /// # Example |
| 136 | /// |
| 137 | /// ``` |
| 138 | /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; |
| 139 | /// |
| 140 | /// let dfa = DFA::new("foo[0-9]+bar" )?; |
| 141 | /// let mut cache = dfa.create_cache(); |
| 142 | /// |
| 143 | /// let expected = HalfMatch::must(0, 11); |
| 144 | /// assert_eq!( |
| 145 | /// Some(expected), |
| 146 | /// dfa.try_search_fwd(&mut cache, &Input::new("foo12345bar" ))?, |
| 147 | /// ); |
| 148 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 149 | /// ``` |
| 150 | #[cfg (feature = "syntax" )] |
| 151 | pub fn new(pattern: &str) -> Result<DFA, BuildError> { |
| 152 | DFA::builder().build(pattern) |
| 153 | } |
| 154 | |
| 155 | /// Parse the given regular expressions using a default configuration and |
| 156 | /// return the corresponding lazy multi-DFA. |
| 157 | /// |
| 158 | /// If you want a non-default configuration, then use the [`Builder`] to |
| 159 | /// set your own configuration. |
| 160 | /// |
| 161 | /// # Example |
| 162 | /// |
| 163 | /// ``` |
| 164 | /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; |
| 165 | /// |
| 166 | /// let dfa = DFA::new_many(&["[0-9]+" , "[a-z]+" ])?; |
| 167 | /// let mut cache = dfa.create_cache(); |
| 168 | /// |
| 169 | /// let expected = HalfMatch::must(1, 3); |
| 170 | /// assert_eq!( |
| 171 | /// Some(expected), |
| 172 | /// dfa.try_search_fwd(&mut cache, &Input::new("foo12345bar" ))?, |
| 173 | /// ); |
| 174 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 175 | /// ``` |
| 176 | #[cfg (feature = "syntax" )] |
| 177 | pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<DFA, BuildError> { |
| 178 | DFA::builder().build_many(patterns) |
| 179 | } |
| 180 | |
| 181 | /// Create a new lazy DFA that matches every input. |
| 182 | /// |
| 183 | /// # Example |
| 184 | /// |
| 185 | /// ``` |
| 186 | /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; |
| 187 | /// |
| 188 | /// let dfa = DFA::always_match()?; |
| 189 | /// let mut cache = dfa.create_cache(); |
| 190 | /// |
| 191 | /// let expected = HalfMatch::must(0, 0); |
| 192 | /// assert_eq!(Some(expected), dfa.try_search_fwd( |
| 193 | /// &mut cache, &Input::new("" ))?, |
| 194 | /// ); |
| 195 | /// assert_eq!(Some(expected), dfa.try_search_fwd( |
| 196 | /// &mut cache, &Input::new("foo" ))?, |
| 197 | /// ); |
| 198 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 199 | /// ``` |
| 200 | pub fn always_match() -> Result<DFA, BuildError> { |
| 201 | let nfa = thompson::NFA::always_match(); |
| 202 | Builder::new().build_from_nfa(nfa) |
| 203 | } |
| 204 | |
| 205 | /// Create a new lazy DFA that never matches any input. |
| 206 | /// |
| 207 | /// # Example |
| 208 | /// |
| 209 | /// ``` |
| 210 | /// use regex_automata::{hybrid::dfa::DFA, Input}; |
| 211 | /// |
| 212 | /// let dfa = DFA::never_match()?; |
| 213 | /// let mut cache = dfa.create_cache(); |
| 214 | /// |
| 215 | /// assert_eq!(None, dfa.try_search_fwd(&mut cache, &Input::new("" ))?); |
| 216 | /// assert_eq!(None, dfa.try_search_fwd(&mut cache, &Input::new("foo" ))?); |
| 217 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 218 | /// ``` |
| 219 | pub fn never_match() -> Result<DFA, BuildError> { |
| 220 | let nfa = thompson::NFA::never_match(); |
| 221 | Builder::new().build_from_nfa(nfa) |
| 222 | } |
| 223 | |
| 224 | /// Return a default configuration for a `DFA`. |
| 225 | /// |
| 226 | /// This is a convenience routine to avoid needing to import the [`Config`] |
| 227 | /// type when customizing the construction of a lazy DFA. |
| 228 | /// |
| 229 | /// # Example |
| 230 | /// |
| 231 | /// This example shows how to build a lazy DFA that heuristically supports |
| 232 | /// Unicode word boundaries. |
| 233 | /// |
| 234 | /// ``` |
| 235 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
| 236 | /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchError, Input}; |
| 237 | /// |
| 238 | /// let re = DFA::builder() |
| 239 | /// .configure(DFA::config().unicode_word_boundary(true)) |
| 240 | /// .build(r"\b\w+\b" )?; |
| 241 | /// let mut cache = re.create_cache(); |
| 242 | /// |
| 243 | /// // Since our haystack is all ASCII, the DFA search sees then and knows |
| 244 | /// // it is legal to interpret Unicode word boundaries as ASCII word |
| 245 | /// // boundaries. |
| 246 | /// let input = Input::new("!!foo!!" ); |
| 247 | /// let expected = HalfMatch::must(0, 5); |
| 248 | /// assert_eq!(Some(expected), re.try_search_fwd(&mut cache, &input)?); |
| 249 | /// |
| 250 | /// // But if our haystack contains non-ASCII, then the search will fail |
| 251 | /// // with an error. |
| 252 | /// let input = Input::new("!!βββ!!" ); |
| 253 | /// let expected = MatchError::quit(b' \xCE' , 2); |
| 254 | /// assert_eq!(Err(expected), re.try_search_fwd(&mut cache, &input)); |
| 255 | /// |
| 256 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 257 | /// ``` |
| 258 | pub fn config() -> Config { |
| 259 | Config::new() |
| 260 | } |
| 261 | |
| 262 | /// Return a builder for configuring the construction of a `Regex`. |
| 263 | /// |
| 264 | /// This is a convenience routine to avoid needing to import the |
| 265 | /// [`Builder`] type in common cases. |
| 266 | /// |
| 267 | /// # Example |
| 268 | /// |
| 269 | /// This example shows how to use the builder to disable UTF-8 mode |
| 270 | /// everywhere for lazy DFAs. |
| 271 | /// |
| 272 | /// ``` |
| 273 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
| 274 | /// use regex_automata::{hybrid::dfa::DFA, util::syntax, HalfMatch, Input}; |
| 275 | /// |
| 276 | /// let re = DFA::builder() |
| 277 | /// .syntax(syntax::Config::new().utf8(false)) |
| 278 | /// .build(r"foo(?-u:[^b])ar.*" )?; |
| 279 | /// let mut cache = re.create_cache(); |
| 280 | /// |
| 281 | /// let input = Input::new(b" \xFEfoo \xFFarzz \xE2\x98\xFF\n" ); |
| 282 | /// let expected = Some(HalfMatch::must(0, 9)); |
| 283 | /// let got = re.try_search_fwd(&mut cache, &input)?; |
| 284 | /// assert_eq!(expected, got); |
| 285 | /// |
| 286 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 287 | /// ``` |
| 288 | pub fn builder() -> Builder { |
| 289 | Builder::new() |
| 290 | } |
| 291 | |
| 292 | /// Create a new cache for this lazy DFA. |
| 293 | /// |
| 294 | /// The cache returned should only be used for searches for this |
| 295 | /// lazy DFA. If you want to reuse the cache for another DFA, then |
| 296 | /// you must call [`Cache::reset`] with that DFA (or, equivalently, |
| 297 | /// [`DFA::reset_cache`]). |
| 298 | pub fn create_cache(&self) -> Cache { |
| 299 | Cache::new(self) |
| 300 | } |
| 301 | |
| 302 | /// Reset the given cache such that it can be used for searching with the |
| 303 | /// this lazy DFA (and only this DFA). |
| 304 | /// |
| 305 | /// A cache reset permits reusing memory already allocated in this cache |
| 306 | /// with a different lazy DFA. |
| 307 | /// |
| 308 | /// Resetting a cache sets its "clear count" to 0. This is relevant if the |
| 309 | /// lazy DFA has been configured to "give up" after it has cleared the |
| 310 | /// cache a certain number of times. |
| 311 | /// |
| 312 | /// Any lazy state ID generated by the cache prior to resetting it is |
| 313 | /// invalid after the reset. |
| 314 | /// |
| 315 | /// # Example |
| 316 | /// |
| 317 | /// This shows how to re-purpose a cache for use with a different DFA. |
| 318 | /// |
| 319 | /// ``` |
| 320 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
| 321 | /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; |
| 322 | /// |
| 323 | /// let dfa1 = DFA::new(r"\w" )?; |
| 324 | /// let dfa2 = DFA::new(r"\W" )?; |
| 325 | /// |
| 326 | /// let mut cache = dfa1.create_cache(); |
| 327 | /// assert_eq!( |
| 328 | /// Some(HalfMatch::must(0, 2)), |
| 329 | /// dfa1.try_search_fwd(&mut cache, &Input::new("Δ" ))?, |
| 330 | /// ); |
| 331 | /// |
| 332 | /// // Using 'cache' with dfa2 is not allowed. It may result in panics or |
| 333 | /// // incorrect results. In order to re-purpose the cache, we must reset |
| 334 | /// // it with the DFA we'd like to use it with. |
| 335 | /// // |
| 336 | /// // Similarly, after this reset, using the cache with 'dfa1' is also not |
| 337 | /// // allowed. |
| 338 | /// dfa2.reset_cache(&mut cache); |
| 339 | /// assert_eq!( |
| 340 | /// Some(HalfMatch::must(0, 3)), |
| 341 | /// dfa2.try_search_fwd(&mut cache, &Input::new("☃" ))?, |
| 342 | /// ); |
| 343 | /// |
| 344 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 345 | /// ``` |
| 346 | pub fn reset_cache(&self, cache: &mut Cache) { |
| 347 | Lazy::new(self, cache).reset_cache() |
| 348 | } |
| 349 | |
| 350 | /// Returns the total number of patterns compiled into this lazy DFA. |
| 351 | /// |
| 352 | /// In the case of a DFA that contains no patterns, this returns `0`. |
| 353 | /// |
| 354 | /// # Example |
| 355 | /// |
| 356 | /// This example shows the pattern length for a DFA that never matches: |
| 357 | /// |
| 358 | /// ``` |
| 359 | /// use regex_automata::hybrid::dfa::DFA; |
| 360 | /// |
| 361 | /// let dfa = DFA::never_match()?; |
| 362 | /// assert_eq!(dfa.pattern_len(), 0); |
| 363 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 364 | /// ``` |
| 365 | /// |
| 366 | /// And another example for a DFA that matches at every position: |
| 367 | /// |
| 368 | /// ``` |
| 369 | /// use regex_automata::hybrid::dfa::DFA; |
| 370 | /// |
| 371 | /// let dfa = DFA::always_match()?; |
| 372 | /// assert_eq!(dfa.pattern_len(), 1); |
| 373 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 374 | /// ``` |
| 375 | /// |
| 376 | /// And finally, a DFA that was constructed from multiple patterns: |
| 377 | /// |
| 378 | /// ``` |
| 379 | /// use regex_automata::hybrid::dfa::DFA; |
| 380 | /// |
| 381 | /// let dfa = DFA::new_many(&["[0-9]+" , "[a-z]+" , "[A-Z]+" ])?; |
| 382 | /// assert_eq!(dfa.pattern_len(), 3); |
| 383 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 384 | /// ``` |
| 385 | pub fn pattern_len(&self) -> usize { |
| 386 | self.nfa.pattern_len() |
| 387 | } |
| 388 | |
| 389 | /// Returns the equivalence classes that make up the alphabet for this DFA. |
| 390 | /// |
| 391 | /// Unless [`Config::byte_classes`] was disabled, it is possible that |
| 392 | /// multiple distinct bytes are grouped into the same equivalence class |
| 393 | /// if it is impossible for them to discriminate between a match and a |
| 394 | /// non-match. This has the effect of reducing the overall alphabet size |
| 395 | /// and in turn potentially substantially reducing the size of the DFA's |
| 396 | /// transition table. |
| 397 | /// |
| 398 | /// The downside of using equivalence classes like this is that every state |
| 399 | /// transition will automatically use this map to convert an arbitrary |
| 400 | /// byte to its corresponding equivalence class. In practice this has a |
| 401 | /// negligible impact on performance. |
| 402 | pub fn byte_classes(&self) -> &ByteClasses { |
| 403 | &self.classes |
| 404 | } |
| 405 | |
| 406 | /// Returns this lazy DFA's configuration. |
| 407 | pub fn get_config(&self) -> &Config { |
| 408 | &self.config |
| 409 | } |
| 410 | |
| 411 | /// Returns a reference to the underlying NFA. |
| 412 | pub fn get_nfa(&self) -> &thompson::NFA { |
| 413 | &self.nfa |
| 414 | } |
| 415 | |
| 416 | /// Returns the stride, as a base-2 exponent, required for these |
| 417 | /// equivalence classes. |
| 418 | /// |
| 419 | /// The stride is always the smallest power of 2 that is greater than or |
| 420 | /// equal to the alphabet length. This is done so that converting between |
| 421 | /// state IDs and indices can be done with shifts alone, which is much |
| 422 | /// faster than integer division. |
| 423 | fn stride2(&self) -> usize { |
| 424 | self.stride2 |
| 425 | } |
| 426 | |
| 427 | /// Returns the total stride for every state in this lazy DFA. This |
| 428 | /// corresponds to the total number of transitions used by each state in |
| 429 | /// this DFA's transition table. |
| 430 | fn stride(&self) -> usize { |
| 431 | 1 << self.stride2() |
| 432 | } |
| 433 | |
| 434 | /// Returns the memory usage, in bytes, of this lazy DFA. |
| 435 | /// |
| 436 | /// This does **not** include the stack size used up by this lazy DFA. To |
| 437 | /// compute that, use `std::mem::size_of::<DFA>()`. This also does not |
| 438 | /// include the size of the `Cache` used. |
| 439 | /// |
| 440 | /// This also does not include any heap memory used by the NFA inside of |
| 441 | /// this hybrid NFA/DFA. This is because the NFA's ownership is shared, and |
| 442 | /// thus not owned by this hybrid NFA/DFA. More practically, several regex |
| 443 | /// engines in this crate embed an NFA, and reporting the NFA's memory |
| 444 | /// usage in all of them would likely result in reporting higher heap |
| 445 | /// memory than is actually used. |
| 446 | pub fn memory_usage(&self) -> usize { |
| 447 | // The only thing that uses heap memory in a DFA is the NFA. But the |
| 448 | // NFA has shared ownership, so reporting its memory as part of the |
| 449 | // hybrid DFA is likely to lead to double-counting the NFA memory |
| 450 | // somehow. In particular, this DFA does not really own an NFA, so |
| 451 | // including it in the DFA's memory usage doesn't seem semantically |
| 452 | // correct. |
| 453 | 0 |
| 454 | } |
| 455 | } |
| 456 | |
| 457 | impl DFA { |
| 458 | /// Executes a forward search and returns the end position of the leftmost |
| 459 | /// match that is found. If no match exists, then `None` is returned. |
| 460 | /// |
| 461 | /// In particular, this method continues searching even after it enters |
| 462 | /// a match state. The search only terminates once it has reached the |
| 463 | /// end of the input or when it has entered a dead or quit state. Upon |
| 464 | /// termination, the position of the last byte seen while still in a match |
| 465 | /// state is returned. |
| 466 | /// |
| 467 | /// # Errors |
| 468 | /// |
| 469 | /// This routine errors if the search could not complete. This can occur |
| 470 | /// in a number of circumstances: |
| 471 | /// |
| 472 | /// * The configuration of the lazy DFA may permit it to "quit" the search. |
| 473 | /// For example, setting quit bytes or enabling heuristic support for |
| 474 | /// Unicode word boundaries. The default configuration does not enable any |
| 475 | /// option that could result in the lazy DFA quitting. |
| 476 | /// * The configuration of the lazy DFA may also permit it to "give up" |
| 477 | /// on a search if it makes ineffective use of its transition table |
| 478 | /// cache. The default configuration does not enable this by default, |
| 479 | /// although it is typically a good idea to. |
| 480 | /// * When the provided `Input` configuration is not supported. For |
| 481 | /// example, by providing an unsupported anchor mode. |
| 482 | /// |
| 483 | /// When a search returns an error, callers cannot know whether a match |
| 484 | /// exists or not. |
| 485 | /// |
| 486 | /// # Example |
| 487 | /// |
| 488 | /// This example shows how to run a basic search. |
| 489 | /// |
| 490 | /// ``` |
| 491 | /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; |
| 492 | /// |
| 493 | /// let dfa = DFA::new("foo[0-9]+" )?; |
| 494 | /// let mut cache = dfa.create_cache(); |
| 495 | /// let expected = HalfMatch::must(0, 8); |
| 496 | /// assert_eq!(Some(expected), dfa.try_search_fwd( |
| 497 | /// &mut cache, &Input::new("foo12345" ))?, |
| 498 | /// ); |
| 499 | /// |
| 500 | /// // Even though a match is found after reading the first byte (`a`), |
| 501 | /// // the leftmost first match semantics demand that we find the earliest |
| 502 | /// // match that prefers earlier parts of the pattern over later parts. |
| 503 | /// let dfa = DFA::new("abc|a" )?; |
| 504 | /// let mut cache = dfa.create_cache(); |
| 505 | /// let expected = HalfMatch::must(0, 3); |
| 506 | /// assert_eq!(Some(expected), dfa.try_search_fwd( |
| 507 | /// &mut cache, &Input::new("abc" ))?, |
| 508 | /// ); |
| 509 | /// |
| 510 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 511 | /// ``` |
| 512 | /// |
| 513 | /// # Example: specific pattern search |
| 514 | /// |
| 515 | /// This example shows how to build a lazy multi-DFA that permits searching |
| 516 | /// for specific patterns. |
| 517 | /// |
| 518 | /// ``` |
| 519 | /// use regex_automata::{ |
| 520 | /// hybrid::dfa::DFA, |
| 521 | /// Anchored, HalfMatch, PatternID, Input, |
| 522 | /// }; |
| 523 | /// |
| 524 | /// let dfa = DFA::builder() |
| 525 | /// .configure(DFA::config().starts_for_each_pattern(true)) |
| 526 | /// .build_many(&["[a-z0-9]{6}" , "[a-z][a-z0-9]{5}" ])?; |
| 527 | /// let mut cache = dfa.create_cache(); |
| 528 | /// let haystack = "foo123" ; |
| 529 | /// |
| 530 | /// // Since we are using the default leftmost-first match and both |
| 531 | /// // patterns match at the same starting position, only the first pattern |
| 532 | /// // will be returned in this case when doing a search for any of the |
| 533 | /// // patterns. |
| 534 | /// let expected = Some(HalfMatch::must(0, 6)); |
| 535 | /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?; |
| 536 | /// assert_eq!(expected, got); |
| 537 | /// |
| 538 | /// // But if we want to check whether some other pattern matches, then we |
| 539 | /// // can provide its pattern ID. |
| 540 | /// let expected = Some(HalfMatch::must(1, 6)); |
| 541 | /// let input = Input::new(haystack) |
| 542 | /// .anchored(Anchored::Pattern(PatternID::must(1))); |
| 543 | /// let got = dfa.try_search_fwd(&mut cache, &input)?; |
| 544 | /// assert_eq!(expected, got); |
| 545 | /// |
| 546 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 547 | /// ``` |
| 548 | /// |
| 549 | /// # Example: specifying the bounds of a search |
| 550 | /// |
| 551 | /// This example shows how providing the bounds of a search can produce |
| 552 | /// different results than simply sub-slicing the haystack. |
| 553 | /// |
| 554 | /// ``` |
| 555 | /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; |
| 556 | /// |
| 557 | /// // N.B. We disable Unicode here so that we use a simple ASCII word |
| 558 | /// // boundary. Alternatively, we could enable heuristic support for |
| 559 | /// // Unicode word boundaries since our haystack is pure ASCII. |
| 560 | /// let dfa = DFA::new(r"(?-u)\b[0-9]{3}\b" )?; |
| 561 | /// let mut cache = dfa.create_cache(); |
| 562 | /// let haystack = "foo123bar" ; |
| 563 | /// |
| 564 | /// // Since we sub-slice the haystack, the search doesn't know about the |
| 565 | /// // larger context and assumes that `123` is surrounded by word |
| 566 | /// // boundaries. And of course, the match position is reported relative |
| 567 | /// // to the sub-slice as well, which means we get `3` instead of `6`. |
| 568 | /// let expected = Some(HalfMatch::must(0, 3)); |
| 569 | /// let got = dfa.try_search_fwd( |
| 570 | /// &mut cache, |
| 571 | /// &Input::new(&haystack[3..6]), |
| 572 | /// )?; |
| 573 | /// assert_eq!(expected, got); |
| 574 | /// |
| 575 | /// // But if we provide the bounds of the search within the context of the |
| 576 | /// // entire haystack, then the search can take the surrounding context |
| 577 | /// // into account. (And if we did find a match, it would be reported |
| 578 | /// // as a valid offset into `haystack` instead of its sub-slice.) |
| 579 | /// let expected = None; |
| 580 | /// let got = dfa.try_search_fwd( |
| 581 | /// &mut cache, |
| 582 | /// &Input::new(haystack).range(3..6), |
| 583 | /// )?; |
| 584 | /// assert_eq!(expected, got); |
| 585 | /// |
| 586 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 587 | /// ``` |
| 588 | #[inline ] |
| 589 | pub fn try_search_fwd( |
| 590 | &self, |
| 591 | cache: &mut Cache, |
| 592 | input: &Input<'_>, |
| 593 | ) -> Result<Option<HalfMatch>, MatchError> { |
| 594 | let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); |
| 595 | let hm = match search::find_fwd(self, cache, input)? { |
| 596 | None => return Ok(None), |
| 597 | Some(hm) if !utf8empty => return Ok(Some(hm)), |
| 598 | Some(hm) => hm, |
| 599 | }; |
| 600 | // We get to this point when we know our DFA can match the empty string |
| 601 | // AND when UTF-8 mode is enabled. In this case, we skip any matches |
| 602 | // whose offset splits a codepoint. Such a match is necessarily a |
| 603 | // zero-width match, because UTF-8 mode requires the underlying NFA |
| 604 | // to be built such that all non-empty matches span valid UTF-8. |
| 605 | // Therefore, any match that ends in the middle of a codepoint cannot |
| 606 | // be part of a span of valid UTF-8 and thus must be an empty match. |
| 607 | // In such cases, we skip it, so as not to report matches that split a |
| 608 | // codepoint. |
| 609 | // |
| 610 | // Note that this is not a checked assumption. Callers *can* provide an |
| 611 | // NFA with UTF-8 mode enabled but produces non-empty matches that span |
| 612 | // invalid UTF-8. But doing so is documented to result in unspecified |
| 613 | // behavior. |
| 614 | empty::skip_splits_fwd(input, hm, hm.offset(), |input| { |
| 615 | let got = search::find_fwd(self, cache, input)?; |
| 616 | Ok(got.map(|hm| (hm, hm.offset()))) |
| 617 | }) |
| 618 | } |
| 619 | |
| 620 | /// Executes a reverse search and returns the start of the position of the |
| 621 | /// leftmost match that is found. If no match exists, then `None` is |
| 622 | /// returned. |
| 623 | /// |
| 624 | /// # Errors |
| 625 | /// |
| 626 | /// This routine errors if the search could not complete. This can occur |
| 627 | /// in a number of circumstances: |
| 628 | /// |
| 629 | /// * The configuration of the lazy DFA may permit it to "quit" the search. |
| 630 | /// For example, setting quit bytes or enabling heuristic support for |
| 631 | /// Unicode word boundaries. The default configuration does not enable any |
| 632 | /// option that could result in the lazy DFA quitting. |
| 633 | /// * The configuration of the lazy DFA may also permit it to "give up" |
| 634 | /// on a search if it makes ineffective use of its transition table |
| 635 | /// cache. The default configuration does not enable this by default, |
| 636 | /// although it is typically a good idea to. |
| 637 | /// * When the provided `Input` configuration is not supported. For |
| 638 | /// example, by providing an unsupported anchor mode. |
| 639 | /// |
| 640 | /// When a search returns an error, callers cannot know whether a match |
| 641 | /// exists or not. |
| 642 | /// |
| 643 | /// # Example |
| 644 | /// |
| 645 | /// This routine is principally useful when used in |
| 646 | /// conjunction with the |
| 647 | /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse) |
| 648 | /// configuration. In general, it's unlikely to be correct to use both |
| 649 | /// `try_search_fwd` and `try_search_rev` with the same DFA since any |
| 650 | /// particular DFA will only support searching in one direction with |
| 651 | /// respect to the pattern. |
| 652 | /// |
| 653 | /// ``` |
| 654 | /// use regex_automata::{ |
| 655 | /// nfa::thompson, |
| 656 | /// hybrid::dfa::DFA, |
| 657 | /// HalfMatch, Input, |
| 658 | /// }; |
| 659 | /// |
| 660 | /// let dfa = DFA::builder() |
| 661 | /// .thompson(thompson::Config::new().reverse(true)) |
| 662 | /// .build("foo[0-9]+" )?; |
| 663 | /// let mut cache = dfa.create_cache(); |
| 664 | /// let expected = HalfMatch::must(0, 0); |
| 665 | /// assert_eq!( |
| 666 | /// Some(expected), |
| 667 | /// dfa.try_search_rev(&mut cache, &Input::new("foo12345" ))?, |
| 668 | /// ); |
| 669 | /// |
| 670 | /// // Even though a match is found after reading the last byte (`c`), |
| 671 | /// // the leftmost first match semantics demand that we find the earliest |
| 672 | /// // match that prefers earlier parts of the pattern over latter parts. |
| 673 | /// let dfa = DFA::builder() |
| 674 | /// .thompson(thompson::Config::new().reverse(true)) |
| 675 | /// .build("abc|c" )?; |
| 676 | /// let mut cache = dfa.create_cache(); |
| 677 | /// let expected = HalfMatch::must(0, 0); |
| 678 | /// assert_eq!(Some(expected), dfa.try_search_rev( |
| 679 | /// &mut cache, &Input::new("abc" ))?, |
| 680 | /// ); |
| 681 | /// |
| 682 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 683 | /// ``` |
| 684 | /// |
| 685 | /// # Example: UTF-8 mode |
| 686 | /// |
| 687 | /// This examples demonstrates that UTF-8 mode applies to reverse |
| 688 | /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all |
| 689 | /// matches reported must correspond to valid UTF-8 spans. This includes |
| 690 | /// prohibiting zero-width matches that split a codepoint. |
| 691 | /// |
| 692 | /// UTF-8 mode is enabled by default. Notice below how the only zero-width |
| 693 | /// matches reported are those at UTF-8 boundaries: |
| 694 | /// |
| 695 | /// ``` |
| 696 | /// use regex_automata::{ |
| 697 | /// hybrid::dfa::DFA, |
| 698 | /// nfa::thompson, |
| 699 | /// HalfMatch, Input, MatchKind, |
| 700 | /// }; |
| 701 | /// |
| 702 | /// let dfa = DFA::builder() |
| 703 | /// .thompson(thompson::Config::new().reverse(true)) |
| 704 | /// .build(r"" )?; |
| 705 | /// let mut cache = dfa.create_cache(); |
| 706 | /// |
| 707 | /// // Run the reverse DFA to collect all matches. |
| 708 | /// let mut input = Input::new("☃" ); |
| 709 | /// let mut matches = vec![]; |
| 710 | /// loop { |
| 711 | /// match dfa.try_search_rev(&mut cache, &input)? { |
| 712 | /// None => break, |
| 713 | /// Some(hm) => { |
| 714 | /// matches.push(hm); |
| 715 | /// if hm.offset() == 0 || input.end() == 0 { |
| 716 | /// break; |
| 717 | /// } else if hm.offset() < input.end() { |
| 718 | /// input.set_end(hm.offset()); |
| 719 | /// } else { |
| 720 | /// // This is only necessary to handle zero-width |
| 721 | /// // matches, which of course occur in this example. |
| 722 | /// // Without this, the search would never advance |
| 723 | /// // backwards beyond the initial match. |
| 724 | /// input.set_end(input.end() - 1); |
| 725 | /// } |
| 726 | /// } |
| 727 | /// } |
| 728 | /// } |
| 729 | /// |
| 730 | /// // No matches split a codepoint. |
| 731 | /// let expected = vec![ |
| 732 | /// HalfMatch::must(0, 3), |
| 733 | /// HalfMatch::must(0, 0), |
| 734 | /// ]; |
| 735 | /// assert_eq!(expected, matches); |
| 736 | /// |
| 737 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 738 | /// ``` |
| 739 | /// |
| 740 | /// Now let's look at the same example, but with UTF-8 mode on the |
| 741 | /// underlying NFA disabled: |
| 742 | /// |
| 743 | /// ``` |
| 744 | /// use regex_automata::{ |
| 745 | /// hybrid::dfa::DFA, |
| 746 | /// nfa::thompson, |
| 747 | /// HalfMatch, Input, MatchKind, |
| 748 | /// }; |
| 749 | /// |
| 750 | /// let dfa = DFA::builder() |
| 751 | /// .thompson(thompson::Config::new().reverse(true).utf8(false)) |
| 752 | /// .build(r"" )?; |
| 753 | /// let mut cache = dfa.create_cache(); |
| 754 | /// |
| 755 | /// // Run the reverse DFA to collect all matches. |
| 756 | /// let mut input = Input::new("☃" ); |
| 757 | /// let mut matches = vec![]; |
| 758 | /// loop { |
| 759 | /// match dfa.try_search_rev(&mut cache, &input)? { |
| 760 | /// None => break, |
| 761 | /// Some(hm) => { |
| 762 | /// matches.push(hm); |
| 763 | /// if hm.offset() == 0 || input.end() == 0 { |
| 764 | /// break; |
| 765 | /// } else if hm.offset() < input.end() { |
| 766 | /// input.set_end(hm.offset()); |
| 767 | /// } else { |
| 768 | /// // This is only necessary to handle zero-width |
| 769 | /// // matches, which of course occur in this example. |
| 770 | /// // Without this, the search would never advance |
| 771 | /// // backwards beyond the initial match. |
| 772 | /// input.set_end(input.end() - 1); |
| 773 | /// } |
| 774 | /// } |
| 775 | /// } |
| 776 | /// } |
| 777 | /// |
| 778 | /// // No matches split a codepoint. |
| 779 | /// let expected = vec![ |
| 780 | /// HalfMatch::must(0, 3), |
| 781 | /// HalfMatch::must(0, 2), |
| 782 | /// HalfMatch::must(0, 1), |
| 783 | /// HalfMatch::must(0, 0), |
| 784 | /// ]; |
| 785 | /// assert_eq!(expected, matches); |
| 786 | /// |
| 787 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 788 | /// ``` |
| 789 | #[inline ] |
| 790 | pub fn try_search_rev( |
| 791 | &self, |
| 792 | cache: &mut Cache, |
| 793 | input: &Input<'_>, |
| 794 | ) -> Result<Option<HalfMatch>, MatchError> { |
| 795 | let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); |
| 796 | let hm = match search::find_rev(self, cache, input)? { |
| 797 | None => return Ok(None), |
| 798 | Some(hm) if !utf8empty => return Ok(Some(hm)), |
| 799 | Some(hm) => hm, |
| 800 | }; |
| 801 | empty::skip_splits_rev(input, hm, hm.offset(), |input| { |
| 802 | let got = search::find_rev(self, cache, input)?; |
| 803 | Ok(got.map(|hm| (hm, hm.offset()))) |
| 804 | }) |
| 805 | } |
| 806 | |
| 807 | /// Executes an overlapping forward search and returns the end position of |
| 808 | /// matches as they are found. If no match exists, then `None` is returned. |
| 809 | /// |
| 810 | /// This routine is principally only useful when searching for multiple |
| 811 | /// patterns on inputs where multiple patterns may match the same regions |
| 812 | /// of text. In particular, callers must preserve the automaton's search |
| 813 | /// state from prior calls so that the implementation knows where the last |
| 814 | /// match occurred. |
| 815 | /// |
| 816 | /// When using this routine to implement an iterator of overlapping |
| 817 | /// matches, the `start` of the search should remain invariant throughout |
| 818 | /// iteration. The `OverlappingState` given to the search will keep track |
| 819 | /// of the current position of the search. (This is because multiple |
| 820 | /// matches may be reported at the same position, so only the search |
| 821 | /// implementation itself knows when to advance the position.) |
| 822 | /// |
| 823 | /// If for some reason you want the search to forget about its previous |
| 824 | /// state and restart the search at a particular position, then setting the |
| 825 | /// state to [`OverlappingState::start`] will accomplish that. |
| 826 | /// |
| 827 | /// # Errors |
| 828 | /// |
| 829 | /// This routine errors if the search could not complete. This can occur |
| 830 | /// in a number of circumstances: |
| 831 | /// |
| 832 | /// * The configuration of the lazy DFA may permit it to "quit" the search. |
| 833 | /// For example, setting quit bytes or enabling heuristic support for |
| 834 | /// Unicode word boundaries. The default configuration does not enable any |
| 835 | /// option that could result in the lazy DFA quitting. |
| 836 | /// * The configuration of the lazy DFA may also permit it to "give up" |
| 837 | /// on a search if it makes ineffective use of its transition table |
| 838 | /// cache. The default configuration does not enable this by default, |
| 839 | /// although it is typically a good idea to. |
| 840 | /// * When the provided `Input` configuration is not supported. For |
| 841 | /// example, by providing an unsupported anchor mode. |
| 842 | /// |
| 843 | /// When a search returns an error, callers cannot know whether a match |
| 844 | /// exists or not. |
| 845 | /// |
| 846 | /// # Example |
| 847 | /// |
| 848 | /// This example shows how to run a basic overlapping search. Notice |
| 849 | /// that we build the automaton with a `MatchKind::All` configuration. |
| 850 | /// Overlapping searches are unlikely to work as one would expect when |
| 851 | /// using the default `MatchKind::LeftmostFirst` match semantics, since |
| 852 | /// leftmost-first matching is fundamentally incompatible with overlapping |
| 853 | /// searches. Namely, overlapping searches need to report matches as they |
| 854 | /// are seen, where as leftmost-first searches will continue searching even |
| 855 | /// after a match has been observed in order to find the conventional end |
| 856 | /// position of the match. More concretely, leftmost-first searches use |
| 857 | /// dead states to terminate a search after a specific match can no longer |
| 858 | /// be extended. Overlapping searches instead do the opposite by continuing |
| 859 | /// the search to find totally new matches (potentially of other patterns). |
| 860 | /// |
| 861 | /// ``` |
| 862 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
| 863 | /// use regex_automata::{ |
| 864 | /// hybrid::dfa::{DFA, OverlappingState}, |
| 865 | /// HalfMatch, Input, MatchKind, |
| 866 | /// }; |
| 867 | /// |
| 868 | /// let dfa = DFA::builder() |
| 869 | /// .configure(DFA::config().match_kind(MatchKind::All)) |
| 870 | /// .build_many(&[r"\w+$" , r"\S+$" ])?; |
| 871 | /// let mut cache = dfa.create_cache(); |
| 872 | /// |
| 873 | /// let haystack = "@foo" ; |
| 874 | /// let mut state = OverlappingState::start(); |
| 875 | /// |
| 876 | /// let expected = Some(HalfMatch::must(1, 4)); |
| 877 | /// dfa.try_search_overlapping_fwd( |
| 878 | /// &mut cache, &Input::new(haystack), &mut state, |
| 879 | /// )?; |
| 880 | /// assert_eq!(expected, state.get_match()); |
| 881 | /// |
| 882 | /// // The first pattern also matches at the same position, so re-running |
| 883 | /// // the search will yield another match. Notice also that the first |
| 884 | /// // pattern is returned after the second. This is because the second |
| 885 | /// // pattern begins its match before the first, is therefore an earlier |
| 886 | /// // match and is thus reported first. |
| 887 | /// let expected = Some(HalfMatch::must(0, 4)); |
| 888 | /// dfa.try_search_overlapping_fwd( |
| 889 | /// &mut cache, &Input::new(haystack), &mut state, |
| 890 | /// )?; |
| 891 | /// assert_eq!(expected, state.get_match()); |
| 892 | /// |
| 893 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 894 | /// ``` |
| 895 | #[inline ] |
| 896 | pub fn try_search_overlapping_fwd( |
| 897 | &self, |
| 898 | cache: &mut Cache, |
| 899 | input: &Input<'_>, |
| 900 | state: &mut OverlappingState, |
| 901 | ) -> Result<(), MatchError> { |
| 902 | let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); |
| 903 | search::find_overlapping_fwd(self, cache, input, state)?; |
| 904 | match state.get_match() { |
| 905 | None => Ok(()), |
| 906 | Some(_) if !utf8empty => Ok(()), |
| 907 | Some(_) => skip_empty_utf8_splits_overlapping( |
| 908 | input, |
| 909 | state, |
| 910 | |input, state| { |
| 911 | search::find_overlapping_fwd(self, cache, input, state) |
| 912 | }, |
| 913 | ), |
| 914 | } |
| 915 | } |
| 916 | |
| 917 | /// Executes a reverse overlapping search and returns the start of the |
| 918 | /// position of the leftmost match that is found. If no match exists, then |
| 919 | /// `None` is returned. |
| 920 | /// |
| 921 | /// When using this routine to implement an iterator of overlapping |
| 922 | /// matches, the `start` of the search should remain invariant throughout |
| 923 | /// iteration. The `OverlappingState` given to the search will keep track |
| 924 | /// of the current position of the search. (This is because multiple |
| 925 | /// matches may be reported at the same position, so only the search |
| 926 | /// implementation itself knows when to advance the position.) |
| 927 | /// |
| 928 | /// If for some reason you want the search to forget about its previous |
| 929 | /// state and restart the search at a particular position, then setting the |
| 930 | /// state to [`OverlappingState::start`] will accomplish that. |
| 931 | /// |
| 932 | /// # Errors |
| 933 | /// |
| 934 | /// This routine errors if the search could not complete. This can occur |
| 935 | /// in a number of circumstances: |
| 936 | /// |
| 937 | /// * The configuration of the lazy DFA may permit it to "quit" the search. |
| 938 | /// For example, setting quit bytes or enabling heuristic support for |
| 939 | /// Unicode word boundaries. The default configuration does not enable any |
| 940 | /// option that could result in the lazy DFA quitting. |
| 941 | /// * The configuration of the lazy DFA may also permit it to "give up" |
| 942 | /// on a search if it makes ineffective use of its transition table |
| 943 | /// cache. The default configuration does not enable this by default, |
| 944 | /// although it is typically a good idea to. |
| 945 | /// * When the provided `Input` configuration is not supported. For |
| 946 | /// example, by providing an unsupported anchor mode. |
| 947 | /// |
| 948 | /// When a search returns an error, callers cannot know whether a match |
| 949 | /// exists or not. |
| 950 | /// |
| 951 | /// # Example: UTF-8 mode |
| 952 | /// |
| 953 | /// This examples demonstrates that UTF-8 mode applies to reverse |
| 954 | /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all |
| 955 | /// matches reported must correspond to valid UTF-8 spans. This includes |
| 956 | /// prohibiting zero-width matches that split a codepoint. |
| 957 | /// |
| 958 | /// UTF-8 mode is enabled by default. Notice below how the only zero-width |
| 959 | /// matches reported are those at UTF-8 boundaries: |
| 960 | /// |
| 961 | /// ``` |
| 962 | /// use regex_automata::{ |
| 963 | /// hybrid::dfa::{DFA, OverlappingState}, |
| 964 | /// nfa::thompson, |
| 965 | /// HalfMatch, Input, MatchKind, |
| 966 | /// }; |
| 967 | /// |
| 968 | /// let dfa = DFA::builder() |
| 969 | /// .configure(DFA::config().match_kind(MatchKind::All)) |
| 970 | /// .thompson(thompson::Config::new().reverse(true)) |
| 971 | /// .build_many(&[r"" , r"☃" ])?; |
| 972 | /// let mut cache = dfa.create_cache(); |
| 973 | /// |
| 974 | /// // Run the reverse DFA to collect all matches. |
| 975 | /// let input = Input::new("☃" ); |
| 976 | /// let mut state = OverlappingState::start(); |
| 977 | /// let mut matches = vec![]; |
| 978 | /// loop { |
| 979 | /// dfa.try_search_overlapping_rev(&mut cache, &input, &mut state)?; |
| 980 | /// match state.get_match() { |
| 981 | /// None => break, |
| 982 | /// Some(hm) => matches.push(hm), |
| 983 | /// } |
| 984 | /// } |
| 985 | /// |
| 986 | /// // No matches split a codepoint. |
| 987 | /// let expected = vec![ |
| 988 | /// HalfMatch::must(0, 3), |
| 989 | /// HalfMatch::must(1, 0), |
| 990 | /// HalfMatch::must(0, 0), |
| 991 | /// ]; |
| 992 | /// assert_eq!(expected, matches); |
| 993 | /// |
| 994 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 995 | /// ``` |
| 996 | /// |
| 997 | /// Now let's look at the same example, but with UTF-8 mode on the |
| 998 | /// underlying NFA disabled: |
| 999 | /// |
| 1000 | /// ``` |
| 1001 | /// use regex_automata::{ |
| 1002 | /// hybrid::dfa::{DFA, OverlappingState}, |
| 1003 | /// nfa::thompson, |
| 1004 | /// HalfMatch, Input, MatchKind, |
| 1005 | /// }; |
| 1006 | /// |
| 1007 | /// let dfa = DFA::builder() |
| 1008 | /// .configure(DFA::config().match_kind(MatchKind::All)) |
| 1009 | /// .thompson(thompson::Config::new().reverse(true).utf8(false)) |
| 1010 | /// .build_many(&[r"" , r"☃" ])?; |
| 1011 | /// let mut cache = dfa.create_cache(); |
| 1012 | /// |
| 1013 | /// // Run the reverse DFA to collect all matches. |
| 1014 | /// let input = Input::new("☃" ); |
| 1015 | /// let mut state = OverlappingState::start(); |
| 1016 | /// let mut matches = vec![]; |
| 1017 | /// loop { |
| 1018 | /// dfa.try_search_overlapping_rev(&mut cache, &input, &mut state)?; |
| 1019 | /// match state.get_match() { |
| 1020 | /// None => break, |
| 1021 | /// Some(hm) => matches.push(hm), |
| 1022 | /// } |
| 1023 | /// } |
| 1024 | /// |
| 1025 | /// // Now *all* positions match, even within a codepoint, |
| 1026 | /// // because we lifted the requirement that matches |
| 1027 | /// // correspond to valid UTF-8 spans. |
| 1028 | /// let expected = vec![ |
| 1029 | /// HalfMatch::must(0, 3), |
| 1030 | /// HalfMatch::must(0, 2), |
| 1031 | /// HalfMatch::must(0, 1), |
| 1032 | /// HalfMatch::must(1, 0), |
| 1033 | /// HalfMatch::must(0, 0), |
| 1034 | /// ]; |
| 1035 | /// assert_eq!(expected, matches); |
| 1036 | /// |
| 1037 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 1038 | /// ``` |
| 1039 | #[inline ] |
| 1040 | pub fn try_search_overlapping_rev( |
| 1041 | &self, |
| 1042 | cache: &mut Cache, |
| 1043 | input: &Input<'_>, |
| 1044 | state: &mut OverlappingState, |
| 1045 | ) -> Result<(), MatchError> { |
| 1046 | let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); |
| 1047 | search::find_overlapping_rev(self, cache, input, state)?; |
| 1048 | match state.get_match() { |
| 1049 | None => Ok(()), |
| 1050 | Some(_) if !utf8empty => Ok(()), |
| 1051 | Some(_) => skip_empty_utf8_splits_overlapping( |
| 1052 | input, |
| 1053 | state, |
| 1054 | |input, state| { |
| 1055 | search::find_overlapping_rev(self, cache, input, state) |
| 1056 | }, |
| 1057 | ), |
| 1058 | } |
| 1059 | } |
| 1060 | |
| 1061 | /// Writes the set of patterns that match anywhere in the given search |
| 1062 | /// configuration to `patset`. If multiple patterns match at the same |
| 1063 | /// position and the underlying DFA supports overlapping matches, then all |
| 1064 | /// matching patterns are written to the given set. |
| 1065 | /// |
| 1066 | /// Unless all of the patterns in this DFA are anchored, then generally |
| 1067 | /// speaking, this will visit every byte in the haystack. |
| 1068 | /// |
| 1069 | /// This search routine *does not* clear the pattern set. This gives some |
| 1070 | /// flexibility to the caller (e.g., running multiple searches with the |
| 1071 | /// same pattern set), but does make the API bug-prone if you're reusing |
| 1072 | /// the same pattern set for multiple searches but intended them to be |
| 1073 | /// independent. |
| 1074 | /// |
| 1075 | /// If a pattern ID matched but the given `PatternSet` does not have |
| 1076 | /// sufficient capacity to store it, then it is not inserted and silently |
| 1077 | /// dropped. |
| 1078 | /// |
| 1079 | /// # Errors |
| 1080 | /// |
| 1081 | /// This routine errors if the search could not complete. This can occur |
| 1082 | /// in a number of circumstances: |
| 1083 | /// |
| 1084 | /// * The configuration of the lazy DFA may permit it to "quit" the search. |
| 1085 | /// For example, setting quit bytes or enabling heuristic support for |
| 1086 | /// Unicode word boundaries. The default configuration does not enable any |
| 1087 | /// option that could result in the lazy DFA quitting. |
| 1088 | /// * The configuration of the lazy DFA may also permit it to "give up" |
| 1089 | /// on a search if it makes ineffective use of its transition table |
| 1090 | /// cache. The default configuration does not enable this by default, |
| 1091 | /// although it is typically a good idea to. |
| 1092 | /// * When the provided `Input` configuration is not supported. For |
| 1093 | /// example, by providing an unsupported anchor mode. |
| 1094 | /// |
| 1095 | /// When a search returns an error, callers cannot know whether a match |
| 1096 | /// exists or not. |
| 1097 | /// |
| 1098 | /// # Example |
| 1099 | /// |
| 1100 | /// This example shows how to find all matching patterns in a haystack, |
| 1101 | /// even when some patterns match at the same position as other patterns. |
| 1102 | /// |
| 1103 | /// ``` |
| 1104 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
| 1105 | /// use regex_automata::{ |
| 1106 | /// hybrid::dfa::DFA, |
| 1107 | /// Input, MatchKind, PatternSet, |
| 1108 | /// }; |
| 1109 | /// |
| 1110 | /// let patterns = &[ |
| 1111 | /// r"\w+" , r"\d+" , r"\pL+" , r"foo" , r"bar" , r"barfoo" , r"foobar" , |
| 1112 | /// ]; |
| 1113 | /// let dfa = DFA::builder() |
| 1114 | /// .configure(DFA::config().match_kind(MatchKind::All)) |
| 1115 | /// .build_many(patterns)?; |
| 1116 | /// let mut cache = dfa.create_cache(); |
| 1117 | /// |
| 1118 | /// let input = Input::new("foobar" ); |
| 1119 | /// let mut patset = PatternSet::new(dfa.pattern_len()); |
| 1120 | /// dfa.try_which_overlapping_matches(&mut cache, &input, &mut patset)?; |
| 1121 | /// let expected = vec![0, 2, 3, 4, 6]; |
| 1122 | /// let got: Vec<usize> = patset.iter().map(|p| p.as_usize()).collect(); |
| 1123 | /// assert_eq!(expected, got); |
| 1124 | /// |
| 1125 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 1126 | /// ``` |
| 1127 | #[inline ] |
| 1128 | pub fn try_which_overlapping_matches( |
| 1129 | &self, |
| 1130 | cache: &mut Cache, |
| 1131 | input: &Input<'_>, |
| 1132 | patset: &mut PatternSet, |
| 1133 | ) -> Result<(), MatchError> { |
| 1134 | let mut state = OverlappingState::start(); |
| 1135 | while let Some(m) = { |
| 1136 | self.try_search_overlapping_fwd(cache, input, &mut state)?; |
| 1137 | state.get_match() |
| 1138 | } { |
| 1139 | let _ = patset.try_insert(m.pattern()); |
| 1140 | // There's nothing left to find, so we can stop. Or the caller |
| 1141 | // asked us to. |
| 1142 | if patset.is_full() || input.get_earliest() { |
| 1143 | break; |
| 1144 | } |
| 1145 | } |
| 1146 | Ok(()) |
| 1147 | } |
| 1148 | } |
| 1149 | |
| 1150 | impl DFA { |
| 1151 | /// Transitions from the current state to the next state, given the next |
| 1152 | /// byte of input. |
| 1153 | /// |
| 1154 | /// The given cache is used to either reuse pre-computed state |
| 1155 | /// transitions, or to store this newly computed transition for future |
| 1156 | /// reuse. Thus, this routine guarantees that it will never return a state |
| 1157 | /// ID that has an "unknown" tag. |
| 1158 | /// |
| 1159 | /// # State identifier validity |
| 1160 | /// |
| 1161 | /// The only valid value for `current` is the lazy state ID returned |
| 1162 | /// by the most recent call to `next_state`, `next_state_untagged`, |
| 1163 | /// `next_state_untagged_unchecked`, `start_state_forward` or |
| 1164 | /// `state_state_reverse` for the given `cache`. Any state ID returned from |
| 1165 | /// prior calls to these routines (with the same `cache`) is considered |
| 1166 | /// invalid (even if it gives an appearance of working). State IDs returned |
| 1167 | /// from _any_ prior call for different `cache` values are also always |
| 1168 | /// invalid. |
| 1169 | /// |
| 1170 | /// The returned ID is always a valid ID when `current` refers to a valid |
| 1171 | /// ID. Moreover, this routine is defined for all possible values of |
| 1172 | /// `input`. |
| 1173 | /// |
| 1174 | /// These validity rules are not checked, even in debug mode. Callers are |
| 1175 | /// required to uphold these rules themselves. |
| 1176 | /// |
| 1177 | /// Violating these state ID validity rules will not sacrifice memory |
| 1178 | /// safety, but _may_ produce an incorrect result or a panic. |
| 1179 | /// |
| 1180 | /// # Panics |
| 1181 | /// |
| 1182 | /// If the given ID does not refer to a valid state, then this routine |
| 1183 | /// may panic but it also may not panic and instead return an invalid or |
| 1184 | /// incorrect ID. |
| 1185 | /// |
| 1186 | /// # Example |
| 1187 | /// |
| 1188 | /// This shows a simplistic example for walking a lazy DFA for a given |
| 1189 | /// haystack by using the `next_state` method. |
| 1190 | /// |
| 1191 | /// ``` |
| 1192 | /// use regex_automata::{hybrid::dfa::DFA, Input}; |
| 1193 | /// |
| 1194 | /// let dfa = DFA::new(r"[a-z]+r" )?; |
| 1195 | /// let mut cache = dfa.create_cache(); |
| 1196 | /// let haystack = "bar" .as_bytes(); |
| 1197 | /// |
| 1198 | /// // The start state is determined by inspecting the position and the |
| 1199 | /// // initial bytes of the haystack. |
| 1200 | /// let mut sid = dfa.start_state_forward( |
| 1201 | /// &mut cache, &Input::new(haystack), |
| 1202 | /// )?; |
| 1203 | /// // Walk all the bytes in the haystack. |
| 1204 | /// for &b in haystack { |
| 1205 | /// sid = dfa.next_state(&mut cache, sid, b)?; |
| 1206 | /// } |
| 1207 | /// // Matches are always delayed by 1 byte, so we must explicitly walk the |
| 1208 | /// // special "EOI" transition at the end of the search. |
| 1209 | /// sid = dfa.next_eoi_state(&mut cache, sid)?; |
| 1210 | /// assert!(sid.is_match()); |
| 1211 | /// |
| 1212 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 1213 | /// ``` |
| 1214 | #[inline ] |
| 1215 | pub fn next_state( |
| 1216 | &self, |
| 1217 | cache: &mut Cache, |
| 1218 | current: LazyStateID, |
| 1219 | input: u8, |
| 1220 | ) -> Result<LazyStateID, CacheError> { |
| 1221 | let class = usize::from(self.classes.get(input)); |
| 1222 | let offset = current.as_usize_untagged() + class; |
| 1223 | let sid = cache.trans[offset]; |
| 1224 | if !sid.is_unknown() { |
| 1225 | return Ok(sid); |
| 1226 | } |
| 1227 | let unit = alphabet::Unit::u8(input); |
| 1228 | Lazy::new(self, cache).cache_next_state(current, unit) |
| 1229 | } |
| 1230 | |
| 1231 | /// Transitions from the current state to the next state, given the next |
| 1232 | /// byte of input and a state ID that is not tagged. |
| 1233 | /// |
| 1234 | /// The only reason to use this routine is performance. In particular, the |
| 1235 | /// `next_state` method needs to do some additional checks, among them is |
| 1236 | /// to account for identifiers to states that are not yet computed. In |
| 1237 | /// such a case, the transition is computed on the fly. However, if it is |
| 1238 | /// known that the `current` state ID is untagged, then these checks can be |
| 1239 | /// omitted. |
| 1240 | /// |
| 1241 | /// Since this routine does not compute states on the fly, it does not |
| 1242 | /// modify the cache and thus cannot return an error. Consequently, `cache` |
| 1243 | /// does not need to be mutable and it is possible for this routine to |
| 1244 | /// return a state ID corresponding to the special "unknown" state. In |
| 1245 | /// this case, it is the caller's responsibility to use the prior state |
| 1246 | /// ID and `input` with `next_state` in order to force the computation of |
| 1247 | /// the unknown transition. Otherwise, trying to use the "unknown" state |
| 1248 | /// ID will just result in transitioning back to itself, and thus never |
| 1249 | /// terminating. (This is technically a special exemption to the state ID |
| 1250 | /// validity rules, but is permissible since this routine is guarateed to |
| 1251 | /// never mutate the given `cache`, and thus the identifier is guaranteed |
| 1252 | /// to remain valid.) |
| 1253 | /// |
| 1254 | /// See [`LazyStateID`] for more details on what it means for a state ID |
| 1255 | /// to be tagged. Also, see |
| 1256 | /// [`next_state_untagged_unchecked`](DFA::next_state_untagged_unchecked) |
| 1257 | /// for this same idea, but with bounds checks forcefully elided. |
| 1258 | /// |
| 1259 | /// # State identifier validity |
| 1260 | /// |
| 1261 | /// The only valid value for `current` is an **untagged** lazy |
| 1262 | /// state ID returned by the most recent call to `next_state`, |
| 1263 | /// `next_state_untagged`, `next_state_untagged_unchecked`, |
| 1264 | /// `start_state_forward` or `state_state_reverse` for the given `cache`. |
| 1265 | /// Any state ID returned from prior calls to these routines (with the |
| 1266 | /// same `cache`) is considered invalid (even if it gives an appearance |
| 1267 | /// of working). State IDs returned from _any_ prior call for different |
| 1268 | /// `cache` values are also always invalid. |
| 1269 | /// |
| 1270 | /// The returned ID is always a valid ID when `current` refers to a valid |
| 1271 | /// ID, although it may be tagged. Moreover, this routine is defined for |
| 1272 | /// all possible values of `input`. |
| 1273 | /// |
| 1274 | /// Not all validity rules are checked, even in debug mode. Callers are |
| 1275 | /// required to uphold these rules themselves. |
| 1276 | /// |
| 1277 | /// Violating these state ID validity rules will not sacrifice memory |
| 1278 | /// safety, but _may_ produce an incorrect result or a panic. |
| 1279 | /// |
| 1280 | /// # Panics |
| 1281 | /// |
| 1282 | /// If the given ID does not refer to a valid state, then this routine |
| 1283 | /// may panic but it also may not panic and instead return an invalid or |
| 1284 | /// incorrect ID. |
| 1285 | /// |
| 1286 | /// # Example |
| 1287 | /// |
| 1288 | /// This shows a simplistic example for walking a lazy DFA for a given |
| 1289 | /// haystack by using the `next_state_untagged` method where possible. |
| 1290 | /// |
| 1291 | /// ``` |
| 1292 | /// use regex_automata::{hybrid::dfa::DFA, Input}; |
| 1293 | /// |
| 1294 | /// let dfa = DFA::new(r"[a-z]+r" )?; |
| 1295 | /// let mut cache = dfa.create_cache(); |
| 1296 | /// let haystack = "bar" .as_bytes(); |
| 1297 | /// |
| 1298 | /// // The start state is determined by inspecting the position and the |
| 1299 | /// // initial bytes of the haystack. |
| 1300 | /// let mut sid = dfa.start_state_forward( |
| 1301 | /// &mut cache, &Input::new(haystack), |
| 1302 | /// )?; |
| 1303 | /// // Walk all the bytes in the haystack. |
| 1304 | /// let mut at = 0; |
| 1305 | /// while at < haystack.len() { |
| 1306 | /// if sid.is_tagged() { |
| 1307 | /// sid = dfa.next_state(&mut cache, sid, haystack[at])?; |
| 1308 | /// } else { |
| 1309 | /// let mut prev_sid = sid; |
| 1310 | /// // We attempt to chew through as much as we can while moving |
| 1311 | /// // through untagged state IDs. Thus, the transition function |
| 1312 | /// // does less work on average per byte. (Unrolling this loop |
| 1313 | /// // may help even more.) |
| 1314 | /// while at < haystack.len() { |
| 1315 | /// prev_sid = sid; |
| 1316 | /// sid = dfa.next_state_untagged( |
| 1317 | /// &mut cache, sid, haystack[at], |
| 1318 | /// ); |
| 1319 | /// at += 1; |
| 1320 | /// if sid.is_tagged() { |
| 1321 | /// break; |
| 1322 | /// } |
| 1323 | /// } |
| 1324 | /// // We must ensure that we never proceed to the next iteration |
| 1325 | /// // with an unknown state ID. If we don't account for this |
| 1326 | /// // case, then search isn't guaranteed to terminate since all |
| 1327 | /// // transitions on unknown states loop back to itself. |
| 1328 | /// if sid.is_unknown() { |
| 1329 | /// sid = dfa.next_state( |
| 1330 | /// &mut cache, prev_sid, haystack[at - 1], |
| 1331 | /// )?; |
| 1332 | /// } |
| 1333 | /// } |
| 1334 | /// } |
| 1335 | /// // Matches are always delayed by 1 byte, so we must explicitly walk the |
| 1336 | /// // special "EOI" transition at the end of the search. |
| 1337 | /// sid = dfa.next_eoi_state(&mut cache, sid)?; |
| 1338 | /// assert!(sid.is_match()); |
| 1339 | /// |
| 1340 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 1341 | /// ``` |
| 1342 | #[inline ] |
| 1343 | pub fn next_state_untagged( |
| 1344 | &self, |
| 1345 | cache: &Cache, |
| 1346 | current: LazyStateID, |
| 1347 | input: u8, |
| 1348 | ) -> LazyStateID { |
| 1349 | debug_assert!(!current.is_tagged()); |
| 1350 | let class = usize::from(self.classes.get(input)); |
| 1351 | let offset = current.as_usize_unchecked() + class; |
| 1352 | cache.trans[offset] |
| 1353 | } |
| 1354 | |
| 1355 | /// Transitions from the current state to the next state, eliding bounds |
| 1356 | /// checks, given the next byte of input and a state ID that is not tagged. |
| 1357 | /// |
| 1358 | /// The only reason to use this routine is performance. In particular, the |
| 1359 | /// `next_state` method needs to do some additional checks, among them is |
| 1360 | /// to account for identifiers to states that are not yet computed. In |
| 1361 | /// such a case, the transition is computed on the fly. However, if it is |
| 1362 | /// known that the `current` state ID is untagged, then these checks can be |
| 1363 | /// omitted. |
| 1364 | /// |
| 1365 | /// Since this routine does not compute states on the fly, it does not |
| 1366 | /// modify the cache and thus cannot return an error. Consequently, `cache` |
| 1367 | /// does not need to be mutable and it is possible for this routine to |
| 1368 | /// return a state ID corresponding to the special "unknown" state. In |
| 1369 | /// this case, it is the caller's responsibility to use the prior state |
| 1370 | /// ID and `input` with `next_state` in order to force the computation of |
| 1371 | /// the unknown transition. Otherwise, trying to use the "unknown" state |
| 1372 | /// ID will just result in transitioning back to itself, and thus never |
| 1373 | /// terminating. (This is technically a special exemption to the state ID |
| 1374 | /// validity rules, but is permissible since this routine is guarateed to |
| 1375 | /// never mutate the given `cache`, and thus the identifier is guaranteed |
| 1376 | /// to remain valid.) |
| 1377 | /// |
| 1378 | /// See [`LazyStateID`] for more details on what it means for a state ID |
| 1379 | /// to be tagged. Also, see |
| 1380 | /// [`next_state_untagged`](DFA::next_state_untagged) |
| 1381 | /// for this same idea, but with memory safety guaranteed by retaining |
| 1382 | /// bounds checks. |
| 1383 | /// |
| 1384 | /// # State identifier validity |
| 1385 | /// |
| 1386 | /// The only valid value for `current` is an **untagged** lazy |
| 1387 | /// state ID returned by the most recent call to `next_state`, |
| 1388 | /// `next_state_untagged`, `next_state_untagged_unchecked`, |
| 1389 | /// `start_state_forward` or `state_state_reverse` for the given `cache`. |
| 1390 | /// Any state ID returned from prior calls to these routines (with the |
| 1391 | /// same `cache`) is considered invalid (even if it gives an appearance |
| 1392 | /// of working). State IDs returned from _any_ prior call for different |
| 1393 | /// `cache` values are also always invalid. |
| 1394 | /// |
| 1395 | /// The returned ID is always a valid ID when `current` refers to a valid |
| 1396 | /// ID, although it may be tagged. Moreover, this routine is defined for |
| 1397 | /// all possible values of `input`. |
| 1398 | /// |
| 1399 | /// Not all validity rules are checked, even in debug mode. Callers are |
| 1400 | /// required to uphold these rules themselves. |
| 1401 | /// |
| 1402 | /// Violating these state ID validity rules will not sacrifice memory |
| 1403 | /// safety, but _may_ produce an incorrect result or a panic. |
| 1404 | /// |
| 1405 | /// # Safety |
| 1406 | /// |
| 1407 | /// Callers of this method must guarantee that `current` refers to a valid |
| 1408 | /// state ID according to the rules described above. If `current` is not a |
| 1409 | /// valid state ID for this automaton, then calling this routine may result |
| 1410 | /// in undefined behavior. |
| 1411 | /// |
| 1412 | /// If `current` is valid, then the ID returned is valid for all possible |
| 1413 | /// values of `input`. |
| 1414 | #[inline ] |
| 1415 | pub unsafe fn next_state_untagged_unchecked( |
| 1416 | &self, |
| 1417 | cache: &Cache, |
| 1418 | current: LazyStateID, |
| 1419 | input: u8, |
| 1420 | ) -> LazyStateID { |
| 1421 | debug_assert!(!current.is_tagged()); |
| 1422 | let class = usize::from(self.classes.get(input)); |
| 1423 | let offset = current.as_usize_unchecked() + class; |
| 1424 | *cache.trans.get_unchecked(offset) |
| 1425 | } |
| 1426 | |
| 1427 | /// Transitions from the current state to the next state for the special |
| 1428 | /// EOI symbol. |
| 1429 | /// |
| 1430 | /// The given cache is used to either reuse pre-computed state |
| 1431 | /// transitions, or to store this newly computed transition for future |
| 1432 | /// reuse. Thus, this routine guarantees that it will never return a state |
| 1433 | /// ID that has an "unknown" tag. |
| 1434 | /// |
| 1435 | /// This routine must be called at the end of every search in a correct |
| 1436 | /// implementation of search. Namely, lazy DFAs in this crate delay matches |
| 1437 | /// by one byte in order to support look-around operators. Thus, after |
| 1438 | /// reaching the end of a haystack, a search implementation must follow one |
| 1439 | /// last EOI transition. |
| 1440 | /// |
| 1441 | /// It is best to think of EOI as an additional symbol in the alphabet of a |
| 1442 | /// DFA that is distinct from every other symbol. That is, the alphabet of |
| 1443 | /// lazy DFAs in this crate has a logical size of 257 instead of 256, where |
| 1444 | /// 256 corresponds to every possible inhabitant of `u8`. (In practice, the |
| 1445 | /// physical alphabet size may be smaller because of alphabet compression |
| 1446 | /// via equivalence classes, but EOI is always represented somehow in the |
| 1447 | /// alphabet.) |
| 1448 | /// |
| 1449 | /// # State identifier validity |
| 1450 | /// |
| 1451 | /// The only valid value for `current` is the lazy state ID returned |
| 1452 | /// by the most recent call to `next_state`, `next_state_untagged`, |
| 1453 | /// `next_state_untagged_unchecked`, `start_state_forward` or |
| 1454 | /// `state_state_reverse` for the given `cache`. Any state ID returned from |
| 1455 | /// prior calls to these routines (with the same `cache`) is considered |
| 1456 | /// invalid (even if it gives an appearance of working). State IDs returned |
| 1457 | /// from _any_ prior call for different `cache` values are also always |
| 1458 | /// invalid. |
| 1459 | /// |
| 1460 | /// The returned ID is always a valid ID when `current` refers to a valid |
| 1461 | /// ID. |
| 1462 | /// |
| 1463 | /// These validity rules are not checked, even in debug mode. Callers are |
| 1464 | /// required to uphold these rules themselves. |
| 1465 | /// |
| 1466 | /// Violating these state ID validity rules will not sacrifice memory |
| 1467 | /// safety, but _may_ produce an incorrect result or a panic. |
| 1468 | /// |
| 1469 | /// # Panics |
| 1470 | /// |
| 1471 | /// If the given ID does not refer to a valid state, then this routine |
| 1472 | /// may panic but it also may not panic and instead return an invalid or |
| 1473 | /// incorrect ID. |
| 1474 | /// |
| 1475 | /// # Example |
| 1476 | /// |
| 1477 | /// This shows a simplistic example for walking a DFA for a given haystack, |
| 1478 | /// and then finishing the search with the final EOI transition. |
| 1479 | /// |
| 1480 | /// ``` |
| 1481 | /// use regex_automata::{hybrid::dfa::DFA, Input}; |
| 1482 | /// |
| 1483 | /// let dfa = DFA::new(r"[a-z]+r" )?; |
| 1484 | /// let mut cache = dfa.create_cache(); |
| 1485 | /// let haystack = "bar" .as_bytes(); |
| 1486 | /// |
| 1487 | /// // The start state is determined by inspecting the position and the |
| 1488 | /// // initial bytes of the haystack. |
| 1489 | /// let mut sid = dfa.start_state_forward( |
| 1490 | /// &mut cache, &Input::new(haystack), |
| 1491 | /// )?; |
| 1492 | /// // Walk all the bytes in the haystack. |
| 1493 | /// for &b in haystack { |
| 1494 | /// sid = dfa.next_state(&mut cache, sid, b)?; |
| 1495 | /// } |
| 1496 | /// // Matches are always delayed by 1 byte, so we must explicitly walk |
| 1497 | /// // the special "EOI" transition at the end of the search. Without this |
| 1498 | /// // final transition, the assert below will fail since the DFA will not |
| 1499 | /// // have entered a match state yet! |
| 1500 | /// sid = dfa.next_eoi_state(&mut cache, sid)?; |
| 1501 | /// assert!(sid.is_match()); |
| 1502 | /// |
| 1503 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 1504 | /// ``` |
| 1505 | #[inline ] |
| 1506 | pub fn next_eoi_state( |
| 1507 | &self, |
| 1508 | cache: &mut Cache, |
| 1509 | current: LazyStateID, |
| 1510 | ) -> Result<LazyStateID, CacheError> { |
| 1511 | let eoi = self.classes.eoi().as_usize(); |
| 1512 | let offset = current.as_usize_untagged() + eoi; |
| 1513 | let sid = cache.trans[offset]; |
| 1514 | if !sid.is_unknown() { |
| 1515 | return Ok(sid); |
| 1516 | } |
| 1517 | let unit = self.classes.eoi(); |
| 1518 | Lazy::new(self, cache).cache_next_state(current, unit) |
| 1519 | } |
| 1520 | |
| 1521 | /// Return the ID of the start state for this lazy DFA for the given |
| 1522 | /// starting configuration. |
| 1523 | /// |
| 1524 | /// Unlike typical DFA implementations, the start state for DFAs in this |
| 1525 | /// crate is dependent on a few different factors: |
| 1526 | /// |
| 1527 | /// * The [`Anchored`] mode of the search. Unanchored, anchored and |
| 1528 | /// anchored searches for a specific [`PatternID`] all use different start |
| 1529 | /// states. |
| 1530 | /// * Whether a "look-behind" byte exists. For example, the `^` anchor |
| 1531 | /// matches if and only if there is no look-behind byte. |
| 1532 | /// * The specific value of that look-behind byte. For example, a `(?m:^)` |
| 1533 | /// assertion only matches when there is either no look-behind byte, or |
| 1534 | /// when the look-behind byte is a line terminator. |
| 1535 | /// |
| 1536 | /// The [starting configuration](start::Config) provides the above |
| 1537 | /// information. |
| 1538 | /// |
| 1539 | /// This routine can be used for either forward or reverse searches. |
| 1540 | /// Although, as a convenience, if you have an [`Input`], then it |
| 1541 | /// may be more succinct to use [`DFA::start_state_forward`] or |
| 1542 | /// [`DFA::start_state_reverse`]. Note, for example, that the convenience |
| 1543 | /// routines return a [`MatchError`] on failure where as this routine |
| 1544 | /// returns a [`StartError`]. |
| 1545 | /// |
| 1546 | /// # Errors |
| 1547 | /// |
| 1548 | /// This may return a [`StartError`] if the search needs to give up when |
| 1549 | /// determining the start state (for example, if it sees a "quit" byte |
| 1550 | /// or if the cache has become inefficient). This can also return an |
| 1551 | /// error if the given configuration contains an unsupported [`Anchored`] |
| 1552 | /// configuration. |
| 1553 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1554 | pub fn start_state( |
| 1555 | &self, |
| 1556 | cache: &mut Cache, |
| 1557 | config: &start::Config, |
| 1558 | ) -> Result<LazyStateID, StartError> { |
| 1559 | let lazy = LazyRef::new(self, cache); |
| 1560 | let anchored = config.get_anchored(); |
| 1561 | let start = match config.get_look_behind() { |
| 1562 | None => Start::Text, |
| 1563 | Some(byte) => { |
| 1564 | if !self.quitset.is_empty() && self.quitset.contains(byte) { |
| 1565 | return Err(StartError::quit(byte)); |
| 1566 | } |
| 1567 | self.start_map.get(byte) |
| 1568 | } |
| 1569 | }; |
| 1570 | let start_id = lazy.get_cached_start_id(anchored, start)?; |
| 1571 | if !start_id.is_unknown() { |
| 1572 | return Ok(start_id); |
| 1573 | } |
| 1574 | Lazy::new(self, cache).cache_start_group(anchored, start) |
| 1575 | } |
| 1576 | |
| 1577 | /// Return the ID of the start state for this lazy DFA when executing a |
| 1578 | /// forward search. |
| 1579 | /// |
| 1580 | /// This is a convenience routine for calling [`DFA::start_state`] that |
| 1581 | /// converts the given [`Input`] to a [start configuration](start::Config). |
| 1582 | /// Additionally, if an error occurs, it is converted from a [`StartError`] |
| 1583 | /// to a [`MatchError`] using the offset information in the given |
| 1584 | /// [`Input`]. |
| 1585 | /// |
| 1586 | /// # Errors |
| 1587 | /// |
| 1588 | /// This may return a [`MatchError`] if the search needs to give up when |
| 1589 | /// determining the start state (for example, if it sees a "quit" byte or |
| 1590 | /// if the cache has become inefficient). This can also return an error if |
| 1591 | /// the given `Input` contains an unsupported [`Anchored`] configuration. |
| 1592 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1593 | pub fn start_state_forward( |
| 1594 | &self, |
| 1595 | cache: &mut Cache, |
| 1596 | input: &Input<'_>, |
| 1597 | ) -> Result<LazyStateID, MatchError> { |
| 1598 | let config = start::Config::from_input_forward(input); |
| 1599 | self.start_state(cache, &config).map_err(|err| match err { |
| 1600 | StartError::Cache { .. } => MatchError::gave_up(input.start()), |
| 1601 | StartError::Quit { byte } => { |
| 1602 | let offset = input |
| 1603 | .start() |
| 1604 | .checked_sub(1) |
| 1605 | .expect("no quit in start without look-behind" ); |
| 1606 | MatchError::quit(byte, offset) |
| 1607 | } |
| 1608 | StartError::UnsupportedAnchored { mode } => { |
| 1609 | MatchError::unsupported_anchored(mode) |
| 1610 | } |
| 1611 | }) |
| 1612 | } |
| 1613 | |
| 1614 | /// Return the ID of the start state for this lazy DFA when executing a |
| 1615 | /// reverse search. |
| 1616 | /// |
| 1617 | /// This is a convenience routine for calling [`DFA::start_state`] that |
| 1618 | /// converts the given [`Input`] to a [start configuration](start::Config). |
| 1619 | /// Additionally, if an error occurs, it is converted from a [`StartError`] |
| 1620 | /// to a [`MatchError`] using the offset information in the given |
| 1621 | /// [`Input`]. |
| 1622 | /// |
| 1623 | /// # Errors |
| 1624 | /// |
| 1625 | /// This may return a [`MatchError`] if the search needs to give up when |
| 1626 | /// determining the start state (for example, if it sees a "quit" byte or |
| 1627 | /// if the cache has become inefficient). This can also return an error if |
| 1628 | /// the given `Input` contains an unsupported [`Anchored`] configuration. |
| 1629 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1630 | pub fn start_state_reverse( |
| 1631 | &self, |
| 1632 | cache: &mut Cache, |
| 1633 | input: &Input<'_>, |
| 1634 | ) -> Result<LazyStateID, MatchError> { |
| 1635 | let config = start::Config::from_input_reverse(input); |
| 1636 | self.start_state(cache, &config).map_err(|err| match err { |
| 1637 | StartError::Cache { .. } => MatchError::gave_up(input.end()), |
| 1638 | StartError::Quit { byte } => { |
| 1639 | let offset = input.end(); |
| 1640 | MatchError::quit(byte, offset) |
| 1641 | } |
| 1642 | StartError::UnsupportedAnchored { mode } => { |
| 1643 | MatchError::unsupported_anchored(mode) |
| 1644 | } |
| 1645 | }) |
| 1646 | } |
| 1647 | |
| 1648 | /// Returns the total number of patterns that match in this state. |
| 1649 | /// |
| 1650 | /// If the lazy DFA was compiled with one pattern, then this must |
| 1651 | /// necessarily always return `1` for all match states. |
| 1652 | /// |
| 1653 | /// A lazy DFA guarantees that [`DFA::match_pattern`] can be called with |
| 1654 | /// indices up to (but not including) the length returned by this routine |
| 1655 | /// without panicking. |
| 1656 | /// |
| 1657 | /// # Panics |
| 1658 | /// |
| 1659 | /// If the given state is not a match state, then this may either panic |
| 1660 | /// or return an incorrect result. |
| 1661 | /// |
| 1662 | /// # Example |
| 1663 | /// |
| 1664 | /// This example shows a simple instance of implementing overlapping |
| 1665 | /// matches. In particular, it shows not only how to determine how many |
| 1666 | /// patterns have matched in a particular state, but also how to access |
| 1667 | /// which specific patterns have matched. |
| 1668 | /// |
| 1669 | /// Notice that we must use [`MatchKind::All`] when building the DFA. If we |
| 1670 | /// used [`MatchKind::LeftmostFirst`] instead, then the DFA would not be |
| 1671 | /// constructed in a way that supports overlapping matches. (It would only |
| 1672 | /// report a single pattern that matches at any particular point in time.) |
| 1673 | /// |
| 1674 | /// Another thing to take note of is the patterns used and the order in |
| 1675 | /// which the pattern IDs are reported. In the example below, pattern `3` |
| 1676 | /// is yielded first. Why? Because it corresponds to the match that |
| 1677 | /// appears first. Namely, the `@` symbol is part of `\S+` but not part |
| 1678 | /// of any of the other patterns. Since the `\S+` pattern has a match that |
| 1679 | /// starts to the left of any other pattern, its ID is returned before any |
| 1680 | /// other. |
| 1681 | /// |
| 1682 | /// ``` |
| 1683 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
| 1684 | /// use regex_automata::{hybrid::dfa::DFA, Input, MatchKind}; |
| 1685 | /// |
| 1686 | /// let dfa = DFA::builder() |
| 1687 | /// .configure(DFA::config().match_kind(MatchKind::All)) |
| 1688 | /// .build_many(&[ |
| 1689 | /// r"\w+" , r"[a-z]+" , r"[A-Z]+" , r"\S+" , |
| 1690 | /// ])?; |
| 1691 | /// let mut cache = dfa.create_cache(); |
| 1692 | /// let haystack = "@bar" .as_bytes(); |
| 1693 | /// |
| 1694 | /// // The start state is determined by inspecting the position and the |
| 1695 | /// // initial bytes of the haystack. |
| 1696 | /// let mut sid = dfa.start_state_forward( |
| 1697 | /// &mut cache, &Input::new(haystack), |
| 1698 | /// )?; |
| 1699 | /// // Walk all the bytes in the haystack. |
| 1700 | /// for &b in haystack { |
| 1701 | /// sid = dfa.next_state(&mut cache, sid, b)?; |
| 1702 | /// } |
| 1703 | /// sid = dfa.next_eoi_state(&mut cache, sid)?; |
| 1704 | /// |
| 1705 | /// assert!(sid.is_match()); |
| 1706 | /// assert_eq!(dfa.match_len(&mut cache, sid), 3); |
| 1707 | /// // The following calls are guaranteed to not panic since `match_len` |
| 1708 | /// // returned `3` above. |
| 1709 | /// assert_eq!(dfa.match_pattern(&mut cache, sid, 0).as_usize(), 3); |
| 1710 | /// assert_eq!(dfa.match_pattern(&mut cache, sid, 1).as_usize(), 0); |
| 1711 | /// assert_eq!(dfa.match_pattern(&mut cache, sid, 2).as_usize(), 1); |
| 1712 | /// |
| 1713 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 1714 | /// ``` |
| 1715 | #[inline ] |
| 1716 | pub fn match_len(&self, cache: &Cache, id: LazyStateID) -> usize { |
| 1717 | assert!(id.is_match()); |
| 1718 | LazyRef::new(self, cache).get_cached_state(id).match_len() |
| 1719 | } |
| 1720 | |
| 1721 | /// Returns the pattern ID corresponding to the given match index in the |
| 1722 | /// given state. |
| 1723 | /// |
| 1724 | /// See [`DFA::match_len`] for an example of how to use this method |
| 1725 | /// correctly. Note that if you know your lazy DFA is configured with a |
| 1726 | /// single pattern, then this routine is never necessary since it will |
| 1727 | /// always return a pattern ID of `0` for an index of `0` when `id` |
| 1728 | /// corresponds to a match state. |
| 1729 | /// |
| 1730 | /// Typically, this routine is used when implementing an overlapping |
| 1731 | /// search, as the example for `DFA::match_len` does. |
| 1732 | /// |
| 1733 | /// # Panics |
| 1734 | /// |
| 1735 | /// If the state ID is not a match state or if the match index is out |
| 1736 | /// of bounds for the given state, then this routine may either panic |
| 1737 | /// or produce an incorrect result. If the state ID is correct and the |
| 1738 | /// match index is correct, then this routine always produces a valid |
| 1739 | /// `PatternID`. |
| 1740 | #[inline ] |
| 1741 | pub fn match_pattern( |
| 1742 | &self, |
| 1743 | cache: &Cache, |
| 1744 | id: LazyStateID, |
| 1745 | match_index: usize, |
| 1746 | ) -> PatternID { |
| 1747 | // This is an optimization for the very common case of a DFA with a |
| 1748 | // single pattern. This conditional avoids a somewhat more costly path |
| 1749 | // that finds the pattern ID from the corresponding `State`, which |
| 1750 | // requires a bit of slicing/pointer-chasing. This optimization tends |
| 1751 | // to only matter when matches are frequent. |
| 1752 | if self.pattern_len() == 1 { |
| 1753 | return PatternID::ZERO; |
| 1754 | } |
| 1755 | LazyRef::new(self, cache) |
| 1756 | .get_cached_state(id) |
| 1757 | .match_pattern(match_index) |
| 1758 | } |
| 1759 | } |
| 1760 | |
| 1761 | /// A cache represents a partially computed DFA. |
| 1762 | /// |
| 1763 | /// A cache is the key component that differentiates a classical DFA and a |
| 1764 | /// hybrid NFA/DFA (also called a "lazy DFA"). Where a classical DFA builds a |
| 1765 | /// complete transition table that can handle all possible inputs, a hybrid |
| 1766 | /// NFA/DFA starts with an empty transition table and builds only the parts |
| 1767 | /// required during search. The parts that are built are stored in a cache. For |
| 1768 | /// this reason, a cache is a required parameter for nearly every operation on |
| 1769 | /// a [`DFA`]. |
| 1770 | /// |
| 1771 | /// Caches can be created from their corresponding DFA via |
| 1772 | /// [`DFA::create_cache`]. A cache can only be used with either the DFA that |
| 1773 | /// created it, or the DFA that was most recently used to reset it with |
| 1774 | /// [`Cache::reset`]. Using a cache with any other DFA may result in panics |
| 1775 | /// or incorrect results. |
| 1776 | #[derive (Clone, Debug)] |
| 1777 | pub struct Cache { |
| 1778 | // N.B. If you're looking to understand how determinization works, it |
| 1779 | // is probably simpler to first grok src/dfa/determinize.rs, since that |
| 1780 | // doesn't have the "laziness" component. |
| 1781 | /// The transition table. |
| 1782 | /// |
| 1783 | /// Given a `current` LazyStateID and an `input` byte, the next state can |
| 1784 | /// be computed via `trans[untagged(current) + equiv_class(input)]`. Notice |
| 1785 | /// that no multiplication is used. That's because state identifiers are |
| 1786 | /// "premultiplied." |
| 1787 | /// |
| 1788 | /// Note that the next state may be the "unknown" state. In this case, the |
| 1789 | /// next state is not known and determinization for `current` on `input` |
| 1790 | /// must be performed. |
| 1791 | trans: Vec<LazyStateID>, |
| 1792 | /// The starting states for this DFA. |
| 1793 | /// |
| 1794 | /// These are computed lazily. Initially, these are all set to "unknown" |
| 1795 | /// lazy state IDs. |
| 1796 | /// |
| 1797 | /// When 'starts_for_each_pattern' is disabled (the default), then the size |
| 1798 | /// of this is constrained to the possible starting configurations based |
| 1799 | /// on the search parameters. (At time of writing, that's 4.) However, |
| 1800 | /// when starting states for each pattern is enabled, then there are N |
| 1801 | /// additional groups of starting states, where each group reflects the |
| 1802 | /// different possible configurations and N is the number of patterns. |
| 1803 | starts: Vec<LazyStateID>, |
| 1804 | /// A sequence of NFA/DFA powerset states that have been computed for this |
| 1805 | /// lazy DFA. This sequence is indexable by untagged LazyStateIDs. (Every |
| 1806 | /// tagged LazyStateID can be used to index this sequence by converting it |
| 1807 | /// to its untagged form.) |
| 1808 | states: Vec<State>, |
| 1809 | /// A map from states to their corresponding IDs. This map may be accessed |
| 1810 | /// via the raw byte representation of a state, which means that a `State` |
| 1811 | /// does not need to be allocated to determine whether it already exists |
| 1812 | /// in this map. Indeed, the existence of such a state is what determines |
| 1813 | /// whether we allocate a new `State` or not. |
| 1814 | /// |
| 1815 | /// The higher level idea here is that we do just enough determinization |
| 1816 | /// for a state to check whether we've already computed it. If we have, |
| 1817 | /// then we can save a little (albeit not much) work. The real savings is |
| 1818 | /// in memory usage. If we never checked for trivially duplicate states, |
| 1819 | /// then our memory usage would explode to unreasonable levels. |
| 1820 | states_to_id: StateMap, |
| 1821 | /// Sparse sets used to track which NFA states have been visited during |
| 1822 | /// various traversals. |
| 1823 | sparses: SparseSets, |
| 1824 | /// Scratch space for traversing the NFA graph. (We use space on the heap |
| 1825 | /// instead of the call stack.) |
| 1826 | stack: Vec<NFAStateID>, |
| 1827 | /// Scratch space for building a NFA/DFA powerset state. This is used to |
| 1828 | /// help amortize allocation since not every powerset state generated is |
| 1829 | /// added to the cache. In particular, if it already exists in the cache, |
| 1830 | /// then there is no need to allocate a new `State` for it. |
| 1831 | scratch_state_builder: StateBuilderEmpty, |
| 1832 | /// A simple abstraction for handling the saving of at most a single state |
| 1833 | /// across a cache clearing. This is required for correctness. Namely, if |
| 1834 | /// adding a new state after clearing the cache fails, then the caller |
| 1835 | /// must retain the ability to continue using the state ID given. The |
| 1836 | /// state corresponding to the state ID is what we preserve across cache |
| 1837 | /// clearings. |
| 1838 | state_saver: StateSaver, |
| 1839 | /// The memory usage, in bytes, used by 'states' and 'states_to_id'. We |
| 1840 | /// track this as new states are added since states use a variable amount |
| 1841 | /// of heap. Tracking this as we add states makes it possible to compute |
| 1842 | /// the total amount of memory used by the determinizer in constant time. |
| 1843 | memory_usage_state: usize, |
| 1844 | /// The number of times the cache has been cleared. When a minimum cache |
| 1845 | /// clear count is set, then the cache will return an error instead of |
| 1846 | /// clearing the cache if the count has been exceeded. |
| 1847 | clear_count: usize, |
| 1848 | /// The total number of bytes searched since the last time this cache was |
| 1849 | /// cleared, not including the current search. |
| 1850 | /// |
| 1851 | /// This can be added to the length of the current search to get the true |
| 1852 | /// total number of bytes searched. |
| 1853 | /// |
| 1854 | /// This is generally only non-zero when the |
| 1855 | /// `Cache::search_{start,update,finish}` APIs are used to track search |
| 1856 | /// progress. |
| 1857 | bytes_searched: usize, |
| 1858 | /// The progress of the current search. |
| 1859 | /// |
| 1860 | /// This is only non-`None` when callers utlize the `Cache::search_start`, |
| 1861 | /// `Cache::search_update` and `Cache::search_finish` APIs. |
| 1862 | /// |
| 1863 | /// The purpose of recording search progress is to be able to make a |
| 1864 | /// determination about the efficiency of the cache. Namely, by keeping |
| 1865 | /// track of the |
| 1866 | progress: Option<SearchProgress>, |
| 1867 | } |
| 1868 | |
| 1869 | impl Cache { |
| 1870 | /// Create a new cache for the given lazy DFA. |
| 1871 | /// |
| 1872 | /// The cache returned should only be used for searches for the given DFA. |
| 1873 | /// If you want to reuse the cache for another DFA, then you must call |
| 1874 | /// [`Cache::reset`] with that DFA. |
| 1875 | pub fn new(dfa: &DFA) -> Cache { |
| 1876 | let mut cache = Cache { |
| 1877 | trans: alloc::vec![], |
| 1878 | starts: alloc::vec![], |
| 1879 | states: alloc::vec![], |
| 1880 | states_to_id: StateMap::new(), |
| 1881 | sparses: SparseSets::new(dfa.get_nfa().states().len()), |
| 1882 | stack: alloc::vec![], |
| 1883 | scratch_state_builder: StateBuilderEmpty::new(), |
| 1884 | state_saver: StateSaver::none(), |
| 1885 | memory_usage_state: 0, |
| 1886 | clear_count: 0, |
| 1887 | bytes_searched: 0, |
| 1888 | progress: None, |
| 1889 | }; |
| 1890 | debug!("pre-init lazy DFA cache size: {}" , cache.memory_usage()); |
| 1891 | Lazy { dfa, cache: &mut cache }.init_cache(); |
| 1892 | debug!("post-init lazy DFA cache size: {}" , cache.memory_usage()); |
| 1893 | cache |
| 1894 | } |
| 1895 | |
| 1896 | /// Reset this cache such that it can be used for searching with the given |
| 1897 | /// lazy DFA (and only that DFA). |
| 1898 | /// |
| 1899 | /// A cache reset permits reusing memory already allocated in this cache |
| 1900 | /// with a different lazy DFA. |
| 1901 | /// |
| 1902 | /// Resetting a cache sets its "clear count" to 0. This is relevant if the |
| 1903 | /// lazy DFA has been configured to "give up" after it has cleared the |
| 1904 | /// cache a certain number of times. |
| 1905 | /// |
| 1906 | /// Any lazy state ID generated by the cache prior to resetting it is |
| 1907 | /// invalid after the reset. |
| 1908 | /// |
| 1909 | /// # Example |
| 1910 | /// |
| 1911 | /// This shows how to re-purpose a cache for use with a different DFA. |
| 1912 | /// |
| 1913 | /// ``` |
| 1914 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
| 1915 | /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; |
| 1916 | /// |
| 1917 | /// let dfa1 = DFA::new(r"\w" )?; |
| 1918 | /// let dfa2 = DFA::new(r"\W" )?; |
| 1919 | /// |
| 1920 | /// let mut cache = dfa1.create_cache(); |
| 1921 | /// assert_eq!( |
| 1922 | /// Some(HalfMatch::must(0, 2)), |
| 1923 | /// dfa1.try_search_fwd(&mut cache, &Input::new("Δ" ))?, |
| 1924 | /// ); |
| 1925 | /// |
| 1926 | /// // Using 'cache' with dfa2 is not allowed. It may result in panics or |
| 1927 | /// // incorrect results. In order to re-purpose the cache, we must reset |
| 1928 | /// // it with the DFA we'd like to use it with. |
| 1929 | /// // |
| 1930 | /// // Similarly, after this reset, using the cache with 'dfa1' is also not |
| 1931 | /// // allowed. |
| 1932 | /// cache.reset(&dfa2); |
| 1933 | /// assert_eq!( |
| 1934 | /// Some(HalfMatch::must(0, 3)), |
| 1935 | /// dfa2.try_search_fwd(&mut cache, &Input::new("☃" ))?, |
| 1936 | /// ); |
| 1937 | /// |
| 1938 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 1939 | /// ``` |
| 1940 | pub fn reset(&mut self, dfa: &DFA) { |
| 1941 | Lazy::new(dfa, self).reset_cache() |
| 1942 | } |
| 1943 | |
| 1944 | /// Initializes a new search starting at the given position. |
| 1945 | /// |
| 1946 | /// If a previous search was unfinished, then it is finished automatically |
| 1947 | /// and a new search is begun. |
| 1948 | /// |
| 1949 | /// Note that keeping track of search progress is _not necessary_ |
| 1950 | /// for correct implementations of search using a lazy DFA. Keeping |
| 1951 | /// track of search progress is only necessary if you want the |
| 1952 | /// [`Config::minimum_bytes_per_state`] configuration knob to work. |
| 1953 | #[inline ] |
| 1954 | pub fn search_start(&mut self, at: usize) { |
| 1955 | // If a previous search wasn't marked as finished, then finish it |
| 1956 | // now automatically. |
| 1957 | if let Some(p) = self.progress.take() { |
| 1958 | self.bytes_searched += p.len(); |
| 1959 | } |
| 1960 | self.progress = Some(SearchProgress { start: at, at }); |
| 1961 | } |
| 1962 | |
| 1963 | /// Updates the current search to indicate that it has search to the |
| 1964 | /// current position. |
| 1965 | /// |
| 1966 | /// No special care needs to be taken for reverse searches. Namely, the |
| 1967 | /// position given may be _less than_ the starting position of the search. |
| 1968 | /// |
| 1969 | /// # Panics |
| 1970 | /// |
| 1971 | /// This panics if no search has been started by [`Cache::search_start`]. |
| 1972 | #[inline ] |
| 1973 | pub fn search_update(&mut self, at: usize) { |
| 1974 | let p = |
| 1975 | self.progress.as_mut().expect("no in-progress search to update" ); |
| 1976 | p.at = at; |
| 1977 | } |
| 1978 | |
| 1979 | /// Indicates that a search has finished at the given position. |
| 1980 | /// |
| 1981 | /// # Panics |
| 1982 | /// |
| 1983 | /// This panics if no search has been started by [`Cache::search_start`]. |
| 1984 | #[inline ] |
| 1985 | pub fn search_finish(&mut self, at: usize) { |
| 1986 | let mut p = |
| 1987 | self.progress.take().expect("no in-progress search to finish" ); |
| 1988 | p.at = at; |
| 1989 | self.bytes_searched += p.len(); |
| 1990 | } |
| 1991 | |
| 1992 | /// Returns the total number of bytes that have been searched since this |
| 1993 | /// cache was last cleared. |
| 1994 | /// |
| 1995 | /// This is useful for determining the efficiency of the cache. For |
| 1996 | /// example, the lazy DFA uses this value in conjunction with the |
| 1997 | /// [`Config::minimum_bytes_per_state`] knob to help determine whether it |
| 1998 | /// should quit searching. |
| 1999 | /// |
| 2000 | /// This always returns `0` if search progress isn't being tracked. Note |
| 2001 | /// that the lazy DFA search routines in this crate always track search |
| 2002 | /// progress. |
| 2003 | pub fn search_total_len(&self) -> usize { |
| 2004 | self.bytes_searched + self.progress.as_ref().map_or(0, |p| p.len()) |
| 2005 | } |
| 2006 | |
| 2007 | /// Returns the total number of times this cache has been cleared since it |
| 2008 | /// was either created or last reset. |
| 2009 | /// |
| 2010 | /// This is useful for informational purposes or if you want to change |
| 2011 | /// search strategies based on the number of times the cache has been |
| 2012 | /// cleared. |
| 2013 | pub fn clear_count(&self) -> usize { |
| 2014 | self.clear_count |
| 2015 | } |
| 2016 | |
| 2017 | /// Returns the heap memory usage, in bytes, of this cache. |
| 2018 | /// |
| 2019 | /// This does **not** include the stack size used up by this cache. To |
| 2020 | /// compute that, use `std::mem::size_of::<Cache>()`. |
| 2021 | pub fn memory_usage(&self) -> usize { |
| 2022 | const ID_SIZE: usize = size_of::<LazyStateID>(); |
| 2023 | const STATE_SIZE: usize = size_of::<State>(); |
| 2024 | |
| 2025 | // NOTE: If you make changes to the below, then |
| 2026 | // 'minimum_cache_capacity' should be updated correspondingly. |
| 2027 | |
| 2028 | self.trans.len() * ID_SIZE |
| 2029 | + self.starts.len() * ID_SIZE |
| 2030 | + self.states.len() * STATE_SIZE |
| 2031 | // Maps likely use more memory than this, but it's probably close. |
| 2032 | + self.states_to_id.len() * (STATE_SIZE + ID_SIZE) |
| 2033 | + self.sparses.memory_usage() |
| 2034 | + self.stack.capacity() * ID_SIZE |
| 2035 | + self.scratch_state_builder.capacity() |
| 2036 | // Heap memory used by 'State' in both 'states' and 'states_to_id'. |
| 2037 | + self.memory_usage_state |
| 2038 | } |
| 2039 | } |
| 2040 | |
| 2041 | /// Keeps track of the progress of the current search. |
| 2042 | /// |
| 2043 | /// This is updated via the `Cache::search_{start,update,finish}` APIs to |
| 2044 | /// record how many bytes have been searched. This permits computing a |
| 2045 | /// heuristic that represents the efficiency of a cache, and thus helps inform |
| 2046 | /// whether the lazy DFA should give up or not. |
| 2047 | #[derive (Clone, Debug)] |
| 2048 | struct SearchProgress { |
| 2049 | start: usize, |
| 2050 | at: usize, |
| 2051 | } |
| 2052 | |
| 2053 | impl SearchProgress { |
| 2054 | /// Returns the length, in bytes, of this search so far. |
| 2055 | /// |
| 2056 | /// This automatically handles the case of a reverse search, where `at` |
| 2057 | /// is likely to be less than `start`. |
| 2058 | fn len(&self) -> usize { |
| 2059 | if self.start <= self.at { |
| 2060 | self.at - self.start |
| 2061 | } else { |
| 2062 | self.start - self.at |
| 2063 | } |
| 2064 | } |
| 2065 | } |
| 2066 | |
| 2067 | /// A map from states to state identifiers. When using std, we use a standard |
| 2068 | /// hashmap, since it's a bit faster for this use case. (Other maps, like |
| 2069 | /// one's based on FNV, have not yet been benchmarked.) |
| 2070 | /// |
| 2071 | /// The main purpose of this map is to reuse states where possible. This won't |
| 2072 | /// fully minimize the DFA, but it works well in a lot of cases. |
| 2073 | #[cfg (feature = "std" )] |
| 2074 | type StateMap = std::collections::HashMap<State, LazyStateID>; |
| 2075 | #[cfg (not(feature = "std" ))] |
| 2076 | type StateMap = alloc::collections::BTreeMap<State, LazyStateID>; |
| 2077 | |
| 2078 | /// A type that groups methods that require the base NFA/DFA and writable |
| 2079 | /// access to the cache. |
| 2080 | #[derive (Debug)] |
| 2081 | struct Lazy<'i, 'c> { |
| 2082 | dfa: &'i DFA, |
| 2083 | cache: &'c mut Cache, |
| 2084 | } |
| 2085 | |
| 2086 | impl<'i, 'c> Lazy<'i, 'c> { |
| 2087 | /// Creates a new 'Lazy' wrapper for a DFA and its corresponding cache. |
| 2088 | fn new(dfa: &'i DFA, cache: &'c mut Cache) -> Lazy<'i, 'c> { |
| 2089 | Lazy { dfa, cache } |
| 2090 | } |
| 2091 | |
| 2092 | /// Return an immutable view by downgrading a writable cache to a read-only |
| 2093 | /// cache. |
| 2094 | fn as_ref<'a>(&'a self) -> LazyRef<'i, 'a> { |
| 2095 | LazyRef::new(self.dfa, self.cache) |
| 2096 | } |
| 2097 | |
| 2098 | /// This is marked as 'inline(never)' to avoid bloating methods on 'DFA' |
| 2099 | /// like 'next_state' and 'next_eoi_state' that are called in critical |
| 2100 | /// areas. The idea is to let the optimizer focus on the other areas of |
| 2101 | /// those methods as the hot path. |
| 2102 | /// |
| 2103 | /// Here's an example that justifies 'inline(never)' |
| 2104 | /// |
| 2105 | /// ```ignore |
| 2106 | /// regex-cli find match hybrid \ |
| 2107 | /// --cache-capacity 100000000 \ |
| 2108 | /// -p '\pL{100}' |
| 2109 | /// all-codepoints-utf8-100x |
| 2110 | /// ``` |
| 2111 | /// |
| 2112 | /// Where 'all-codepoints-utf8-100x' is the UTF-8 encoding of every |
| 2113 | /// codepoint, in sequence, repeated 100 times. |
| 2114 | /// |
| 2115 | /// With 'inline(never)' hyperfine reports 1.1s per run. With |
| 2116 | /// 'inline(always)', hyperfine reports 1.23s. So that's a 10% improvement. |
| 2117 | #[cold ] |
| 2118 | #[inline (never)] |
| 2119 | fn cache_next_state( |
| 2120 | &mut self, |
| 2121 | mut current: LazyStateID, |
| 2122 | unit: alphabet::Unit, |
| 2123 | ) -> Result<LazyStateID, CacheError> { |
| 2124 | let stride2 = self.dfa.stride2(); |
| 2125 | let empty_builder = self.get_state_builder(); |
| 2126 | let builder = determinize::next( |
| 2127 | self.dfa.get_nfa(), |
| 2128 | self.dfa.get_config().get_match_kind(), |
| 2129 | &mut self.cache.sparses, |
| 2130 | &mut self.cache.stack, |
| 2131 | &self.cache.states[current.as_usize_untagged() >> stride2], |
| 2132 | unit, |
| 2133 | empty_builder, |
| 2134 | ); |
| 2135 | let save_state = !self.as_ref().state_builder_fits_in_cache(&builder); |
| 2136 | if save_state { |
| 2137 | self.save_state(current); |
| 2138 | } |
| 2139 | let next = self.add_builder_state(builder, |sid| sid)?; |
| 2140 | if save_state { |
| 2141 | current = self.saved_state_id(); |
| 2142 | } |
| 2143 | // This is the payoff. The next time 'next_state' is called with this |
| 2144 | // state and alphabet unit, it will find this transition and avoid |
| 2145 | // having to re-determinize this transition. |
| 2146 | self.set_transition(current, unit, next); |
| 2147 | Ok(next) |
| 2148 | } |
| 2149 | |
| 2150 | /// Compute and cache the starting state for the given pattern ID (if |
| 2151 | /// present) and the starting configuration. |
| 2152 | /// |
| 2153 | /// This panics if a pattern ID is given and the DFA isn't configured to |
| 2154 | /// build anchored start states for each pattern. |
| 2155 | /// |
| 2156 | /// This will never return an unknown lazy state ID. |
| 2157 | /// |
| 2158 | /// If caching this state would otherwise result in a cache that has been |
| 2159 | /// cleared too many times, then an error is returned. |
| 2160 | #[cold ] |
| 2161 | #[inline (never)] |
| 2162 | fn cache_start_group( |
| 2163 | &mut self, |
| 2164 | anchored: Anchored, |
| 2165 | start: Start, |
| 2166 | ) -> Result<LazyStateID, StartError> { |
| 2167 | let nfa_start_id = match anchored { |
| 2168 | Anchored::No => self.dfa.get_nfa().start_unanchored(), |
| 2169 | Anchored::Yes => self.dfa.get_nfa().start_anchored(), |
| 2170 | Anchored::Pattern(pid) => { |
| 2171 | if !self.dfa.get_config().get_starts_for_each_pattern() { |
| 2172 | return Err(StartError::unsupported_anchored(anchored)); |
| 2173 | } |
| 2174 | match self.dfa.get_nfa().start_pattern(pid) { |
| 2175 | None => return Ok(self.as_ref().dead_id()), |
| 2176 | Some(sid) => sid, |
| 2177 | } |
| 2178 | } |
| 2179 | }; |
| 2180 | |
| 2181 | let id = self |
| 2182 | .cache_start_one(nfa_start_id, start) |
| 2183 | .map_err(StartError::cache)?; |
| 2184 | self.set_start_state(anchored, start, id); |
| 2185 | Ok(id) |
| 2186 | } |
| 2187 | |
| 2188 | /// Compute and cache the starting state for the given NFA state ID and the |
| 2189 | /// starting configuration. The NFA state ID might be one of the following: |
| 2190 | /// |
| 2191 | /// 1) An unanchored start state to match any pattern. |
| 2192 | /// 2) An anchored start state to match any pattern. |
| 2193 | /// 3) An anchored start state for a particular pattern. |
| 2194 | /// |
| 2195 | /// This will never return an unknown lazy state ID. |
| 2196 | /// |
| 2197 | /// If caching this state would otherwise result in a cache that has been |
| 2198 | /// cleared too many times, then an error is returned. |
| 2199 | fn cache_start_one( |
| 2200 | &mut self, |
| 2201 | nfa_start_id: NFAStateID, |
| 2202 | start: Start, |
| 2203 | ) -> Result<LazyStateID, CacheError> { |
| 2204 | let mut builder_matches = self.get_state_builder().into_matches(); |
| 2205 | determinize::set_lookbehind_from_start( |
| 2206 | self.dfa.get_nfa(), |
| 2207 | &start, |
| 2208 | &mut builder_matches, |
| 2209 | ); |
| 2210 | self.cache.sparses.set1.clear(); |
| 2211 | determinize::epsilon_closure( |
| 2212 | self.dfa.get_nfa(), |
| 2213 | nfa_start_id, |
| 2214 | builder_matches.look_have(), |
| 2215 | &mut self.cache.stack, |
| 2216 | &mut self.cache.sparses.set1, |
| 2217 | ); |
| 2218 | let mut builder = builder_matches.into_nfa(); |
| 2219 | determinize::add_nfa_states( |
| 2220 | &self.dfa.get_nfa(), |
| 2221 | &self.cache.sparses.set1, |
| 2222 | &mut builder, |
| 2223 | ); |
| 2224 | let tag_starts = self.dfa.get_config().get_specialize_start_states(); |
| 2225 | self.add_builder_state(builder, |id| { |
| 2226 | if tag_starts { |
| 2227 | id.to_start() |
| 2228 | } else { |
| 2229 | id |
| 2230 | } |
| 2231 | }) |
| 2232 | } |
| 2233 | |
| 2234 | /// Either add the given builder state to this cache, or return an ID to an |
| 2235 | /// equivalent state already in this cache. |
| 2236 | /// |
| 2237 | /// In the case where no equivalent state exists, the idmap function given |
| 2238 | /// may be used to transform the identifier allocated. This is useful if |
| 2239 | /// the caller needs to tag the ID with additional information. |
| 2240 | /// |
| 2241 | /// This will never return an unknown lazy state ID. |
| 2242 | /// |
| 2243 | /// If caching this state would otherwise result in a cache that has been |
| 2244 | /// cleared too many times, then an error is returned. |
| 2245 | fn add_builder_state( |
| 2246 | &mut self, |
| 2247 | builder: StateBuilderNFA, |
| 2248 | idmap: impl Fn(LazyStateID) -> LazyStateID, |
| 2249 | ) -> Result<LazyStateID, CacheError> { |
| 2250 | if let Some(&cached_id) = |
| 2251 | self.cache.states_to_id.get(builder.as_bytes()) |
| 2252 | { |
| 2253 | // Since we have a cached state, put the constructed state's |
| 2254 | // memory back into our scratch space, so that it can be reused. |
| 2255 | self.put_state_builder(builder); |
| 2256 | return Ok(cached_id); |
| 2257 | } |
| 2258 | let result = self.add_state(builder.to_state(), idmap); |
| 2259 | self.put_state_builder(builder); |
| 2260 | result |
| 2261 | } |
| 2262 | |
| 2263 | /// Allocate a new state ID and add the given state to this cache. |
| 2264 | /// |
| 2265 | /// The idmap function given may be used to transform the identifier |
| 2266 | /// allocated. This is useful if the caller needs to tag the ID with |
| 2267 | /// additional information. |
| 2268 | /// |
| 2269 | /// This will never return an unknown lazy state ID. |
| 2270 | /// |
| 2271 | /// If caching this state would otherwise result in a cache that has been |
| 2272 | /// cleared too many times, then an error is returned. |
| 2273 | fn add_state( |
| 2274 | &mut self, |
| 2275 | state: State, |
| 2276 | idmap: impl Fn(LazyStateID) -> LazyStateID, |
| 2277 | ) -> Result<LazyStateID, CacheError> { |
| 2278 | if !self.as_ref().state_fits_in_cache(&state) { |
| 2279 | self.try_clear_cache()?; |
| 2280 | } |
| 2281 | // It's important for this to come second, since the above may clear |
| 2282 | // the cache. If we clear the cache after ID generation, then the ID |
| 2283 | // is likely bunk since it would have been generated based on a larger |
| 2284 | // transition table. |
| 2285 | let mut id = idmap(self.next_state_id()?); |
| 2286 | if state.is_match() { |
| 2287 | id = id.to_match(); |
| 2288 | } |
| 2289 | // Add room in the transition table. Since this is a fresh state, all |
| 2290 | // of its transitions are unknown. |
| 2291 | self.cache.trans.extend( |
| 2292 | iter::repeat(self.as_ref().unknown_id()).take(self.dfa.stride()), |
| 2293 | ); |
| 2294 | // When we add a sentinel state, we never want to set any quit |
| 2295 | // transitions. Technically, this is harmless, since sentinel states |
| 2296 | // have all of their transitions set to loop back to themselves. But |
| 2297 | // when creating sentinel states before the quit sentinel state, |
| 2298 | // this will try to call 'set_transition' on a state ID that doesn't |
| 2299 | // actually exist yet, which isn't allowed. So we just skip doing so |
| 2300 | // entirely. |
| 2301 | if !self.dfa.quitset.is_empty() && !self.as_ref().is_sentinel(id) { |
| 2302 | let quit_id = self.as_ref().quit_id(); |
| 2303 | for b in self.dfa.quitset.iter() { |
| 2304 | self.set_transition(id, alphabet::Unit::u8(b), quit_id); |
| 2305 | } |
| 2306 | } |
| 2307 | self.cache.memory_usage_state += state.memory_usage(); |
| 2308 | self.cache.states.push(state.clone()); |
| 2309 | self.cache.states_to_id.insert(state, id); |
| 2310 | Ok(id) |
| 2311 | } |
| 2312 | |
| 2313 | /// Allocate a new state ID. |
| 2314 | /// |
| 2315 | /// This will never return an unknown lazy state ID. |
| 2316 | /// |
| 2317 | /// If caching this state would otherwise result in a cache that has been |
| 2318 | /// cleared too many times, then an error is returned. |
| 2319 | fn next_state_id(&mut self) -> Result<LazyStateID, CacheError> { |
| 2320 | let sid = match LazyStateID::new(self.cache.trans.len()) { |
| 2321 | Ok(sid) => sid, |
| 2322 | Err(_) => { |
| 2323 | self.try_clear_cache()?; |
| 2324 | // This has to pass since we check that ID capacity at |
| 2325 | // construction time can fit at least MIN_STATES states. |
| 2326 | LazyStateID::new(self.cache.trans.len()).unwrap() |
| 2327 | } |
| 2328 | }; |
| 2329 | Ok(sid) |
| 2330 | } |
| 2331 | |
| 2332 | /// Attempt to clear the cache used by this lazy DFA. |
| 2333 | /// |
| 2334 | /// If clearing the cache exceeds the minimum number of required cache |
| 2335 | /// clearings, then this will return a cache error. In this case, |
| 2336 | /// callers should bubble this up as the cache can't be used until it is |
| 2337 | /// reset. Implementations of search should convert this error into a |
| 2338 | /// [`MatchError::gave_up`]. |
| 2339 | /// |
| 2340 | /// If 'self.state_saver' is set to save a state, then this state is |
| 2341 | /// persisted through cache clearing. Otherwise, the cache is returned to |
| 2342 | /// its state after initialization with two exceptions: its clear count |
| 2343 | /// is incremented and some of its memory likely has additional capacity. |
| 2344 | /// That is, clearing a cache does _not_ release memory. |
| 2345 | /// |
| 2346 | /// Otherwise, any lazy state ID generated by the cache prior to resetting |
| 2347 | /// it is invalid after the reset. |
| 2348 | fn try_clear_cache(&mut self) -> Result<(), CacheError> { |
| 2349 | let c = self.dfa.get_config(); |
| 2350 | if let Some(min_count) = c.get_minimum_cache_clear_count() { |
| 2351 | if self.cache.clear_count >= min_count { |
| 2352 | if let Some(min_bytes_per) = c.get_minimum_bytes_per_state() { |
| 2353 | let len = self.cache.search_total_len(); |
| 2354 | let min_bytes = |
| 2355 | min_bytes_per.saturating_mul(self.cache.states.len()); |
| 2356 | // If we've searched 0 bytes then probably something has |
| 2357 | // gone wrong and the lazy DFA search implementation isn't |
| 2358 | // correctly updating the search progress state. |
| 2359 | if len == 0 { |
| 2360 | trace!( |
| 2361 | "number of bytes searched is 0, but \ |
| 2362 | a minimum bytes per state searched ({}) is \ |
| 2363 | enabled, maybe Cache::search_update \ |
| 2364 | is not being used?" , |
| 2365 | min_bytes_per, |
| 2366 | ); |
| 2367 | } |
| 2368 | if len < min_bytes { |
| 2369 | trace!( |
| 2370 | "lazy DFA cache has been cleared {} times, \ |
| 2371 | which exceeds the limit of {}, \ |
| 2372 | AND its bytes searched per state is less \ |
| 2373 | than the configured minimum of {}, \ |
| 2374 | therefore lazy DFA is giving up \ |
| 2375 | (bytes searched since cache clear = {}, \ |
| 2376 | number of states = {})" , |
| 2377 | self.cache.clear_count, |
| 2378 | min_count, |
| 2379 | min_bytes_per, |
| 2380 | len, |
| 2381 | self.cache.states.len(), |
| 2382 | ); |
| 2383 | return Err(CacheError::bad_efficiency()); |
| 2384 | } else { |
| 2385 | trace!( |
| 2386 | "lazy DFA cache has been cleared {} times, \ |
| 2387 | which exceeds the limit of {}, \ |
| 2388 | AND its bytes searched per state is greater \ |
| 2389 | than the configured minimum of {}, \ |
| 2390 | therefore lazy DFA is continuing! \ |
| 2391 | (bytes searched since cache clear = {}, \ |
| 2392 | number of states = {})" , |
| 2393 | self.cache.clear_count, |
| 2394 | min_count, |
| 2395 | min_bytes_per, |
| 2396 | len, |
| 2397 | self.cache.states.len(), |
| 2398 | ); |
| 2399 | } |
| 2400 | } else { |
| 2401 | trace!( |
| 2402 | "lazy DFA cache has been cleared {} times, \ |
| 2403 | which exceeds the limit of {}, \ |
| 2404 | since there is no configured bytes per state \ |
| 2405 | minimum, lazy DFA is giving up" , |
| 2406 | self.cache.clear_count, |
| 2407 | min_count, |
| 2408 | ); |
| 2409 | return Err(CacheError::too_many_cache_clears()); |
| 2410 | } |
| 2411 | } |
| 2412 | } |
| 2413 | self.clear_cache(); |
| 2414 | Ok(()) |
| 2415 | } |
| 2416 | |
| 2417 | /// Clears _and_ resets the cache. Resetting the cache means that no |
| 2418 | /// states are persisted and the clear count is reset to 0. No heap memory |
| 2419 | /// is released. |
| 2420 | /// |
| 2421 | /// Note that the caller may reset a cache with a different DFA than what |
| 2422 | /// it was created from. In which case, the cache can now be used with the |
| 2423 | /// new DFA (and not the old DFA). |
| 2424 | fn reset_cache(&mut self) { |
| 2425 | self.cache.state_saver = StateSaver::none(); |
| 2426 | self.clear_cache(); |
| 2427 | // If a new DFA is used, it might have a different number of NFA |
| 2428 | // states, so we need to make sure our sparse sets have the appropriate |
| 2429 | // size. |
| 2430 | self.cache.sparses.resize(self.dfa.get_nfa().states().len()); |
| 2431 | self.cache.clear_count = 0; |
| 2432 | self.cache.progress = None; |
| 2433 | } |
| 2434 | |
| 2435 | /// Clear the cache used by this lazy DFA. |
| 2436 | /// |
| 2437 | /// If 'self.state_saver' is set to save a state, then this state is |
| 2438 | /// persisted through cache clearing. Otherwise, the cache is returned to |
| 2439 | /// its state after initialization with two exceptions: its clear count |
| 2440 | /// is incremented and some of its memory likely has additional capacity. |
| 2441 | /// That is, clearing a cache does _not_ release memory. |
| 2442 | /// |
| 2443 | /// Otherwise, any lazy state ID generated by the cache prior to resetting |
| 2444 | /// it is invalid after the reset. |
| 2445 | fn clear_cache(&mut self) { |
| 2446 | self.cache.trans.clear(); |
| 2447 | self.cache.starts.clear(); |
| 2448 | self.cache.states.clear(); |
| 2449 | self.cache.states_to_id.clear(); |
| 2450 | self.cache.memory_usage_state = 0; |
| 2451 | self.cache.clear_count += 1; |
| 2452 | self.cache.bytes_searched = 0; |
| 2453 | if let Some(ref mut progress) = self.cache.progress { |
| 2454 | progress.start = progress.at; |
| 2455 | } |
| 2456 | trace!( |
| 2457 | "lazy DFA cache has been cleared (count: {})" , |
| 2458 | self.cache.clear_count |
| 2459 | ); |
| 2460 | self.init_cache(); |
| 2461 | // If the state we want to save is one of the sentinel |
| 2462 | // (unknown/dead/quit) states, then 'init_cache' adds those back, and |
| 2463 | // their identifier values remains invariant. So there's no need to add |
| 2464 | // it again. (And indeed, doing so would be incorrect!) |
| 2465 | if let Some((old_id, state)) = self.cache.state_saver.take_to_save() { |
| 2466 | // If the state is one of the special sentinel states, then it is |
| 2467 | // automatically added by cache initialization and its ID always |
| 2468 | // remains the same. With that said, this should never occur since |
| 2469 | // the sentinel states are all loop states back to themselves. So |
| 2470 | // we should never be in a position where we're attempting to save |
| 2471 | // a sentinel state since we never compute transitions out of a |
| 2472 | // sentinel state. |
| 2473 | assert!( |
| 2474 | !self.as_ref().is_sentinel(old_id), |
| 2475 | "cannot save sentinel state" |
| 2476 | ); |
| 2477 | let new_id = self |
| 2478 | .add_state(state, |id| { |
| 2479 | if old_id.is_start() { |
| 2480 | // We don't need to consult the |
| 2481 | // 'specialize_start_states' config knob here, because |
| 2482 | // if it's disabled, old_id.is_start() will never |
| 2483 | // return true. |
| 2484 | id.to_start() |
| 2485 | } else { |
| 2486 | id |
| 2487 | } |
| 2488 | }) |
| 2489 | // The unwrap here is OK because lazy DFA creation ensures that |
| 2490 | // we have room in the cache to add MIN_STATES states. Since |
| 2491 | // 'init_cache' above adds 3, this adds a 4th. |
| 2492 | .expect("adding one state after cache clear must work" ); |
| 2493 | self.cache.state_saver = StateSaver::Saved(new_id); |
| 2494 | } |
| 2495 | } |
| 2496 | |
| 2497 | /// Initialize this cache from emptiness to a place where it can be used |
| 2498 | /// for search. |
| 2499 | /// |
| 2500 | /// This is called both at cache creation time and after the cache has been |
| 2501 | /// cleared. |
| 2502 | /// |
| 2503 | /// Primarily, this adds the three sentinel states and allocates some |
| 2504 | /// initial memory. |
| 2505 | fn init_cache(&mut self) { |
| 2506 | // Why multiply by 2 here? Because we make room for both the unanchored |
| 2507 | // and anchored start states. Unanchored is first and then anchored. |
| 2508 | let mut starts_len = Start::len().checked_mul(2).unwrap(); |
| 2509 | // ... but if we also want start states for every pattern, we make room |
| 2510 | // for that too. |
| 2511 | if self.dfa.get_config().get_starts_for_each_pattern() { |
| 2512 | starts_len += Start::len() * self.dfa.pattern_len(); |
| 2513 | } |
| 2514 | self.cache |
| 2515 | .starts |
| 2516 | .extend(iter::repeat(self.as_ref().unknown_id()).take(starts_len)); |
| 2517 | // This is the set of NFA states that corresponds to each of our three |
| 2518 | // sentinel states: the empty set. |
| 2519 | let dead = State::dead(); |
| 2520 | // This sets up some states that we use as sentinels that are present |
| 2521 | // in every DFA. While it would be technically possible to implement |
| 2522 | // this DFA without explicitly putting these states in the transition |
| 2523 | // table, this is convenient to do to make `next_state` correct for all |
| 2524 | // valid state IDs without needing explicit conditionals to special |
| 2525 | // case these sentinel states. |
| 2526 | // |
| 2527 | // All three of these states are "dead" states. That is, all of |
| 2528 | // them transition only to themselves. So once you enter one of |
| 2529 | // these states, it's impossible to leave them. Thus, any correct |
| 2530 | // search routine must explicitly check for these state types. (Sans |
| 2531 | // `unknown`, since that is only used internally to represent missing |
| 2532 | // states.) |
| 2533 | let unk_id = |
| 2534 | self.add_state(dead.clone(), |id| id.to_unknown()).unwrap(); |
| 2535 | let dead_id = self.add_state(dead.clone(), |id| id.to_dead()).unwrap(); |
| 2536 | let quit_id = self.add_state(dead.clone(), |id| id.to_quit()).unwrap(); |
| 2537 | assert_eq!(unk_id, self.as_ref().unknown_id()); |
| 2538 | assert_eq!(dead_id, self.as_ref().dead_id()); |
| 2539 | assert_eq!(quit_id, self.as_ref().quit_id()); |
| 2540 | // The idea here is that if you start in an unknown/dead/quit state and |
| 2541 | // try to transition on them, then you should end up where you started. |
| 2542 | self.set_all_transitions(unk_id, unk_id); |
| 2543 | self.set_all_transitions(dead_id, dead_id); |
| 2544 | self.set_all_transitions(quit_id, quit_id); |
| 2545 | // All of these states are technically equivalent from the FSM |
| 2546 | // perspective, so putting all three of them in the cache isn't |
| 2547 | // possible. (They are distinct merely because we use their |
| 2548 | // identifiers as sentinels to mean something, as indicated by the |
| 2549 | // names.) Moreover, we wouldn't want to do that. Unknown and quit |
| 2550 | // states are special in that they are artificial constructions |
| 2551 | // this implementation. But dead states are a natural part of |
| 2552 | // determinization. When you reach a point in the NFA where you cannot |
| 2553 | // go anywhere else, a dead state will naturally arise and we MUST |
| 2554 | // reuse the canonical dead state that we've created here. Why? Because |
| 2555 | // it is the state ID that tells the search routine whether a state is |
| 2556 | // dead or not, and thus, whether to stop the search. Having a bunch of |
| 2557 | // distinct dead states would be quite wasteful! |
| 2558 | self.cache.states_to_id.insert(dead, dead_id); |
| 2559 | } |
| 2560 | |
| 2561 | /// Save the state corresponding to the ID given such that the state |
| 2562 | /// persists through a cache clearing. |
| 2563 | /// |
| 2564 | /// While the state may persist, the ID may not. In order to discover the |
| 2565 | /// new state ID, one must call 'saved_state_id' after a cache clearing. |
| 2566 | fn save_state(&mut self, id: LazyStateID) { |
| 2567 | let state = self.as_ref().get_cached_state(id).clone(); |
| 2568 | self.cache.state_saver = StateSaver::ToSave { id, state }; |
| 2569 | } |
| 2570 | |
| 2571 | /// Returns the updated lazy state ID for a state that was persisted |
| 2572 | /// through a cache clearing. |
| 2573 | /// |
| 2574 | /// It is only correct to call this routine when both a state has been |
| 2575 | /// saved and the cache has just been cleared. Otherwise, this panics. |
| 2576 | fn saved_state_id(&mut self) -> LazyStateID { |
| 2577 | self.cache |
| 2578 | .state_saver |
| 2579 | .take_saved() |
| 2580 | .expect("state saver does not have saved state ID" ) |
| 2581 | } |
| 2582 | |
| 2583 | /// Set all transitions on the state 'from' to 'to'. |
| 2584 | fn set_all_transitions(&mut self, from: LazyStateID, to: LazyStateID) { |
| 2585 | for unit in self.dfa.classes.representatives(..) { |
| 2586 | self.set_transition(from, unit, to); |
| 2587 | } |
| 2588 | } |
| 2589 | |
| 2590 | /// Set the transition on 'from' for 'unit' to 'to'. |
| 2591 | /// |
| 2592 | /// This panics if either 'from' or 'to' is invalid. |
| 2593 | /// |
| 2594 | /// All unit values are OK. |
| 2595 | fn set_transition( |
| 2596 | &mut self, |
| 2597 | from: LazyStateID, |
| 2598 | unit: alphabet::Unit, |
| 2599 | to: LazyStateID, |
| 2600 | ) { |
| 2601 | assert!(self.as_ref().is_valid(from), "invalid 'from' id: {:?}" , from); |
| 2602 | assert!(self.as_ref().is_valid(to), "invalid 'to' id: {:?}" , to); |
| 2603 | let offset = |
| 2604 | from.as_usize_untagged() + self.dfa.classes.get_by_unit(unit); |
| 2605 | self.cache.trans[offset] = to; |
| 2606 | } |
| 2607 | |
| 2608 | /// Set the start ID for the given pattern ID (if given) and starting |
| 2609 | /// configuration to the ID given. |
| 2610 | /// |
| 2611 | /// This panics if 'id' is not valid or if a pattern ID is given and |
| 2612 | /// 'starts_for_each_pattern' is not enabled. |
| 2613 | fn set_start_state( |
| 2614 | &mut self, |
| 2615 | anchored: Anchored, |
| 2616 | start: Start, |
| 2617 | id: LazyStateID, |
| 2618 | ) { |
| 2619 | assert!(self.as_ref().is_valid(id)); |
| 2620 | let start_index = start.as_usize(); |
| 2621 | let index = match anchored { |
| 2622 | Anchored::No => start_index, |
| 2623 | Anchored::Yes => Start::len() + start_index, |
| 2624 | Anchored::Pattern(pid) => { |
| 2625 | assert!( |
| 2626 | self.dfa.get_config().get_starts_for_each_pattern(), |
| 2627 | "attempted to search for a specific pattern \ |
| 2628 | without enabling starts_for_each_pattern" , |
| 2629 | ); |
| 2630 | let pid = pid.as_usize(); |
| 2631 | (2 * Start::len()) + (Start::len() * pid) + start_index |
| 2632 | } |
| 2633 | }; |
| 2634 | self.cache.starts[index] = id; |
| 2635 | } |
| 2636 | |
| 2637 | /// Returns a state builder from this DFA that might have existing |
| 2638 | /// capacity. This helps avoid allocs in cases where a state is built that |
| 2639 | /// turns out to already be cached. |
| 2640 | /// |
| 2641 | /// Callers must put the state builder back with 'put_state_builder', |
| 2642 | /// otherwise the allocation reuse won't work. |
| 2643 | fn get_state_builder(&mut self) -> StateBuilderEmpty { |
| 2644 | core::mem::replace( |
| 2645 | &mut self.cache.scratch_state_builder, |
| 2646 | StateBuilderEmpty::new(), |
| 2647 | ) |
| 2648 | } |
| 2649 | |
| 2650 | /// Puts the given state builder back into this DFA for reuse. |
| 2651 | /// |
| 2652 | /// Note that building a 'State' from a builder always creates a new alloc, |
| 2653 | /// so callers should always put the builder back. |
| 2654 | fn put_state_builder(&mut self, builder: StateBuilderNFA) { |
| 2655 | let _ = core::mem::replace( |
| 2656 | &mut self.cache.scratch_state_builder, |
| 2657 | builder.clear(), |
| 2658 | ); |
| 2659 | } |
| 2660 | } |
| 2661 | |
| 2662 | /// A type that groups methods that require the base NFA/DFA and read-only |
| 2663 | /// access to the cache. |
| 2664 | #[derive (Debug)] |
| 2665 | struct LazyRef<'i, 'c> { |
| 2666 | dfa: &'i DFA, |
| 2667 | cache: &'c Cache, |
| 2668 | } |
| 2669 | |
| 2670 | impl<'i, 'c> LazyRef<'i, 'c> { |
| 2671 | /// Creates a new 'Lazy' wrapper for a DFA and its corresponding cache. |
| 2672 | fn new(dfa: &'i DFA, cache: &'c Cache) -> LazyRef<'i, 'c> { |
| 2673 | LazyRef { dfa, cache } |
| 2674 | } |
| 2675 | |
| 2676 | /// Return the ID of the start state for the given configuration. |
| 2677 | /// |
| 2678 | /// If the start state has not yet been computed, then this returns an |
| 2679 | /// unknown lazy state ID. |
| 2680 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 2681 | fn get_cached_start_id( |
| 2682 | &self, |
| 2683 | anchored: Anchored, |
| 2684 | start: Start, |
| 2685 | ) -> Result<LazyStateID, StartError> { |
| 2686 | let start_index = start.as_usize(); |
| 2687 | let index = match anchored { |
| 2688 | Anchored::No => start_index, |
| 2689 | Anchored::Yes => Start::len() + start_index, |
| 2690 | Anchored::Pattern(pid) => { |
| 2691 | if !self.dfa.get_config().get_starts_for_each_pattern() { |
| 2692 | return Err(StartError::unsupported_anchored(anchored)); |
| 2693 | } |
| 2694 | if pid.as_usize() >= self.dfa.pattern_len() { |
| 2695 | return Ok(self.dead_id()); |
| 2696 | } |
| 2697 | (2 * Start::len()) |
| 2698 | + (Start::len() * pid.as_usize()) |
| 2699 | + start_index |
| 2700 | } |
| 2701 | }; |
| 2702 | Ok(self.cache.starts[index]) |
| 2703 | } |
| 2704 | |
| 2705 | /// Return the cached NFA/DFA powerset state for the given ID. |
| 2706 | /// |
| 2707 | /// This panics if the given ID does not address a valid state. |
| 2708 | fn get_cached_state(&self, sid: LazyStateID) -> &State { |
| 2709 | let index = sid.as_usize_untagged() >> self.dfa.stride2(); |
| 2710 | &self.cache.states[index] |
| 2711 | } |
| 2712 | |
| 2713 | /// Returns true if and only if the given ID corresponds to a "sentinel" |
| 2714 | /// state. |
| 2715 | /// |
| 2716 | /// A sentinel state is a state that signifies a special condition of |
| 2717 | /// search, and where every transition maps back to itself. See LazyStateID |
| 2718 | /// for more details. Note that start and match states are _not_ sentinels |
| 2719 | /// since they may otherwise be real states with non-trivial transitions. |
| 2720 | /// The purposes of sentinel states is purely to indicate something. Their |
| 2721 | /// transitions are not meant to be followed. |
| 2722 | fn is_sentinel(&self, id: LazyStateID) -> bool { |
| 2723 | id == self.unknown_id() || id == self.dead_id() || id == self.quit_id() |
| 2724 | } |
| 2725 | |
| 2726 | /// Returns the ID of the unknown state for this lazy DFA. |
| 2727 | fn unknown_id(&self) -> LazyStateID { |
| 2728 | // This unwrap is OK since 0 is always a valid state ID. |
| 2729 | LazyStateID::new(0).unwrap().to_unknown() |
| 2730 | } |
| 2731 | |
| 2732 | /// Returns the ID of the dead state for this lazy DFA. |
| 2733 | fn dead_id(&self) -> LazyStateID { |
| 2734 | // This unwrap is OK since the maximum value here is 1 * 512 = 512, |
| 2735 | // which is <= 2047 (the maximum state ID on 16-bit systems). Where |
| 2736 | // 512 is the worst case for our equivalence classes (every byte is a |
| 2737 | // distinct class). |
| 2738 | LazyStateID::new(1 << self.dfa.stride2()).unwrap().to_dead() |
| 2739 | } |
| 2740 | |
| 2741 | /// Returns the ID of the quit state for this lazy DFA. |
| 2742 | fn quit_id(&self) -> LazyStateID { |
| 2743 | // This unwrap is OK since the maximum value here is 2 * 512 = 1024, |
| 2744 | // which is <= 2047 (the maximum state ID on 16-bit systems). Where |
| 2745 | // 512 is the worst case for our equivalence classes (every byte is a |
| 2746 | // distinct class). |
| 2747 | LazyStateID::new(2 << self.dfa.stride2()).unwrap().to_quit() |
| 2748 | } |
| 2749 | |
| 2750 | /// Returns true if and only if the given ID is valid. |
| 2751 | /// |
| 2752 | /// An ID is valid if it is both a valid index into the transition table |
| 2753 | /// and is a multiple of the DFA's stride. |
| 2754 | fn is_valid(&self, id: LazyStateID) -> bool { |
| 2755 | let id = id.as_usize_untagged(); |
| 2756 | id < self.cache.trans.len() && id % self.dfa.stride() == 0 |
| 2757 | } |
| 2758 | |
| 2759 | /// Returns true if adding the state given would fit in this cache. |
| 2760 | fn state_fits_in_cache(&self, state: &State) -> bool { |
| 2761 | let needed = self.cache.memory_usage() |
| 2762 | + self.memory_usage_for_one_more_state(state.memory_usage()); |
| 2763 | trace!( |
| 2764 | "lazy DFA cache capacity check: {:?} ?<=? {:?}" , |
| 2765 | needed, |
| 2766 | self.dfa.cache_capacity |
| 2767 | ); |
| 2768 | needed <= self.dfa.cache_capacity |
| 2769 | } |
| 2770 | |
| 2771 | /// Returns true if adding the state to be built by the given builder would |
| 2772 | /// fit in this cache. |
| 2773 | fn state_builder_fits_in_cache(&self, state: &StateBuilderNFA) -> bool { |
| 2774 | let needed = self.cache.memory_usage() |
| 2775 | + self.memory_usage_for_one_more_state(state.as_bytes().len()); |
| 2776 | needed <= self.dfa.cache_capacity |
| 2777 | } |
| 2778 | |
| 2779 | /// Returns the additional memory usage, in bytes, required to add one more |
| 2780 | /// state to this cache. The given size should be the heap size, in bytes, |
| 2781 | /// that would be used by the new state being added. |
| 2782 | fn memory_usage_for_one_more_state( |
| 2783 | &self, |
| 2784 | state_heap_size: usize, |
| 2785 | ) -> usize { |
| 2786 | const ID_SIZE: usize = size_of::<LazyStateID>(); |
| 2787 | const STATE_SIZE: usize = size_of::<State>(); |
| 2788 | |
| 2789 | self.dfa.stride() * ID_SIZE // additional space needed in trans table |
| 2790 | + STATE_SIZE // space in cache.states |
| 2791 | + (STATE_SIZE + ID_SIZE) // space in cache.states_to_id |
| 2792 | + state_heap_size // heap memory used by state itself |
| 2793 | } |
| 2794 | } |
| 2795 | |
| 2796 | /// A simple type that encapsulates the saving of a state ID through a cache |
| 2797 | /// clearing. |
| 2798 | /// |
| 2799 | /// A state ID can be marked for saving with ToSave, while a state ID can be |
| 2800 | /// saved itself with Saved. |
| 2801 | #[derive (Clone, Debug)] |
| 2802 | enum StateSaver { |
| 2803 | /// An empty state saver. In this case, no states (other than the special |
| 2804 | /// sentinel states) are preserved after clearing the cache. |
| 2805 | None, |
| 2806 | /// An ID of a state (and the state itself) that should be preserved after |
| 2807 | /// the lazy DFA's cache has been cleared. After clearing, the updated ID |
| 2808 | /// is stored in 'Saved' since it may have changed. |
| 2809 | ToSave { id: LazyStateID, state: State }, |
| 2810 | /// An ID that of a state that has been persisted through a lazy DFA |
| 2811 | /// cache clearing. The ID recorded here corresponds to an ID that was |
| 2812 | /// once marked as ToSave. The IDs are likely not equivalent even though |
| 2813 | /// the states they point to are. |
| 2814 | Saved(LazyStateID), |
| 2815 | } |
| 2816 | |
| 2817 | impl StateSaver { |
| 2818 | /// Create an empty state saver. |
| 2819 | fn none() -> StateSaver { |
| 2820 | StateSaver::None |
| 2821 | } |
| 2822 | |
| 2823 | /// Replace this state saver with an empty saver, and if this saver is a |
| 2824 | /// request to save a state, return that request. |
| 2825 | fn take_to_save(&mut self) -> Option<(LazyStateID, State)> { |
| 2826 | match core::mem::replace(self, StateSaver::None) { |
| 2827 | StateSaver::None | StateSaver::Saved(_) => None, |
| 2828 | StateSaver::ToSave { id, state } => Some((id, state)), |
| 2829 | } |
| 2830 | } |
| 2831 | |
| 2832 | /// Replace this state saver with an empty saver, and if this saver is a |
| 2833 | /// saved state (or a request to save a state), return that state's ID. |
| 2834 | /// |
| 2835 | /// The idea here is that a request to save a state isn't necessarily |
| 2836 | /// honored because it might not be needed. e.g., Some higher level code |
| 2837 | /// might request a state to be saved on the off chance that the cache gets |
| 2838 | /// cleared when a new state is added at a lower level. But if that new |
| 2839 | /// state is never added, then the cache is never cleared and the state and |
| 2840 | /// its ID remain unchanged. |
| 2841 | fn take_saved(&mut self) -> Option<LazyStateID> { |
| 2842 | match core::mem::replace(self, StateSaver::None) { |
| 2843 | StateSaver::None => None, |
| 2844 | StateSaver::Saved(id) | StateSaver::ToSave { id, .. } => Some(id), |
| 2845 | } |
| 2846 | } |
| 2847 | } |
| 2848 | |
| 2849 | /// The configuration used for building a lazy DFA. |
| 2850 | /// |
| 2851 | /// As a convenience, [`DFA::config`] is an alias for [`Config::new`]. The |
| 2852 | /// advantage of the former is that it often lets you avoid importing the |
| 2853 | /// `Config` type directly. |
| 2854 | /// |
| 2855 | /// A lazy DFA configuration is a simple data object that is typically used |
| 2856 | /// with [`Builder::configure`]. |
| 2857 | /// |
| 2858 | /// The default configuration guarantees that a search will never return a |
| 2859 | /// "gave up" or "quit" error, although it is possible for a search to fail |
| 2860 | /// if [`Config::starts_for_each_pattern`] wasn't enabled (which it is not by |
| 2861 | /// default) and an [`Anchored::Pattern`] mode is requested via [`Input`]. |
| 2862 | #[derive (Clone, Debug, Default)] |
| 2863 | pub struct Config { |
| 2864 | // As with other configuration types in this crate, we put all our knobs |
| 2865 | // in options so that we can distinguish between "default" and "not set." |
| 2866 | // This makes it possible to easily combine multiple configurations |
| 2867 | // without default values overwriting explicitly specified values. See the |
| 2868 | // 'overwrite' method. |
| 2869 | // |
| 2870 | // For docs on the fields below, see the corresponding method setters. |
| 2871 | match_kind: Option<MatchKind>, |
| 2872 | pre: Option<Option<Prefilter>>, |
| 2873 | starts_for_each_pattern: Option<bool>, |
| 2874 | byte_classes: Option<bool>, |
| 2875 | unicode_word_boundary: Option<bool>, |
| 2876 | quitset: Option<ByteSet>, |
| 2877 | specialize_start_states: Option<bool>, |
| 2878 | cache_capacity: Option<usize>, |
| 2879 | skip_cache_capacity_check: Option<bool>, |
| 2880 | minimum_cache_clear_count: Option<Option<usize>>, |
| 2881 | minimum_bytes_per_state: Option<Option<usize>>, |
| 2882 | } |
| 2883 | |
| 2884 | impl Config { |
| 2885 | /// Return a new default lazy DFA builder configuration. |
| 2886 | pub fn new() -> Config { |
| 2887 | Config::default() |
| 2888 | } |
| 2889 | |
| 2890 | /// Set the desired match semantics. |
| 2891 | /// |
| 2892 | /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the |
| 2893 | /// match semantics of Perl-like regex engines. That is, when multiple |
| 2894 | /// patterns would match at the same leftmost position, the pattern that |
| 2895 | /// appears first in the concrete syntax is chosen. |
| 2896 | /// |
| 2897 | /// Currently, the only other kind of match semantics supported is |
| 2898 | /// [`MatchKind::All`]. This corresponds to classical DFA construction |
| 2899 | /// where all possible matches are added to the lazy DFA. |
| 2900 | /// |
| 2901 | /// Typically, `All` is used when one wants to execute an overlapping |
| 2902 | /// search and `LeftmostFirst` otherwise. In particular, it rarely makes |
| 2903 | /// sense to use `All` with the various "leftmost" find routines, since the |
| 2904 | /// leftmost routines depend on the `LeftmostFirst` automata construction |
| 2905 | /// strategy. Specifically, `LeftmostFirst` adds dead states to the |
| 2906 | /// lazy DFA as a way to terminate the search and report a match. |
| 2907 | /// `LeftmostFirst` also supports non-greedy matches using this strategy |
| 2908 | /// where as `All` does not. |
| 2909 | /// |
| 2910 | /// # Example: overlapping search |
| 2911 | /// |
| 2912 | /// This example shows the typical use of `MatchKind::All`, which is to |
| 2913 | /// report overlapping matches. |
| 2914 | /// |
| 2915 | /// ``` |
| 2916 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
| 2917 | /// use regex_automata::{ |
| 2918 | /// hybrid::dfa::{DFA, OverlappingState}, |
| 2919 | /// HalfMatch, Input, MatchKind, |
| 2920 | /// }; |
| 2921 | /// |
| 2922 | /// let dfa = DFA::builder() |
| 2923 | /// .configure(DFA::config().match_kind(MatchKind::All)) |
| 2924 | /// .build_many(&[r"\w+$" , r"\S+$" ])?; |
| 2925 | /// let mut cache = dfa.create_cache(); |
| 2926 | /// let haystack = "@foo" ; |
| 2927 | /// let mut state = OverlappingState::start(); |
| 2928 | /// |
| 2929 | /// let expected = Some(HalfMatch::must(1, 4)); |
| 2930 | /// dfa.try_search_overlapping_fwd( |
| 2931 | /// &mut cache, &Input::new(haystack), &mut state, |
| 2932 | /// )?; |
| 2933 | /// assert_eq!(expected, state.get_match()); |
| 2934 | /// |
| 2935 | /// // The first pattern also matches at the same position, so re-running |
| 2936 | /// // the search will yield another match. Notice also that the first |
| 2937 | /// // pattern is returned after the second. This is because the second |
| 2938 | /// // pattern begins its match before the first, is therefore an earlier |
| 2939 | /// // match and is thus reported first. |
| 2940 | /// let expected = Some(HalfMatch::must(0, 4)); |
| 2941 | /// dfa.try_search_overlapping_fwd( |
| 2942 | /// &mut cache, &Input::new(haystack), &mut state, |
| 2943 | /// )?; |
| 2944 | /// assert_eq!(expected, state.get_match()); |
| 2945 | /// |
| 2946 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 2947 | /// ``` |
| 2948 | /// |
| 2949 | /// # Example: reverse automaton to find start of match |
| 2950 | /// |
| 2951 | /// Another example for using `MatchKind::All` is for constructing a |
| 2952 | /// reverse automaton to find the start of a match. `All` semantics are |
| 2953 | /// used for this in order to find the longest possible match, which |
| 2954 | /// corresponds to the leftmost starting position. |
| 2955 | /// |
| 2956 | /// Note that if you need the starting position then |
| 2957 | /// [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) will handle this |
| 2958 | /// for you, so it's usually not necessary to do this yourself. |
| 2959 | /// |
| 2960 | /// ``` |
| 2961 | /// use regex_automata::{ |
| 2962 | /// hybrid::dfa::DFA, |
| 2963 | /// nfa::thompson::NFA, |
| 2964 | /// Anchored, HalfMatch, Input, MatchKind, |
| 2965 | /// }; |
| 2966 | /// |
| 2967 | /// let input = Input::new("123foobar456" ); |
| 2968 | /// let pattern = r"[a-z]+r" ; |
| 2969 | /// |
| 2970 | /// let dfa_fwd = DFA::new(pattern)?; |
| 2971 | /// let dfa_rev = DFA::builder() |
| 2972 | /// .thompson(NFA::config().reverse(true)) |
| 2973 | /// .configure(DFA::config().match_kind(MatchKind::All)) |
| 2974 | /// .build(pattern)?; |
| 2975 | /// let mut cache_fwd = dfa_fwd.create_cache(); |
| 2976 | /// let mut cache_rev = dfa_rev.create_cache(); |
| 2977 | /// |
| 2978 | /// let expected_fwd = HalfMatch::must(0, 9); |
| 2979 | /// let expected_rev = HalfMatch::must(0, 3); |
| 2980 | /// let got_fwd = dfa_fwd.try_search_fwd(&mut cache_fwd, &input)?.unwrap(); |
| 2981 | /// // Here we don't specify the pattern to search for since there's only |
| 2982 | /// // one pattern and we're doing a leftmost search. But if this were an |
| 2983 | /// // overlapping search, you'd need to specify the pattern that matched |
| 2984 | /// // in the forward direction. (Otherwise, you might wind up finding the |
| 2985 | /// // starting position of a match of some other pattern.) That in turn |
| 2986 | /// // requires building the reverse automaton with starts_for_each_pattern |
| 2987 | /// // enabled. |
| 2988 | /// let input = input |
| 2989 | /// .clone() |
| 2990 | /// .range(..got_fwd.offset()) |
| 2991 | /// .anchored(Anchored::Yes); |
| 2992 | /// let got_rev = dfa_rev.try_search_rev(&mut cache_rev, &input)?.unwrap(); |
| 2993 | /// assert_eq!(expected_fwd, got_fwd); |
| 2994 | /// assert_eq!(expected_rev, got_rev); |
| 2995 | /// |
| 2996 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 2997 | /// ``` |
| 2998 | pub fn match_kind(mut self, kind: MatchKind) -> Config { |
| 2999 | self.match_kind = Some(kind); |
| 3000 | self |
| 3001 | } |
| 3002 | |
| 3003 | /// Set a prefilter to be used whenever a start state is entered. |
| 3004 | /// |
| 3005 | /// A [`Prefilter`] in this context is meant to accelerate searches by |
| 3006 | /// looking for literal prefixes that every match for the corresponding |
| 3007 | /// pattern (or patterns) must start with. Once a prefilter produces a |
| 3008 | /// match, the underlying search routine continues on to try and confirm |
| 3009 | /// the match. |
| 3010 | /// |
| 3011 | /// Be warned that setting a prefilter does not guarantee that the search |
| 3012 | /// will be faster. While it's usually a good bet, if the prefilter |
| 3013 | /// produces a lot of false positive candidates (i.e., positions matched |
| 3014 | /// by the prefilter but not by the regex), then the overall result can |
| 3015 | /// be slower than if you had just executed the regex engine without any |
| 3016 | /// prefilters. |
| 3017 | /// |
| 3018 | /// Note that unless [`Config::specialize_start_states`] has been |
| 3019 | /// explicitly set, then setting this will also enable (when `pre` is |
| 3020 | /// `Some`) or disable (when `pre` is `None`) start state specialization. |
| 3021 | /// This occurs because without start state specialization, a prefilter |
| 3022 | /// is likely to be less effective. And without a prefilter, start state |
| 3023 | /// specialization is usually pointless. |
| 3024 | /// |
| 3025 | /// By default no prefilter is set. |
| 3026 | /// |
| 3027 | /// # Example |
| 3028 | /// |
| 3029 | /// ``` |
| 3030 | /// use regex_automata::{ |
| 3031 | /// hybrid::dfa::DFA, |
| 3032 | /// util::prefilter::Prefilter, |
| 3033 | /// Input, HalfMatch, MatchKind, |
| 3034 | /// }; |
| 3035 | /// |
| 3036 | /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo" , "bar" ]); |
| 3037 | /// let re = DFA::builder() |
| 3038 | /// .configure(DFA::config().prefilter(pre)) |
| 3039 | /// .build(r"(foo|bar)[a-z]+" )?; |
| 3040 | /// let mut cache = re.create_cache(); |
| 3041 | /// let input = Input::new("foo1 barfox bar" ); |
| 3042 | /// assert_eq!( |
| 3043 | /// Some(HalfMatch::must(0, 11)), |
| 3044 | /// re.try_search_fwd(&mut cache, &input)?, |
| 3045 | /// ); |
| 3046 | /// |
| 3047 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 3048 | /// ``` |
| 3049 | /// |
| 3050 | /// Be warned though that an incorrect prefilter can lead to incorrect |
| 3051 | /// results! |
| 3052 | /// |
| 3053 | /// ``` |
| 3054 | /// use regex_automata::{ |
| 3055 | /// hybrid::dfa::DFA, |
| 3056 | /// util::prefilter::Prefilter, |
| 3057 | /// Input, HalfMatch, MatchKind, |
| 3058 | /// }; |
| 3059 | /// |
| 3060 | /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo" , "car" ]); |
| 3061 | /// let re = DFA::builder() |
| 3062 | /// .configure(DFA::config().prefilter(pre)) |
| 3063 | /// .build(r"(foo|bar)[a-z]+" )?; |
| 3064 | /// let mut cache = re.create_cache(); |
| 3065 | /// let input = Input::new("foo1 barfox bar" ); |
| 3066 | /// assert_eq!( |
| 3067 | /// // No match reported even though there clearly is one! |
| 3068 | /// None, |
| 3069 | /// re.try_search_fwd(&mut cache, &input)?, |
| 3070 | /// ); |
| 3071 | /// |
| 3072 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 3073 | /// ``` |
| 3074 | pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config { |
| 3075 | self.pre = Some(pre); |
| 3076 | if self.specialize_start_states.is_none() { |
| 3077 | self.specialize_start_states = |
| 3078 | Some(self.get_prefilter().is_some()); |
| 3079 | } |
| 3080 | self |
| 3081 | } |
| 3082 | |
| 3083 | /// Whether to compile a separate start state for each pattern in the |
| 3084 | /// lazy DFA. |
| 3085 | /// |
| 3086 | /// When enabled, a separate **anchored** start state is added for each |
| 3087 | /// pattern in the lazy DFA. When this start state is used, then the DFA |
| 3088 | /// will only search for matches for the pattern specified, even if there |
| 3089 | /// are other patterns in the DFA. |
| 3090 | /// |
| 3091 | /// The main downside of this option is that it can potentially increase |
| 3092 | /// the size of the DFA and/or increase the time it takes to build the |
| 3093 | /// DFA at search time. However, since this is configuration for a lazy |
| 3094 | /// DFA, these states aren't actually built unless they're used. Enabling |
| 3095 | /// this isn't necessarily free, however, as it may result in higher cache |
| 3096 | /// usage. |
| 3097 | /// |
| 3098 | /// There are a few reasons one might want to enable this (it's disabled |
| 3099 | /// by default): |
| 3100 | /// |
| 3101 | /// 1. When looking for the start of an overlapping match (using a reverse |
| 3102 | /// DFA), doing it correctly requires starting the reverse search using the |
| 3103 | /// starting state of the pattern that matched in the forward direction. |
| 3104 | /// Indeed, when building a [`Regex`](crate::hybrid::regex::Regex), it |
| 3105 | /// will automatically enable this option when building the reverse DFA |
| 3106 | /// internally. |
| 3107 | /// 2. When you want to use a DFA with multiple patterns to both search |
| 3108 | /// for matches of any pattern or to search for anchored matches of one |
| 3109 | /// particular pattern while using the same DFA. (Otherwise, you would need |
| 3110 | /// to compile a new DFA for each pattern.) |
| 3111 | /// |
| 3112 | /// By default this is disabled. |
| 3113 | /// |
| 3114 | /// # Example |
| 3115 | /// |
| 3116 | /// This example shows how to use this option to permit the same lazy DFA |
| 3117 | /// to run both general searches for any pattern and anchored searches for |
| 3118 | /// a specific pattern. |
| 3119 | /// |
| 3120 | /// ``` |
| 3121 | /// use regex_automata::{ |
| 3122 | /// hybrid::dfa::DFA, |
| 3123 | /// Anchored, HalfMatch, Input, PatternID, |
| 3124 | /// }; |
| 3125 | /// |
| 3126 | /// let dfa = DFA::builder() |
| 3127 | /// .configure(DFA::config().starts_for_each_pattern(true)) |
| 3128 | /// .build_many(&[r"[a-z0-9]{6}" , r"[a-z][a-z0-9]{5}" ])?; |
| 3129 | /// let mut cache = dfa.create_cache(); |
| 3130 | /// let haystack = "bar foo123" ; |
| 3131 | /// |
| 3132 | /// // Here's a normal unanchored search that looks for any pattern. |
| 3133 | /// let expected = HalfMatch::must(0, 10); |
| 3134 | /// let input = Input::new(haystack); |
| 3135 | /// assert_eq!(Some(expected), dfa.try_search_fwd(&mut cache, &input)?); |
| 3136 | /// // We can also do a normal anchored search for any pattern. Since it's |
| 3137 | /// // an anchored search, we position the start of the search where we |
| 3138 | /// // know the match will begin. |
| 3139 | /// let expected = HalfMatch::must(0, 10); |
| 3140 | /// let input = Input::new(haystack).range(4..); |
| 3141 | /// assert_eq!(Some(expected), dfa.try_search_fwd(&mut cache, &input)?); |
| 3142 | /// // Since we compiled anchored start states for each pattern, we can |
| 3143 | /// // also look for matches of other patterns explicitly, even if a |
| 3144 | /// // different pattern would have normally matched. |
| 3145 | /// let expected = HalfMatch::must(1, 10); |
| 3146 | /// let input = Input::new(haystack) |
| 3147 | /// .range(4..) |
| 3148 | /// .anchored(Anchored::Pattern(PatternID::must(1))); |
| 3149 | /// assert_eq!(Some(expected), dfa.try_search_fwd(&mut cache, &input)?); |
| 3150 | /// |
| 3151 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 3152 | /// ``` |
| 3153 | pub fn starts_for_each_pattern(mut self, yes: bool) -> Config { |
| 3154 | self.starts_for_each_pattern = Some(yes); |
| 3155 | self |
| 3156 | } |
| 3157 | |
| 3158 | /// Whether to attempt to shrink the size of the lazy DFA's alphabet or |
| 3159 | /// not. |
| 3160 | /// |
| 3161 | /// This option is enabled by default and should never be disabled unless |
| 3162 | /// one is debugging the lazy DFA. |
| 3163 | /// |
| 3164 | /// When enabled, the lazy DFA will use a map from all possible bytes |
| 3165 | /// to their corresponding equivalence class. Each equivalence class |
| 3166 | /// represents a set of bytes that does not discriminate between a match |
| 3167 | /// and a non-match in the DFA. For example, the pattern `[ab]+` has at |
| 3168 | /// least two equivalence classes: a set containing `a` and `b` and a set |
| 3169 | /// containing every byte except for `a` and `b`. `a` and `b` are in the |
| 3170 | /// same equivalence classes because they never discriminate between a |
| 3171 | /// match and a non-match. |
| 3172 | /// |
| 3173 | /// The advantage of this map is that the size of the transition table |
| 3174 | /// can be reduced drastically from `#states * 256 * sizeof(LazyStateID)` |
| 3175 | /// to `#states * k * sizeof(LazyStateID)` where `k` is the number of |
| 3176 | /// equivalence classes (rounded up to the nearest power of 2). As a |
| 3177 | /// result, total space usage can decrease substantially. Moreover, since a |
| 3178 | /// smaller alphabet is used, DFA compilation during search becomes faster |
| 3179 | /// as well since it will potentially be able to reuse a single transition |
| 3180 | /// for multiple bytes. |
| 3181 | /// |
| 3182 | /// **WARNING:** This is only useful for debugging lazy DFAs. Disabling |
| 3183 | /// this does not yield any speed advantages. Namely, even when this is |
| 3184 | /// disabled, a byte class map is still used while searching. The only |
| 3185 | /// difference is that every byte will be forced into its own distinct |
| 3186 | /// equivalence class. This is useful for debugging the actual generated |
| 3187 | /// transitions because it lets one see the transitions defined on actual |
| 3188 | /// bytes instead of the equivalence classes. |
| 3189 | pub fn byte_classes(mut self, yes: bool) -> Config { |
| 3190 | self.byte_classes = Some(yes); |
| 3191 | self |
| 3192 | } |
| 3193 | |
| 3194 | /// Heuristically enable Unicode word boundaries. |
| 3195 | /// |
| 3196 | /// When set, this will attempt to implement Unicode word boundaries as if |
| 3197 | /// they were ASCII word boundaries. This only works when the search input |
| 3198 | /// is ASCII only. If a non-ASCII byte is observed while searching, then a |
| 3199 | /// [`MatchError::quit`] error is returned. |
| 3200 | /// |
| 3201 | /// A possible alternative to enabling this option is to simply use an |
| 3202 | /// ASCII word boundary, e.g., via `(?-u:\b)`. The main reason to use this |
| 3203 | /// option is if you absolutely need Unicode support. This option lets one |
| 3204 | /// use a fast search implementation (a DFA) for some potentially very |
| 3205 | /// common cases, while providing the option to fall back to some other |
| 3206 | /// regex engine to handle the general case when an error is returned. |
| 3207 | /// |
| 3208 | /// If the pattern provided has no Unicode word boundary in it, then this |
| 3209 | /// option has no effect. (That is, quitting on a non-ASCII byte only |
| 3210 | /// occurs when this option is enabled _and_ a Unicode word boundary is |
| 3211 | /// present in the pattern.) |
| 3212 | /// |
| 3213 | /// This is almost equivalent to setting all non-ASCII bytes to be quit |
| 3214 | /// bytes. The only difference is that this will cause non-ASCII bytes to |
| 3215 | /// be quit bytes _only_ when a Unicode word boundary is present in the |
| 3216 | /// pattern. |
| 3217 | /// |
| 3218 | /// When enabling this option, callers _must_ be prepared to |
| 3219 | /// handle a [`MatchError`] error during search. When using a |
| 3220 | /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the |
| 3221 | /// `try_` suite of methods. Alternatively, if callers can guarantee that |
| 3222 | /// their input is ASCII only, then a [`MatchError::quit`] error will never |
| 3223 | /// be returned while searching. |
| 3224 | /// |
| 3225 | /// This is disabled by default. |
| 3226 | /// |
| 3227 | /// # Example |
| 3228 | /// |
| 3229 | /// This example shows how to heuristically enable Unicode word boundaries |
| 3230 | /// in a pattern. It also shows what happens when a search comes across a |
| 3231 | /// non-ASCII byte. |
| 3232 | /// |
| 3233 | /// ``` |
| 3234 | /// use regex_automata::{ |
| 3235 | /// hybrid::dfa::DFA, |
| 3236 | /// HalfMatch, Input, MatchError, |
| 3237 | /// }; |
| 3238 | /// |
| 3239 | /// let dfa = DFA::builder() |
| 3240 | /// .configure(DFA::config().unicode_word_boundary(true)) |
| 3241 | /// .build(r"\b[0-9]+\b" )?; |
| 3242 | /// let mut cache = dfa.create_cache(); |
| 3243 | /// |
| 3244 | /// // The match occurs before the search ever observes the snowman |
| 3245 | /// // character, so no error occurs. |
| 3246 | /// let haystack = "foo 123 ☃" ; |
| 3247 | /// let expected = Some(HalfMatch::must(0, 7)); |
| 3248 | /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?; |
| 3249 | /// assert_eq!(expected, got); |
| 3250 | /// |
| 3251 | /// // Notice that this search fails, even though the snowman character |
| 3252 | /// // occurs after the ending match offset. This is because search |
| 3253 | /// // routines read one byte past the end of the search to account for |
| 3254 | /// // look-around, and indeed, this is required here to determine whether |
| 3255 | /// // the trailing \b matches. |
| 3256 | /// let haystack = "foo 123 ☃" ; |
| 3257 | /// let expected = MatchError::quit(0xE2, 8); |
| 3258 | /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack)); |
| 3259 | /// assert_eq!(Err(expected), got); |
| 3260 | /// |
| 3261 | /// // Another example is executing a search where the span of the haystack |
| 3262 | /// // we specify is all ASCII, but there is non-ASCII just before it. This |
| 3263 | /// // correctly also reports an error. |
| 3264 | /// let input = Input::new("β123" ).range(2..); |
| 3265 | /// let expected = MatchError::quit(0xB2, 1); |
| 3266 | /// let got = dfa.try_search_fwd(&mut cache, &input); |
| 3267 | /// assert_eq!(Err(expected), got); |
| 3268 | /// |
| 3269 | /// // And similarly for the trailing word boundary. |
| 3270 | /// let input = Input::new("123β" ).range(..3); |
| 3271 | /// let expected = MatchError::quit(0xCE, 3); |
| 3272 | /// let got = dfa.try_search_fwd(&mut cache, &input); |
| 3273 | /// assert_eq!(Err(expected), got); |
| 3274 | /// |
| 3275 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 3276 | /// ``` |
| 3277 | pub fn unicode_word_boundary(mut self, yes: bool) -> Config { |
| 3278 | // We have a separate option for this instead of just setting the |
| 3279 | // appropriate quit bytes here because we don't want to set quit bytes |
| 3280 | // for every regex. We only want to set them when the regex contains a |
| 3281 | // Unicode word boundary. |
| 3282 | self.unicode_word_boundary = Some(yes); |
| 3283 | self |
| 3284 | } |
| 3285 | |
| 3286 | /// Add a "quit" byte to the lazy DFA. |
| 3287 | /// |
| 3288 | /// When a quit byte is seen during search time, then search will return a |
| 3289 | /// [`MatchError::quit`] error indicating the offset at which the search |
| 3290 | /// stopped. |
| 3291 | /// |
| 3292 | /// A quit byte will always overrule any other aspects of a regex. For |
| 3293 | /// example, if the `x` byte is added as a quit byte and the regex `\w` is |
| 3294 | /// used, then observing `x` will cause the search to quit immediately |
| 3295 | /// despite the fact that `x` is in the `\w` class. |
| 3296 | /// |
| 3297 | /// This mechanism is primarily useful for heuristically enabling certain |
| 3298 | /// features like Unicode word boundaries in a DFA. Namely, if the input |
| 3299 | /// to search is ASCII, then a Unicode word boundary can be implemented |
| 3300 | /// via an ASCII word boundary with no change in semantics. Thus, a DFA |
| 3301 | /// can attempt to match a Unicode word boundary but give up as soon as it |
| 3302 | /// observes a non-ASCII byte. Indeed, if callers set all non-ASCII bytes |
| 3303 | /// to be quit bytes, then Unicode word boundaries will be permitted when |
| 3304 | /// building lazy DFAs. Of course, callers should enable |
| 3305 | /// [`Config::unicode_word_boundary`] if they want this behavior instead. |
| 3306 | /// (The advantage being that non-ASCII quit bytes will only be added if a |
| 3307 | /// Unicode word boundary is in the pattern.) |
| 3308 | /// |
| 3309 | /// When enabling this option, callers _must_ be prepared to |
| 3310 | /// handle a [`MatchError`] error during search. When using a |
| 3311 | /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the |
| 3312 | /// `try_` suite of methods. |
| 3313 | /// |
| 3314 | /// By default, there are no quit bytes set. |
| 3315 | /// |
| 3316 | /// # Panics |
| 3317 | /// |
| 3318 | /// This panics if heuristic Unicode word boundaries are enabled and any |
| 3319 | /// non-ASCII byte is removed from the set of quit bytes. Namely, enabling |
| 3320 | /// Unicode word boundaries requires setting every non-ASCII byte to a quit |
| 3321 | /// byte. So if the caller attempts to undo any of that, then this will |
| 3322 | /// panic. |
| 3323 | /// |
| 3324 | /// # Example |
| 3325 | /// |
| 3326 | /// This example shows how to cause a search to terminate if it sees a |
| 3327 | /// `\n` byte. This could be useful if, for example, you wanted to prevent |
| 3328 | /// a user supplied pattern from matching across a line boundary. |
| 3329 | /// |
| 3330 | /// ``` |
| 3331 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
| 3332 | /// use regex_automata::{hybrid::dfa::DFA, MatchError, Input}; |
| 3333 | /// |
| 3334 | /// let dfa = DFA::builder() |
| 3335 | /// .configure(DFA::config().quit(b' \n' , true)) |
| 3336 | /// .build(r"foo\p{any}+bar" )?; |
| 3337 | /// let mut cache = dfa.create_cache(); |
| 3338 | /// |
| 3339 | /// let haystack = "foo \nbar" ; |
| 3340 | /// // Normally this would produce a match, since \p{any} contains '\n'. |
| 3341 | /// // But since we instructed the automaton to enter a quit state if a |
| 3342 | /// // '\n' is observed, this produces a match error instead. |
| 3343 | /// let expected = MatchError::quit(b' \n' , 3); |
| 3344 | /// let got = dfa.try_search_fwd( |
| 3345 | /// &mut cache, |
| 3346 | /// &Input::new(haystack), |
| 3347 | /// ).unwrap_err(); |
| 3348 | /// assert_eq!(expected, got); |
| 3349 | /// |
| 3350 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 3351 | /// ``` |
| 3352 | pub fn quit(mut self, byte: u8, yes: bool) -> Config { |
| 3353 | if self.get_unicode_word_boundary() && !byte.is_ascii() && !yes { |
| 3354 | panic!( |
| 3355 | "cannot set non-ASCII byte to be non-quit when \ |
| 3356 | Unicode word boundaries are enabled" |
| 3357 | ); |
| 3358 | } |
| 3359 | if self.quitset.is_none() { |
| 3360 | self.quitset = Some(ByteSet::empty()); |
| 3361 | } |
| 3362 | if yes { |
| 3363 | self.quitset.as_mut().unwrap().add(byte); |
| 3364 | } else { |
| 3365 | self.quitset.as_mut().unwrap().remove(byte); |
| 3366 | } |
| 3367 | self |
| 3368 | } |
| 3369 | |
| 3370 | /// Enable specializing start states in the lazy DFA. |
| 3371 | /// |
| 3372 | /// When start states are specialized, an implementor of a search routine |
| 3373 | /// using a lazy DFA can tell when the search has entered a starting state. |
| 3374 | /// When start states aren't specialized, then it is impossible to know |
| 3375 | /// whether the search has entered a start state. |
| 3376 | /// |
| 3377 | /// Ideally, this option wouldn't need to exist and we could always |
| 3378 | /// specialize start states. The problem is that start states can be quite |
| 3379 | /// active. This in turn means that an efficient search routine is likely |
| 3380 | /// to ping-pong between a heavily optimized hot loop that handles most |
| 3381 | /// states and to a less optimized specialized handling of start states. |
| 3382 | /// This causes branches to get heavily mispredicted and overall can |
| 3383 | /// materially decrease throughput. Therefore, specializing start states |
| 3384 | /// should only be enabled when it is needed. |
| 3385 | /// |
| 3386 | /// Knowing whether a search is in a start state is typically useful when a |
| 3387 | /// prefilter is active for the search. A prefilter is typically only run |
| 3388 | /// when in a start state and a prefilter can greatly accelerate a search. |
| 3389 | /// Therefore, the possible cost of specializing start states is worth it |
| 3390 | /// in this case. Otherwise, if you have no prefilter, there is likely no |
| 3391 | /// reason to specialize start states. |
| 3392 | /// |
| 3393 | /// This is disabled by default, but note that it is automatically |
| 3394 | /// enabled (or disabled) if [`Config::prefilter`] is set. Namely, unless |
| 3395 | /// `specialize_start_states` has already been set, [`Config::prefilter`] |
| 3396 | /// will automatically enable or disable it based on whether a prefilter |
| 3397 | /// is present or not, respectively. This is done because a prefilter's |
| 3398 | /// effectiveness is rooted in being executed whenever the DFA is in a |
| 3399 | /// start state, and that's only possible to do when they are specialized. |
| 3400 | /// |
| 3401 | /// Note that it is plausibly reasonable to _disable_ this option |
| 3402 | /// explicitly while _enabling_ a prefilter. In that case, a prefilter |
| 3403 | /// will still be run at the beginning of a search, but never again. This |
| 3404 | /// in theory could strike a good balance if you're in a situation where a |
| 3405 | /// prefilter is likely to produce many false positive candidates. |
| 3406 | /// |
| 3407 | /// # Example |
| 3408 | /// |
| 3409 | /// This example shows how to enable start state specialization and then |
| 3410 | /// shows how to check whether a state is a start state or not. |
| 3411 | /// |
| 3412 | /// ``` |
| 3413 | /// use regex_automata::{hybrid::dfa::DFA, MatchError, Input}; |
| 3414 | /// |
| 3415 | /// let dfa = DFA::builder() |
| 3416 | /// .configure(DFA::config().specialize_start_states(true)) |
| 3417 | /// .build(r"[a-z]+" )?; |
| 3418 | /// let mut cache = dfa.create_cache(); |
| 3419 | /// |
| 3420 | /// let haystack = "123 foobar 4567" .as_bytes(); |
| 3421 | /// let sid = dfa.start_state_forward(&mut cache, &Input::new(haystack))?; |
| 3422 | /// // The ID returned by 'start_state_forward' will always be tagged as |
| 3423 | /// // a start state when start state specialization is enabled. |
| 3424 | /// assert!(sid.is_tagged()); |
| 3425 | /// assert!(sid.is_start()); |
| 3426 | /// |
| 3427 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 3428 | /// ``` |
| 3429 | /// |
| 3430 | /// Compare the above with the default lazy DFA configuration where |
| 3431 | /// start states are _not_ specialized. In this case, the start state |
| 3432 | /// is not tagged and `sid.is_start()` returns false. |
| 3433 | /// |
| 3434 | /// ``` |
| 3435 | /// use regex_automata::{hybrid::dfa::DFA, MatchError, Input}; |
| 3436 | /// |
| 3437 | /// let dfa = DFA::new(r"[a-z]+" )?; |
| 3438 | /// let mut cache = dfa.create_cache(); |
| 3439 | /// |
| 3440 | /// let haystack = "123 foobar 4567" .as_bytes(); |
| 3441 | /// let sid = dfa.start_state_forward(&mut cache, &Input::new(haystack))?; |
| 3442 | /// // Start states are not tagged in the default configuration! |
| 3443 | /// assert!(!sid.is_tagged()); |
| 3444 | /// assert!(!sid.is_start()); |
| 3445 | /// |
| 3446 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 3447 | /// ``` |
| 3448 | pub fn specialize_start_states(mut self, yes: bool) -> Config { |
| 3449 | self.specialize_start_states = Some(yes); |
| 3450 | self |
| 3451 | } |
| 3452 | |
| 3453 | /// Sets the maximum amount of heap memory, in bytes, to allocate to the |
| 3454 | /// cache for use during a lazy DFA search. If the lazy DFA would otherwise |
| 3455 | /// use more heap memory, then, depending on other configuration knobs, |
| 3456 | /// either stop the search and return an error or clear the cache and |
| 3457 | /// continue the search. |
| 3458 | /// |
| 3459 | /// The default cache capacity is some "reasonable" number that will |
| 3460 | /// accommodate most regular expressions. You may find that if you need |
| 3461 | /// to build a large DFA then it may be necessary to increase the cache |
| 3462 | /// capacity. |
| 3463 | /// |
| 3464 | /// Note that while building a lazy DFA will do a "minimum" check to ensure |
| 3465 | /// the capacity is big enough, this is more or less about correctness. |
| 3466 | /// If the cache is bigger than the minimum but still "too small," then the |
| 3467 | /// lazy DFA could wind up spending a lot of time clearing the cache and |
| 3468 | /// recomputing transitions, thus negating the performance benefits of a |
| 3469 | /// lazy DFA. Thus, setting the cache capacity is mostly an experimental |
| 3470 | /// endeavor. For most common patterns, however, the default should be |
| 3471 | /// sufficient. |
| 3472 | /// |
| 3473 | /// For more details on how the lazy DFA's cache is used, see the |
| 3474 | /// documentation for [`Cache`]. |
| 3475 | /// |
| 3476 | /// # Example |
| 3477 | /// |
| 3478 | /// This example shows what happens if the configured cache capacity is |
| 3479 | /// too small. In such cases, one can override the cache capacity to make |
| 3480 | /// it bigger. Alternatively, one might want to use less memory by setting |
| 3481 | /// a smaller cache capacity. |
| 3482 | /// |
| 3483 | /// ``` |
| 3484 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
| 3485 | /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; |
| 3486 | /// |
| 3487 | /// let pattern = r"\p{L}{1000}" ; |
| 3488 | /// |
| 3489 | /// // The default cache capacity is likely too small to deal with regexes |
| 3490 | /// // that are very large. Large repetitions of large Unicode character |
| 3491 | /// // classes are a common way to make very large regexes. |
| 3492 | /// let _ = DFA::new(pattern).unwrap_err(); |
| 3493 | /// // Bump up the capacity to something bigger. |
| 3494 | /// let dfa = DFA::builder() |
| 3495 | /// .configure(DFA::config().cache_capacity(100 * (1<<20))) // 100 MB |
| 3496 | /// .build(pattern)?; |
| 3497 | /// let mut cache = dfa.create_cache(); |
| 3498 | /// |
| 3499 | /// let haystack = "ͰͲͶͿΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙ" .repeat(50); |
| 3500 | /// let expected = Some(HalfMatch::must(0, 2000)); |
| 3501 | /// let got = dfa.try_search_fwd(&mut cache, &Input::new(&haystack))?; |
| 3502 | /// assert_eq!(expected, got); |
| 3503 | /// |
| 3504 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 3505 | /// ``` |
| 3506 | pub fn cache_capacity(mut self, bytes: usize) -> Config { |
| 3507 | self.cache_capacity = Some(bytes); |
| 3508 | self |
| 3509 | } |
| 3510 | |
| 3511 | /// Configures construction of a lazy DFA to use the minimum cache capacity |
| 3512 | /// if the configured capacity is otherwise too small for the provided NFA. |
| 3513 | /// |
| 3514 | /// This is useful if you never want lazy DFA construction to fail because |
| 3515 | /// of a capacity that is too small. |
| 3516 | /// |
| 3517 | /// In general, this option is typically not a good idea. In particular, |
| 3518 | /// while a minimum cache capacity does permit the lazy DFA to function |
| 3519 | /// where it otherwise couldn't, it's plausible that it may not function |
| 3520 | /// well if it's constantly running out of room. In that case, the speed |
| 3521 | /// advantages of the lazy DFA may be negated. On the other hand, the |
| 3522 | /// "minimum" cache capacity computed may not be completely accurate and |
| 3523 | /// could actually be bigger than what is really necessary. Therefore, it |
| 3524 | /// is plausible that using the minimum cache capacity could still result |
| 3525 | /// in very good performance. |
| 3526 | /// |
| 3527 | /// This is disabled by default. |
| 3528 | /// |
| 3529 | /// # Example |
| 3530 | /// |
| 3531 | /// This example shows what happens if the configured cache capacity is |
| 3532 | /// too small. In such cases, one could override the capacity explicitly. |
| 3533 | /// An alternative, demonstrated here, let's us force construction to use |
| 3534 | /// the minimum cache capacity if the configured capacity is otherwise |
| 3535 | /// too small. |
| 3536 | /// |
| 3537 | /// ``` |
| 3538 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
| 3539 | /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; |
| 3540 | /// |
| 3541 | /// let pattern = r"\p{L}{1000}" ; |
| 3542 | /// |
| 3543 | /// // The default cache capacity is likely too small to deal with regexes |
| 3544 | /// // that are very large. Large repetitions of large Unicode character |
| 3545 | /// // classes are a common way to make very large regexes. |
| 3546 | /// let _ = DFA::new(pattern).unwrap_err(); |
| 3547 | /// // Configure construction such it automatically selects the minimum |
| 3548 | /// // cache capacity if it would otherwise be too small. |
| 3549 | /// let dfa = DFA::builder() |
| 3550 | /// .configure(DFA::config().skip_cache_capacity_check(true)) |
| 3551 | /// .build(pattern)?; |
| 3552 | /// let mut cache = dfa.create_cache(); |
| 3553 | /// |
| 3554 | /// let haystack = "ͰͲͶͿΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙ" .repeat(50); |
| 3555 | /// let expected = Some(HalfMatch::must(0, 2000)); |
| 3556 | /// let got = dfa.try_search_fwd(&mut cache, &Input::new(&haystack))?; |
| 3557 | /// assert_eq!(expected, got); |
| 3558 | /// |
| 3559 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 3560 | /// ``` |
| 3561 | pub fn skip_cache_capacity_check(mut self, yes: bool) -> Config { |
| 3562 | self.skip_cache_capacity_check = Some(yes); |
| 3563 | self |
| 3564 | } |
| 3565 | |
| 3566 | /// Configure a lazy DFA search to quit after a certain number of cache |
| 3567 | /// clearings. |
| 3568 | /// |
| 3569 | /// When a minimum is set, then a lazy DFA search will *possibly* "give |
| 3570 | /// up" after the minimum number of cache clearings has occurred. This is |
| 3571 | /// typically useful in scenarios where callers want to detect whether the |
| 3572 | /// lazy DFA search is "efficient" or not. If the cache is cleared too many |
| 3573 | /// times, this is a good indicator that it is not efficient, and thus, the |
| 3574 | /// caller may wish to use some other regex engine. |
| 3575 | /// |
| 3576 | /// Note that the number of times a cache is cleared is a property of |
| 3577 | /// the cache itself. Thus, if a cache is used in a subsequent search |
| 3578 | /// with a similarly configured lazy DFA, then it could cause the |
| 3579 | /// search to "give up" if the cache needed to be cleared, depending |
| 3580 | /// on its internal count and configured minimum. The cache clear |
| 3581 | /// count can only be reset to `0` via [`DFA::reset_cache`] (or |
| 3582 | /// [`Regex::reset_cache`](crate::hybrid::regex::Regex::reset_cache) if |
| 3583 | /// you're using the `Regex` API). |
| 3584 | /// |
| 3585 | /// By default, no minimum is configured. Thus, a lazy DFA search will |
| 3586 | /// never give up due to cache clearings. If you do set this option, you |
| 3587 | /// might consider also setting [`Config::minimum_bytes_per_state`] in |
| 3588 | /// order for the lazy DFA to take efficiency into account before giving |
| 3589 | /// up. |
| 3590 | /// |
| 3591 | /// # Example |
| 3592 | /// |
| 3593 | /// This example uses a somewhat pathological configuration to demonstrate |
| 3594 | /// the _possible_ behavior of cache clearing and how it might result |
| 3595 | /// in a search that returns an error. |
| 3596 | /// |
| 3597 | /// It is important to note that the precise mechanics of how and when |
| 3598 | /// a cache gets cleared is an implementation detail. |
| 3599 | /// |
| 3600 | /// ``` |
| 3601 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
| 3602 | /// use regex_automata::{hybrid::dfa::DFA, Input, MatchError, MatchErrorKind}; |
| 3603 | /// |
| 3604 | /// // This is a carefully chosen regex. The idea is to pick one |
| 3605 | /// // that requires some decent number of states (hence the bounded |
| 3606 | /// // repetition). But we specifically choose to create a class with an |
| 3607 | /// // ASCII letter and a non-ASCII letter so that we can check that no new |
| 3608 | /// // states are created once the cache is full. Namely, if we fill up the |
| 3609 | /// // cache on a haystack of 'a's, then in order to match one 'β', a new |
| 3610 | /// // state will need to be created since a 'β' is encoded with multiple |
| 3611 | /// // bytes. Since there's no room for this state, the search should quit |
| 3612 | /// // at the very first position. |
| 3613 | /// let pattern = r"[aβ]{100}" ; |
| 3614 | /// let dfa = DFA::builder() |
| 3615 | /// .configure( |
| 3616 | /// // Configure it so that we have the minimum cache capacity |
| 3617 | /// // possible. And that if any clearings occur, the search quits. |
| 3618 | /// DFA::config() |
| 3619 | /// .skip_cache_capacity_check(true) |
| 3620 | /// .cache_capacity(0) |
| 3621 | /// .minimum_cache_clear_count(Some(0)), |
| 3622 | /// ) |
| 3623 | /// .build(pattern)?; |
| 3624 | /// let mut cache = dfa.create_cache(); |
| 3625 | /// |
| 3626 | /// // Our search will give up before reaching the end! |
| 3627 | /// let haystack = "a" .repeat(101).into_bytes(); |
| 3628 | /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack)); |
| 3629 | /// assert!(matches!( |
| 3630 | /// *result.unwrap_err().kind(), |
| 3631 | /// MatchErrorKind::GaveUp { .. }, |
| 3632 | /// )); |
| 3633 | /// |
| 3634 | /// // Now that we know the cache is full, if we search a haystack that we |
| 3635 | /// // know will require creating at least one new state, it should not |
| 3636 | /// // be able to make much progress. |
| 3637 | /// let haystack = "β" .repeat(101).into_bytes(); |
| 3638 | /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack)); |
| 3639 | /// assert!(matches!( |
| 3640 | /// *result.unwrap_err().kind(), |
| 3641 | /// MatchErrorKind::GaveUp { .. }, |
| 3642 | /// )); |
| 3643 | /// |
| 3644 | /// // If we reset the cache, then we should be able to create more states |
| 3645 | /// // and make more progress with searching for betas. |
| 3646 | /// cache.reset(&dfa); |
| 3647 | /// let haystack = "β" .repeat(101).into_bytes(); |
| 3648 | /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack)); |
| 3649 | /// assert!(matches!( |
| 3650 | /// *result.unwrap_err().kind(), |
| 3651 | /// MatchErrorKind::GaveUp { .. }, |
| 3652 | /// )); |
| 3653 | /// |
| 3654 | /// // ... switching back to ASCII still makes progress since it just needs |
| 3655 | /// // to set transitions on existing states! |
| 3656 | /// let haystack = "a" .repeat(101).into_bytes(); |
| 3657 | /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack)); |
| 3658 | /// assert!(matches!( |
| 3659 | /// *result.unwrap_err().kind(), |
| 3660 | /// MatchErrorKind::GaveUp { .. }, |
| 3661 | /// )); |
| 3662 | /// |
| 3663 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 3664 | /// ``` |
| 3665 | pub fn minimum_cache_clear_count(mut self, min: Option<usize>) -> Config { |
| 3666 | self.minimum_cache_clear_count = Some(min); |
| 3667 | self |
| 3668 | } |
| 3669 | |
| 3670 | /// Configure a lazy DFA search to quit only when its efficiency drops |
| 3671 | /// below the given minimum. |
| 3672 | /// |
| 3673 | /// The efficiency of the cache is determined by the number of DFA states |
| 3674 | /// compiled per byte of haystack searched. For example, if the efficiency |
| 3675 | /// is 2, then it means the lazy DFA is creating a new DFA state after |
| 3676 | /// searching approximately 2 bytes in a haystack. Generally speaking, 2 |
| 3677 | /// is quite bad and it's likely that even a slower regex engine like the |
| 3678 | /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) would be faster. |
| 3679 | /// |
| 3680 | /// This has no effect if [`Config::minimum_cache_clear_count`] is not set. |
| 3681 | /// Namely, this option only kicks in when the cache has been cleared more |
| 3682 | /// than the minimum number. If no minimum is set, then the cache is simply |
| 3683 | /// cleared whenever it fills up and it is impossible for the lazy DFA to |
| 3684 | /// quit due to ineffective use of the cache. |
| 3685 | /// |
| 3686 | /// In general, if one is setting [`Config::minimum_cache_clear_count`], |
| 3687 | /// then one should probably also set this knob as well. The reason is |
| 3688 | /// that the absolute number of times the cache is cleared is generally |
| 3689 | /// not a great predictor of efficiency. For example, if a new DFA state |
| 3690 | /// is created for every 1,000 bytes searched, then it wouldn't be hard |
| 3691 | /// for the cache to get cleared more than `N` times and then cause the |
| 3692 | /// lazy DFA to quit. But a new DFA state every 1,000 bytes is likely quite |
| 3693 | /// good from a performance perspective, and it's likely that the lazy |
| 3694 | /// DFA should continue searching, even if it requires clearing the cache |
| 3695 | /// occasionally. |
| 3696 | /// |
| 3697 | /// Finally, note that if you're implementing your own lazy DFA search |
| 3698 | /// routine and also want this efficiency check to work correctly, then |
| 3699 | /// you'll need to use the following routines to record search progress: |
| 3700 | /// |
| 3701 | /// * Call [`Cache::search_start`] at the beginning of every search. |
| 3702 | /// * Call [`Cache::search_update`] whenever [`DFA::next_state`] is |
| 3703 | /// called. |
| 3704 | /// * Call [`Cache::search_finish`] before completing a search. (It is |
| 3705 | /// not strictly necessary to call this when an error is returned, as |
| 3706 | /// `Cache::search_start` will automatically finish the previous search |
| 3707 | /// for you. But calling it where possible before returning helps improve |
| 3708 | /// the accuracy of how many bytes have actually been searched.) |
| 3709 | pub fn minimum_bytes_per_state(mut self, min: Option<usize>) -> Config { |
| 3710 | self.minimum_bytes_per_state = Some(min); |
| 3711 | self |
| 3712 | } |
| 3713 | |
| 3714 | /// Returns the match semantics set in this configuration. |
| 3715 | pub fn get_match_kind(&self) -> MatchKind { |
| 3716 | self.match_kind.unwrap_or(MatchKind::LeftmostFirst) |
| 3717 | } |
| 3718 | |
| 3719 | /// Returns the prefilter set in this configuration, if one at all. |
| 3720 | pub fn get_prefilter(&self) -> Option<&Prefilter> { |
| 3721 | self.pre.as_ref().unwrap_or(&None).as_ref() |
| 3722 | } |
| 3723 | |
| 3724 | /// Returns whether this configuration has enabled anchored starting states |
| 3725 | /// for every pattern in the DFA. |
| 3726 | pub fn get_starts_for_each_pattern(&self) -> bool { |
| 3727 | self.starts_for_each_pattern.unwrap_or(false) |
| 3728 | } |
| 3729 | |
| 3730 | /// Returns whether this configuration has enabled byte classes or not. |
| 3731 | /// This is typically a debugging oriented option, as disabling it confers |
| 3732 | /// no speed benefit. |
| 3733 | pub fn get_byte_classes(&self) -> bool { |
| 3734 | self.byte_classes.unwrap_or(true) |
| 3735 | } |
| 3736 | |
| 3737 | /// Returns whether this configuration has enabled heuristic Unicode word |
| 3738 | /// boundary support. When enabled, it is possible for a search to return |
| 3739 | /// an error. |
| 3740 | pub fn get_unicode_word_boundary(&self) -> bool { |
| 3741 | self.unicode_word_boundary.unwrap_or(false) |
| 3742 | } |
| 3743 | |
| 3744 | /// Returns whether this configuration will instruct the lazy DFA to enter |
| 3745 | /// a quit state whenever the given byte is seen during a search. When at |
| 3746 | /// least one byte has this enabled, it is possible for a search to return |
| 3747 | /// an error. |
| 3748 | pub fn get_quit(&self, byte: u8) -> bool { |
| 3749 | self.quitset.map_or(false, |q| q.contains(byte)) |
| 3750 | } |
| 3751 | |
| 3752 | /// Returns whether this configuration will instruct the lazy DFA to |
| 3753 | /// "specialize" start states. When enabled, the lazy DFA will tag start |
| 3754 | /// states so that search routines using the lazy DFA can detect when |
| 3755 | /// it's in a start state and do some kind of optimization (like run a |
| 3756 | /// prefilter). |
| 3757 | pub fn get_specialize_start_states(&self) -> bool { |
| 3758 | self.specialize_start_states.unwrap_or(false) |
| 3759 | } |
| 3760 | |
| 3761 | /// Returns the cache capacity set on this configuration. |
| 3762 | pub fn get_cache_capacity(&self) -> usize { |
| 3763 | self.cache_capacity.unwrap_or(2 * (1 << 20)) |
| 3764 | } |
| 3765 | |
| 3766 | /// Returns whether the cache capacity check should be skipped. |
| 3767 | pub fn get_skip_cache_capacity_check(&self) -> bool { |
| 3768 | self.skip_cache_capacity_check.unwrap_or(false) |
| 3769 | } |
| 3770 | |
| 3771 | /// Returns, if set, the minimum number of times the cache must be cleared |
| 3772 | /// before a lazy DFA search can give up. When no minimum is set, then a |
| 3773 | /// search will never quit and will always clear the cache whenever it |
| 3774 | /// fills up. |
| 3775 | pub fn get_minimum_cache_clear_count(&self) -> Option<usize> { |
| 3776 | self.minimum_cache_clear_count.unwrap_or(None) |
| 3777 | } |
| 3778 | |
| 3779 | /// Returns, if set, the minimum number of bytes per state that need to be |
| 3780 | /// processed in order for the lazy DFA to keep going. If the minimum falls |
| 3781 | /// below this number (and the cache has been cleared a minimum number of |
| 3782 | /// times), then the lazy DFA will return a "gave up" error. |
| 3783 | pub fn get_minimum_bytes_per_state(&self) -> Option<usize> { |
| 3784 | self.minimum_bytes_per_state.unwrap_or(None) |
| 3785 | } |
| 3786 | |
| 3787 | /// Returns the minimum lazy DFA cache capacity required for the given NFA. |
| 3788 | /// |
| 3789 | /// The cache capacity required for a particular NFA may change without |
| 3790 | /// notice. Callers should not rely on it being stable. |
| 3791 | /// |
| 3792 | /// This is useful for informational purposes, but can also be useful for |
| 3793 | /// other reasons. For example, if one wants to check the minimum cache |
| 3794 | /// capacity themselves or if one wants to set the capacity based on the |
| 3795 | /// minimum. |
| 3796 | /// |
| 3797 | /// This may return an error if this configuration does not support all of |
| 3798 | /// the instructions used in the given NFA. For example, if the NFA has a |
| 3799 | /// Unicode word boundary but this configuration does not enable heuristic |
| 3800 | /// support for Unicode word boundaries. |
| 3801 | pub fn get_minimum_cache_capacity( |
| 3802 | &self, |
| 3803 | nfa: &thompson::NFA, |
| 3804 | ) -> Result<usize, BuildError> { |
| 3805 | let quitset = self.quit_set_from_nfa(nfa)?; |
| 3806 | let classes = self.byte_classes_from_nfa(nfa, &quitset); |
| 3807 | let starts = self.get_starts_for_each_pattern(); |
| 3808 | Ok(minimum_cache_capacity(nfa, &classes, starts)) |
| 3809 | } |
| 3810 | |
| 3811 | /// Returns the byte class map used during search from the given NFA. |
| 3812 | /// |
| 3813 | /// If byte classes are disabled on this configuration, then a map is |
| 3814 | /// returned that puts each byte in its own equivalent class. |
| 3815 | fn byte_classes_from_nfa( |
| 3816 | &self, |
| 3817 | nfa: &thompson::NFA, |
| 3818 | quit: &ByteSet, |
| 3819 | ) -> ByteClasses { |
| 3820 | if !self.get_byte_classes() { |
| 3821 | // The lazy DFA will always use the equivalence class map, but |
| 3822 | // enabling this option is useful for debugging. Namely, this will |
| 3823 | // cause all transitions to be defined over their actual bytes |
| 3824 | // instead of an opaque equivalence class identifier. The former is |
| 3825 | // much easier to grok as a human. |
| 3826 | ByteClasses::singletons() |
| 3827 | } else { |
| 3828 | let mut set = nfa.byte_class_set().clone(); |
| 3829 | // It is important to distinguish any "quit" bytes from all other |
| 3830 | // bytes. Otherwise, a non-quit byte may end up in the same class |
| 3831 | // as a quit byte, and thus cause the DFA stop when it shouldn't. |
| 3832 | // |
| 3833 | // Test case: |
| 3834 | // |
| 3835 | // regex-cli find match hybrid --unicode-word-boundary \ |
| 3836 | // -p '^#' -p '\b10\.55\.182\.100\b' -y @conn.json.1000x.log |
| 3837 | if !quit.is_empty() { |
| 3838 | set.add_set(&quit); |
| 3839 | } |
| 3840 | set.byte_classes() |
| 3841 | } |
| 3842 | } |
| 3843 | |
| 3844 | /// Return the quit set for this configuration and the given NFA. |
| 3845 | /// |
| 3846 | /// This may return an error if the NFA is incompatible with this |
| 3847 | /// configuration's quit set. For example, if the NFA has a Unicode word |
| 3848 | /// boundary and the quit set doesn't include non-ASCII bytes. |
| 3849 | fn quit_set_from_nfa( |
| 3850 | &self, |
| 3851 | nfa: &thompson::NFA, |
| 3852 | ) -> Result<ByteSet, BuildError> { |
| 3853 | let mut quit = self.quitset.unwrap_or(ByteSet::empty()); |
| 3854 | if nfa.look_set_any().contains_word_unicode() { |
| 3855 | if self.get_unicode_word_boundary() { |
| 3856 | for b in 0x80..=0xFF { |
| 3857 | quit.add(b); |
| 3858 | } |
| 3859 | } else { |
| 3860 | // If heuristic support for Unicode word boundaries wasn't |
| 3861 | // enabled, then we can still check if our quit set is correct. |
| 3862 | // If the caller set their quit bytes in a way that causes the |
| 3863 | // DFA to quit on at least all non-ASCII bytes, then that's all |
| 3864 | // we need for heuristic support to work. |
| 3865 | if !quit.contains_range(0x80, 0xFF) { |
| 3866 | return Err( |
| 3867 | BuildError::unsupported_dfa_word_boundary_unicode(), |
| 3868 | ); |
| 3869 | } |
| 3870 | } |
| 3871 | } |
| 3872 | Ok(quit) |
| 3873 | } |
| 3874 | |
| 3875 | /// Overwrite the default configuration such that the options in `o` are |
| 3876 | /// always used. If an option in `o` is not set, then the corresponding |
| 3877 | /// option in `self` is used. If it's not set in `self` either, then it |
| 3878 | /// remains not set. |
| 3879 | fn overwrite(&self, o: Config) -> Config { |
| 3880 | Config { |
| 3881 | match_kind: o.match_kind.or(self.match_kind), |
| 3882 | pre: o.pre.or_else(|| self.pre.clone()), |
| 3883 | starts_for_each_pattern: o |
| 3884 | .starts_for_each_pattern |
| 3885 | .or(self.starts_for_each_pattern), |
| 3886 | byte_classes: o.byte_classes.or(self.byte_classes), |
| 3887 | unicode_word_boundary: o |
| 3888 | .unicode_word_boundary |
| 3889 | .or(self.unicode_word_boundary), |
| 3890 | quitset: o.quitset.or(self.quitset), |
| 3891 | specialize_start_states: o |
| 3892 | .specialize_start_states |
| 3893 | .or(self.specialize_start_states), |
| 3894 | cache_capacity: o.cache_capacity.or(self.cache_capacity), |
| 3895 | skip_cache_capacity_check: o |
| 3896 | .skip_cache_capacity_check |
| 3897 | .or(self.skip_cache_capacity_check), |
| 3898 | minimum_cache_clear_count: o |
| 3899 | .minimum_cache_clear_count |
| 3900 | .or(self.minimum_cache_clear_count), |
| 3901 | minimum_bytes_per_state: o |
| 3902 | .minimum_bytes_per_state |
| 3903 | .or(self.minimum_bytes_per_state), |
| 3904 | } |
| 3905 | } |
| 3906 | } |
| 3907 | |
| 3908 | /// A builder for constructing a lazy deterministic finite automaton from |
| 3909 | /// regular expressions. |
| 3910 | /// |
| 3911 | /// As a convenience, [`DFA::builder`] is an alias for [`Builder::new`]. The |
| 3912 | /// advantage of the former is that it often lets you avoid importing the |
| 3913 | /// `Builder` type directly. |
| 3914 | /// |
| 3915 | /// This builder provides two main things: |
| 3916 | /// |
| 3917 | /// 1. It provides a few different `build` routines for actually constructing |
| 3918 | /// a DFA from different kinds of inputs. The most convenient is |
| 3919 | /// [`Builder::build`], which builds a DFA directly from a pattern string. The |
| 3920 | /// most flexible is [`Builder::build_from_nfa`], which builds a DFA straight |
| 3921 | /// from an NFA. |
| 3922 | /// 2. The builder permits configuring a number of things. |
| 3923 | /// [`Builder::configure`] is used with [`Config`] to configure aspects of |
| 3924 | /// the DFA and the construction process itself. [`Builder::syntax`] and |
| 3925 | /// [`Builder::thompson`] permit configuring the regex parser and Thompson NFA |
| 3926 | /// construction, respectively. The syntax and thompson configurations only |
| 3927 | /// apply when building from a pattern string. |
| 3928 | /// |
| 3929 | /// This builder always constructs a *single* lazy DFA. As such, this builder |
| 3930 | /// can only be used to construct regexes that either detect the presence |
| 3931 | /// of a match or find the end location of a match. A single DFA cannot |
| 3932 | /// produce both the start and end of a match. For that information, use a |
| 3933 | /// [`Regex`](crate::hybrid::regex::Regex), which can be similarly configured |
| 3934 | /// using [`regex::Builder`](crate::hybrid::regex::Builder). The main reason |
| 3935 | /// to use a DFA directly is if the end location of a match is enough for your |
| 3936 | /// use case. Namely, a `Regex` will construct two lazy DFAs instead of one, |
| 3937 | /// since a second reverse DFA is needed to find the start of a match. |
| 3938 | /// |
| 3939 | /// # Example |
| 3940 | /// |
| 3941 | /// This example shows how to build a lazy DFA that uses a tiny cache capacity |
| 3942 | /// and completely disables Unicode. That is: |
| 3943 | /// |
| 3944 | /// * Things such as `\w`, `.` and `\b` are no longer Unicode-aware. `\w` |
| 3945 | /// and `\b` are ASCII-only while `.` matches any byte except for `\n` |
| 3946 | /// (instead of any UTF-8 encoding of a Unicode scalar value except for |
| 3947 | /// `\n`). Things that are Unicode only, such as `\pL`, are not allowed. |
| 3948 | /// * The pattern itself is permitted to match invalid UTF-8. For example, |
| 3949 | /// things like `[^a]` that match any byte except for `a` are permitted. |
| 3950 | /// |
| 3951 | /// ``` |
| 3952 | /// use regex_automata::{ |
| 3953 | /// hybrid::dfa::DFA, |
| 3954 | /// nfa::thompson, |
| 3955 | /// util::syntax, |
| 3956 | /// HalfMatch, Input, |
| 3957 | /// }; |
| 3958 | /// |
| 3959 | /// let dfa = DFA::builder() |
| 3960 | /// .configure(DFA::config().cache_capacity(5_000)) |
| 3961 | /// .thompson(thompson::Config::new().utf8(false)) |
| 3962 | /// .syntax(syntax::Config::new().unicode(false).utf8(false)) |
| 3963 | /// .build(r"foo[^b]ar.*" )?; |
| 3964 | /// let mut cache = dfa.create_cache(); |
| 3965 | /// |
| 3966 | /// let haystack = b" \xFEfoo \xFFar \xE2\x98\xFF\n" ; |
| 3967 | /// let expected = Some(HalfMatch::must(0, 10)); |
| 3968 | /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?; |
| 3969 | /// assert_eq!(expected, got); |
| 3970 | /// |
| 3971 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 3972 | /// ``` |
| 3973 | #[derive (Clone, Debug)] |
| 3974 | pub struct Builder { |
| 3975 | config: Config, |
| 3976 | #[cfg (feature = "syntax" )] |
| 3977 | thompson: thompson::Compiler, |
| 3978 | } |
| 3979 | |
| 3980 | impl Builder { |
| 3981 | /// Create a new lazy DFA builder with the default configuration. |
| 3982 | pub fn new() -> Builder { |
| 3983 | Builder { |
| 3984 | config: Config::default(), |
| 3985 | #[cfg (feature = "syntax" )] |
| 3986 | thompson: thompson::Compiler::new(), |
| 3987 | } |
| 3988 | } |
| 3989 | |
| 3990 | /// Build a lazy DFA from the given pattern. |
| 3991 | /// |
| 3992 | /// If there was a problem parsing or compiling the pattern, then an error |
| 3993 | /// is returned. |
| 3994 | #[cfg (feature = "syntax" )] |
| 3995 | pub fn build(&self, pattern: &str) -> Result<DFA, BuildError> { |
| 3996 | self.build_many(&[pattern]) |
| 3997 | } |
| 3998 | |
| 3999 | /// Build a lazy DFA from the given patterns. |
| 4000 | /// |
| 4001 | /// When matches are returned, the pattern ID corresponds to the index of |
| 4002 | /// the pattern in the slice given. |
| 4003 | #[cfg (feature = "syntax" )] |
| 4004 | pub fn build_many<P: AsRef<str>>( |
| 4005 | &self, |
| 4006 | patterns: &[P], |
| 4007 | ) -> Result<DFA, BuildError> { |
| 4008 | let nfa = self |
| 4009 | .thompson |
| 4010 | .clone() |
| 4011 | // We can always forcefully disable captures because DFAs do not |
| 4012 | // support them. |
| 4013 | .configure( |
| 4014 | thompson::Config::new() |
| 4015 | .which_captures(thompson::WhichCaptures::None), |
| 4016 | ) |
| 4017 | .build_many(patterns) |
| 4018 | .map_err(BuildError::nfa)?; |
| 4019 | self.build_from_nfa(nfa) |
| 4020 | } |
| 4021 | |
| 4022 | /// Build a DFA from the given NFA. |
| 4023 | /// |
| 4024 | /// Note that this requires owning a `thompson::NFA`. While this may force |
| 4025 | /// you to clone the NFA, such a clone is not a deep clone. Namely, NFAs |
| 4026 | /// are defined internally to support shared ownership such that cloning is |
| 4027 | /// very cheap. |
| 4028 | /// |
| 4029 | /// # Example |
| 4030 | /// |
| 4031 | /// This example shows how to build a lazy DFA if you already have an NFA |
| 4032 | /// in hand. |
| 4033 | /// |
| 4034 | /// ``` |
| 4035 | /// use regex_automata::{ |
| 4036 | /// hybrid::dfa::DFA, |
| 4037 | /// nfa::thompson, |
| 4038 | /// HalfMatch, Input, |
| 4039 | /// }; |
| 4040 | /// |
| 4041 | /// let haystack = "foo123bar" ; |
| 4042 | /// |
| 4043 | /// // This shows how to set non-default options for building an NFA. |
| 4044 | /// let nfa = thompson::Compiler::new() |
| 4045 | /// .configure(thompson::Config::new().shrink(true)) |
| 4046 | /// .build(r"[0-9]+" )?; |
| 4047 | /// let dfa = DFA::builder().build_from_nfa(nfa)?; |
| 4048 | /// let mut cache = dfa.create_cache(); |
| 4049 | /// let expected = Some(HalfMatch::must(0, 6)); |
| 4050 | /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?; |
| 4051 | /// assert_eq!(expected, got); |
| 4052 | /// |
| 4053 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 4054 | /// ``` |
| 4055 | pub fn build_from_nfa( |
| 4056 | &self, |
| 4057 | nfa: thompson::NFA, |
| 4058 | ) -> Result<DFA, BuildError> { |
| 4059 | let quitset = self.config.quit_set_from_nfa(&nfa)?; |
| 4060 | let classes = self.config.byte_classes_from_nfa(&nfa, &quitset); |
| 4061 | // Check that we can fit at least a few states into our cache, |
| 4062 | // otherwise it's pretty senseless to use the lazy DFA. This does have |
| 4063 | // a possible failure mode though. This assumes the maximum size of a |
| 4064 | // state in powerset space (so, the total number of NFA states), which |
| 4065 | // may never actually materialize, and could be quite a bit larger |
| 4066 | // than the actual biggest state. If this turns out to be a problem, |
| 4067 | // we could expose a knob that disables this check. But if so, we have |
| 4068 | // to be careful not to panic in other areas of the code (the cache |
| 4069 | // clearing and init code) that tend to assume some minimum useful |
| 4070 | // cache capacity. |
| 4071 | let min_cache = minimum_cache_capacity( |
| 4072 | &nfa, |
| 4073 | &classes, |
| 4074 | self.config.get_starts_for_each_pattern(), |
| 4075 | ); |
| 4076 | let mut cache_capacity = self.config.get_cache_capacity(); |
| 4077 | if cache_capacity < min_cache { |
| 4078 | // When the caller has asked us to skip the cache capacity check, |
| 4079 | // then we simply force the cache capacity to its minimum amount |
| 4080 | // and mush on. |
| 4081 | if self.config.get_skip_cache_capacity_check() { |
| 4082 | debug!( |
| 4083 | "given capacity ({}) is too small, \ |
| 4084 | since skip_cache_capacity_check is enabled, \ |
| 4085 | setting cache capacity to minimum ({})" , |
| 4086 | cache_capacity, min_cache, |
| 4087 | ); |
| 4088 | cache_capacity = min_cache; |
| 4089 | } else { |
| 4090 | return Err(BuildError::insufficient_cache_capacity( |
| 4091 | min_cache, |
| 4092 | cache_capacity, |
| 4093 | )); |
| 4094 | } |
| 4095 | } |
| 4096 | // We also need to check that we can fit at least some small number |
| 4097 | // of states in our state ID space. This is unlikely to trigger in |
| 4098 | // >=32-bit systems, but 16-bit systems have a pretty small state ID |
| 4099 | // space since a number of bits are used up as sentinels. |
| 4100 | if let Err(err) = minimum_lazy_state_id(&classes) { |
| 4101 | return Err(BuildError::insufficient_state_id_capacity(err)); |
| 4102 | } |
| 4103 | let stride2 = classes.stride2(); |
| 4104 | let start_map = StartByteMap::new(nfa.look_matcher()); |
| 4105 | Ok(DFA { |
| 4106 | config: self.config.clone(), |
| 4107 | nfa, |
| 4108 | stride2, |
| 4109 | start_map, |
| 4110 | classes, |
| 4111 | quitset, |
| 4112 | cache_capacity, |
| 4113 | }) |
| 4114 | } |
| 4115 | |
| 4116 | /// Apply the given lazy DFA configuration options to this builder. |
| 4117 | pub fn configure(&mut self, config: Config) -> &mut Builder { |
| 4118 | self.config = self.config.overwrite(config); |
| 4119 | self |
| 4120 | } |
| 4121 | |
| 4122 | /// Set the syntax configuration for this builder using |
| 4123 | /// [`syntax::Config`](crate::util::syntax::Config). |
| 4124 | /// |
| 4125 | /// This permits setting things like case insensitivity, Unicode and multi |
| 4126 | /// line mode. |
| 4127 | /// |
| 4128 | /// These settings only apply when constructing a lazy DFA directly from a |
| 4129 | /// pattern. |
| 4130 | #[cfg (feature = "syntax" )] |
| 4131 | pub fn syntax( |
| 4132 | &mut self, |
| 4133 | config: crate::util::syntax::Config, |
| 4134 | ) -> &mut Builder { |
| 4135 | self.thompson.syntax(config); |
| 4136 | self |
| 4137 | } |
| 4138 | |
| 4139 | /// Set the Thompson NFA configuration for this builder using |
| 4140 | /// [`nfa::thompson::Config`](crate::nfa::thompson::Config). |
| 4141 | /// |
| 4142 | /// This permits setting things like whether the DFA should match the regex |
| 4143 | /// in reverse or if additional time should be spent shrinking the size of |
| 4144 | /// the NFA. |
| 4145 | /// |
| 4146 | /// These settings only apply when constructing a DFA directly from a |
| 4147 | /// pattern. |
| 4148 | #[cfg (feature = "syntax" )] |
| 4149 | pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { |
| 4150 | self.thompson.configure(config); |
| 4151 | self |
| 4152 | } |
| 4153 | } |
| 4154 | |
| 4155 | /// Represents the current state of an overlapping search. |
| 4156 | /// |
| 4157 | /// This is used for overlapping searches since they need to know something |
| 4158 | /// about the previous search. For example, when multiple patterns match at the |
| 4159 | /// same position, this state tracks the last reported pattern so that the next |
| 4160 | /// search knows whether to report another matching pattern or continue with |
| 4161 | /// the search at the next position. Additionally, it also tracks which state |
| 4162 | /// the last search call terminated in. |
| 4163 | /// |
| 4164 | /// This type provides little introspection capabilities. The only thing a |
| 4165 | /// caller can do is construct it and pass it around to permit search routines |
| 4166 | /// to use it to track state, and also ask whether a match has been found. |
| 4167 | /// |
| 4168 | /// Callers should always provide a fresh state constructed via |
| 4169 | /// [`OverlappingState::start`] when starting a new search. Reusing state from |
| 4170 | /// a previous search may result in incorrect results. |
| 4171 | #[derive (Clone, Debug, Eq, PartialEq)] |
| 4172 | pub struct OverlappingState { |
| 4173 | /// The match reported by the most recent overlapping search to use this |
| 4174 | /// state. |
| 4175 | /// |
| 4176 | /// If a search does not find any matches, then it is expected to clear |
| 4177 | /// this value. |
| 4178 | pub(crate) mat: Option<HalfMatch>, |
| 4179 | /// The state ID of the state at which the search was in when the call |
| 4180 | /// terminated. When this is a match state, `last_match` must be set to a |
| 4181 | /// non-None value. |
| 4182 | /// |
| 4183 | /// A `None` value indicates the start state of the corresponding |
| 4184 | /// automaton. We cannot use the actual ID, since any one automaton may |
| 4185 | /// have many start states, and which one is in use depends on several |
| 4186 | /// search-time factors. |
| 4187 | pub(crate) id: Option<LazyStateID>, |
| 4188 | /// The position of the search. |
| 4189 | /// |
| 4190 | /// When `id` is None (i.e., we are starting a search), this is set to |
| 4191 | /// the beginning of the search as given by the caller regardless of its |
| 4192 | /// current value. Subsequent calls to an overlapping search pick up at |
| 4193 | /// this offset. |
| 4194 | pub(crate) at: usize, |
| 4195 | /// The index into the matching patterns of the next match to report if the |
| 4196 | /// current state is a match state. Note that this may be 1 greater than |
| 4197 | /// the total number of matches to report for the current match state. (In |
| 4198 | /// which case, no more matches should be reported at the current position |
| 4199 | /// and the search should advance to the next position.) |
| 4200 | pub(crate) next_match_index: Option<usize>, |
| 4201 | /// This is set to true when a reverse overlapping search has entered its |
| 4202 | /// EOI transitions. |
| 4203 | /// |
| 4204 | /// This isn't used in a forward search because it knows to stop once the |
| 4205 | /// position exceeds the end of the search range. In a reverse search, |
| 4206 | /// since we use unsigned offsets, we don't "know" once we've gone past |
| 4207 | /// `0`. So the only way to detect it is with this extra flag. The reverse |
| 4208 | /// overlapping search knows to terminate specifically after it has |
| 4209 | /// reported all matches after following the EOI transition. |
| 4210 | pub(crate) rev_eoi: bool, |
| 4211 | } |
| 4212 | |
| 4213 | impl OverlappingState { |
| 4214 | /// Create a new overlapping state that begins at the start state of any |
| 4215 | /// automaton. |
| 4216 | pub fn start() -> OverlappingState { |
| 4217 | OverlappingState { |
| 4218 | mat: None, |
| 4219 | id: None, |
| 4220 | at: 0, |
| 4221 | next_match_index: None, |
| 4222 | rev_eoi: false, |
| 4223 | } |
| 4224 | } |
| 4225 | |
| 4226 | /// Return the match result of the most recent search to execute with this |
| 4227 | /// state. |
| 4228 | /// |
| 4229 | /// A searches will clear this result automatically, such that if no |
| 4230 | /// match is found, this will correctly report `None`. |
| 4231 | pub fn get_match(&self) -> Option<HalfMatch> { |
| 4232 | self.mat |
| 4233 | } |
| 4234 | } |
| 4235 | |
| 4236 | /// Runs the given overlapping `search` function (forwards or backwards) until |
| 4237 | /// a match is found whose offset does not split a codepoint. |
| 4238 | /// |
| 4239 | /// This is *not* always correct to call. It should only be called when the |
| 4240 | /// underlying NFA has UTF-8 mode enabled *and* it can produce zero-width |
| 4241 | /// matches. Calling this when both of those things aren't true might result |
| 4242 | /// in legitimate matches getting skipped. |
| 4243 | #[cold ] |
| 4244 | #[inline (never)] |
| 4245 | fn skip_empty_utf8_splits_overlapping<F>( |
| 4246 | input: &Input<'_>, |
| 4247 | state: &mut OverlappingState, |
| 4248 | mut search: F, |
| 4249 | ) -> Result<(), MatchError> |
| 4250 | where |
| 4251 | F: FnMut(&Input<'_>, &mut OverlappingState) -> Result<(), MatchError>, |
| 4252 | { |
| 4253 | // Note that this routine works for forwards and reverse searches |
| 4254 | // even though there's no code here to handle those cases. That's |
| 4255 | // because overlapping searches drive themselves to completion via |
| 4256 | // `OverlappingState`. So all we have to do is push it until no matches are |
| 4257 | // found. |
| 4258 | |
| 4259 | let mut hm = match state.get_match() { |
| 4260 | None => return Ok(()), |
| 4261 | Some(hm) => hm, |
| 4262 | }; |
| 4263 | if input.get_anchored().is_anchored() { |
| 4264 | if !input.is_char_boundary(hm.offset()) { |
| 4265 | state.mat = None; |
| 4266 | } |
| 4267 | return Ok(()); |
| 4268 | } |
| 4269 | while !input.is_char_boundary(hm.offset()) { |
| 4270 | search(input, state)?; |
| 4271 | hm = match state.get_match() { |
| 4272 | None => return Ok(()), |
| 4273 | Some(hm) => hm, |
| 4274 | }; |
| 4275 | } |
| 4276 | Ok(()) |
| 4277 | } |
| 4278 | |
| 4279 | /// Based on the minimum number of states required for a useful lazy DFA cache, |
| 4280 | /// this returns the minimum lazy state ID that must be representable. |
| 4281 | /// |
| 4282 | /// It's not likely for this to have any impact 32-bit systems (or higher), but |
| 4283 | /// on 16-bit systems, the lazy state ID space is quite constrained and thus |
| 4284 | /// may be insufficient if our MIN_STATES value is (for some reason) too high. |
| 4285 | fn minimum_lazy_state_id( |
| 4286 | classes: &ByteClasses, |
| 4287 | ) -> Result<LazyStateID, LazyStateIDError> { |
| 4288 | let stride: usize = 1 << classes.stride2(); |
| 4289 | let min_state_index: usize = MIN_STATES.checked_sub(1).unwrap(); |
| 4290 | LazyStateID::new(id:min_state_index * stride) |
| 4291 | } |
| 4292 | |
| 4293 | /// Based on the minimum number of states required for a useful lazy DFA cache, |
| 4294 | /// this returns a heuristic minimum number of bytes of heap space required. |
| 4295 | /// |
| 4296 | /// This is a "heuristic" because the minimum it returns is likely bigger than |
| 4297 | /// the true minimum. Namely, it assumes that each powerset NFA/DFA state uses |
| 4298 | /// the maximum number of NFA states (all of them). This is likely bigger |
| 4299 | /// than what is required in practice. Computing the true minimum effectively |
| 4300 | /// requires determinization, which is probably too much work to do for a |
| 4301 | /// simple check like this. |
| 4302 | /// |
| 4303 | /// One of the issues with this approach IMO is that it requires that this |
| 4304 | /// be in sync with the calculation above for computing how much heap memory |
| 4305 | /// the DFA cache uses. If we get it wrong, it's possible for example for the |
| 4306 | /// minimum to be smaller than the computed heap memory, and thus, it may be |
| 4307 | /// the case that we can't add the required minimum number of states. That in |
| 4308 | /// turn will make lazy DFA panic because we assume that we can add at least a |
| 4309 | /// minimum number of states. |
| 4310 | /// |
| 4311 | /// Another approach would be to always allow the minimum number of states to |
| 4312 | /// be added to the lazy DFA cache, even if it exceeds the configured cache |
| 4313 | /// limit. This does mean that the limit isn't really a limit in all cases, |
| 4314 | /// which is unfortunate. But it does at least guarantee that the lazy DFA can |
| 4315 | /// always make progress, even if it is slow. (This approach is very similar to |
| 4316 | /// enabling the 'skip_cache_capacity_check' config knob, except it wouldn't |
| 4317 | /// rely on cache size calculation. Instead, it would just always permit a |
| 4318 | /// minimum number of states to be added.) |
| 4319 | fn minimum_cache_capacity( |
| 4320 | nfa: &thompson::NFA, |
| 4321 | classes: &ByteClasses, |
| 4322 | starts_for_each_pattern: bool, |
| 4323 | ) -> usize { |
| 4324 | const ID_SIZE: usize = size_of::<LazyStateID>(); |
| 4325 | const STATE_SIZE: usize = size_of::<State>(); |
| 4326 | |
| 4327 | let stride = 1 << classes.stride2(); |
| 4328 | let states_len = nfa.states().len(); |
| 4329 | let sparses = 2 * states_len * NFAStateID::SIZE; |
| 4330 | let trans = MIN_STATES * stride * ID_SIZE; |
| 4331 | |
| 4332 | let mut starts = Start::len() * ID_SIZE; |
| 4333 | if starts_for_each_pattern { |
| 4334 | starts += (Start::len() * nfa.pattern_len()) * ID_SIZE; |
| 4335 | } |
| 4336 | |
| 4337 | // The min number of states HAS to be at least 4: we have 3 sentinel states |
| 4338 | // and then we need space for one more when we save a state after clearing |
| 4339 | // the cache. We also need space for one more, otherwise we get stuck in a |
| 4340 | // loop where we try to add a 5th state, which gets rejected, which clears |
| 4341 | // the cache, which adds back a saved state (4th total state) which then |
| 4342 | // tries to add the 5th state again. |
| 4343 | assert!(MIN_STATES >= 5, "minimum number of states has to be at least 5" ); |
| 4344 | // The minimum number of non-sentinel states. We consider this separately |
| 4345 | // because sentinel states are much smaller in that they contain no NFA |
| 4346 | // states. Given our aggressive calculation here, it's worth being more |
| 4347 | // precise with the number of states we need. |
| 4348 | let non_sentinel = MIN_STATES.checked_sub(SENTINEL_STATES).unwrap(); |
| 4349 | |
| 4350 | // Every `State` has 5 bytes for flags, 4 bytes (max) for the number of |
| 4351 | // patterns, followed by 32-bit encodings of patterns and then delta |
| 4352 | // varint encodings of NFA state IDs. We use the worst case (which isn't |
| 4353 | // technically possible) of 5 bytes for each NFA state ID. |
| 4354 | // |
| 4355 | // HOWEVER, three of the states needed by a lazy DFA are just the sentinel |
| 4356 | // unknown, dead and quit states. Those states have a known size and it is |
| 4357 | // small. |
| 4358 | let dead_state_size = State::dead().memory_usage(); |
| 4359 | let max_state_size = 5 + 4 + (nfa.pattern_len() * 4) + (states_len * 5); |
| 4360 | let states = (SENTINEL_STATES * (STATE_SIZE + dead_state_size)) |
| 4361 | + (non_sentinel * (STATE_SIZE + max_state_size)); |
| 4362 | // NOTE: We don't double count heap memory used by State for this map since |
| 4363 | // we use reference counting to avoid doubling memory usage. (This tends to |
| 4364 | // be where most memory is allocated in the cache.) |
| 4365 | let states_to_sid = (MIN_STATES * STATE_SIZE) + (MIN_STATES * ID_SIZE); |
| 4366 | let stack = states_len * NFAStateID::SIZE; |
| 4367 | let scratch_state_builder = max_state_size; |
| 4368 | |
| 4369 | trans |
| 4370 | + starts |
| 4371 | + states |
| 4372 | + states_to_sid |
| 4373 | + sparses |
| 4374 | + stack |
| 4375 | + scratch_state_builder |
| 4376 | } |
| 4377 | |
| 4378 | #[cfg (all(test, feature = "syntax" ))] |
| 4379 | mod tests { |
| 4380 | use super::*; |
| 4381 | |
| 4382 | // Tests that we handle heuristic Unicode word boundary support in reverse |
| 4383 | // DFAs in the specific case of contextual searches. |
| 4384 | // |
| 4385 | // I wrote this test when I discovered a bug in how heuristic word |
| 4386 | // boundaries were handled. Namely, that the starting state selection |
| 4387 | // didn't consider the DFA's quit byte set when looking at the byte |
| 4388 | // immediately before the start of the search (or immediately after the |
| 4389 | // end of the search in the case of a reverse search). As a result, it was |
| 4390 | // possible for '\bfoo\b' to match 'β123' because the trailing \xB2 byte |
| 4391 | // in the 'β' codepoint would be treated as a non-word character. But of |
| 4392 | // course, this search should trigger the DFA to quit, since there is a |
| 4393 | // non-ASCII byte in consideration. |
| 4394 | // |
| 4395 | // Thus, I fixed 'start_state_{forward,reverse}' to check the quit byte set |
| 4396 | // if it wasn't empty. The forward case is tested in the doc test for the |
| 4397 | // Config::unicode_word_boundary API. We test the reverse case here, which |
| 4398 | // is sufficiently niche that it doesn't really belong in a doc test. |
| 4399 | #[test ] |
| 4400 | fn heuristic_unicode_reverse() { |
| 4401 | let dfa = DFA::builder() |
| 4402 | .configure(DFA::config().unicode_word_boundary(true)) |
| 4403 | .thompson(thompson::Config::new().reverse(true)) |
| 4404 | .build(r"\b[0-9]+\b" ) |
| 4405 | .unwrap(); |
| 4406 | let mut cache = dfa.create_cache(); |
| 4407 | |
| 4408 | let input = Input::new("β123" ).range(2..); |
| 4409 | let expected = MatchError::quit(0xB2, 1); |
| 4410 | let got = dfa.try_search_rev(&mut cache, &input); |
| 4411 | assert_eq!(Err(expected), got); |
| 4412 | |
| 4413 | let input = Input::new("123β" ).range(..3); |
| 4414 | let expected = MatchError::quit(0xCE, 3); |
| 4415 | let got = dfa.try_search_rev(&mut cache, &input); |
| 4416 | assert_eq!(Err(expected), got); |
| 4417 | } |
| 4418 | } |
| 4419 | |