1 | /*! |
2 | An NFA backed Pike VM for executing regex searches with capturing groups. |
3 | |
4 | This module provides a [`PikeVM`] that works by simulating an NFA and |
5 | resolving all spans of capturing groups that participate in a match. |
6 | */ |
7 | |
8 | #[cfg (feature = "internal-instrument-pikevm" )] |
9 | use core::cell::RefCell; |
10 | |
11 | use alloc::{vec, vec::Vec}; |
12 | |
13 | use crate::{ |
14 | nfa::thompson::{self, BuildError, State, NFA}, |
15 | util::{ |
16 | captures::Captures, |
17 | empty, iter, |
18 | prefilter::Prefilter, |
19 | primitives::{NonMaxUsize, PatternID, SmallIndex, StateID}, |
20 | search::{ |
21 | Anchored, HalfMatch, Input, Match, MatchKind, PatternSet, Span, |
22 | }, |
23 | sparse_set::SparseSet, |
24 | }, |
25 | }; |
26 | |
27 | /// A simple macro for conditionally executing instrumentation logic when |
28 | /// the 'trace' log level is enabled. This is a compile-time no-op when the |
29 | /// 'internal-instrument-pikevm' feature isn't enabled. The intent here is that |
30 | /// this makes it easier to avoid doing extra work when instrumentation isn't |
31 | /// enabled. |
32 | /// |
33 | /// This macro accepts a closure of type `|&mut Counters|`. The closure can |
34 | /// then increment counters (or whatever) in accordance with what one wants |
35 | /// to track. |
36 | macro_rules! instrument { |
37 | ($fun:expr) => { |
38 | #[cfg(feature = "internal-instrument-pikevm" )] |
39 | { |
40 | let fun: &mut dyn FnMut(&mut Counters) = &mut $fun; |
41 | COUNTERS.with(|c: &RefCell<Counters>| fun(&mut *c.borrow_mut())); |
42 | } |
43 | }; |
44 | } |
45 | |
46 | #[cfg (feature = "internal-instrument-pikevm" )] |
47 | std::thread_local! { |
48 | /// Effectively global state used to keep track of instrumentation |
49 | /// counters. The "proper" way to do this is to thread it through the |
50 | /// PikeVM, but it makes the code quite icky. Since this is just a |
51 | /// debugging feature, we're content to relegate it to thread local |
52 | /// state. When instrumentation is enabled, the counters are reset at the |
53 | /// beginning of every search and printed (with the 'trace' log level) at |
54 | /// the end of every search. |
55 | static COUNTERS: RefCell<Counters> = RefCell::new(Counters::empty()); |
56 | } |
57 | |
58 | /// The configuration used for building a [`PikeVM`]. |
59 | /// |
60 | /// A PikeVM configuration is a simple data object that is typically used with |
61 | /// [`Builder::configure`]. It can be cheaply cloned. |
62 | /// |
63 | /// A default configuration can be created either with `Config::new`, or |
64 | /// perhaps more conveniently, with [`PikeVM::config`]. |
65 | #[derive(Clone, Debug, Default)] |
66 | pub struct Config { |
67 | match_kind: Option<MatchKind>, |
68 | pre: Option<Option<Prefilter>>, |
69 | } |
70 | |
71 | impl Config { |
72 | /// Return a new default PikeVM configuration. |
73 | pub fn new() -> Config { |
74 | Config::default() |
75 | } |
76 | |
77 | /// Set the desired match semantics. |
78 | /// |
79 | /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the |
80 | /// match semantics of Perl-like regex engines. That is, when multiple |
81 | /// patterns would match at the same leftmost position, the pattern that |
82 | /// appears first in the concrete syntax is chosen. |
83 | /// |
84 | /// Currently, the only other kind of match semantics supported is |
85 | /// [`MatchKind::All`]. This corresponds to "classical DFA" construction |
86 | /// where all possible matches are visited in the NFA by the `PikeVM`. |
87 | /// |
88 | /// Typically, `All` is used when one wants to execute an overlapping |
89 | /// search and `LeftmostFirst` otherwise. In particular, it rarely makes |
90 | /// sense to use `All` with the various "leftmost" find routines, since the |
91 | /// leftmost routines depend on the `LeftmostFirst` automata construction |
92 | /// strategy. Specifically, `LeftmostFirst` results in the `PikeVM` |
93 | /// simulating dead states as a way to terminate the search and report a |
94 | /// match. `LeftmostFirst` also supports non-greedy matches using this |
95 | /// strategy where as `All` does not. |
96 | pub fn match_kind(mut self, kind: MatchKind) -> Config { |
97 | self.match_kind = Some(kind); |
98 | self |
99 | } |
100 | |
101 | /// Set a prefilter to be used whenever a start state is entered. |
102 | /// |
103 | /// A [`Prefilter`] in this context is meant to accelerate searches by |
104 | /// looking for literal prefixes that every match for the corresponding |
105 | /// pattern (or patterns) must start with. Once a prefilter produces a |
106 | /// match, the underlying search routine continues on to try and confirm |
107 | /// the match. |
108 | /// |
109 | /// Be warned that setting a prefilter does not guarantee that the search |
110 | /// will be faster. While it's usually a good bet, if the prefilter |
111 | /// produces a lot of false positive candidates (i.e., positions matched |
112 | /// by the prefilter but not by the regex), then the overall result can |
113 | /// be slower than if you had just executed the regex engine without any |
114 | /// prefilters. |
115 | /// |
116 | /// By default no prefilter is set. |
117 | /// |
118 | /// # Example |
119 | /// |
120 | /// ``` |
121 | /// use regex_automata::{ |
122 | /// nfa::thompson::pikevm::PikeVM, |
123 | /// util::prefilter::Prefilter, |
124 | /// Input, Match, MatchKind, |
125 | /// }; |
126 | /// |
127 | /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo" , "bar" ]); |
128 | /// let re = PikeVM::builder() |
129 | /// .configure(PikeVM::config().prefilter(pre)) |
130 | /// .build(r"(foo|bar)[a-z]+" )?; |
131 | /// let mut cache = re.create_cache(); |
132 | /// let input = Input::new("foo1 barfox bar" ); |
133 | /// assert_eq!(Some(Match::must(0, 5..11)), re.find(&mut cache, input)); |
134 | /// |
135 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
136 | /// ``` |
137 | /// |
138 | /// Be warned though that an incorrect prefilter can lead to incorrect |
139 | /// results! |
140 | /// |
141 | /// ``` |
142 | /// use regex_automata::{ |
143 | /// nfa::thompson::pikevm::PikeVM, |
144 | /// util::prefilter::Prefilter, |
145 | /// Input, HalfMatch, MatchKind, |
146 | /// }; |
147 | /// |
148 | /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo" , "car" ]); |
149 | /// let re = PikeVM::builder() |
150 | /// .configure(PikeVM::config().prefilter(pre)) |
151 | /// .build(r"(foo|bar)[a-z]+" )?; |
152 | /// let mut cache = re.create_cache(); |
153 | /// let input = Input::new("foo1 barfox bar" ); |
154 | /// // No match reported even though there clearly is one! |
155 | /// assert_eq!(None, re.find(&mut cache, input)); |
156 | /// |
157 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
158 | /// ``` |
159 | pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config { |
160 | self.pre = Some(pre); |
161 | self |
162 | } |
163 | |
164 | /// Returns the match semantics set in this configuration. |
165 | pub fn get_match_kind(&self) -> MatchKind { |
166 | self.match_kind.unwrap_or(MatchKind::LeftmostFirst) |
167 | } |
168 | |
169 | /// Returns the prefilter set in this configuration, if one at all. |
170 | pub fn get_prefilter(&self) -> Option<&Prefilter> { |
171 | self.pre.as_ref().unwrap_or(&None).as_ref() |
172 | } |
173 | |
174 | /// Overwrite the default configuration such that the options in `o` are |
175 | /// always used. If an option in `o` is not set, then the corresponding |
176 | /// option in `self` is used. If it's not set in `self` either, then it |
177 | /// remains not set. |
178 | pub(crate) fn overwrite(&self, o: Config) -> Config { |
179 | Config { |
180 | match_kind: o.match_kind.or(self.match_kind), |
181 | pre: o.pre.or_else(|| self.pre.clone()), |
182 | } |
183 | } |
184 | } |
185 | |
186 | /// A builder for a `PikeVM`. |
187 | /// |
188 | /// This builder permits configuring options for the syntax of a pattern, |
189 | /// the NFA construction and the `PikeVM` construction. This builder is |
190 | /// different from a general purpose regex builder in that it permits fine |
191 | /// grain configuration of the construction process. The trade off for this is |
192 | /// complexity, and the possibility of setting a configuration that might not |
193 | /// make sense. For example, there are two different UTF-8 modes: |
194 | /// |
195 | /// * [`util::syntax::Config::utf8`](crate::util::syntax::Config::utf8) |
196 | /// controls whether the pattern itself can contain sub-expressions that match |
197 | /// invalid UTF-8. |
198 | /// * [`thompson::Config::utf8`] controls whether empty matches that split a |
199 | /// Unicode codepoint are reported or not. |
200 | /// |
201 | /// Generally speaking, callers will want to either enable all of these or |
202 | /// disable all of these. |
203 | /// |
204 | /// # Example |
205 | /// |
206 | /// This example shows how to disable UTF-8 mode in the syntax and the regex |
207 | /// itself. This is generally what you want for matching on arbitrary bytes. |
208 | /// |
209 | /// ``` |
210 | /// use regex_automata::{ |
211 | /// nfa::thompson::{self, pikevm::PikeVM}, |
212 | /// util::syntax, |
213 | /// Match, |
214 | /// }; |
215 | /// |
216 | /// let re = PikeVM::builder() |
217 | /// .syntax(syntax::Config::new().utf8(false)) |
218 | /// .thompson(thompson::Config::new().utf8(false)) |
219 | /// .build(r"foo(?-u:[^b])ar.*" )?; |
220 | /// let mut cache = re.create_cache(); |
221 | /// |
222 | /// let haystack = b" \xFEfoo \xFFarzz \xE2\x98\xFF\n" ; |
223 | /// let expected = Some(Match::must(0, 1..9)); |
224 | /// let got = re.find_iter(&mut cache, haystack).next(); |
225 | /// assert_eq!(expected, got); |
226 | /// // Notice that `(?-u:[^b])` matches invalid UTF-8, |
227 | /// // but the subsequent `.*` does not! Disabling UTF-8 |
228 | /// // on the syntax permits this. |
229 | /// // |
230 | /// // N.B. This example does not show the impact of |
231 | /// // disabling UTF-8 mode on a PikeVM Config, since that |
232 | /// // only impacts regexes that can produce matches of |
233 | /// // length 0. |
234 | /// assert_eq!(b"foo \xFFarzz" , &haystack[got.unwrap().range()]); |
235 | /// |
236 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
237 | /// ``` |
238 | #[derive(Clone, Debug)] |
239 | pub struct Builder { |
240 | config: Config, |
241 | #[cfg (feature = "syntax" )] |
242 | thompson: thompson::Compiler, |
243 | } |
244 | |
245 | impl Builder { |
246 | /// Create a new PikeVM builder with its default configuration. |
247 | pub fn new() -> Builder { |
248 | Builder { |
249 | config: Config::default(), |
250 | #[cfg (feature = "syntax" )] |
251 | thompson: thompson::Compiler::new(), |
252 | } |
253 | } |
254 | |
255 | /// Build a `PikeVM` from the given pattern. |
256 | /// |
257 | /// If there was a problem parsing or compiling the pattern, then an error |
258 | /// is returned. |
259 | #[cfg (feature = "syntax" )] |
260 | pub fn build(&self, pattern: &str) -> Result<PikeVM, BuildError> { |
261 | self.build_many(&[pattern]) |
262 | } |
263 | |
264 | /// Build a `PikeVM` from the given patterns. |
265 | #[cfg (feature = "syntax" )] |
266 | pub fn build_many<P: AsRef<str>>( |
267 | &self, |
268 | patterns: &[P], |
269 | ) -> Result<PikeVM, BuildError> { |
270 | let nfa = self.thompson.build_many(patterns)?; |
271 | self.build_from_nfa(nfa) |
272 | } |
273 | |
274 | /// Build a `PikeVM` directly from its NFA. |
275 | /// |
276 | /// Note that when using this method, any configuration that applies to the |
277 | /// construction of the NFA itself will of course be ignored, since the NFA |
278 | /// given here is already built. |
279 | pub fn build_from_nfa(&self, nfa: NFA) -> Result<PikeVM, BuildError> { |
280 | nfa.look_set_any().available().map_err(BuildError::word)?; |
281 | Ok(PikeVM { config: self.config.clone(), nfa }) |
282 | } |
283 | |
284 | /// Apply the given `PikeVM` configuration options to this builder. |
285 | pub fn configure(&mut self, config: Config) -> &mut Builder { |
286 | self.config = self.config.overwrite(config); |
287 | self |
288 | } |
289 | |
290 | /// Set the syntax configuration for this builder using |
291 | /// [`syntax::Config`](crate::util::syntax::Config). |
292 | /// |
293 | /// This permits setting things like case insensitivity, Unicode and multi |
294 | /// line mode. |
295 | /// |
296 | /// These settings only apply when constructing a PikeVM directly from a |
297 | /// pattern. |
298 | #[cfg (feature = "syntax" )] |
299 | pub fn syntax( |
300 | &mut self, |
301 | config: crate::util::syntax::Config, |
302 | ) -> &mut Builder { |
303 | self.thompson.syntax(config); |
304 | self |
305 | } |
306 | |
307 | /// Set the Thompson NFA configuration for this builder using |
308 | /// [`nfa::thompson::Config`](crate::nfa::thompson::Config). |
309 | /// |
310 | /// This permits setting things like if additional time should be spent |
311 | /// shrinking the size of the NFA. |
312 | /// |
313 | /// These settings only apply when constructing a PikeVM directly from a |
314 | /// pattern. |
315 | #[cfg (feature = "syntax" )] |
316 | pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { |
317 | self.thompson.configure(config); |
318 | self |
319 | } |
320 | } |
321 | |
322 | /// A virtual machine for executing regex searches with capturing groups. |
323 | /// |
324 | /// # Infallible APIs |
325 | /// |
326 | /// Unlike most other regex engines in this crate, a `PikeVM` never returns an |
327 | /// error at search time. It supports all [`Anchored`] configurations, never |
328 | /// quits and works on haystacks of arbitrary length. |
329 | /// |
330 | /// There are two caveats to mention though: |
331 | /// |
332 | /// * If an invalid pattern ID is given to a search via [`Anchored::Pattern`], |
333 | /// then the PikeVM will report "no match." This is consistent with all other |
334 | /// regex engines in this crate. |
335 | /// * When using [`PikeVM::which_overlapping_matches`] with a [`PatternSet`] |
336 | /// that has insufficient capacity to store all valid pattern IDs, then if a |
337 | /// match occurs for a `PatternID` that cannot be inserted, it is silently |
338 | /// dropped as if it did not match. |
339 | /// |
340 | /// # Advice |
341 | /// |
342 | /// The `PikeVM` is generally the most "powerful" regex engine in this crate. |
343 | /// "Powerful" in this context means that it can handle any regular expression |
344 | /// that is parseable by `regex-syntax` and any size haystack. Regretably, |
345 | /// the `PikeVM` is also simultaneously often the _slowest_ regex engine in |
346 | /// practice. This results in an annoying situation where one generally tries |
347 | /// to pick any other regex engine (or perhaps none at all) before being |
348 | /// forced to fall back to a `PikeVM`. |
349 | /// |
350 | /// For example, a common strategy for dealing with capturing groups is to |
351 | /// actually look for the overall match of the regex using a faster regex |
352 | /// engine, like a [lazy DFA](crate::hybrid::regex::Regex). Once the overall |
353 | /// match is found, one can then run the `PikeVM` on just the match span to |
354 | /// find the spans of the capturing groups. In this way, the faster regex |
355 | /// engine does the majority of the work, while the `PikeVM` only lends its |
356 | /// power in a more limited role. |
357 | /// |
358 | /// Unfortunately, this isn't always possible because the faster regex engines |
359 | /// don't support all of the regex features in `regex-syntax`. This notably |
360 | /// includes (and is currently limited to) Unicode word boundaries. So if |
361 | /// your pattern has Unicode word boundaries, you typically can't use a |
362 | /// DFA-based regex engine at all (unless you [enable heuristic support for |
363 | /// it](crate::hybrid::dfa::Config::unicode_word_boundary)). (The [one-pass |
364 | /// DFA](crate::dfa::onepass::DFA) can handle Unicode word boundaries for |
365 | /// anchored searches only, but in a cruel sort of joke, many Unicode features |
366 | /// tend to result in making the regex _not_ one-pass.) |
367 | /// |
368 | /// # Example |
369 | /// |
370 | /// This example shows that the `PikeVM` implements Unicode word boundaries |
371 | /// correctly by default. |
372 | /// |
373 | /// ``` |
374 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
375 | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; |
376 | /// |
377 | /// let re = PikeVM::new(r"\b\w+\b" )?; |
378 | /// let mut cache = re.create_cache(); |
379 | /// |
380 | /// let mut it = re.find_iter(&mut cache, "Шерлок Холмс" ); |
381 | /// assert_eq!(Some(Match::must(0, 0..12)), it.next()); |
382 | /// assert_eq!(Some(Match::must(0, 13..23)), it.next()); |
383 | /// assert_eq!(None, it.next()); |
384 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
385 | /// ``` |
386 | #[derive(Clone, Debug)] |
387 | pub struct PikeVM { |
388 | config: Config, |
389 | nfa: NFA, |
390 | } |
391 | |
392 | impl PikeVM { |
393 | /// Parse the given regular expression using the default configuration and |
394 | /// return the corresponding `PikeVM`. |
395 | /// |
396 | /// If you want a non-default configuration, then use the [`Builder`] to |
397 | /// set your own configuration. |
398 | /// |
399 | /// # Example |
400 | /// |
401 | /// ``` |
402 | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; |
403 | /// |
404 | /// let re = PikeVM::new("foo[0-9]+bar" )?; |
405 | /// let mut cache = re.create_cache(); |
406 | /// assert_eq!( |
407 | /// Some(Match::must(0, 3..14)), |
408 | /// re.find_iter(&mut cache, "zzzfoo12345barzzz" ).next(), |
409 | /// ); |
410 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
411 | /// ``` |
412 | #[cfg (feature = "syntax" )] |
413 | pub fn new(pattern: &str) -> Result<PikeVM, BuildError> { |
414 | PikeVM::builder().build(pattern) |
415 | } |
416 | |
417 | /// Like `new`, but parses multiple patterns into a single "multi regex." |
418 | /// This similarly uses the default regex configuration. |
419 | /// |
420 | /// # Example |
421 | /// |
422 | /// ``` |
423 | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; |
424 | /// |
425 | /// let re = PikeVM::new_many(&["[a-z]+" , "[0-9]+" ])?; |
426 | /// let mut cache = re.create_cache(); |
427 | /// |
428 | /// let mut it = re.find_iter(&mut cache, "abc 1 foo 4567 0 quux" ); |
429 | /// assert_eq!(Some(Match::must(0, 0..3)), it.next()); |
430 | /// assert_eq!(Some(Match::must(1, 4..5)), it.next()); |
431 | /// assert_eq!(Some(Match::must(0, 6..9)), it.next()); |
432 | /// assert_eq!(Some(Match::must(1, 10..14)), it.next()); |
433 | /// assert_eq!(Some(Match::must(1, 15..16)), it.next()); |
434 | /// assert_eq!(Some(Match::must(0, 17..21)), it.next()); |
435 | /// assert_eq!(None, it.next()); |
436 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
437 | /// ``` |
438 | #[cfg (feature = "syntax" )] |
439 | pub fn new_many<P: AsRef<str>>( |
440 | patterns: &[P], |
441 | ) -> Result<PikeVM, BuildError> { |
442 | PikeVM::builder().build_many(patterns) |
443 | } |
444 | |
445 | /// Like `new`, but builds a PikeVM directly from an NFA. This is useful |
446 | /// if you already have an NFA, or even if you hand-assembled the NFA. |
447 | /// |
448 | /// # Example |
449 | /// |
450 | /// This shows how to hand assemble a regular expression via its HIR, |
451 | /// compile an NFA from it and build a PikeVM from the NFA. |
452 | /// |
453 | /// ``` |
454 | /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; |
455 | /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; |
456 | /// |
457 | /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![ |
458 | /// ClassBytesRange::new(b'0' , b'9' ), |
459 | /// ClassBytesRange::new(b'A' , b'Z' ), |
460 | /// ClassBytesRange::new(b'_' , b'_' ), |
461 | /// ClassBytesRange::new(b'a' , b'z' ), |
462 | /// ]))); |
463 | /// |
464 | /// let config = NFA::config().nfa_size_limit(Some(1_000)); |
465 | /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?; |
466 | /// |
467 | /// let re = PikeVM::new_from_nfa(nfa)?; |
468 | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
469 | /// let expected = Some(Match::must(0, 3..4)); |
470 | /// re.captures(&mut cache, "!@#A#@!" , &mut caps); |
471 | /// assert_eq!(expected, caps.get_match()); |
472 | /// |
473 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
474 | /// ``` |
475 | pub fn new_from_nfa(nfa: NFA) -> Result<PikeVM, BuildError> { |
476 | PikeVM::builder().build_from_nfa(nfa) |
477 | } |
478 | |
479 | /// Create a new `PikeVM` that matches every input. |
480 | /// |
481 | /// # Example |
482 | /// |
483 | /// ``` |
484 | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; |
485 | /// |
486 | /// let re = PikeVM::always_match()?; |
487 | /// let mut cache = re.create_cache(); |
488 | /// |
489 | /// let expected = Match::must(0, 0..0); |
490 | /// assert_eq!(Some(expected), re.find_iter(&mut cache, "" ).next()); |
491 | /// assert_eq!(Some(expected), re.find_iter(&mut cache, "foo" ).next()); |
492 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
493 | /// ``` |
494 | pub fn always_match() -> Result<PikeVM, BuildError> { |
495 | let nfa = thompson::NFA::always_match(); |
496 | PikeVM::new_from_nfa(nfa) |
497 | } |
498 | |
499 | /// Create a new `PikeVM` that never matches any input. |
500 | /// |
501 | /// # Example |
502 | /// |
503 | /// ``` |
504 | /// use regex_automata::nfa::thompson::pikevm::PikeVM; |
505 | /// |
506 | /// let re = PikeVM::never_match()?; |
507 | /// let mut cache = re.create_cache(); |
508 | /// |
509 | /// assert_eq!(None, re.find_iter(&mut cache, "" ).next()); |
510 | /// assert_eq!(None, re.find_iter(&mut cache, "foo" ).next()); |
511 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
512 | /// ``` |
513 | pub fn never_match() -> Result<PikeVM, BuildError> { |
514 | let nfa = thompson::NFA::never_match(); |
515 | PikeVM::new_from_nfa(nfa) |
516 | } |
517 | |
518 | /// Return a default configuration for a `PikeVM`. |
519 | /// |
520 | /// This is a convenience routine to avoid needing to import the `Config` |
521 | /// type when customizing the construction of a `PikeVM`. |
522 | /// |
523 | /// # Example |
524 | /// |
525 | /// This example shows how to disable UTF-8 mode. When UTF-8 mode is |
526 | /// disabled, zero-width matches that split a codepoint are allowed. |
527 | /// Otherwise they are never reported. |
528 | /// |
529 | /// In the code below, notice that `""` is permitted to match positions |
530 | /// that split the encoding of a codepoint. |
531 | /// |
532 | /// ``` |
533 | /// use regex_automata::{nfa::thompson::{self, pikevm::PikeVM}, Match}; |
534 | /// |
535 | /// let re = PikeVM::builder() |
536 | /// .thompson(thompson::Config::new().utf8(false)) |
537 | /// .build(r"" )?; |
538 | /// let mut cache = re.create_cache(); |
539 | /// |
540 | /// let haystack = "a☃z" ; |
541 | /// let mut it = re.find_iter(&mut cache, haystack); |
542 | /// assert_eq!(Some(Match::must(0, 0..0)), it.next()); |
543 | /// assert_eq!(Some(Match::must(0, 1..1)), it.next()); |
544 | /// assert_eq!(Some(Match::must(0, 2..2)), it.next()); |
545 | /// assert_eq!(Some(Match::must(0, 3..3)), it.next()); |
546 | /// assert_eq!(Some(Match::must(0, 4..4)), it.next()); |
547 | /// assert_eq!(Some(Match::must(0, 5..5)), it.next()); |
548 | /// assert_eq!(None, it.next()); |
549 | /// |
550 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
551 | /// ``` |
552 | pub fn config() -> Config { |
553 | Config::new() |
554 | } |
555 | |
556 | /// Return a builder for configuring the construction of a `PikeVM`. |
557 | /// |
558 | /// This is a convenience routine to avoid needing to import the |
559 | /// [`Builder`] type in common cases. |
560 | /// |
561 | /// # Example |
562 | /// |
563 | /// This example shows how to use the builder to disable UTF-8 mode |
564 | /// everywhere. |
565 | /// |
566 | /// ``` |
567 | /// use regex_automata::{ |
568 | /// nfa::thompson::{self, pikevm::PikeVM}, |
569 | /// util::syntax, |
570 | /// Match, |
571 | /// }; |
572 | /// |
573 | /// let re = PikeVM::builder() |
574 | /// .syntax(syntax::Config::new().utf8(false)) |
575 | /// .thompson(thompson::Config::new().utf8(false)) |
576 | /// .build(r"foo(?-u:[^b])ar.*" )?; |
577 | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
578 | /// |
579 | /// let haystack = b" \xFEfoo \xFFarzz \xE2\x98\xFF\n" ; |
580 | /// let expected = Some(Match::must(0, 1..9)); |
581 | /// re.captures(&mut cache, haystack, &mut caps); |
582 | /// assert_eq!(expected, caps.get_match()); |
583 | /// |
584 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
585 | /// ``` |
586 | pub fn builder() -> Builder { |
587 | Builder::new() |
588 | } |
589 | |
590 | /// Create a new empty set of capturing groups that is guaranteed to be |
591 | /// valid for the search APIs on this `PikeVM`. |
592 | /// |
593 | /// A `Captures` value created for a specific `PikeVM` cannot be used with |
594 | /// any other `PikeVM`. |
595 | /// |
596 | /// This is a convenience function for [`Captures::all`]. See the |
597 | /// [`Captures`] documentation for an explanation of its alternative |
598 | /// constructors that permit the `PikeVM` to do less work during a search, |
599 | /// and thus might make it faster. |
600 | pub fn create_captures(&self) -> Captures { |
601 | Captures::all(self.get_nfa().group_info().clone()) |
602 | } |
603 | |
604 | /// Create a new cache for this `PikeVM`. |
605 | /// |
606 | /// The cache returned should only be used for searches for this |
607 | /// `PikeVM`. If you want to reuse the cache for another `PikeVM`, then |
608 | /// you must call [`Cache::reset`] with that `PikeVM` (or, equivalently, |
609 | /// [`PikeVM::reset_cache`]). |
610 | pub fn create_cache(&self) -> Cache { |
611 | Cache::new(self) |
612 | } |
613 | |
614 | /// Reset the given cache such that it can be used for searching with the |
615 | /// this `PikeVM` (and only this `PikeVM`). |
616 | /// |
617 | /// A cache reset permits reusing memory already allocated in this cache |
618 | /// with a different `PikeVM`. |
619 | /// |
620 | /// # Example |
621 | /// |
622 | /// This shows how to re-purpose a cache for use with a different `PikeVM`. |
623 | /// |
624 | /// ``` |
625 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
626 | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; |
627 | /// |
628 | /// let re1 = PikeVM::new(r"\w" )?; |
629 | /// let re2 = PikeVM::new(r"\W" )?; |
630 | /// |
631 | /// let mut cache = re1.create_cache(); |
632 | /// assert_eq!( |
633 | /// Some(Match::must(0, 0..2)), |
634 | /// re1.find_iter(&mut cache, "Δ" ).next(), |
635 | /// ); |
636 | /// |
637 | /// // Using 'cache' with re2 is not allowed. It may result in panics or |
638 | /// // incorrect results. In order to re-purpose the cache, we must reset |
639 | /// // it with the PikeVM we'd like to use it with. |
640 | /// // |
641 | /// // Similarly, after this reset, using the cache with 're1' is also not |
642 | /// // allowed. |
643 | /// re2.reset_cache(&mut cache); |
644 | /// assert_eq!( |
645 | /// Some(Match::must(0, 0..3)), |
646 | /// re2.find_iter(&mut cache, "☃" ).next(), |
647 | /// ); |
648 | /// |
649 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
650 | /// ``` |
651 | pub fn reset_cache(&self, cache: &mut Cache) { |
652 | cache.reset(self); |
653 | } |
654 | |
655 | /// Returns the total number of patterns compiled into this `PikeVM`. |
656 | /// |
657 | /// In the case of a `PikeVM` that contains no patterns, this returns `0`. |
658 | /// |
659 | /// # Example |
660 | /// |
661 | /// This example shows the pattern length for a `PikeVM` that never |
662 | /// matches: |
663 | /// |
664 | /// ``` |
665 | /// use regex_automata::nfa::thompson::pikevm::PikeVM; |
666 | /// |
667 | /// let re = PikeVM::never_match()?; |
668 | /// assert_eq!(re.pattern_len(), 0); |
669 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
670 | /// ``` |
671 | /// |
672 | /// And another example for a `PikeVM` that matches at every position: |
673 | /// |
674 | /// ``` |
675 | /// use regex_automata::nfa::thompson::pikevm::PikeVM; |
676 | /// |
677 | /// let re = PikeVM::always_match()?; |
678 | /// assert_eq!(re.pattern_len(), 1); |
679 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
680 | /// ``` |
681 | /// |
682 | /// And finally, a `PikeVM` that was constructed from multiple patterns: |
683 | /// |
684 | /// ``` |
685 | /// use regex_automata::nfa::thompson::pikevm::PikeVM; |
686 | /// |
687 | /// let re = PikeVM::new_many(&["[0-9]+" , "[a-z]+" , "[A-Z]+" ])?; |
688 | /// assert_eq!(re.pattern_len(), 3); |
689 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
690 | /// ``` |
691 | pub fn pattern_len(&self) -> usize { |
692 | self.nfa.pattern_len() |
693 | } |
694 | |
695 | /// Return the config for this `PikeVM`. |
696 | #[inline ] |
697 | pub fn get_config(&self) -> &Config { |
698 | &self.config |
699 | } |
700 | |
701 | /// Returns a reference to the underlying NFA. |
702 | #[inline ] |
703 | pub fn get_nfa(&self) -> &NFA { |
704 | &self.nfa |
705 | } |
706 | } |
707 | |
708 | impl PikeVM { |
709 | /// Returns true if and only if this `PikeVM` matches the given haystack. |
710 | /// |
711 | /// This routine may short circuit if it knows that scanning future |
712 | /// input will never lead to a different result. In particular, if the |
713 | /// underlying NFA enters a match state, then this routine will return |
714 | /// `true` immediately without inspecting any future input. (Consider how |
715 | /// this might make a difference given the regex `a+` on the haystack |
716 | /// `aaaaaaaaaaaaaaa`. This routine can stop after it sees the first `a`, |
717 | /// but routines like `find` need to continue searching because `+` is |
718 | /// greedy by default.) |
719 | /// |
720 | /// # Example |
721 | /// |
722 | /// This shows basic usage: |
723 | /// |
724 | /// ``` |
725 | /// use regex_automata::nfa::thompson::pikevm::PikeVM; |
726 | /// |
727 | /// let re = PikeVM::new("foo[0-9]+bar" )?; |
728 | /// let mut cache = re.create_cache(); |
729 | /// |
730 | /// assert!(re.is_match(&mut cache, "foo12345bar" )); |
731 | /// assert!(!re.is_match(&mut cache, "foobar" )); |
732 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
733 | /// ``` |
734 | /// |
735 | /// # Example: consistency with search APIs |
736 | /// |
737 | /// `is_match` is guaranteed to return `true` whenever `find` returns a |
738 | /// match. This includes searches that are executed entirely within a |
739 | /// codepoint: |
740 | /// |
741 | /// ``` |
742 | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Input}; |
743 | /// |
744 | /// let re = PikeVM::new("a*" )?; |
745 | /// let mut cache = re.create_cache(); |
746 | /// |
747 | /// assert!(!re.is_match(&mut cache, Input::new("☃" ).span(1..2))); |
748 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
749 | /// ``` |
750 | /// |
751 | /// Notice that when UTF-8 mode is disabled, then the above reports a |
752 | /// match because the restriction against zero-width matches that split a |
753 | /// codepoint has been lifted: |
754 | /// |
755 | /// ``` |
756 | /// use regex_automata::{nfa::thompson::{pikevm::PikeVM, NFA}, Input}; |
757 | /// |
758 | /// let re = PikeVM::builder() |
759 | /// .thompson(NFA::config().utf8(false)) |
760 | /// .build("a*" )?; |
761 | /// let mut cache = re.create_cache(); |
762 | /// |
763 | /// assert!(re.is_match(&mut cache, Input::new("☃" ).span(1..2))); |
764 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
765 | /// ``` |
766 | #[inline ] |
767 | pub fn is_match<'h, I: Into<Input<'h>>>( |
768 | &self, |
769 | cache: &mut Cache, |
770 | input: I, |
771 | ) -> bool { |
772 | let input = input.into().earliest(true); |
773 | self.search_slots(cache, &input, &mut []).is_some() |
774 | } |
775 | |
776 | /// Executes a leftmost forward search and returns a `Match` if one exists. |
777 | /// |
778 | /// This routine only includes the overall match span. To get access to the |
779 | /// individual spans of each capturing group, use [`PikeVM::captures`]. |
780 | /// |
781 | /// # Example |
782 | /// |
783 | /// Leftmost first match semantics corresponds to the match with the |
784 | /// smallest starting offset, but where the end offset is determined by |
785 | /// preferring earlier branches in the original regular expression. For |
786 | /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` |
787 | /// will match `Samwise` in `Samwise`. |
788 | /// |
789 | /// Generally speaking, the "leftmost first" match is how most backtracking |
790 | /// regular expressions tend to work. This is in contrast to POSIX-style |
791 | /// regular expressions that yield "leftmost longest" matches. Namely, |
792 | /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using |
793 | /// leftmost longest semantics. (This crate does not currently support |
794 | /// leftmost longest semantics.) |
795 | /// |
796 | /// ``` |
797 | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; |
798 | /// |
799 | /// let re = PikeVM::new("foo[0-9]+" )?; |
800 | /// let mut cache = re.create_cache(); |
801 | /// let expected = Match::must(0, 0..8); |
802 | /// assert_eq!(Some(expected), re.find(&mut cache, "foo12345" )); |
803 | /// |
804 | /// // Even though a match is found after reading the first byte (`a`), |
805 | /// // the leftmost first match semantics demand that we find the earliest |
806 | /// // match that prefers earlier parts of the pattern over later parts. |
807 | /// let re = PikeVM::new("abc|a" )?; |
808 | /// let mut cache = re.create_cache(); |
809 | /// let expected = Match::must(0, 0..3); |
810 | /// assert_eq!(Some(expected), re.find(&mut cache, "abc" )); |
811 | /// |
812 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
813 | /// ``` |
814 | #[inline ] |
815 | pub fn find<'h, I: Into<Input<'h>>>( |
816 | &self, |
817 | cache: &mut Cache, |
818 | input: I, |
819 | ) -> Option<Match> { |
820 | let input = input.into(); |
821 | if self.get_nfa().pattern_len() == 1 { |
822 | let mut slots = [None, None]; |
823 | let pid = self.search_slots(cache, &input, &mut slots)?; |
824 | let start = slots[0]?.get(); |
825 | let end = slots[1]?.get(); |
826 | return Some(Match::new(pid, Span { start, end })); |
827 | } |
828 | let ginfo = self.get_nfa().group_info(); |
829 | let slots_len = ginfo.implicit_slot_len(); |
830 | let mut slots = vec![None; slots_len]; |
831 | let pid = self.search_slots(cache, &input, &mut slots)?; |
832 | let start = slots[pid.as_usize() * 2]?.get(); |
833 | let end = slots[pid.as_usize() * 2 + 1]?.get(); |
834 | Some(Match::new(pid, Span { start, end })) |
835 | } |
836 | |
837 | /// Executes a leftmost forward search and writes the spans of capturing |
838 | /// groups that participated in a match into the provided [`Captures`] |
839 | /// value. If no match was found, then [`Captures::is_match`] is guaranteed |
840 | /// to return `false`. |
841 | /// |
842 | /// # Example |
843 | /// |
844 | /// ``` |
845 | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; |
846 | /// |
847 | /// let re = PikeVM::new(r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$" )?; |
848 | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
849 | /// |
850 | /// re.captures(&mut cache, "2010-03-14" , &mut caps); |
851 | /// assert!(caps.is_match()); |
852 | /// assert_eq!(Some(Span::from(0..4)), caps.get_group(1)); |
853 | /// assert_eq!(Some(Span::from(5..7)), caps.get_group(2)); |
854 | /// assert_eq!(Some(Span::from(8..10)), caps.get_group(3)); |
855 | /// |
856 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
857 | /// ``` |
858 | #[inline ] |
859 | pub fn captures<'h, I: Into<Input<'h>>>( |
860 | &self, |
861 | cache: &mut Cache, |
862 | input: I, |
863 | caps: &mut Captures, |
864 | ) { |
865 | self.search(cache, &input.into(), caps) |
866 | } |
867 | |
868 | /// Returns an iterator over all non-overlapping leftmost matches in the |
869 | /// given bytes. If no match exists, then the iterator yields no elements. |
870 | /// |
871 | /// # Example |
872 | /// |
873 | /// ``` |
874 | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; |
875 | /// |
876 | /// let re = PikeVM::new("foo[0-9]+" )?; |
877 | /// let mut cache = re.create_cache(); |
878 | /// |
879 | /// let text = "foo1 foo12 foo123" ; |
880 | /// let matches: Vec<Match> = re.find_iter(&mut cache, text).collect(); |
881 | /// assert_eq!(matches, vec![ |
882 | /// Match::must(0, 0..4), |
883 | /// Match::must(0, 5..10), |
884 | /// Match::must(0, 11..17), |
885 | /// ]); |
886 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
887 | /// ``` |
888 | #[inline ] |
889 | pub fn find_iter<'r, 'c, 'h, I: Into<Input<'h>>>( |
890 | &'r self, |
891 | cache: &'c mut Cache, |
892 | input: I, |
893 | ) -> FindMatches<'r, 'c, 'h> { |
894 | let caps = Captures::matches(self.get_nfa().group_info().clone()); |
895 | let it = iter::Searcher::new(input.into()); |
896 | FindMatches { re: self, cache, caps, it } |
897 | } |
898 | |
899 | /// Returns an iterator over all non-overlapping `Captures` values. If no |
900 | /// match exists, then the iterator yields no elements. |
901 | /// |
902 | /// This yields the same matches as [`PikeVM::find_iter`], but it includes |
903 | /// the spans of all capturing groups that participate in each match. |
904 | /// |
905 | /// **Tip:** See [`util::iter::Searcher`](crate::util::iter::Searcher) for |
906 | /// how to correctly iterate over all matches in a haystack while avoiding |
907 | /// the creation of a new `Captures` value for every match. (Which you are |
908 | /// forced to do with an `Iterator`.) |
909 | /// |
910 | /// # Example |
911 | /// |
912 | /// ``` |
913 | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; |
914 | /// |
915 | /// let re = PikeVM::new("foo(?P<numbers>[0-9]+)" )?; |
916 | /// let mut cache = re.create_cache(); |
917 | /// |
918 | /// let text = "foo1 foo12 foo123" ; |
919 | /// let matches: Vec<Span> = re |
920 | /// .captures_iter(&mut cache, text) |
921 | /// // The unwrap is OK since 'numbers' matches if the pattern matches. |
922 | /// .map(|caps| caps.get_group_by_name("numbers" ).unwrap()) |
923 | /// .collect(); |
924 | /// assert_eq!(matches, vec![ |
925 | /// Span::from(3..4), |
926 | /// Span::from(8..10), |
927 | /// Span::from(14..17), |
928 | /// ]); |
929 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
930 | /// ``` |
931 | #[inline ] |
932 | pub fn captures_iter<'r, 'c, 'h, I: Into<Input<'h>>>( |
933 | &'r self, |
934 | cache: &'c mut Cache, |
935 | input: I, |
936 | ) -> CapturesMatches<'r, 'c, 'h> { |
937 | let caps = self.create_captures(); |
938 | let it = iter::Searcher::new(input.into()); |
939 | CapturesMatches { re: self, cache, caps, it } |
940 | } |
941 | } |
942 | |
943 | impl PikeVM { |
944 | /// Executes a leftmost forward search and writes the spans of capturing |
945 | /// groups that participated in a match into the provided [`Captures`] |
946 | /// value. If no match was found, then [`Captures::is_match`] is guaranteed |
947 | /// to return `false`. |
948 | /// |
949 | /// This is like [`PikeVM::captures`], but it accepts a concrete `&Input` |
950 | /// instead of an `Into<Input>`. |
951 | /// |
952 | /// # Example: specific pattern search |
953 | /// |
954 | /// This example shows how to build a multi-PikeVM that permits searching |
955 | /// for specific patterns. |
956 | /// |
957 | /// ``` |
958 | /// use regex_automata::{ |
959 | /// nfa::thompson::pikevm::PikeVM, |
960 | /// Anchored, Match, PatternID, Input, |
961 | /// }; |
962 | /// |
963 | /// let re = PikeVM::new_many(&["[a-z0-9]{6}" , "[a-z][a-z0-9]{5}" ])?; |
964 | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
965 | /// let haystack = "foo123" ; |
966 | /// |
967 | /// // Since we are using the default leftmost-first match and both |
968 | /// // patterns match at the same starting position, only the first pattern |
969 | /// // will be returned in this case when doing a search for any of the |
970 | /// // patterns. |
971 | /// let expected = Some(Match::must(0, 0..6)); |
972 | /// re.search(&mut cache, &Input::new(haystack), &mut caps); |
973 | /// assert_eq!(expected, caps.get_match()); |
974 | /// |
975 | /// // But if we want to check whether some other pattern matches, then we |
976 | /// // can provide its pattern ID. |
977 | /// let expected = Some(Match::must(1, 0..6)); |
978 | /// let input = Input::new(haystack) |
979 | /// .anchored(Anchored::Pattern(PatternID::must(1))); |
980 | /// re.search(&mut cache, &input, &mut caps); |
981 | /// assert_eq!(expected, caps.get_match()); |
982 | /// |
983 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
984 | /// ``` |
985 | /// |
986 | /// # Example: specifying the bounds of a search |
987 | /// |
988 | /// This example shows how providing the bounds of a search can produce |
989 | /// different results than simply sub-slicing the haystack. |
990 | /// |
991 | /// ``` |
992 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
993 | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match, Input}; |
994 | /// |
995 | /// let re = PikeVM::new(r"\b[0-9]{3}\b" )?; |
996 | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
997 | /// let haystack = "foo123bar" ; |
998 | /// |
999 | /// // Since we sub-slice the haystack, the search doesn't know about |
1000 | /// // the larger context and assumes that `123` is surrounded by word |
1001 | /// // boundaries. And of course, the match position is reported relative |
1002 | /// // to the sub-slice as well, which means we get `0..3` instead of |
1003 | /// // `3..6`. |
1004 | /// let expected = Some(Match::must(0, 0..3)); |
1005 | /// re.search(&mut cache, &Input::new(&haystack[3..6]), &mut caps); |
1006 | /// assert_eq!(expected, caps.get_match()); |
1007 | /// |
1008 | /// // But if we provide the bounds of the search within the context of the |
1009 | /// // entire haystack, then the search can take the surrounding context |
1010 | /// // into account. (And if we did find a match, it would be reported |
1011 | /// // as a valid offset into `haystack` instead of its sub-slice.) |
1012 | /// let expected = None; |
1013 | /// let input = Input::new(haystack).range(3..6); |
1014 | /// re.search(&mut cache, &input, &mut caps); |
1015 | /// assert_eq!(expected, caps.get_match()); |
1016 | /// |
1017 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1018 | /// ``` |
1019 | #[inline ] |
1020 | pub fn search( |
1021 | &self, |
1022 | cache: &mut Cache, |
1023 | input: &Input<'_>, |
1024 | caps: &mut Captures, |
1025 | ) { |
1026 | caps.set_pattern(None); |
1027 | let pid = self.search_slots(cache, input, caps.slots_mut()); |
1028 | caps.set_pattern(pid); |
1029 | } |
1030 | |
1031 | /// Executes a leftmost forward search and writes the spans of capturing |
1032 | /// groups that participated in a match into the provided `slots`, and |
1033 | /// returns the matching pattern ID. The contents of the slots for patterns |
1034 | /// other than the matching pattern are unspecified. If no match was found, |
1035 | /// then `None` is returned and the contents of `slots` is unspecified. |
1036 | /// |
1037 | /// This is like [`PikeVM::search`], but it accepts a raw slots slice |
1038 | /// instead of a `Captures` value. This is useful in contexts where you |
1039 | /// don't want or need to allocate a `Captures`. |
1040 | /// |
1041 | /// It is legal to pass _any_ number of slots to this routine. If the regex |
1042 | /// engine would otherwise write a slot offset that doesn't fit in the |
1043 | /// provided slice, then it is simply skipped. In general though, there are |
1044 | /// usually three slice lengths you might want to use: |
1045 | /// |
1046 | /// * An empty slice, if you only care about which pattern matched. |
1047 | /// * A slice with |
1048 | /// [`pattern_len() * 2`](crate::nfa::thompson::NFA::pattern_len) |
1049 | /// slots, if you only care about the overall match spans for each matching |
1050 | /// pattern. |
1051 | /// * A slice with |
1052 | /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which |
1053 | /// permits recording match offsets for every capturing group in every |
1054 | /// pattern. |
1055 | /// |
1056 | /// # Example |
1057 | /// |
1058 | /// This example shows how to find the overall match offsets in a |
1059 | /// multi-pattern search without allocating a `Captures` value. Indeed, we |
1060 | /// can put our slots right on the stack. |
1061 | /// |
1062 | /// ``` |
1063 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
1064 | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID, Input}; |
1065 | /// |
1066 | /// let re = PikeVM::new_many(&[ |
1067 | /// r"\pL+" , |
1068 | /// r"\d+" , |
1069 | /// ])?; |
1070 | /// let mut cache = re.create_cache(); |
1071 | /// let input = Input::new("!@#123" ); |
1072 | /// |
1073 | /// // We only care about the overall match offsets here, so we just |
1074 | /// // allocate two slots for each pattern. Each slot records the start |
1075 | /// // and end of the match. |
1076 | /// let mut slots = [None; 4]; |
1077 | /// let pid = re.search_slots(&mut cache, &input, &mut slots); |
1078 | /// assert_eq!(Some(PatternID::must(1)), pid); |
1079 | /// |
1080 | /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'. |
1081 | /// // See 'GroupInfo' for more details on the mapping between groups and |
1082 | /// // slot indices. |
1083 | /// let slot_start = pid.unwrap().as_usize() * 2; |
1084 | /// let slot_end = slot_start + 1; |
1085 | /// assert_eq!(Some(3), slots[slot_start].map(|s| s.get())); |
1086 | /// assert_eq!(Some(6), slots[slot_end].map(|s| s.get())); |
1087 | /// |
1088 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1089 | /// ``` |
1090 | #[inline ] |
1091 | pub fn search_slots( |
1092 | &self, |
1093 | cache: &mut Cache, |
1094 | input: &Input<'_>, |
1095 | slots: &mut [Option<NonMaxUsize>], |
1096 | ) -> Option<PatternID> { |
1097 | let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); |
1098 | if !utf8empty { |
1099 | let hm = self.search_slots_imp(cache, input, slots)?; |
1100 | return Some(hm.pattern()); |
1101 | } |
1102 | // There is an unfortunate special case where if the regex can |
1103 | // match the empty string and UTF-8 mode is enabled, the search |
1104 | // implementation requires that the slots have at least as much space |
1105 | // to report the bounds of any match. This is so zero-width matches |
1106 | // that split a codepoint can be filtered out. |
1107 | // |
1108 | // Note that if utf8empty is true, we specialize the case for when |
1109 | // the number of patterns is 1. In that case, we can just use a stack |
1110 | // allocation. Otherwise we resort to a heap allocation, which we |
1111 | // convince ourselves we're fine with due to the pathological nature of |
1112 | // this case. |
1113 | let min = self.get_nfa().group_info().implicit_slot_len(); |
1114 | if slots.len() >= min { |
1115 | let hm = self.search_slots_imp(cache, input, slots)?; |
1116 | return Some(hm.pattern()); |
1117 | } |
1118 | if self.get_nfa().pattern_len() == 1 { |
1119 | let mut enough = [None, None]; |
1120 | let got = self.search_slots_imp(cache, input, &mut enough); |
1121 | // This is OK because we know `enough` is strictly bigger than |
1122 | // `slots`, otherwise this special case isn't reached. |
1123 | slots.copy_from_slice(&enough[..slots.len()]); |
1124 | return got.map(|hm| hm.pattern()); |
1125 | } |
1126 | let mut enough = vec![None; min]; |
1127 | let got = self.search_slots_imp(cache, input, &mut enough); |
1128 | // This is OK because we know `enough` is strictly bigger than `slots`, |
1129 | // otherwise this special case isn't reached. |
1130 | slots.copy_from_slice(&enough[..slots.len()]); |
1131 | got.map(|hm| hm.pattern()) |
1132 | } |
1133 | |
1134 | /// This is the actual implementation of `search_slots_imp` that |
1135 | /// doesn't account for the special case when 1) the NFA has UTF-8 mode |
1136 | /// enabled, 2) the NFA can match the empty string and 3) the caller has |
1137 | /// provided an insufficient number of slots to record match offsets. |
1138 | #[inline (never)] |
1139 | fn search_slots_imp( |
1140 | &self, |
1141 | cache: &mut Cache, |
1142 | input: &Input<'_>, |
1143 | slots: &mut [Option<NonMaxUsize>], |
1144 | ) -> Option<HalfMatch> { |
1145 | let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); |
1146 | let hm = match self.search_imp(cache, input, slots) { |
1147 | None => return None, |
1148 | Some(hm) if !utf8empty => return Some(hm), |
1149 | Some(hm) => hm, |
1150 | }; |
1151 | empty::skip_splits_fwd(input, hm, hm.offset(), |input| { |
1152 | Ok(self |
1153 | .search_imp(cache, input, slots) |
1154 | .map(|hm| (hm, hm.offset()))) |
1155 | }) |
1156 | // OK because the PikeVM never errors. |
1157 | .unwrap() |
1158 | } |
1159 | |
1160 | /// Writes the set of patterns that match anywhere in the given search |
1161 | /// configuration to `patset`. If multiple patterns match at the same |
1162 | /// position and this `PikeVM` was configured with [`MatchKind::All`] |
1163 | /// semantics, then all matching patterns are written to the given set. |
1164 | /// |
1165 | /// Unless all of the patterns in this `PikeVM` are anchored, then |
1166 | /// generally speaking, this will visit every byte in the haystack. |
1167 | /// |
1168 | /// This search routine *does not* clear the pattern set. This gives some |
1169 | /// flexibility to the caller (e.g., running multiple searches with the |
1170 | /// same pattern set), but does make the API bug-prone if you're reusing |
1171 | /// the same pattern set for multiple searches but intended them to be |
1172 | /// independent. |
1173 | /// |
1174 | /// If a pattern ID matched but the given `PatternSet` does not have |
1175 | /// sufficient capacity to store it, then it is not inserted and silently |
1176 | /// dropped. |
1177 | /// |
1178 | /// # Example |
1179 | /// |
1180 | /// This example shows how to find all matching patterns in a haystack, |
1181 | /// even when some patterns match at the same position as other patterns. |
1182 | /// |
1183 | /// ``` |
1184 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
1185 | /// use regex_automata::{ |
1186 | /// nfa::thompson::pikevm::PikeVM, |
1187 | /// Input, MatchKind, PatternSet, |
1188 | /// }; |
1189 | /// |
1190 | /// let patterns = &[ |
1191 | /// r"\w+" , r"\d+" , r"\pL+" , r"foo" , r"bar" , r"barfoo" , r"foobar" , |
1192 | /// ]; |
1193 | /// let re = PikeVM::builder() |
1194 | /// .configure(PikeVM::config().match_kind(MatchKind::All)) |
1195 | /// .build_many(patterns)?; |
1196 | /// let mut cache = re.create_cache(); |
1197 | /// |
1198 | /// let input = Input::new("foobar" ); |
1199 | /// let mut patset = PatternSet::new(re.pattern_len()); |
1200 | /// re.which_overlapping_matches(&mut cache, &input, &mut patset); |
1201 | /// let expected = vec![0, 2, 3, 4, 6]; |
1202 | /// let got: Vec<usize> = patset.iter().map(|p| p.as_usize()).collect(); |
1203 | /// assert_eq!(expected, got); |
1204 | /// |
1205 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1206 | /// ``` |
1207 | #[inline ] |
1208 | pub fn which_overlapping_matches( |
1209 | &self, |
1210 | cache: &mut Cache, |
1211 | input: &Input<'_>, |
1212 | patset: &mut PatternSet, |
1213 | ) { |
1214 | self.which_overlapping_imp(cache, input, patset) |
1215 | } |
1216 | } |
1217 | |
1218 | impl PikeVM { |
1219 | /// The implementation of standard leftmost search. |
1220 | /// |
1221 | /// Capturing group spans are written to `slots`, but only if requested. |
1222 | /// `slots` can be any length. Any slot in the NFA that is activated but |
1223 | /// which is out of bounds for the given `slots` is ignored. |
1224 | fn search_imp( |
1225 | &self, |
1226 | cache: &mut Cache, |
1227 | input: &Input<'_>, |
1228 | slots: &mut [Option<NonMaxUsize>], |
1229 | ) -> Option<HalfMatch> { |
1230 | cache.setup_search(slots.len()); |
1231 | if input.is_done() { |
1232 | return None; |
1233 | } |
1234 | // Why do we even care about this? Well, in our 'Captures' |
1235 | // representation, we use usize::MAX as a sentinel to indicate "no |
1236 | // match." This isn't problematic so long as our haystack doesn't have |
1237 | // a maximal length. Byte slices are guaranteed by Rust to have a |
1238 | // length that fits into isize, and so this assert should always pass. |
1239 | // But we put it here to make our assumption explicit. |
1240 | assert!( |
1241 | input.haystack().len() < core::usize::MAX, |
1242 | "byte slice lengths must be less than usize MAX" , |
1243 | ); |
1244 | instrument!(|c| c.reset(&self.nfa)); |
1245 | |
1246 | // Whether we want to visit all match states instead of emulating the |
1247 | // 'leftmost' semantics of typical backtracking regex engines. |
1248 | let allmatches = |
1249 | self.config.get_match_kind().continue_past_first_match(); |
1250 | let (anchored, start_id) = match self.start_config(input) { |
1251 | None => return None, |
1252 | Some(config) => config, |
1253 | }; |
1254 | |
1255 | let pre = |
1256 | if anchored { None } else { self.get_config().get_prefilter() }; |
1257 | let Cache { ref mut stack, ref mut curr, ref mut next } = cache; |
1258 | let mut hm = None; |
1259 | // Yes, our search doesn't end at input.end(), but includes it. This |
1260 | // is necessary because matches are delayed by one byte, just like |
1261 | // how the DFA engines work. The delay is used to handle look-behind |
1262 | // assertions. In the case of the PikeVM, the delay is implemented |
1263 | // by not considering a match to exist until it is visited in |
1264 | // 'steps'. Technically, we know a match exists in the previous |
1265 | // iteration via 'epsilon_closure'. (It's the same thing in NFA-to-DFA |
1266 | // determinization. We don't mark a DFA state as a match state if it |
1267 | // contains an NFA match state, but rather, whether the DFA state was |
1268 | // generated by a transition from a DFA state that contains an NFA |
1269 | // match state.) |
1270 | let mut at = input.start(); |
1271 | while at <= input.end() { |
1272 | // If we have no states left to visit, then there are some cases |
1273 | // where we know we can quit early or even skip ahead. |
1274 | if curr.set.is_empty() { |
1275 | // We have a match and we haven't been instructed to continue |
1276 | // on even after finding a match, so we can quit. |
1277 | if hm.is_some() && !allmatches { |
1278 | break; |
1279 | } |
1280 | // If we're running an anchored search and we've advanced |
1281 | // beyond the start position with no other states to try, then |
1282 | // we will never observe a match and thus can stop. |
1283 | if anchored && at > input.start() { |
1284 | break; |
1285 | } |
1286 | // If there no states left to explore at this position and we |
1287 | // know we can't terminate early, then we are effectively at |
1288 | // the starting state of the NFA. If we fell through here, |
1289 | // we'd end up adding our '(?s-u:.)*?' prefix and it would be |
1290 | // the only thing in 'curr'. So we might as well just skip |
1291 | // ahead until we find something that we know might advance us |
1292 | // forward. |
1293 | if let Some(ref pre) = pre { |
1294 | let span = Span::from(at..input.end()); |
1295 | match pre.find(input.haystack(), span) { |
1296 | None => break, |
1297 | Some(ref span) => at = span.start, |
1298 | } |
1299 | } |
1300 | } |
1301 | // Instead of using the NFA's unanchored start state, we actually |
1302 | // always use its anchored starting state. As a result, when doing |
1303 | // an unanchored search, we need to simulate our own '(?s-u:.)*?' |
1304 | // prefix, to permit a match to appear anywhere. |
1305 | // |
1306 | // Now, we don't *have* to do things this way. We could use the |
1307 | // NFA's unanchored starting state and do one 'epsilon_closure' |
1308 | // call from that starting state before the main loop here. And |
1309 | // that is just as correct. However, it turns out to be slower |
1310 | // than our approach here because it slightly increases the cost |
1311 | // of processing each byte by requiring us to visit more NFA |
1312 | // states to deal with the additional NFA states in the unanchored |
1313 | // prefix. By simulating it explicitly here, we lower those costs |
1314 | // substantially. The cost is itself small, but it adds up for |
1315 | // large haystacks. |
1316 | // |
1317 | // In order to simulate the '(?s-u:.)*?' prefix---which is not |
1318 | // greedy---we are careful not to perform an epsilon closure on |
1319 | // the start state if we already have a match. Namely, if we |
1320 | // did otherwise, we would never reach a terminating condition |
1321 | // because there would always be additional states to process. |
1322 | // In effect, the exclusion of running 'epsilon_closure' when |
1323 | // we have a match corresponds to the "dead" states we have in |
1324 | // our DFA regex engines. Namely, in a DFA, match states merely |
1325 | // instruct the search execution to record the current offset as |
1326 | // the most recently seen match. It is the dead state that actually |
1327 | // indicates when to stop the search (other than EOF or quit |
1328 | // states). |
1329 | // |
1330 | // However, when 'allmatches' is true, the caller has asked us to |
1331 | // leave in every possible match state. This tends not to make a |
1332 | // whole lot of sense in unanchored searches, because it means the |
1333 | // search really cannot terminate until EOF. And often, in that |
1334 | // case, you wind up skipping over a bunch of matches and are left |
1335 | // with the "last" match. Arguably, it just doesn't make a lot of |
1336 | // sense to run a 'leftmost' search (which is what this routine is) |
1337 | // with 'allmatches' set to true. But the DFAs support it and this |
1338 | // matches their behavior. (Generally, 'allmatches' is useful for |
1339 | // overlapping searches or leftmost anchored searches to find the |
1340 | // longest possible match by ignoring match priority.) |
1341 | // |
1342 | // Additionally, when we're running an anchored search, this |
1343 | // epsilon closure should only be computed at the beginning of the |
1344 | // search. If we re-computed it at every position, we would be |
1345 | // simulating an unanchored search when we were tasked to perform |
1346 | // an anchored search. |
1347 | if (!hm.is_some() || allmatches) |
1348 | && (!anchored || at == input.start()) |
1349 | { |
1350 | // Since we are adding to the 'curr' active states and since |
1351 | // this is for the start ID, we use a slots slice that is |
1352 | // guaranteed to have the right length but where every element |
1353 | // is absent. This is exactly what we want, because this |
1354 | // epsilon closure is responsible for simulating an unanchored |
1355 | // '(?s:.)*?' prefix. It is specifically outside of any |
1356 | // capturing groups, and thus, using slots that are always |
1357 | // absent is correct. |
1358 | // |
1359 | // Note though that we can't just use '&mut []' here, since |
1360 | // this epsilon closure may traverse through 'Captures' epsilon |
1361 | // transitions, and thus must be able to write offsets to the |
1362 | // slots given which are later copied to slot values in 'curr'. |
1363 | let slots = next.slot_table.all_absent(); |
1364 | self.epsilon_closure(stack, slots, curr, input, at, start_id); |
1365 | } |
1366 | if let Some(pid) = self.nexts(stack, curr, next, input, at, slots) |
1367 | { |
1368 | hm = Some(HalfMatch::new(pid, at)); |
1369 | } |
1370 | // Unless the caller asked us to return early, we need to mush on |
1371 | // to see if we can extend our match. (But note that 'nexts' will |
1372 | // quit right after seeing a match when match_kind==LeftmostFirst, |
1373 | // as is consistent with leftmost-first match priority.) |
1374 | if input.get_earliest() && hm.is_some() { |
1375 | break; |
1376 | } |
1377 | core::mem::swap(curr, next); |
1378 | next.set.clear(); |
1379 | at += 1; |
1380 | } |
1381 | instrument!(|c| c.eprint(&self.nfa)); |
1382 | hm |
1383 | } |
1384 | |
1385 | /// The implementation for the 'which_overlapping_matches' API. Basically, |
1386 | /// we do a single scan through the entire haystack (unless our regex |
1387 | /// or search is anchored) and record every pattern that matched. In |
1388 | /// particular, when MatchKind::All is used, this supports overlapping |
1389 | /// matches. So if we have the regexes 'sam' and 'samwise', they will |
1390 | /// *both* be reported in the pattern set when searching the haystack |
1391 | /// 'samwise'. |
1392 | fn which_overlapping_imp( |
1393 | &self, |
1394 | cache: &mut Cache, |
1395 | input: &Input<'_>, |
1396 | patset: &mut PatternSet, |
1397 | ) { |
1398 | // NOTE: This is effectively a copy of 'search_imp' above, but with no |
1399 | // captures support and instead writes patterns that matched directly |
1400 | // to 'patset'. See that routine for better commentary about what's |
1401 | // going on in this routine. We probably could unify the routines using |
1402 | // generics or more helper routines, but I'm not sure it's worth it. |
1403 | // |
1404 | // NOTE: We somewhat go out of our way here to support things like |
1405 | // 'input.get_earliest()' and 'leftmost-first' match semantics. Neither |
1406 | // of those seem particularly relevant to this routine, but they are |
1407 | // both supported by the DFA analogs of this routine by construction |
1408 | // and composition, so it seems like good sense to have the PikeVM |
1409 | // match that behavior. |
1410 | |
1411 | cache.setup_search(0); |
1412 | if input.is_done() { |
1413 | return; |
1414 | } |
1415 | assert!( |
1416 | input.haystack().len() < core::usize::MAX, |
1417 | "byte slice lengths must be less than usize MAX" , |
1418 | ); |
1419 | instrument!(|c| c.reset(&self.nfa)); |
1420 | |
1421 | let allmatches = |
1422 | self.config.get_match_kind().continue_past_first_match(); |
1423 | let (anchored, start_id) = match self.start_config(input) { |
1424 | None => return, |
1425 | Some(config) => config, |
1426 | }; |
1427 | |
1428 | let Cache { ref mut stack, ref mut curr, ref mut next } = cache; |
1429 | for at in input.start()..=input.end() { |
1430 | let any_matches = !patset.is_empty(); |
1431 | if curr.set.is_empty() { |
1432 | if any_matches && !allmatches { |
1433 | break; |
1434 | } |
1435 | if anchored && at > input.start() { |
1436 | break; |
1437 | } |
1438 | } |
1439 | if !any_matches || allmatches { |
1440 | let slots = &mut []; |
1441 | self.epsilon_closure(stack, slots, curr, input, at, start_id); |
1442 | } |
1443 | self.nexts_overlapping(stack, curr, next, input, at, patset); |
1444 | // If we found a match and filled our set, then there is no more |
1445 | // additional info that we can provide. Thus, we can quit. We also |
1446 | // quit if the caller asked us to stop at the earliest point that |
1447 | // we know a match exists. |
1448 | if patset.is_full() || input.get_earliest() { |
1449 | break; |
1450 | } |
1451 | core::mem::swap(curr, next); |
1452 | next.set.clear(); |
1453 | } |
1454 | instrument!(|c| c.eprint(&self.nfa)); |
1455 | } |
1456 | |
1457 | /// Process the active states in 'curr' to find the states (written to |
1458 | /// 'next') we should process for the next byte in the haystack. |
1459 | /// |
1460 | /// 'stack' is used to perform a depth first traversal of the NFA when |
1461 | /// computing an epsilon closure. |
1462 | /// |
1463 | /// When a match is found, the slots for that match state (in 'curr') are |
1464 | /// copied to 'caps'. Moreover, once a match is seen, processing for 'curr' |
1465 | /// stops (unless the PikeVM was configured with MatchKind::All semantics). |
1466 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
1467 | fn nexts( |
1468 | &self, |
1469 | stack: &mut Vec<FollowEpsilon>, |
1470 | curr: &mut ActiveStates, |
1471 | next: &mut ActiveStates, |
1472 | input: &Input<'_>, |
1473 | at: usize, |
1474 | slots: &mut [Option<NonMaxUsize>], |
1475 | ) -> Option<PatternID> { |
1476 | instrument!(|c| c.record_state_set(&curr.set)); |
1477 | let mut pid = None; |
1478 | let ActiveStates { ref set, ref mut slot_table } = *curr; |
1479 | for sid in set.iter() { |
1480 | pid = match self.next(stack, slot_table, next, input, at, sid) { |
1481 | None => continue, |
1482 | Some(pid) => Some(pid), |
1483 | }; |
1484 | slots.copy_from_slice(slot_table.for_state(sid)); |
1485 | if !self.config.get_match_kind().continue_past_first_match() { |
1486 | break; |
1487 | } |
1488 | } |
1489 | pid |
1490 | } |
1491 | |
1492 | /// Like 'nexts', but for the overlapping case. This doesn't write any |
1493 | /// slots, and instead just writes which pattern matched in 'patset'. |
1494 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
1495 | fn nexts_overlapping( |
1496 | &self, |
1497 | stack: &mut Vec<FollowEpsilon>, |
1498 | curr: &mut ActiveStates, |
1499 | next: &mut ActiveStates, |
1500 | input: &Input<'_>, |
1501 | at: usize, |
1502 | patset: &mut PatternSet, |
1503 | ) { |
1504 | instrument!(|c| c.record_state_set(&curr.set)); |
1505 | let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); |
1506 | let ActiveStates { ref set, ref mut slot_table } = *curr; |
1507 | for sid in set.iter() { |
1508 | let pid = match self.next(stack, slot_table, next, input, at, sid) |
1509 | { |
1510 | None => continue, |
1511 | Some(pid) => pid, |
1512 | }; |
1513 | // This handles the case of finding a zero-width match that splits |
1514 | // a codepoint. Namely, if we're in UTF-8 mode AND we know we can |
1515 | // match the empty string, then the only valid way of getting to |
1516 | // this point with an offset that splits a codepoint is when we |
1517 | // have an empty match. Such matches, in UTF-8 mode, must not be |
1518 | // reported. So we just skip them here and pretend as if we did |
1519 | // not see a match. |
1520 | if utf8empty && !input.is_char_boundary(at) { |
1521 | continue; |
1522 | } |
1523 | let _ = patset.try_insert(pid); |
1524 | if !self.config.get_match_kind().continue_past_first_match() { |
1525 | break; |
1526 | } |
1527 | } |
1528 | } |
1529 | |
1530 | /// Starting from 'sid', if the position 'at' in the 'input' haystack has a |
1531 | /// transition defined out of 'sid', then add the state transitioned to and |
1532 | /// its epsilon closure to the 'next' set of states to explore. |
1533 | /// |
1534 | /// 'stack' is used by the epsilon closure computation to perform a depth |
1535 | /// first traversal of the NFA. |
1536 | /// |
1537 | /// 'curr_slot_table' should be the table of slots for the current set of |
1538 | /// states being explored. If there is a transition out of 'sid', then |
1539 | /// sid's row in the slot table is used to perform the epsilon closure. |
1540 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
1541 | fn next( |
1542 | &self, |
1543 | stack: &mut Vec<FollowEpsilon>, |
1544 | curr_slot_table: &mut SlotTable, |
1545 | next: &mut ActiveStates, |
1546 | input: &Input<'_>, |
1547 | at: usize, |
1548 | sid: StateID, |
1549 | ) -> Option<PatternID> { |
1550 | instrument!(|c| c.record_step(sid)); |
1551 | match *self.nfa.state(sid) { |
1552 | State::Fail |
1553 | | State::Look { .. } |
1554 | | State::Union { .. } |
1555 | | State::BinaryUnion { .. } |
1556 | | State::Capture { .. } => None, |
1557 | State::ByteRange { ref trans } => { |
1558 | if trans.matches(input.haystack(), at) { |
1559 | let slots = curr_slot_table.for_state(sid); |
1560 | // OK because 'at <= haystack.len() < usize::MAX', so |
1561 | // adding 1 will never wrap. |
1562 | let at = at.wrapping_add(1); |
1563 | self.epsilon_closure( |
1564 | stack, slots, next, input, at, trans.next, |
1565 | ); |
1566 | } |
1567 | None |
1568 | } |
1569 | State::Sparse(ref sparse) => { |
1570 | if let Some(next_sid) = sparse.matches(input.haystack(), at) { |
1571 | let slots = curr_slot_table.for_state(sid); |
1572 | // OK because 'at <= haystack.len() < usize::MAX', so |
1573 | // adding 1 will never wrap. |
1574 | let at = at.wrapping_add(1); |
1575 | self.epsilon_closure( |
1576 | stack, slots, next, input, at, next_sid, |
1577 | ); |
1578 | } |
1579 | None |
1580 | } |
1581 | State::Dense(ref dense) => { |
1582 | if let Some(next_sid) = dense.matches(input.haystack(), at) { |
1583 | let slots = curr_slot_table.for_state(sid); |
1584 | // OK because 'at <= haystack.len() < usize::MAX', so |
1585 | // adding 1 will never wrap. |
1586 | let at = at.wrapping_add(1); |
1587 | self.epsilon_closure( |
1588 | stack, slots, next, input, at, next_sid, |
1589 | ); |
1590 | } |
1591 | None |
1592 | } |
1593 | State::Match { pattern_id } => Some(pattern_id), |
1594 | } |
1595 | } |
1596 | |
1597 | /// Compute the epsilon closure of 'sid', writing the closure into 'next' |
1598 | /// while copying slot values from 'curr_slots' into corresponding states |
1599 | /// in 'next'. 'curr_slots' should be the slot values corresponding to |
1600 | /// 'sid'. |
1601 | /// |
1602 | /// The given 'stack' is used to perform a depth first traversal of the |
1603 | /// NFA by recursively following all epsilon transitions out of 'sid'. |
1604 | /// Conditional epsilon transitions are followed if and only if they are |
1605 | /// satisfied for the position 'at' in the 'input' haystack. |
1606 | /// |
1607 | /// While this routine may write to 'curr_slots', once it returns, any |
1608 | /// writes are undone and the original values (even if absent) are |
1609 | /// restored. |
1610 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
1611 | fn epsilon_closure( |
1612 | &self, |
1613 | stack: &mut Vec<FollowEpsilon>, |
1614 | curr_slots: &mut [Option<NonMaxUsize>], |
1615 | next: &mut ActiveStates, |
1616 | input: &Input<'_>, |
1617 | at: usize, |
1618 | sid: StateID, |
1619 | ) { |
1620 | instrument!(|c| { |
1621 | c.record_closure(sid); |
1622 | c.record_stack_push(sid); |
1623 | }); |
1624 | stack.push(FollowEpsilon::Explore(sid)); |
1625 | while let Some(frame) = stack.pop() { |
1626 | match frame { |
1627 | FollowEpsilon::RestoreCapture { slot, offset: pos } => { |
1628 | curr_slots[slot] = pos; |
1629 | } |
1630 | FollowEpsilon::Explore(sid) => { |
1631 | self.epsilon_closure_explore( |
1632 | stack, curr_slots, next, input, at, sid, |
1633 | ); |
1634 | } |
1635 | } |
1636 | } |
1637 | } |
1638 | |
1639 | /// Explore all of the epsilon transitions out of 'sid'. This is mostly |
1640 | /// split out from 'epsilon_closure' in order to clearly delineate |
1641 | /// the actual work of computing an epsilon closure from the stack |
1642 | /// book-keeping. |
1643 | /// |
1644 | /// This will push any additional explorations needed on to 'stack'. |
1645 | /// |
1646 | /// 'curr_slots' should refer to the slots for the currently active NFA |
1647 | /// state. That is, the current state we are stepping through. These |
1648 | /// slots are mutated in place as new 'Captures' states are traversed |
1649 | /// during epsilon closure, but the slots are restored to their original |
1650 | /// values once the full epsilon closure is completed. The ultimate use of |
1651 | /// 'curr_slots' is to copy them to the corresponding 'next_slots', so that |
1652 | /// the capturing group spans are forwarded from the currently active state |
1653 | /// to the next. |
1654 | /// |
1655 | /// 'next' refers to the next set of active states. Computing an epsilon |
1656 | /// closure may increase the next set of active states. |
1657 | /// |
1658 | /// 'input' refers to the caller's input configuration and 'at' refers to |
1659 | /// the current position in the haystack. These are used to check whether |
1660 | /// conditional epsilon transitions (like look-around) are satisfied at |
1661 | /// the current position. If they aren't, then the epsilon closure won't |
1662 | /// include them. |
1663 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
1664 | fn epsilon_closure_explore( |
1665 | &self, |
1666 | stack: &mut Vec<FollowEpsilon>, |
1667 | curr_slots: &mut [Option<NonMaxUsize>], |
1668 | next: &mut ActiveStates, |
1669 | input: &Input<'_>, |
1670 | at: usize, |
1671 | mut sid: StateID, |
1672 | ) { |
1673 | // We can avoid pushing some state IDs on to our stack in precisely |
1674 | // the cases where a 'push(x)' would be immediately followed by a 'x |
1675 | // = pop()'. This is achieved by this outer-loop. We simply set 'sid' |
1676 | // to be the next state ID we want to explore once we're done with |
1677 | // our initial exploration. In practice, this avoids a lot of stack |
1678 | // thrashing. |
1679 | loop { |
1680 | instrument!(|c| c.record_set_insert(sid)); |
1681 | // Record this state as part of our next set of active states. If |
1682 | // we've already explored it, then no need to do it again. |
1683 | if !next.set.insert(sid) { |
1684 | return; |
1685 | } |
1686 | match *self.nfa.state(sid) { |
1687 | State::Fail |
1688 | | State::Match { .. } |
1689 | | State::ByteRange { .. } |
1690 | | State::Sparse { .. } |
1691 | | State::Dense { .. } => { |
1692 | next.slot_table.for_state(sid).copy_from_slice(curr_slots); |
1693 | return; |
1694 | } |
1695 | State::Look { look, next } => { |
1696 | // OK because we don't permit building a searcher with a |
1697 | // Unicode word boundary if the requisite Unicode data is |
1698 | // unavailable. |
1699 | if !self.nfa.look_matcher().matches_inline( |
1700 | look, |
1701 | input.haystack(), |
1702 | at, |
1703 | ) { |
1704 | return; |
1705 | } |
1706 | sid = next; |
1707 | } |
1708 | State::Union { ref alternates } => { |
1709 | sid = match alternates.get(0) { |
1710 | None => return, |
1711 | Some(&sid) => sid, |
1712 | }; |
1713 | instrument!(|c| { |
1714 | for &alt in &alternates[1..] { |
1715 | c.record_stack_push(alt); |
1716 | } |
1717 | }); |
1718 | stack.extend( |
1719 | alternates[1..] |
1720 | .iter() |
1721 | .copied() |
1722 | .rev() |
1723 | .map(FollowEpsilon::Explore), |
1724 | ); |
1725 | } |
1726 | State::BinaryUnion { alt1, alt2 } => { |
1727 | sid = alt1; |
1728 | instrument!(|c| c.record_stack_push(sid)); |
1729 | stack.push(FollowEpsilon::Explore(alt2)); |
1730 | } |
1731 | State::Capture { next, slot, .. } => { |
1732 | // There's no need to do anything with slots that |
1733 | // ultimately won't be copied into the caller-provided |
1734 | // 'Captures' value. So we just skip dealing with them at |
1735 | // all. |
1736 | if slot.as_usize() < curr_slots.len() { |
1737 | instrument!(|c| c.record_stack_push(sid)); |
1738 | stack.push(FollowEpsilon::RestoreCapture { |
1739 | slot, |
1740 | offset: curr_slots[slot], |
1741 | }); |
1742 | // OK because length of a slice must fit into an isize. |
1743 | curr_slots[slot] = Some(NonMaxUsize::new(at).unwrap()); |
1744 | } |
1745 | sid = next; |
1746 | } |
1747 | } |
1748 | } |
1749 | } |
1750 | |
1751 | /// Return the starting configuration of a PikeVM search. |
1752 | /// |
1753 | /// The "start config" is basically whether the search should be anchored |
1754 | /// or not and the NFA state ID at which to begin the search. The state ID |
1755 | /// returned always corresponds to an anchored starting state even when the |
1756 | /// search is unanchored. This is because the PikeVM search loop deals with |
1757 | /// unanchored searches with an explicit epsilon closure out of the start |
1758 | /// state. |
1759 | /// |
1760 | /// This routine accounts for both the caller's `Input` configuration |
1761 | /// and the pattern itself. For example, even if the caller asks for an |
1762 | /// unanchored search, if the pattern itself is anchored, then this will |
1763 | /// always return 'true' because implementing an unanchored search in that |
1764 | /// case would be incorrect. |
1765 | /// |
1766 | /// Similarly, if the caller requests an anchored search for a particular |
1767 | /// pattern, then the starting state ID returned will reflect that. |
1768 | /// |
1769 | /// If a pattern ID is given in the input configuration that is not in |
1770 | /// this regex, then `None` is returned. |
1771 | fn start_config(&self, input: &Input<'_>) -> Option<(bool, StateID)> { |
1772 | match input.get_anchored() { |
1773 | // Only way we're unanchored is if both the caller asked for an |
1774 | // unanchored search *and* the pattern is itself not anchored. |
1775 | Anchored::No => Some(( |
1776 | self.nfa.is_always_start_anchored(), |
1777 | self.nfa.start_anchored(), |
1778 | )), |
1779 | Anchored::Yes => Some((true, self.nfa.start_anchored())), |
1780 | Anchored::Pattern(pid) => { |
1781 | Some((true, self.nfa.start_pattern(pid)?)) |
1782 | } |
1783 | } |
1784 | } |
1785 | } |
1786 | |
1787 | /// An iterator over all non-overlapping matches for a particular search. |
1788 | /// |
1789 | /// The iterator yields a [`Match`] value until no more matches could be found. |
1790 | /// |
1791 | /// The lifetime parameters are as follows: |
1792 | /// |
1793 | /// * `'r` represents the lifetime of the PikeVM. |
1794 | /// * `'c` represents the lifetime of the PikeVM's cache. |
1795 | /// * `'h` represents the lifetime of the haystack being searched. |
1796 | /// |
1797 | /// This iterator can be created with the [`PikeVM::find_iter`] method. |
1798 | #[derive(Debug)] |
1799 | pub struct FindMatches<'r, 'c, 'h> { |
1800 | re: &'r PikeVM, |
1801 | cache: &'c mut Cache, |
1802 | caps: Captures, |
1803 | it: iter::Searcher<'h>, |
1804 | } |
1805 | |
1806 | impl<'r, 'c, 'h> Iterator for FindMatches<'r, 'c, 'h> { |
1807 | type Item = Match; |
1808 | |
1809 | #[inline ] |
1810 | fn next(&mut self) -> Option<Match> { |
1811 | // Splitting 'self' apart seems necessary to appease borrowck. |
1812 | let FindMatches { re, ref mut cache, ref mut caps, ref mut it } = |
1813 | *self; |
1814 | // 'advance' converts errors into panics, which is OK here because |
1815 | // the PikeVM can never return an error. |
1816 | it.advance(|input| { |
1817 | re.search(cache, input, caps); |
1818 | Ok(caps.get_match()) |
1819 | }) |
1820 | } |
1821 | } |
1822 | |
1823 | /// An iterator over all non-overlapping leftmost matches, with their capturing |
1824 | /// groups, for a particular search. |
1825 | /// |
1826 | /// The iterator yields a [`Captures`] value until no more matches could be |
1827 | /// found. |
1828 | /// |
1829 | /// The lifetime parameters are as follows: |
1830 | /// |
1831 | /// * `'r` represents the lifetime of the PikeVM. |
1832 | /// * `'c` represents the lifetime of the PikeVM's cache. |
1833 | /// * `'h` represents the lifetime of the haystack being searched. |
1834 | /// |
1835 | /// This iterator can be created with the [`PikeVM::captures_iter`] method. |
1836 | #[derive(Debug)] |
1837 | pub struct CapturesMatches<'r, 'c, 'h> { |
1838 | re: &'r PikeVM, |
1839 | cache: &'c mut Cache, |
1840 | caps: Captures, |
1841 | it: iter::Searcher<'h>, |
1842 | } |
1843 | |
1844 | impl<'r, 'c, 'h> Iterator for CapturesMatches<'r, 'c, 'h> { |
1845 | type Item = Captures; |
1846 | |
1847 | #[inline ] |
1848 | fn next(&mut self) -> Option<Captures> { |
1849 | // Splitting 'self' apart seems necessary to appease borrowck. |
1850 | let CapturesMatches { re, ref mut cache, ref mut caps, ref mut it } = |
1851 | *self; |
1852 | // 'advance' converts errors into panics, which is OK here because |
1853 | // the PikeVM can never return an error. |
1854 | it.advance(|input| { |
1855 | re.search(cache, input, caps); |
1856 | Ok(caps.get_match()) |
1857 | }); |
1858 | if caps.is_match() { |
1859 | Some(caps.clone()) |
1860 | } else { |
1861 | None |
1862 | } |
1863 | } |
1864 | } |
1865 | |
1866 | /// A cache represents mutable state that a [`PikeVM`] requires during a |
1867 | /// search. |
1868 | /// |
1869 | /// For a given [`PikeVM`], its corresponding cache may be created either via |
1870 | /// [`PikeVM::create_cache`], or via [`Cache::new`]. They are equivalent in |
1871 | /// every way, except the former does not require explicitly importing `Cache`. |
1872 | /// |
1873 | /// A particular `Cache` is coupled with the [`PikeVM`] from which it |
1874 | /// was created. It may only be used with that `PikeVM`. A cache and its |
1875 | /// allocations may be re-purposed via [`Cache::reset`], in which case, it can |
1876 | /// only be used with the new `PikeVM` (and not the old one). |
1877 | #[derive(Clone, Debug)] |
1878 | pub struct Cache { |
1879 | /// Stack used while computing epsilon closure. This effectively lets us |
1880 | /// move what is more naturally expressed through recursion to a stack |
1881 | /// on the heap. |
1882 | stack: Vec<FollowEpsilon>, |
1883 | /// The current active states being explored for the current byte in the |
1884 | /// haystack. |
1885 | curr: ActiveStates, |
1886 | /// The next set of states we're building that will be explored for the |
1887 | /// next byte in the haystack. |
1888 | next: ActiveStates, |
1889 | } |
1890 | |
1891 | impl Cache { |
1892 | /// Create a new [`PikeVM`] cache. |
1893 | /// |
1894 | /// A potentially more convenient routine to create a cache is |
1895 | /// [`PikeVM::create_cache`], as it does not require also importing the |
1896 | /// `Cache` type. |
1897 | /// |
1898 | /// If you want to reuse the returned `Cache` with some other `PikeVM`, |
1899 | /// then you must call [`Cache::reset`] with the desired `PikeVM`. |
1900 | pub fn new(re: &PikeVM) -> Cache { |
1901 | Cache { |
1902 | stack: vec![], |
1903 | curr: ActiveStates::new(re), |
1904 | next: ActiveStates::new(re), |
1905 | } |
1906 | } |
1907 | |
1908 | /// Reset this cache such that it can be used for searching with a |
1909 | /// different [`PikeVM`]. |
1910 | /// |
1911 | /// A cache reset permits reusing memory already allocated in this cache |
1912 | /// with a different `PikeVM`. |
1913 | /// |
1914 | /// # Example |
1915 | /// |
1916 | /// This shows how to re-purpose a cache for use with a different `PikeVM`. |
1917 | /// |
1918 | /// ``` |
1919 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
1920 | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; |
1921 | /// |
1922 | /// let re1 = PikeVM::new(r"\w" )?; |
1923 | /// let re2 = PikeVM::new(r"\W" )?; |
1924 | /// |
1925 | /// let mut cache = re1.create_cache(); |
1926 | /// assert_eq!( |
1927 | /// Some(Match::must(0, 0..2)), |
1928 | /// re1.find_iter(&mut cache, "Δ" ).next(), |
1929 | /// ); |
1930 | /// |
1931 | /// // Using 'cache' with re2 is not allowed. It may result in panics or |
1932 | /// // incorrect results. In order to re-purpose the cache, we must reset |
1933 | /// // it with the PikeVM we'd like to use it with. |
1934 | /// // |
1935 | /// // Similarly, after this reset, using the cache with 're1' is also not |
1936 | /// // allowed. |
1937 | /// cache.reset(&re2); |
1938 | /// assert_eq!( |
1939 | /// Some(Match::must(0, 0..3)), |
1940 | /// re2.find_iter(&mut cache, "☃" ).next(), |
1941 | /// ); |
1942 | /// |
1943 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1944 | /// ``` |
1945 | pub fn reset(&mut self, re: &PikeVM) { |
1946 | self.curr.reset(re); |
1947 | self.next.reset(re); |
1948 | } |
1949 | |
1950 | /// Returns the heap memory usage, in bytes, of this cache. |
1951 | /// |
1952 | /// This does **not** include the stack size used up by this cache. To |
1953 | /// compute that, use `std::mem::size_of::<Cache>()`. |
1954 | pub fn memory_usage(&self) -> usize { |
1955 | use core::mem::size_of; |
1956 | (self.stack.len() * size_of::<FollowEpsilon>()) |
1957 | + self.curr.memory_usage() |
1958 | + self.next.memory_usage() |
1959 | } |
1960 | |
1961 | /// Clears this cache. This should be called at the start of every search |
1962 | /// to ensure we start with a clean slate. |
1963 | /// |
1964 | /// This also sets the length of the capturing groups used in the current |
1965 | /// search. This permits an optimization where by 'SlotTable::for_state' |
1966 | /// only returns the number of slots equivalent to the number of slots |
1967 | /// given in the 'Captures' value. This may be less than the total number |
1968 | /// of possible slots, e.g., when one only wants to track overall match |
1969 | /// offsets. This in turn permits less copying of capturing group spans |
1970 | /// in the PikeVM. |
1971 | fn setup_search(&mut self, captures_slot_len: usize) { |
1972 | self.stack.clear(); |
1973 | self.curr.setup_search(captures_slot_len); |
1974 | self.next.setup_search(captures_slot_len); |
1975 | } |
1976 | } |
1977 | |
1978 | /// A set of active states used to "simulate" the execution of an NFA via the |
1979 | /// PikeVM. |
1980 | /// |
1981 | /// There are two sets of these used during NFA simulation. One set corresponds |
1982 | /// to the "current" set of states being traversed for the current position |
1983 | /// in a haystack. The other set corresponds to the "next" set of states being |
1984 | /// built, which will become the new "current" set for the next position in the |
1985 | /// haystack. These two sets correspond to CLIST and NLIST in Thompson's |
1986 | /// original paper regexes: https://dl.acm.org/doi/pdf/10.1145/363347.363387 |
1987 | /// |
1988 | /// In addition to representing a set of NFA states, this also maintains slot |
1989 | /// values for each state. These slot values are what turn the NFA simulation |
1990 | /// into the "Pike VM." Namely, they track capturing group values for each |
1991 | /// state. During the computation of epsilon closure, we copy slot values from |
1992 | /// states in the "current" set to the "next" set. Eventually, once a match |
1993 | /// is found, the slot values for that match state are what we write to the |
1994 | /// caller provided 'Captures' value. |
1995 | #[derive(Clone, Debug)] |
1996 | struct ActiveStates { |
1997 | /// The set of active NFA states. This set preserves insertion order, which |
1998 | /// is critical for simulating the match semantics of backtracking regex |
1999 | /// engines. |
2000 | set: SparseSet, |
2001 | /// The slots for every NFA state, where each slot stores a (possibly |
2002 | /// absent) offset. Every capturing group has two slots. One for a start |
2003 | /// offset and one for an end offset. |
2004 | slot_table: SlotTable, |
2005 | } |
2006 | |
2007 | impl ActiveStates { |
2008 | /// Create a new set of active states for the given PikeVM. The active |
2009 | /// states returned may only be used with the given PikeVM. (Use 'reset' |
2010 | /// to re-purpose the allocation for a different PikeVM.) |
2011 | fn new(re: &PikeVM) -> ActiveStates { |
2012 | let mut active = ActiveStates { |
2013 | set: SparseSet::new(0), |
2014 | slot_table: SlotTable::new(), |
2015 | }; |
2016 | active.reset(re); |
2017 | active |
2018 | } |
2019 | |
2020 | /// Reset this set of active states such that it can be used with the given |
2021 | /// PikeVM (and only that PikeVM). |
2022 | fn reset(&mut self, re: &PikeVM) { |
2023 | self.set.resize(re.get_nfa().states().len()); |
2024 | self.slot_table.reset(re); |
2025 | } |
2026 | |
2027 | /// Return the heap memory usage, in bytes, used by this set of active |
2028 | /// states. |
2029 | /// |
2030 | /// This does not include the stack size of this value. |
2031 | fn memory_usage(&self) -> usize { |
2032 | self.set.memory_usage() + self.slot_table.memory_usage() |
2033 | } |
2034 | |
2035 | /// Setup this set of active states for a new search. The given slot |
2036 | /// length should be the number of slots in a caller provided 'Captures' |
2037 | /// (and may be zero). |
2038 | fn setup_search(&mut self, captures_slot_len: usize) { |
2039 | self.set.clear(); |
2040 | self.slot_table.setup_search(captures_slot_len); |
2041 | } |
2042 | } |
2043 | |
2044 | /// A table of slots, where each row represent a state in an NFA. Thus, the |
2045 | /// table has room for storing slots for every single state in an NFA. |
2046 | /// |
2047 | /// This table is represented with a single contiguous allocation. In general, |
2048 | /// the notion of "capturing group" doesn't really exist at this level of |
2049 | /// abstraction, hence the name "slot" instead. (Indeed, every capturing group |
2050 | /// maps to a pair of slots, one for the start offset and one for the end |
2051 | /// offset.) Slots are indexed by the 'Captures' NFA state. |
2052 | /// |
2053 | /// N.B. Not every state actually needs a row of slots. Namely, states that |
2054 | /// only have epsilon transitions currently never have anything written to |
2055 | /// their rows in this table. Thus, the table is somewhat wasteful in its heap |
2056 | /// usage. However, it is important to maintain fast random access by state |
2057 | /// ID, which means one giant table tends to work well. RE2 takes a different |
2058 | /// approach here and allocates each row as its own reference counted thing. |
2059 | /// I explored such a strategy at one point here, but couldn't get it to work |
2060 | /// well using entirely safe code. (To the ambitious reader: I encourage you to |
2061 | /// re-litigate that experiment.) I very much wanted to stick to safe code, but |
2062 | /// could be convinced otherwise if there was a solid argument and the safety |
2063 | /// was encapsulated well. |
2064 | #[derive(Clone, Debug)] |
2065 | struct SlotTable { |
2066 | /// The actual table of offsets. |
2067 | table: Vec<Option<NonMaxUsize>>, |
2068 | /// The number of slots per state, i.e., the table's stride or the length |
2069 | /// of each row. |
2070 | slots_per_state: usize, |
2071 | /// The number of slots in the caller-provided 'Captures' value for the |
2072 | /// current search. Setting this to 'slots_per_state' is always correct, |
2073 | /// but may be wasteful. |
2074 | slots_for_captures: usize, |
2075 | } |
2076 | |
2077 | impl SlotTable { |
2078 | /// Create a new slot table. |
2079 | /// |
2080 | /// One should call 'reset' with the corresponding PikeVM before use. |
2081 | fn new() -> SlotTable { |
2082 | SlotTable { table: vec![], slots_for_captures: 0, slots_per_state: 0 } |
2083 | } |
2084 | |
2085 | /// Reset this slot table such that it can be used with the given PikeVM |
2086 | /// (and only that PikeVM). |
2087 | fn reset(&mut self, re: &PikeVM) { |
2088 | let nfa = re.get_nfa(); |
2089 | self.slots_per_state = nfa.group_info().slot_len(); |
2090 | // This is always correct, but may be reduced for a particular search |
2091 | // if a 'Captures' has fewer slots, e.g., none at all or only slots |
2092 | // for tracking the overall match instead of all slots for every |
2093 | // group. |
2094 | self.slots_for_captures = core::cmp::max( |
2095 | self.slots_per_state, |
2096 | nfa.pattern_len().checked_mul(2).unwrap(), |
2097 | ); |
2098 | let len = nfa |
2099 | .states() |
2100 | .len() |
2101 | .checked_mul(self.slots_per_state) |
2102 | // Add space to account for scratch space used during a search. |
2103 | .and_then(|x| x.checked_add(self.slots_for_captures)) |
2104 | // It seems like this could actually panic on legitimate inputs on |
2105 | // 32-bit targets, and very likely to panic on 16-bit. Should we |
2106 | // somehow convert this to an error? What about something similar |
2107 | // for the lazy DFA cache? If you're tripping this assert, please |
2108 | // file a bug. |
2109 | .expect("slot table length doesn't overflow" ); |
2110 | // This happens about as often as a regex is compiled, so it probably |
2111 | // should be at debug level, but I found it quite distracting and not |
2112 | // particularly useful. |
2113 | trace!( |
2114 | "resizing PikeVM active states table to {} entries \ |
2115 | (slots_per_state={})" , |
2116 | len, |
2117 | self.slots_per_state, |
2118 | ); |
2119 | self.table.resize(len, None); |
2120 | } |
2121 | |
2122 | /// Return the heap memory usage, in bytes, used by this slot table. |
2123 | /// |
2124 | /// This does not include the stack size of this value. |
2125 | fn memory_usage(&self) -> usize { |
2126 | self.table.len() * core::mem::size_of::<Option<NonMaxUsize>>() |
2127 | } |
2128 | |
2129 | /// Perform any per-search setup for this slot table. |
2130 | /// |
2131 | /// In particular, this sets the length of the number of slots used in the |
2132 | /// 'Captures' given by the caller (if any at all). This number may be |
2133 | /// smaller than the total number of slots available, e.g., when the caller |
2134 | /// is only interested in tracking the overall match and not the spans of |
2135 | /// every matching capturing group. Only tracking the overall match can |
2136 | /// save a substantial amount of time copying capturing spans during a |
2137 | /// search. |
2138 | fn setup_search(&mut self, captures_slot_len: usize) { |
2139 | self.slots_for_captures = captures_slot_len; |
2140 | } |
2141 | |
2142 | /// Return a mutable slice of the slots for the given state. |
2143 | /// |
2144 | /// Note that the length of the slice returned may be less than the total |
2145 | /// number of slots available for this state. In particular, the length |
2146 | /// always matches the number of slots indicated via 'setup_search'. |
2147 | fn for_state(&mut self, sid: StateID) -> &mut [Option<NonMaxUsize>] { |
2148 | let i = sid.as_usize() * self.slots_per_state; |
2149 | &mut self.table[i..i + self.slots_for_captures] |
2150 | } |
2151 | |
2152 | /// Return a slice of slots of appropriate length where every slot offset |
2153 | /// is guaranteed to be absent. This is useful in cases where you need to |
2154 | /// compute an epsilon closure outside of the user supplied regex, and thus |
2155 | /// never want it to have any capturing slots set. |
2156 | fn all_absent(&mut self) -> &mut [Option<NonMaxUsize>] { |
2157 | let i = self.table.len() - self.slots_for_captures; |
2158 | &mut self.table[i..i + self.slots_for_captures] |
2159 | } |
2160 | } |
2161 | |
2162 | /// Represents a stack frame for use while computing an epsilon closure. |
2163 | /// |
2164 | /// (An "epsilon closure" refers to the set of reachable NFA states from a |
2165 | /// single state without consuming any input. That is, the set of all epsilon |
2166 | /// transitions not only from that single state, but from every other state |
2167 | /// reachable by an epsilon transition as well. This is why it's called a |
2168 | /// "closure." Computing an epsilon closure is also done during DFA |
2169 | /// determinization! Compare and contrast the epsilon closure here in this |
2170 | /// PikeVM and the one used for determinization in crate::util::determinize.) |
2171 | /// |
2172 | /// Computing the epsilon closure in a Thompson NFA proceeds via a depth |
2173 | /// first traversal over all epsilon transitions from a particular state. |
2174 | /// (A depth first traversal is important because it emulates the same priority |
2175 | /// of matches that is typically found in backtracking regex engines.) This |
2176 | /// depth first traversal is naturally expressed using recursion, but to avoid |
2177 | /// a call stack size proportional to the size of a regex, we put our stack on |
2178 | /// the heap instead. |
2179 | /// |
2180 | /// This stack thus consists of call frames. The typical call frame is |
2181 | /// `Explore`, which instructs epsilon closure to explore the epsilon |
2182 | /// transitions from that state. (Subsequent epsilon transitions are then |
2183 | /// pushed on to the stack as more `Explore` frames.) If the state ID being |
2184 | /// explored has no epsilon transitions, then the capturing group slots are |
2185 | /// copied from the original state that sparked the epsilon closure (from the |
2186 | /// 'step' routine) to the state ID being explored. This way, capturing group |
2187 | /// slots are forwarded from the previous state to the next. |
2188 | /// |
2189 | /// The other stack frame, `RestoreCaptures`, instructs the epsilon closure to |
2190 | /// set the position for a particular slot back to some particular offset. This |
2191 | /// frame is pushed when `Explore` sees a `Capture` transition. `Explore` will |
2192 | /// set the offset of the slot indicated in `Capture` to the current offset, |
2193 | /// and then push the old offset on to the stack as a `RestoreCapture` frame. |
2194 | /// Thus, the new offset is only used until the epsilon closure reverts back to |
2195 | /// the `RestoreCapture` frame. In effect, this gives the `Capture` epsilon |
2196 | /// transition its "scope" to only states that come "after" it during depth |
2197 | /// first traversal. |
2198 | #[derive(Clone, Debug)] |
2199 | enum FollowEpsilon { |
2200 | /// Explore the epsilon transitions from a state ID. |
2201 | Explore(StateID), |
2202 | /// Reset the given `slot` to the given `offset` (which might be `None`). |
2203 | RestoreCapture { slot: SmallIndex, offset: Option<NonMaxUsize> }, |
2204 | } |
2205 | |
2206 | /// A set of counters that "instruments" a PikeVM search. To enable this, you |
2207 | /// must enable the 'internal-instrument-pikevm' feature. Then run your Rust |
2208 | /// program with RUST_LOG=regex_automata::nfa::thompson::pikevm=trace set in |
2209 | /// the environment. The metrics collected will be dumped automatically for |
2210 | /// every search executed by the PikeVM. |
2211 | /// |
2212 | /// NOTE: When 'internal-instrument-pikevm' is enabled, it will likely cause an |
2213 | /// absolute decrease in wall-clock performance, even if the 'trace' log level |
2214 | /// isn't enabled. (Although, we do try to avoid extra costs when 'trace' isn't |
2215 | /// enabled.) The main point of instrumentation is to get counts of various |
2216 | /// events that occur during the PikeVM's execution. |
2217 | /// |
2218 | /// This is a somewhat hacked together collection of metrics that are useful |
2219 | /// to gather from a PikeVM search. In particular, it lets us scrutinize the |
2220 | /// performance profile of a search beyond what general purpose profiling tools |
2221 | /// give us. Namely, we orient the profiling data around the specific states of |
2222 | /// the NFA. |
2223 | /// |
2224 | /// In other words, this lets us see which parts of the NFA graph are most |
2225 | /// frequently activated. This then provides direction for optimization |
2226 | /// opportunities. |
2227 | /// |
2228 | /// The really sad part about this is that it absolutely clutters up the PikeVM |
2229 | /// implementation. :'( Another approach would be to just manually add this |
2230 | /// code in whenever I want this kind of profiling data, but it's complicated |
2231 | /// and tedious enough that I went with this approach... for now. |
2232 | /// |
2233 | /// When instrumentation is enabled (which also turns on 'logging'), then a |
2234 | /// `Counters` is initialized for every search and `trace`'d just before the |
2235 | /// search returns to the caller. |
2236 | /// |
2237 | /// Tip: When debugging performance problems with the PikeVM, it's best to try |
2238 | /// to work with an NFA that is as small as possible. Otherwise the state graph |
2239 | /// is likely to be too big to digest. |
2240 | #[cfg (feature = "internal-instrument-pikevm" )] |
2241 | #[derive(Clone, Debug)] |
2242 | struct Counters { |
2243 | /// The number of times the NFA is in a particular permutation of states. |
2244 | state_sets: alloc::collections::BTreeMap<Vec<StateID>, u64>, |
2245 | /// The number of times 'step' is called for a particular state ID (which |
2246 | /// indexes this array). |
2247 | steps: Vec<u64>, |
2248 | /// The number of times an epsilon closure was computed for a state. |
2249 | closures: Vec<u64>, |
2250 | /// The number of times a particular state ID is pushed on to a stack while |
2251 | /// computing an epsilon closure. |
2252 | stack_pushes: Vec<u64>, |
2253 | /// The number of times a particular state ID is inserted into a sparse set |
2254 | /// while computing an epsilon closure. |
2255 | set_inserts: Vec<u64>, |
2256 | } |
2257 | |
2258 | #[cfg (feature = "internal-instrument-pikevm" )] |
2259 | impl Counters { |
2260 | fn empty() -> Counters { |
2261 | Counters { |
2262 | state_sets: alloc::collections::BTreeMap::new(), |
2263 | steps: vec![], |
2264 | closures: vec![], |
2265 | stack_pushes: vec![], |
2266 | set_inserts: vec![], |
2267 | } |
2268 | } |
2269 | |
2270 | fn reset(&mut self, nfa: &NFA) { |
2271 | let len = nfa.states().len(); |
2272 | |
2273 | self.state_sets.clear(); |
2274 | |
2275 | self.steps.clear(); |
2276 | self.steps.resize(len, 0); |
2277 | |
2278 | self.closures.clear(); |
2279 | self.closures.resize(len, 0); |
2280 | |
2281 | self.stack_pushes.clear(); |
2282 | self.stack_pushes.resize(len, 0); |
2283 | |
2284 | self.set_inserts.clear(); |
2285 | self.set_inserts.resize(len, 0); |
2286 | } |
2287 | |
2288 | fn eprint(&self, nfa: &NFA) { |
2289 | trace!("===== START PikeVM Instrumentation Output =====" ); |
2290 | // We take the top-K most occurring state sets. Otherwise the output |
2291 | // is likely to be overwhelming. And we probably only care about the |
2292 | // most frequently occurring ones anyway. |
2293 | const LIMIT: usize = 20; |
2294 | let mut set_counts = |
2295 | self.state_sets.iter().collect::<Vec<(&Vec<StateID>, &u64)>>(); |
2296 | set_counts.sort_by_key(|(_, &count)| core::cmp::Reverse(count)); |
2297 | trace!("## PikeVM frequency of state sets (top {})" , LIMIT); |
2298 | for (set, count) in set_counts.iter().take(LIMIT) { |
2299 | trace!("{:?}: {}" , set, count); |
2300 | } |
2301 | if set_counts.len() > LIMIT { |
2302 | trace!( |
2303 | "... {} sets omitted (out of {} total)" , |
2304 | set_counts.len() - LIMIT, |
2305 | set_counts.len(), |
2306 | ); |
2307 | } |
2308 | |
2309 | trace!("" ); |
2310 | trace!("## PikeVM total frequency of events" ); |
2311 | trace!( |
2312 | "steps: {}, closures: {}, stack-pushes: {}, set-inserts: {}" , |
2313 | self.steps.iter().copied().sum::<u64>(), |
2314 | self.closures.iter().copied().sum::<u64>(), |
2315 | self.stack_pushes.iter().copied().sum::<u64>(), |
2316 | self.set_inserts.iter().copied().sum::<u64>(), |
2317 | ); |
2318 | |
2319 | trace!("" ); |
2320 | trace!("## PikeVM frequency of events broken down by state" ); |
2321 | for sid in 0..self.steps.len() { |
2322 | trace!( |
2323 | "{:06}: steps: {}, closures: {}, \ |
2324 | stack-pushes: {}, set-inserts: {}" , |
2325 | sid, |
2326 | self.steps[sid], |
2327 | self.closures[sid], |
2328 | self.stack_pushes[sid], |
2329 | self.set_inserts[sid], |
2330 | ); |
2331 | } |
2332 | |
2333 | trace!("" ); |
2334 | trace!("## NFA debug display" ); |
2335 | trace!("{:?}" , nfa); |
2336 | trace!("===== END PikeVM Instrumentation Output =====" ); |
2337 | } |
2338 | |
2339 | fn record_state_set(&mut self, set: &SparseSet) { |
2340 | let set = set.iter().collect::<Vec<StateID>>(); |
2341 | *self.state_sets.entry(set).or_insert(0) += 1; |
2342 | } |
2343 | |
2344 | fn record_step(&mut self, sid: StateID) { |
2345 | self.steps[sid] += 1; |
2346 | } |
2347 | |
2348 | fn record_closure(&mut self, sid: StateID) { |
2349 | self.closures[sid] += 1; |
2350 | } |
2351 | |
2352 | fn record_stack_push(&mut self, sid: StateID) { |
2353 | self.stack_pushes[sid] += 1; |
2354 | } |
2355 | |
2356 | fn record_set_insert(&mut self, sid: StateID) { |
2357 | self.set_inserts[sid] += 1; |
2358 | } |
2359 | } |
2360 | |