1 | use core::{fmt, mem}; |
2 | |
3 | use alloc::{boxed::Box, format, string::String, sync::Arc, vec, vec::Vec}; |
4 | |
5 | #[cfg (feature = "syntax" )] |
6 | use crate::nfa::thompson::{ |
7 | compiler::{Compiler, Config}, |
8 | error::BuildError, |
9 | }; |
10 | use crate::{ |
11 | nfa::thompson::builder::Builder, |
12 | util::{ |
13 | alphabet::{self, ByteClassSet, ByteClasses}, |
14 | captures::{GroupInfo, GroupInfoError}, |
15 | look::{Look, LookMatcher, LookSet}, |
16 | primitives::{ |
17 | IteratorIndexExt, PatternID, PatternIDIter, SmallIndex, StateID, |
18 | }, |
19 | sparse_set::SparseSet, |
20 | }, |
21 | }; |
22 | |
23 | /// A byte oriented Thompson non-deterministic finite automaton (NFA). |
24 | /// |
25 | /// A Thompson NFA is a finite state machine that permits unconditional epsilon |
26 | /// transitions, but guarantees that there exists at most one non-epsilon |
27 | /// transition for each element in the alphabet for each state. |
28 | /// |
29 | /// An NFA may be used directly for searching, for analysis or to build |
30 | /// a deterministic finite automaton (DFA). |
31 | /// |
32 | /// # Cheap clones |
33 | /// |
34 | /// Since an NFA is a core data type in this crate that many other regex |
35 | /// engines are based on top of, it is convenient to give ownership of an NFA |
36 | /// to said regex engines. Because of this, an NFA uses reference counting |
37 | /// internally. Therefore, it is cheap to clone and it is encouraged to do so. |
38 | /// |
39 | /// # Capabilities |
40 | /// |
41 | /// Using an NFA for searching via the |
42 | /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) provides the most amount |
43 | /// of "power" of any regex engine in this crate. Namely, it supports the |
44 | /// following in all cases: |
45 | /// |
46 | /// 1. Detection of a match. |
47 | /// 2. Location of a match, including both the start and end offset, in a |
48 | /// single pass of the haystack. |
49 | /// 3. Location of matching capturing groups. |
50 | /// 4. Handles multiple patterns, including (1)-(3) when multiple patterns are |
51 | /// present. |
52 | /// |
53 | /// # Capturing Groups |
54 | /// |
55 | /// Groups refer to parenthesized expressions inside a regex pattern. They look |
56 | /// like this, where `exp` is an arbitrary regex: |
57 | /// |
58 | /// * `(exp)` - An unnamed capturing group. |
59 | /// * `(?P<name>exp)` or `(?<name>exp)` - A named capturing group. |
60 | /// * `(?:exp)` - A non-capturing group. |
61 | /// * `(?i:exp)` - A non-capturing group that sets flags. |
62 | /// |
63 | /// Only the first two forms are said to be _capturing_. Capturing |
64 | /// means that the last position at which they match is reportable. The |
65 | /// [`Captures`](crate::util::captures::Captures) type provides convenient |
66 | /// access to the match positions of capturing groups, which includes looking |
67 | /// up capturing groups by their name. |
68 | /// |
69 | /// # Byte oriented |
70 | /// |
71 | /// This NFA is byte oriented, which means that all of its transitions are |
72 | /// defined on bytes. In other words, the alphabet of an NFA consists of the |
73 | /// 256 different byte values. |
74 | /// |
75 | /// While DFAs nearly demand that they be byte oriented for performance |
76 | /// reasons, an NFA could conceivably be *Unicode codepoint* oriented. Indeed, |
77 | /// a previous version of this NFA supported both byte and codepoint oriented |
78 | /// modes. A codepoint oriented mode can work because an NFA fundamentally uses |
79 | /// a sparse representation of transitions, which works well with the large |
80 | /// sparse space of Unicode codepoints. |
81 | /// |
82 | /// Nevertheless, this NFA is only byte oriented. This choice is primarily |
83 | /// driven by implementation simplicity, and also in part memory usage. In |
84 | /// practice, performance between the two is roughly comparable. However, |
85 | /// building a DFA (including a hybrid DFA) really wants a byte oriented NFA. |
86 | /// So if we do have a codepoint oriented NFA, then we also need to generate |
87 | /// byte oriented NFA in order to build an hybrid NFA/DFA. Thus, by only |
88 | /// generating byte oriented NFAs, we can produce one less NFA. In other words, |
89 | /// if we made our NFA codepoint oriented, we'd need to *also* make it support |
90 | /// a byte oriented mode, which is more complicated. But a byte oriented mode |
91 | /// can support everything. |
92 | /// |
93 | /// # Differences with DFAs |
94 | /// |
95 | /// At the theoretical level, the precise difference between an NFA and a DFA |
96 | /// is that, in a DFA, for every state, an input symbol unambiguously refers |
97 | /// to a single transition _and_ that an input symbol is required for each |
98 | /// transition. At a practical level, this permits DFA implementations to be |
99 | /// implemented at their core with a small constant number of CPU instructions |
100 | /// for each byte of input searched. In practice, this makes them quite a bit |
101 | /// faster than NFAs _in general_. Namely, in order to execute a search for any |
102 | /// Thompson NFA, one needs to keep track of a _set_ of states, and execute |
103 | /// the possible transitions on all of those states for each input symbol. |
104 | /// Overall, this results in much more overhead. To a first approximation, one |
105 | /// can expect DFA searches to be about an order of magnitude faster. |
106 | /// |
107 | /// So why use an NFA at all? The main advantage of an NFA is that it takes |
108 | /// linear time (in the size of the pattern string after repetitions have been |
109 | /// expanded) to build and linear memory usage. A DFA, on the other hand, may |
110 | /// take exponential time and/or space to build. Even in non-pathological |
111 | /// cases, DFAs often take quite a bit more memory than their NFA counterparts, |
112 | /// _especially_ if large Unicode character classes are involved. Of course, |
113 | /// an NFA also provides additional capabilities. For example, it can match |
114 | /// Unicode word boundaries on non-ASCII text and resolve the positions of |
115 | /// capturing groups. |
116 | /// |
117 | /// Note that a [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) strikes a |
118 | /// good balance between an NFA and a DFA. It avoids the exponential build time |
119 | /// of a DFA while maintaining its fast search time. The downside of a hybrid |
120 | /// NFA/DFA is that in some cases it can be slower at search time than the NFA. |
121 | /// (It also has less functionality than a pure NFA. It cannot handle Unicode |
122 | /// word boundaries on non-ASCII text and cannot resolve capturing groups.) |
123 | /// |
124 | /// # Example |
125 | /// |
126 | /// This shows how to build an NFA with the default configuration and execute a |
127 | /// search using the Pike VM. |
128 | /// |
129 | /// ``` |
130 | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; |
131 | /// |
132 | /// let re = PikeVM::new(r"foo[0-9]+" )?; |
133 | /// let mut cache = re.create_cache(); |
134 | /// let mut caps = re.create_captures(); |
135 | /// |
136 | /// let expected = Some(Match::must(0, 0..8)); |
137 | /// re.captures(&mut cache, b"foo12345" , &mut caps); |
138 | /// assert_eq!(expected, caps.get_match()); |
139 | /// |
140 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
141 | /// ``` |
142 | /// |
143 | /// # Example: resolving capturing groups |
144 | /// |
145 | /// This example shows how to parse some simple dates and extract the |
146 | /// components of each date via capturing groups. |
147 | /// |
148 | /// ``` |
149 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
150 | /// use regex_automata::{ |
151 | /// nfa::thompson::pikevm::PikeVM, |
152 | /// util::captures::Captures, |
153 | /// }; |
154 | /// |
155 | /// let vm = PikeVM::new(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})" )?; |
156 | /// let mut cache = vm.create_cache(); |
157 | /// |
158 | /// let haystack = "2012-03-14, 2013-01-01 and 2014-07-05" ; |
159 | /// let all: Vec<Captures> = vm.captures_iter( |
160 | /// &mut cache, haystack.as_bytes() |
161 | /// ).collect(); |
162 | /// // There should be a total of 3 matches. |
163 | /// assert_eq!(3, all.len()); |
164 | /// // The year from the second match is '2013'. |
165 | /// let span = all[1].get_group_by_name("y" ).unwrap(); |
166 | /// assert_eq!("2013" , &haystack[span]); |
167 | /// |
168 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
169 | /// ``` |
170 | /// |
171 | /// This example shows that only the last match of a capturing group is |
172 | /// reported, even if it had to match multiple times for an overall match |
173 | /// to occur. |
174 | /// |
175 | /// ``` |
176 | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; |
177 | /// |
178 | /// let re = PikeVM::new(r"([a-z]){4}" )?; |
179 | /// let mut cache = re.create_cache(); |
180 | /// let mut caps = re.create_captures(); |
181 | /// |
182 | /// let haystack = b"quux" ; |
183 | /// re.captures(&mut cache, haystack, &mut caps); |
184 | /// assert!(caps.is_match()); |
185 | /// assert_eq!(Some(Span::from(3..4)), caps.get_group(1)); |
186 | /// |
187 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
188 | /// ``` |
189 | #[derive(Clone)] |
190 | pub struct NFA( |
191 | // We make NFAs reference counted primarily for two reasons. First is that |
192 | // the NFA type itself is quite large (at least 0.5KB), and so it makes |
193 | // sense to put it on the heap by default anyway. Second is that, for Arc |
194 | // specifically, this enables cheap clones. This tends to be useful because |
195 | // several structures (the backtracker, the Pike VM, the hybrid NFA/DFA) |
196 | // all want to hang on to an NFA for use during search time. We could |
197 | // provide the NFA at search time via a function argument, but this makes |
198 | // for an unnecessarily annoying API. Instead, we just let each structure |
199 | // share ownership of the NFA. Using a deep clone would not be smart, since |
200 | // the NFA can use quite a bit of heap space. |
201 | Arc<Inner>, |
202 | ); |
203 | |
204 | impl NFA { |
205 | /// Parse the given regular expression using a default configuration and |
206 | /// build an NFA from it. |
207 | /// |
208 | /// If you want a non-default configuration, then use the NFA |
209 | /// [`Compiler`] with a [`Config`]. |
210 | /// |
211 | /// # Example |
212 | /// |
213 | /// ``` |
214 | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; |
215 | /// |
216 | /// let re = PikeVM::new(r"foo[0-9]+" )?; |
217 | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
218 | /// |
219 | /// let expected = Some(Match::must(0, 0..8)); |
220 | /// re.captures(&mut cache, b"foo12345" , &mut caps); |
221 | /// assert_eq!(expected, caps.get_match()); |
222 | /// |
223 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
224 | /// ``` |
225 | #[cfg (feature = "syntax" )] |
226 | pub fn new(pattern: &str) -> Result<NFA, BuildError> { |
227 | NFA::compiler().build(pattern) |
228 | } |
229 | |
230 | /// Parse the given regular expressions using a default configuration and |
231 | /// build a multi-NFA from them. |
232 | /// |
233 | /// If you want a non-default configuration, then use the NFA |
234 | /// [`Compiler`] with a [`Config`]. |
235 | /// |
236 | /// # Example |
237 | /// |
238 | /// ``` |
239 | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; |
240 | /// |
241 | /// let re = PikeVM::new_many(&["[0-9]+" , "[a-z]+" ])?; |
242 | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
243 | /// |
244 | /// let expected = Some(Match::must(1, 0..3)); |
245 | /// re.captures(&mut cache, b"foo12345bar" , &mut caps); |
246 | /// assert_eq!(expected, caps.get_match()); |
247 | /// |
248 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
249 | /// ``` |
250 | #[cfg (feature = "syntax" )] |
251 | pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<NFA, BuildError> { |
252 | NFA::compiler().build_many(patterns) |
253 | } |
254 | |
255 | /// Returns an NFA with a single regex pattern that always matches at every |
256 | /// position. |
257 | /// |
258 | /// # Example |
259 | /// |
260 | /// ``` |
261 | /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; |
262 | /// |
263 | /// let re = PikeVM::new_from_nfa(NFA::always_match())?; |
264 | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
265 | /// |
266 | /// let expected = Some(Match::must(0, 0..0)); |
267 | /// re.captures(&mut cache, b"" , &mut caps); |
268 | /// assert_eq!(expected, caps.get_match()); |
269 | /// re.captures(&mut cache, b"foo" , &mut caps); |
270 | /// assert_eq!(expected, caps.get_match()); |
271 | /// |
272 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
273 | /// ``` |
274 | pub fn always_match() -> NFA { |
275 | // We could use NFA::new("") here and we'd get the same semantics, but |
276 | // hand-assembling the NFA (as below) does the same thing with a fewer |
277 | // number of states. It also avoids needing the 'syntax' feature |
278 | // enabled. |
279 | // |
280 | // Technically all we need is the "match" state, but we add the |
281 | // "capture" states so that the PikeVM can use this NFA. |
282 | // |
283 | // The unwraps below are OK because we add so few states that they will |
284 | // never exhaust any default limits in any environment. |
285 | let mut builder = Builder::new(); |
286 | let pid = builder.start_pattern().unwrap(); |
287 | assert_eq!(pid.as_usize(), 0); |
288 | let start_id = |
289 | builder.add_capture_start(StateID::ZERO, 0, None).unwrap(); |
290 | let end_id = builder.add_capture_end(StateID::ZERO, 0).unwrap(); |
291 | let match_id = builder.add_match().unwrap(); |
292 | builder.patch(start_id, end_id).unwrap(); |
293 | builder.patch(end_id, match_id).unwrap(); |
294 | let pid = builder.finish_pattern(start_id).unwrap(); |
295 | assert_eq!(pid.as_usize(), 0); |
296 | builder.build(start_id, start_id).unwrap() |
297 | } |
298 | |
299 | /// Returns an NFA that never matches at any position. |
300 | /// |
301 | /// This is a convenience routine for creating an NFA with zero patterns. |
302 | /// |
303 | /// # Example |
304 | /// |
305 | /// ``` |
306 | /// use regex_automata::nfa::thompson::{NFA, pikevm::PikeVM}; |
307 | /// |
308 | /// let re = PikeVM::new_from_nfa(NFA::never_match())?; |
309 | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
310 | /// |
311 | /// re.captures(&mut cache, b"" , &mut caps); |
312 | /// assert!(!caps.is_match()); |
313 | /// re.captures(&mut cache, b"foo" , &mut caps); |
314 | /// assert!(!caps.is_match()); |
315 | /// |
316 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
317 | /// ``` |
318 | pub fn never_match() -> NFA { |
319 | // This always succeeds because it only requires one NFA state, which |
320 | // will never exhaust any (default) limits. |
321 | let mut builder = Builder::new(); |
322 | let sid = builder.add_fail().unwrap(); |
323 | builder.build(sid, sid).unwrap() |
324 | } |
325 | |
326 | /// Return a default configuration for an `NFA`. |
327 | /// |
328 | /// This is a convenience routine to avoid needing to import the `Config` |
329 | /// type when customizing the construction of an NFA. |
330 | /// |
331 | /// # Example |
332 | /// |
333 | /// This example shows how to build an NFA with a small size limit that |
334 | /// results in a compilation error for any regex that tries to use more |
335 | /// heap memory than the configured limit. |
336 | /// |
337 | /// ``` |
338 | /// use regex_automata::nfa::thompson::{NFA, pikevm::PikeVM}; |
339 | /// |
340 | /// let result = PikeVM::builder() |
341 | /// .thompson(NFA::config().nfa_size_limit(Some(1_000))) |
342 | /// // Remember, \w is Unicode-aware by default and thus huge. |
343 | /// .build(r"\w+" ); |
344 | /// assert!(result.is_err()); |
345 | /// ``` |
346 | #[cfg (feature = "syntax" )] |
347 | pub fn config() -> Config { |
348 | Config::new() |
349 | } |
350 | |
351 | /// Return a compiler for configuring the construction of an `NFA`. |
352 | /// |
353 | /// This is a convenience routine to avoid needing to import the |
354 | /// [`Compiler`] type in common cases. |
355 | /// |
356 | /// # Example |
357 | /// |
358 | /// This example shows how to build an NFA that is permitted match invalid |
359 | /// UTF-8. Without the additional syntax configuration here, compilation of |
360 | /// `(?-u:.)` would fail because it is permitted to match invalid UTF-8. |
361 | /// |
362 | /// ``` |
363 | /// use regex_automata::{ |
364 | /// nfa::thompson::pikevm::PikeVM, |
365 | /// util::syntax, |
366 | /// Match, |
367 | /// }; |
368 | /// |
369 | /// let re = PikeVM::builder() |
370 | /// .syntax(syntax::Config::new().utf8(false)) |
371 | /// .build(r"[a-z]+(?-u:.)" )?; |
372 | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
373 | /// |
374 | /// let expected = Some(Match::must(0, 1..5)); |
375 | /// re.captures(&mut cache, b" \xFFabc \xFF" , &mut caps); |
376 | /// assert_eq!(expected, caps.get_match()); |
377 | /// |
378 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
379 | /// ``` |
380 | #[cfg (feature = "syntax" )] |
381 | pub fn compiler() -> Compiler { |
382 | Compiler::new() |
383 | } |
384 | |
385 | /// Returns an iterator over all pattern identifiers in this NFA. |
386 | /// |
387 | /// Pattern IDs are allocated in sequential order starting from zero, |
388 | /// where the order corresponds to the order of patterns provided to the |
389 | /// [`NFA::new_many`] constructor. |
390 | /// |
391 | /// # Example |
392 | /// |
393 | /// ``` |
394 | /// use regex_automata::{nfa::thompson::NFA, PatternID}; |
395 | /// |
396 | /// let nfa = NFA::new_many(&["[0-9]+" , "[a-z]+" , "[A-Z]+" ])?; |
397 | /// let pids: Vec<PatternID> = nfa.patterns().collect(); |
398 | /// assert_eq!(pids, vec![ |
399 | /// PatternID::must(0), |
400 | /// PatternID::must(1), |
401 | /// PatternID::must(2), |
402 | /// ]); |
403 | /// |
404 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
405 | /// ``` |
406 | pub fn patterns(&self) -> PatternIter<'_> { |
407 | PatternIter { |
408 | it: PatternID::iter(self.pattern_len()), |
409 | _marker: core::marker::PhantomData, |
410 | } |
411 | } |
412 | |
413 | /// Returns the total number of regex patterns in this NFA. |
414 | /// |
415 | /// This may return zero if the NFA was constructed with no patterns. In |
416 | /// this case, the NFA can never produce a match for any input. |
417 | /// |
418 | /// This is guaranteed to be no bigger than [`PatternID::LIMIT`] because |
419 | /// NFA construction will fail if too many patterns are added. |
420 | /// |
421 | /// It is always true that `nfa.patterns().count() == nfa.pattern_len()`. |
422 | /// |
423 | /// # Example |
424 | /// |
425 | /// ``` |
426 | /// use regex_automata::nfa::thompson::NFA; |
427 | /// |
428 | /// let nfa = NFA::new_many(&["[0-9]+" , "[a-z]+" , "[A-Z]+" ])?; |
429 | /// assert_eq!(3, nfa.pattern_len()); |
430 | /// |
431 | /// let nfa = NFA::never_match(); |
432 | /// assert_eq!(0, nfa.pattern_len()); |
433 | /// |
434 | /// let nfa = NFA::always_match(); |
435 | /// assert_eq!(1, nfa.pattern_len()); |
436 | /// |
437 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
438 | /// ``` |
439 | #[inline ] |
440 | pub fn pattern_len(&self) -> usize { |
441 | self.0.start_pattern.len() |
442 | } |
443 | |
444 | /// Return the state identifier of the initial anchored state of this NFA. |
445 | /// |
446 | /// The returned identifier is guaranteed to be a valid index into the |
447 | /// slice returned by [`NFA::states`], and is also a valid argument to |
448 | /// [`NFA::state`]. |
449 | /// |
450 | /// # Example |
451 | /// |
452 | /// This example shows a somewhat contrived example where we can easily |
453 | /// predict the anchored starting state. |
454 | /// |
455 | /// ``` |
456 | /// use regex_automata::nfa::thompson::{NFA, State, WhichCaptures}; |
457 | /// |
458 | /// let nfa = NFA::compiler() |
459 | /// .configure(NFA::config().which_captures(WhichCaptures::None)) |
460 | /// .build("a" )?; |
461 | /// let state = nfa.state(nfa.start_anchored()); |
462 | /// match *state { |
463 | /// State::ByteRange { trans } => { |
464 | /// assert_eq!(b'a' , trans.start); |
465 | /// assert_eq!(b'a' , trans.end); |
466 | /// } |
467 | /// _ => unreachable!("unexpected state" ), |
468 | /// } |
469 | /// |
470 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
471 | /// ``` |
472 | #[inline ] |
473 | pub fn start_anchored(&self) -> StateID { |
474 | self.0.start_anchored |
475 | } |
476 | |
477 | /// Return the state identifier of the initial unanchored state of this |
478 | /// NFA. |
479 | /// |
480 | /// This is equivalent to the identifier returned by |
481 | /// [`NFA::start_anchored`] when the NFA has no unanchored starting state. |
482 | /// |
483 | /// The returned identifier is guaranteed to be a valid index into the |
484 | /// slice returned by [`NFA::states`], and is also a valid argument to |
485 | /// [`NFA::state`]. |
486 | /// |
487 | /// # Example |
488 | /// |
489 | /// This example shows that the anchored and unanchored starting states |
490 | /// are equivalent when an anchored NFA is built. |
491 | /// |
492 | /// ``` |
493 | /// use regex_automata::nfa::thompson::NFA; |
494 | /// |
495 | /// let nfa = NFA::new("^a" )?; |
496 | /// assert_eq!(nfa.start_anchored(), nfa.start_unanchored()); |
497 | /// |
498 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
499 | /// ``` |
500 | #[inline ] |
501 | pub fn start_unanchored(&self) -> StateID { |
502 | self.0.start_unanchored |
503 | } |
504 | |
505 | /// Return the state identifier of the initial anchored state for the given |
506 | /// pattern, or `None` if there is no pattern corresponding to the given |
507 | /// identifier. |
508 | /// |
509 | /// If one uses the starting state for a particular pattern, then the only |
510 | /// match that can be returned is for the corresponding pattern. |
511 | /// |
512 | /// The returned identifier is guaranteed to be a valid index into the |
513 | /// slice returned by [`NFA::states`], and is also a valid argument to |
514 | /// [`NFA::state`]. |
515 | /// |
516 | /// # Errors |
517 | /// |
518 | /// If the pattern doesn't exist in this NFA, then this returns an error. |
519 | /// This occurs when `pid.as_usize() >= nfa.pattern_len()`. |
520 | /// |
521 | /// # Example |
522 | /// |
523 | /// This example shows that the anchored and unanchored starting states |
524 | /// are equivalent when an anchored NFA is built. |
525 | /// |
526 | /// ``` |
527 | /// use regex_automata::{nfa::thompson::NFA, PatternID}; |
528 | /// |
529 | /// let nfa = NFA::new_many(&["^a" , "^b" ])?; |
530 | /// // The anchored and unanchored states for the entire NFA are the same, |
531 | /// // since all of the patterns are anchored. |
532 | /// assert_eq!(nfa.start_anchored(), nfa.start_unanchored()); |
533 | /// // But the anchored starting states for each pattern are distinct, |
534 | /// // because these starting states can only lead to matches for the |
535 | /// // corresponding pattern. |
536 | /// let anchored = Some(nfa.start_anchored()); |
537 | /// assert_ne!(anchored, nfa.start_pattern(PatternID::must(0))); |
538 | /// assert_ne!(anchored, nfa.start_pattern(PatternID::must(1))); |
539 | /// // Requesting a pattern not in the NFA will result in None: |
540 | /// assert_eq!(None, nfa.start_pattern(PatternID::must(2))); |
541 | /// |
542 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
543 | /// ``` |
544 | #[inline ] |
545 | pub fn start_pattern(&self, pid: PatternID) -> Option<StateID> { |
546 | self.0.start_pattern.get(pid.as_usize()).copied() |
547 | } |
548 | |
549 | /// Get the byte class set for this NFA. |
550 | /// |
551 | /// A byte class set is a partitioning of this NFA's alphabet into |
552 | /// equivalence classes. Any two bytes in the same equivalence class are |
553 | /// guaranteed to never discriminate between a match or a non-match. (The |
554 | /// partitioning may not be minimal.) |
555 | /// |
556 | /// Byte classes are used internally by this crate when building DFAs. |
557 | /// Namely, among other optimizations, they enable a space optimization |
558 | /// where the DFA's internal alphabet is defined over the equivalence |
559 | /// classes of bytes instead of all possible byte values. The former is |
560 | /// often quite a bit smaller than the latter, which permits the DFA to use |
561 | /// less space for its transition table. |
562 | #[inline ] |
563 | pub(crate) fn byte_class_set(&self) -> &ByteClassSet { |
564 | &self.0.byte_class_set |
565 | } |
566 | |
567 | /// Get the byte classes for this NFA. |
568 | /// |
569 | /// Byte classes represent a partitioning of this NFA's alphabet into |
570 | /// equivalence classes. Any two bytes in the same equivalence class are |
571 | /// guaranteed to never discriminate between a match or a non-match. (The |
572 | /// partitioning may not be minimal.) |
573 | /// |
574 | /// Byte classes are used internally by this crate when building DFAs. |
575 | /// Namely, among other optimizations, they enable a space optimization |
576 | /// where the DFA's internal alphabet is defined over the equivalence |
577 | /// classes of bytes instead of all possible byte values. The former is |
578 | /// often quite a bit smaller than the latter, which permits the DFA to use |
579 | /// less space for its transition table. |
580 | /// |
581 | /// # Example |
582 | /// |
583 | /// This example shows how to query the class of various bytes. |
584 | /// |
585 | /// ``` |
586 | /// use regex_automata::nfa::thompson::NFA; |
587 | /// |
588 | /// let nfa = NFA::new("[a-z]+" )?; |
589 | /// let classes = nfa.byte_classes(); |
590 | /// // 'a' and 'z' are in the same class for this regex. |
591 | /// assert_eq!(classes.get(b'a' ), classes.get(b'z' )); |
592 | /// // But 'a' and 'A' are not. |
593 | /// assert_ne!(classes.get(b'a' ), classes.get(b'A' )); |
594 | /// |
595 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
596 | /// ``` |
597 | #[inline ] |
598 | pub fn byte_classes(&self) -> &ByteClasses { |
599 | &self.0.byte_classes |
600 | } |
601 | |
602 | /// Return a reference to the NFA state corresponding to the given ID. |
603 | /// |
604 | /// This is a convenience routine for `nfa.states()[id]`. |
605 | /// |
606 | /// # Panics |
607 | /// |
608 | /// This panics when the given identifier does not reference a valid state. |
609 | /// That is, when `id.as_usize() >= nfa.states().len()`. |
610 | /// |
611 | /// # Example |
612 | /// |
613 | /// The anchored state for a pattern will typically correspond to a |
614 | /// capturing state for that pattern. (Although, this is not an API |
615 | /// guarantee!) |
616 | /// |
617 | /// ``` |
618 | /// use regex_automata::{nfa::thompson::{NFA, State}, PatternID}; |
619 | /// |
620 | /// let nfa = NFA::new("a" )?; |
621 | /// let state = nfa.state(nfa.start_pattern(PatternID::ZERO).unwrap()); |
622 | /// match *state { |
623 | /// State::Capture { slot, .. } => { |
624 | /// assert_eq!(0, slot.as_usize()); |
625 | /// } |
626 | /// _ => unreachable!("unexpected state" ), |
627 | /// } |
628 | /// |
629 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
630 | /// ``` |
631 | #[inline ] |
632 | pub fn state(&self, id: StateID) -> &State { |
633 | &self.states()[id] |
634 | } |
635 | |
636 | /// Returns a slice of all states in this NFA. |
637 | /// |
638 | /// The slice returned is indexed by `StateID`. This provides a convenient |
639 | /// way to access states while following transitions among those states. |
640 | /// |
641 | /// # Example |
642 | /// |
643 | /// This demonstrates that disabling UTF-8 mode can shrink the size of the |
644 | /// NFA considerably in some cases, especially when using Unicode character |
645 | /// classes. |
646 | /// |
647 | /// ``` |
648 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
649 | /// use regex_automata::nfa::thompson::NFA; |
650 | /// |
651 | /// let nfa_unicode = NFA::new(r"\w" )?; |
652 | /// let nfa_ascii = NFA::new(r"(?-u)\w" )?; |
653 | /// // Yes, a factor of 45 difference. No lie. |
654 | /// assert!(40 * nfa_ascii.states().len() < nfa_unicode.states().len()); |
655 | /// |
656 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
657 | /// ``` |
658 | #[inline ] |
659 | pub fn states(&self) -> &[State] { |
660 | &self.0.states |
661 | } |
662 | |
663 | /// Returns the capturing group info for this NFA. |
664 | /// |
665 | /// The [`GroupInfo`] provides a way to map to and from capture index |
666 | /// and capture name for each pattern. It also provides a mapping from |
667 | /// each of the capturing groups in every pattern to their corresponding |
668 | /// slot offsets encoded in [`State::Capture`] states. |
669 | /// |
670 | /// Note that `GroupInfo` uses reference counting internally, such that |
671 | /// cloning a `GroupInfo` is very cheap. |
672 | /// |
673 | /// # Example |
674 | /// |
675 | /// This example shows how to get a list of all capture group names for |
676 | /// a particular pattern. |
677 | /// |
678 | /// ``` |
679 | /// use regex_automata::{nfa::thompson::NFA, PatternID}; |
680 | /// |
681 | /// let nfa = NFA::new(r"(a)(?P<foo>b)(c)(d)(?P<bar>e)" )?; |
682 | /// // The first is the implicit group that is always unnammed. The next |
683 | /// // 5 groups are the explicit groups found in the concrete syntax above. |
684 | /// let expected = vec![None, None, Some("foo" ), None, None, Some("bar" )]; |
685 | /// let got: Vec<Option<&str>> = |
686 | /// nfa.group_info().pattern_names(PatternID::ZERO).collect(); |
687 | /// assert_eq!(expected, got); |
688 | /// |
689 | /// // Using an invalid pattern ID will result in nothing yielded. |
690 | /// let got = nfa.group_info().pattern_names(PatternID::must(999)).count(); |
691 | /// assert_eq!(0, got); |
692 | /// |
693 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
694 | /// ``` |
695 | #[inline ] |
696 | pub fn group_info(&self) -> &GroupInfo { |
697 | &self.0.group_info() |
698 | } |
699 | |
700 | /// Returns true if and only if this NFA has at least one |
701 | /// [`Capture`](State::Capture) in its sequence of states. |
702 | /// |
703 | /// This is useful as a way to perform a quick test before attempting |
704 | /// something that does or does not require capture states. For example, |
705 | /// some regex engines (like the PikeVM) require capture states in order to |
706 | /// work at all. |
707 | /// |
708 | /// # Example |
709 | /// |
710 | /// This example shows a few different NFAs and whether they have captures |
711 | /// or not. |
712 | /// |
713 | /// ``` |
714 | /// use regex_automata::nfa::thompson::{NFA, WhichCaptures}; |
715 | /// |
716 | /// // Obviously has capture states. |
717 | /// let nfa = NFA::new("(a)" )?; |
718 | /// assert!(nfa.has_capture()); |
719 | /// |
720 | /// // Less obviously has capture states, because every pattern has at |
721 | /// // least one anonymous capture group corresponding to the match for the |
722 | /// // entire pattern. |
723 | /// let nfa = NFA::new("a" )?; |
724 | /// assert!(nfa.has_capture()); |
725 | /// |
726 | /// // Other than hand building your own NFA, this is the only way to build |
727 | /// // an NFA without capturing groups. In general, you should only do this |
728 | /// // if you don't intend to use any of the NFA-oriented regex engines. |
729 | /// // Overall, capturing groups don't have many downsides. Although they |
730 | /// // can add a bit of noise to simple NFAs, so it can be nice to disable |
731 | /// // them for debugging purposes. |
732 | /// // |
733 | /// // Notice that 'has_capture' is false here even when we have an |
734 | /// // explicit capture group in the pattern. |
735 | /// let nfa = NFA::compiler() |
736 | /// .configure(NFA::config().which_captures(WhichCaptures::None)) |
737 | /// .build("(a)" )?; |
738 | /// assert!(!nfa.has_capture()); |
739 | /// |
740 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
741 | /// ``` |
742 | #[inline ] |
743 | pub fn has_capture(&self) -> bool { |
744 | self.0.has_capture |
745 | } |
746 | |
747 | /// Returns true if and only if this NFA can match the empty string. |
748 | /// When it returns false, all possible matches are guaranteed to have a |
749 | /// non-zero length. |
750 | /// |
751 | /// This is useful as cheap way to know whether code needs to handle the |
752 | /// case of a zero length match. This is particularly important when UTF-8 |
753 | /// modes are enabled, as when UTF-8 mode is enabled, empty matches that |
754 | /// split a codepoint must never be reported. This extra handling can |
755 | /// sometimes be costly, and since regexes matching an empty string are |
756 | /// somewhat rare, it can be beneficial to treat such regexes specially. |
757 | /// |
758 | /// # Example |
759 | /// |
760 | /// This example shows a few different NFAs and whether they match the |
761 | /// empty string or not. Notice the empty string isn't merely a matter |
762 | /// of a string of length literally `0`, but rather, whether a match can |
763 | /// occur between specific pairs of bytes. |
764 | /// |
765 | /// ``` |
766 | /// use regex_automata::{nfa::thompson::NFA, util::syntax}; |
767 | /// |
768 | /// // The empty regex matches the empty string. |
769 | /// let nfa = NFA::new("" )?; |
770 | /// assert!(nfa.has_empty(), "empty matches empty" ); |
771 | /// // The '+' repetition operator requires at least one match, and so |
772 | /// // does not match the empty string. |
773 | /// let nfa = NFA::new("a+" )?; |
774 | /// assert!(!nfa.has_empty(), "+ does not match empty" ); |
775 | /// // But the '*' repetition operator does. |
776 | /// let nfa = NFA::new("a*" )?; |
777 | /// assert!(nfa.has_empty(), "* does match empty" ); |
778 | /// // And wrapping '+' in an operator that can match an empty string also |
779 | /// // causes it to match the empty string too. |
780 | /// let nfa = NFA::new("(a+)*" )?; |
781 | /// assert!(nfa.has_empty(), "+ inside of * matches empty" ); |
782 | /// |
783 | /// // If a regex is just made of a look-around assertion, even if the |
784 | /// // assertion requires some kind of non-empty string around it (such as |
785 | /// // \b), then it is still treated as if it matches the empty string. |
786 | /// // Namely, if a match occurs of just a look-around assertion, then the |
787 | /// // match returned is empty. |
788 | /// let nfa = NFA::compiler() |
789 | /// .syntax(syntax::Config::new().utf8(false)) |
790 | /// .build(r"^$\A\z\b\B(?-u:\b\B)" )?; |
791 | /// assert!(nfa.has_empty(), "assertions match empty" ); |
792 | /// // Even when an assertion is wrapped in a '+', it still matches the |
793 | /// // empty string. |
794 | /// let nfa = NFA::new(r"\b+" )?; |
795 | /// assert!(nfa.has_empty(), "+ of an assertion matches empty" ); |
796 | /// |
797 | /// // An alternation with even one branch that can match the empty string |
798 | /// // is also said to match the empty string overall. |
799 | /// let nfa = NFA::new("foo|(bar)?|quux" )?; |
800 | /// assert!(nfa.has_empty(), "alternations can match empty" ); |
801 | /// |
802 | /// // An NFA that matches nothing does not match the empty string. |
803 | /// let nfa = NFA::new("[a&&b]" )?; |
804 | /// assert!(!nfa.has_empty(), "never matching means not matching empty" ); |
805 | /// // But if it's wrapped in something that doesn't require a match at |
806 | /// // all, then it can match the empty string! |
807 | /// let nfa = NFA::new("[a&&b]*" )?; |
808 | /// assert!(nfa.has_empty(), "* on never-match still matches empty" ); |
809 | /// // Since a '+' requires a match, using it on something that can never |
810 | /// // match will itself produce a regex that can never match anything, |
811 | /// // and thus does not match the empty string. |
812 | /// let nfa = NFA::new("[a&&b]+" )?; |
813 | /// assert!(!nfa.has_empty(), "+ on never-match still matches nothing" ); |
814 | /// |
815 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
816 | /// ``` |
817 | #[inline ] |
818 | pub fn has_empty(&self) -> bool { |
819 | self.0.has_empty |
820 | } |
821 | |
822 | /// Whether UTF-8 mode is enabled for this NFA or not. |
823 | /// |
824 | /// When UTF-8 mode is enabled, all matches reported by a regex engine |
825 | /// derived from this NFA are guaranteed to correspond to spans of valid |
826 | /// UTF-8. This includes zero-width matches. For example, the regex engine |
827 | /// must guarantee that the empty regex will not match at the positions |
828 | /// between code units in the UTF-8 encoding of a single codepoint. |
829 | /// |
830 | /// See [`Config::utf8`] for more information. |
831 | /// |
832 | /// This is enabled by default. |
833 | /// |
834 | /// # Example |
835 | /// |
836 | /// This example shows how UTF-8 mode can impact the match spans that may |
837 | /// be reported in certain cases. |
838 | /// |
839 | /// ``` |
840 | /// use regex_automata::{ |
841 | /// nfa::thompson::{self, pikevm::PikeVM}, |
842 | /// Match, Input, |
843 | /// }; |
844 | /// |
845 | /// let re = PikeVM::new("" )?; |
846 | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
847 | /// |
848 | /// // UTF-8 mode is enabled by default. |
849 | /// let mut input = Input::new("☃" ); |
850 | /// re.search(&mut cache, &input, &mut caps); |
851 | /// assert_eq!(Some(Match::must(0, 0..0)), caps.get_match()); |
852 | /// |
853 | /// // Even though an empty regex matches at 1..1, our next match is |
854 | /// // 3..3 because 1..1 and 2..2 split the snowman codepoint (which is |
855 | /// // three bytes long). |
856 | /// input.set_start(1); |
857 | /// re.search(&mut cache, &input, &mut caps); |
858 | /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match()); |
859 | /// |
860 | /// // But if we disable UTF-8, then we'll get matches at 1..1 and 2..2: |
861 | /// let re = PikeVM::builder() |
862 | /// .thompson(thompson::Config::new().utf8(false)) |
863 | /// .build("" )?; |
864 | /// re.search(&mut cache, &input, &mut caps); |
865 | /// assert_eq!(Some(Match::must(0, 1..1)), caps.get_match()); |
866 | /// |
867 | /// input.set_start(2); |
868 | /// re.search(&mut cache, &input, &mut caps); |
869 | /// assert_eq!(Some(Match::must(0, 2..2)), caps.get_match()); |
870 | /// |
871 | /// input.set_start(3); |
872 | /// re.search(&mut cache, &input, &mut caps); |
873 | /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match()); |
874 | /// |
875 | /// input.set_start(4); |
876 | /// re.search(&mut cache, &input, &mut caps); |
877 | /// assert_eq!(None, caps.get_match()); |
878 | /// |
879 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
880 | /// ``` |
881 | #[inline ] |
882 | pub fn is_utf8(&self) -> bool { |
883 | self.0.utf8 |
884 | } |
885 | |
886 | /// Returns true when this NFA is meant to be matched in reverse. |
887 | /// |
888 | /// Generally speaking, when this is true, it means the NFA is supposed to |
889 | /// be used in conjunction with moving backwards through the haystack. That |
890 | /// is, from a higher memory address to a lower memory address. |
891 | /// |
892 | /// It is often the case that lower level routines dealing with an NFA |
893 | /// don't need to care about whether it is "meant" to be matched in reverse |
894 | /// or not. However, there are some specific cases where it matters. For |
895 | /// example, the implementation of CRLF-aware `^` and `$` line anchors |
896 | /// needs to know whether the search is in the forward or reverse |
897 | /// direction. In the forward direction, neither `^` nor `$` should match |
898 | /// when a `\r` has been seen previously and a `\n` is next. However, in |
899 | /// the reverse direction, neither `^` nor `$` should match when a `\n` |
900 | /// has been seen previously and a `\r` is next. This fundamentally changes |
901 | /// how the state machine is constructed, and thus needs to be altered |
902 | /// based on the direction of the search. |
903 | /// |
904 | /// This is automatically set when using a [`Compiler`] with a configuration |
905 | /// where [`Config::reverse`] is enabled. If you're building your own NFA |
906 | /// by hand via a [`Builder`] |
907 | #[inline ] |
908 | pub fn is_reverse(&self) -> bool { |
909 | self.0.reverse |
910 | } |
911 | |
912 | /// Returns true if and only if all starting states for this NFA correspond |
913 | /// to the beginning of an anchored search. |
914 | /// |
915 | /// Typically, an NFA will have both an anchored and an unanchored starting |
916 | /// state. Namely, because it tends to be useful to have both and the cost |
917 | /// of having an unanchored starting state is almost zero (for an NFA). |
918 | /// However, if all patterns in the NFA are themselves anchored, then even |
919 | /// the unanchored starting state will correspond to an anchored search |
920 | /// since the pattern doesn't permit anything else. |
921 | /// |
922 | /// # Example |
923 | /// |
924 | /// This example shows a few different scenarios where this method's |
925 | /// return value varies. |
926 | /// |
927 | /// ``` |
928 | /// use regex_automata::nfa::thompson::NFA; |
929 | /// |
930 | /// // The unanchored starting state permits matching this pattern anywhere |
931 | /// // in a haystack, instead of just at the beginning. |
932 | /// let nfa = NFA::new("a" )?; |
933 | /// assert!(!nfa.is_always_start_anchored()); |
934 | /// |
935 | /// // In this case, the pattern is itself anchored, so there is no way |
936 | /// // to run an unanchored search. |
937 | /// let nfa = NFA::new("^a" )?; |
938 | /// assert!(nfa.is_always_start_anchored()); |
939 | /// |
940 | /// // When multiline mode is enabled, '^' can match at the start of a line |
941 | /// // in addition to the start of a haystack, so an unanchored search is |
942 | /// // actually possible. |
943 | /// let nfa = NFA::new("(?m)^a" )?; |
944 | /// assert!(!nfa.is_always_start_anchored()); |
945 | /// |
946 | /// // Weird cases also work. A pattern is only considered anchored if all |
947 | /// // matches may only occur at the start of a haystack. |
948 | /// let nfa = NFA::new("(^a)|a" )?; |
949 | /// assert!(!nfa.is_always_start_anchored()); |
950 | /// |
951 | /// // When multiple patterns are present, if they are all anchored, then |
952 | /// // the NFA is always anchored too. |
953 | /// let nfa = NFA::new_many(&["^a" , "^b" , "^c" ])?; |
954 | /// assert!(nfa.is_always_start_anchored()); |
955 | /// |
956 | /// // But if one pattern is unanchored, then the NFA must permit an |
957 | /// // unanchored search. |
958 | /// let nfa = NFA::new_many(&["^a" , "b" , "^c" ])?; |
959 | /// assert!(!nfa.is_always_start_anchored()); |
960 | /// |
961 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
962 | /// ``` |
963 | #[inline ] |
964 | pub fn is_always_start_anchored(&self) -> bool { |
965 | self.start_anchored() == self.start_unanchored() |
966 | } |
967 | |
968 | /// Returns the look-around matcher associated with this NFA. |
969 | /// |
970 | /// A look-around matcher determines how to match look-around assertions. |
971 | /// In particular, some assertions are configurable. For example, the |
972 | /// `(?m:^)` and `(?m:$)` assertions can have their line terminator changed |
973 | /// from the default of `\n` to any other byte. |
974 | /// |
975 | /// If the NFA was built using a [`Compiler`], then this matcher |
976 | /// can be set via the [`Config::look_matcher`] configuration |
977 | /// knob. Otherwise, if you've built an NFA by hand, it is set via |
978 | /// [`Builder::set_look_matcher`]. |
979 | /// |
980 | /// # Example |
981 | /// |
982 | /// This shows how to change the line terminator for multi-line assertions. |
983 | /// |
984 | /// ``` |
985 | /// use regex_automata::{ |
986 | /// nfa::thompson::{self, pikevm::PikeVM}, |
987 | /// util::look::LookMatcher, |
988 | /// Match, Input, |
989 | /// }; |
990 | /// |
991 | /// let mut lookm = LookMatcher::new(); |
992 | /// lookm.set_line_terminator(b' \x00' ); |
993 | /// |
994 | /// let re = PikeVM::builder() |
995 | /// .thompson(thompson::Config::new().look_matcher(lookm)) |
996 | /// .build(r"(?m)^[a-z]+$" )?; |
997 | /// let mut cache = re.create_cache(); |
998 | /// |
999 | /// // Multi-line assertions now use NUL as a terminator. |
1000 | /// assert_eq!( |
1001 | /// Some(Match::must(0, 1..4)), |
1002 | /// re.find(&mut cache, b" \x00abc \x00" ), |
1003 | /// ); |
1004 | /// // ... and \n is no longer recognized as a terminator. |
1005 | /// assert_eq!( |
1006 | /// None, |
1007 | /// re.find(&mut cache, b" \nabc \n" ), |
1008 | /// ); |
1009 | /// |
1010 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1011 | /// ``` |
1012 | #[inline ] |
1013 | pub fn look_matcher(&self) -> &LookMatcher { |
1014 | &self.0.look_matcher |
1015 | } |
1016 | |
1017 | /// Returns the union of all look-around assertions used throughout this |
1018 | /// NFA. When the returned set is empty, it implies that the NFA has no |
1019 | /// look-around assertions and thus zero conditional epsilon transitions. |
1020 | /// |
1021 | /// This is useful in some cases enabling optimizations. It is not |
1022 | /// unusual, for example, for optimizations to be of the form, "for any |
1023 | /// regex with zero conditional epsilon transitions, do ..." where "..." |
1024 | /// is some kind of optimization. |
1025 | /// |
1026 | /// This isn't only helpful for optimizations either. Sometimes look-around |
1027 | /// assertions are difficult to support. For example, many of the DFAs in |
1028 | /// this crate don't support Unicode word boundaries or handle them using |
1029 | /// heuristics. Handling that correctly typically requires some kind of |
1030 | /// cheap check of whether the NFA has a Unicode word boundary in the first |
1031 | /// place. |
1032 | /// |
1033 | /// # Example |
1034 | /// |
1035 | /// This example shows how this routine varies based on the regex pattern: |
1036 | /// |
1037 | /// ``` |
1038 | /// use regex_automata::{nfa::thompson::NFA, util::look::Look}; |
1039 | /// |
1040 | /// // No look-around at all. |
1041 | /// let nfa = NFA::new("a" )?; |
1042 | /// assert!(nfa.look_set_any().is_empty()); |
1043 | /// |
1044 | /// // When multiple patterns are present, since this returns the union, |
1045 | /// // it will include look-around assertions that only appear in one |
1046 | /// // pattern. |
1047 | /// let nfa = NFA::new_many(&["a" , "b" , "a^b" , "c" ])?; |
1048 | /// assert!(nfa.look_set_any().contains(Look::Start)); |
1049 | /// |
1050 | /// // Some groups of assertions have various shortcuts. For example: |
1051 | /// let nfa = NFA::new(r"(?-u:\b)" )?; |
1052 | /// assert!(nfa.look_set_any().contains_word()); |
1053 | /// assert!(!nfa.look_set_any().contains_word_unicode()); |
1054 | /// assert!(nfa.look_set_any().contains_word_ascii()); |
1055 | /// |
1056 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1057 | /// ``` |
1058 | #[inline ] |
1059 | pub fn look_set_any(&self) -> LookSet { |
1060 | self.0.look_set_any |
1061 | } |
1062 | |
1063 | /// Returns the union of all prefix look-around assertions for every |
1064 | /// pattern in this NFA. When the returned set is empty, it implies none of |
1065 | /// the patterns require moving through a conditional epsilon transition |
1066 | /// before inspecting the first byte in the haystack. |
1067 | /// |
1068 | /// This can be useful for determining what kinds of assertions need to be |
1069 | /// satisfied at the beginning of a search. For example, typically DFAs |
1070 | /// in this crate will build a distinct starting state for each possible |
1071 | /// starting configuration that might result in look-around assertions |
1072 | /// being satisfied differently. However, if the set returned here is |
1073 | /// empty, then you know that the start state is invariant because there |
1074 | /// are no conditional epsilon transitions to consider. |
1075 | /// |
1076 | /// # Example |
1077 | /// |
1078 | /// This example shows how this routine varies based on the regex pattern: |
1079 | /// |
1080 | /// ``` |
1081 | /// use regex_automata::{nfa::thompson::NFA, util::look::Look}; |
1082 | /// |
1083 | /// // No look-around at all. |
1084 | /// let nfa = NFA::new("a" )?; |
1085 | /// assert!(nfa.look_set_prefix_any().is_empty()); |
1086 | /// |
1087 | /// // When multiple patterns are present, since this returns the union, |
1088 | /// // it will include look-around assertions that only appear in one |
1089 | /// // pattern. But it will only include assertions that are in the prefix |
1090 | /// // of a pattern. For example, this includes '^' but not '$' even though |
1091 | /// // '$' does appear. |
1092 | /// let nfa = NFA::new_many(&["a" , "b" , "^ab$" , "c" ])?; |
1093 | /// assert!(nfa.look_set_prefix_any().contains(Look::Start)); |
1094 | /// assert!(!nfa.look_set_prefix_any().contains(Look::End)); |
1095 | /// |
1096 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1097 | /// ``` |
1098 | #[inline ] |
1099 | pub fn look_set_prefix_any(&self) -> LookSet { |
1100 | self.0.look_set_prefix_any |
1101 | } |
1102 | |
1103 | // FIXME: The `look_set_prefix_all` computation was not correct, and it |
1104 | // seemed a little tricky to fix it. Since I wasn't actually using it for |
1105 | // anything, I just decided to remove it in the run up to the regex 1.9 |
1106 | // release. If you need this, please file an issue. |
1107 | /* |
1108 | /// Returns the intersection of all prefix look-around assertions for every |
1109 | /// pattern in this NFA. When the returned set is empty, it implies at |
1110 | /// least one of the patterns does not require moving through a conditional |
1111 | /// epsilon transition before inspecting the first byte in the haystack. |
1112 | /// Conversely, when the set contains an assertion, it implies that every |
1113 | /// pattern in the NFA also contains that assertion in its prefix. |
1114 | /// |
1115 | /// This can be useful for determining what kinds of assertions need to be |
1116 | /// satisfied at the beginning of a search. For example, if you know that |
1117 | /// [`Look::Start`] is in the prefix intersection set returned here, then |
1118 | /// you know that all searches, regardless of input configuration, will be |
1119 | /// anchored. |
1120 | /// |
1121 | /// # Example |
1122 | /// |
1123 | /// This example shows how this routine varies based on the regex pattern: |
1124 | /// |
1125 | /// ``` |
1126 | /// use regex_automata::{nfa::thompson::NFA, util::look::Look}; |
1127 | /// |
1128 | /// // No look-around at all. |
1129 | /// let nfa = NFA::new("a")?; |
1130 | /// assert!(nfa.look_set_prefix_all().is_empty()); |
1131 | /// |
1132 | /// // When multiple patterns are present, since this returns the |
1133 | /// // intersection, it will only include assertions present in every |
1134 | /// // prefix, and only the prefix. |
1135 | /// let nfa = NFA::new_many(&["^a$", "^b$", "$^ab$", "^c$"])?; |
1136 | /// assert!(nfa.look_set_prefix_all().contains(Look::Start)); |
1137 | /// assert!(!nfa.look_set_prefix_all().contains(Look::End)); |
1138 | /// |
1139 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1140 | /// ``` |
1141 | #[inline] |
1142 | pub fn look_set_prefix_all(&self) -> LookSet { |
1143 | self.0.look_set_prefix_all |
1144 | } |
1145 | */ |
1146 | |
1147 | /// Returns the memory usage, in bytes, of this NFA. |
1148 | /// |
1149 | /// This does **not** include the stack size used up by this NFA. To |
1150 | /// compute that, use `std::mem::size_of::<NFA>()`. |
1151 | /// |
1152 | /// # Example |
1153 | /// |
1154 | /// This example shows that large Unicode character classes can use quite |
1155 | /// a bit of memory. |
1156 | /// |
1157 | /// ``` |
1158 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
1159 | /// use regex_automata::nfa::thompson::NFA; |
1160 | /// |
1161 | /// let nfa_unicode = NFA::new(r"\w" )?; |
1162 | /// let nfa_ascii = NFA::new(r"(?-u:\w)" )?; |
1163 | /// |
1164 | /// assert!(10 * nfa_ascii.memory_usage() < nfa_unicode.memory_usage()); |
1165 | /// |
1166 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1167 | /// ``` |
1168 | #[inline ] |
1169 | pub fn memory_usage(&self) -> usize { |
1170 | use core::mem::size_of; |
1171 | |
1172 | size_of::<Inner>() // allocated on the heap via Arc |
1173 | + self.0.states.len() * size_of::<State>() |
1174 | + self.0.start_pattern.len() * size_of::<StateID>() |
1175 | + self.0.group_info.memory_usage() |
1176 | + self.0.memory_extra |
1177 | } |
1178 | } |
1179 | |
1180 | impl fmt::Debug for NFA { |
1181 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
1182 | self.0.fmt(f) |
1183 | } |
1184 | } |
1185 | |
1186 | /// The "inner" part of the NFA. We split this part out so that we can easily |
1187 | /// wrap it in an `Arc` above in the definition of `NFA`. |
1188 | /// |
1189 | /// See builder.rs for the code that actually builds this type. This module |
1190 | /// does provide (internal) mutable methods for adding things to this |
1191 | /// NFA before finalizing it, but the high level construction process is |
1192 | /// controlled by the builder abstraction. (Which is complicated enough to |
1193 | /// get its own module.) |
1194 | #[derive(Default)] |
1195 | pub(super) struct Inner { |
1196 | /// The state sequence. This sequence is guaranteed to be indexable by all |
1197 | /// starting state IDs, and it is also guaranteed to contain at most one |
1198 | /// `Match` state for each pattern compiled into this NFA. (A pattern may |
1199 | /// not have a corresponding `Match` state if a `Match` state is impossible |
1200 | /// to reach.) |
1201 | states: Vec<State>, |
1202 | /// The anchored starting state of this NFA. |
1203 | start_anchored: StateID, |
1204 | /// The unanchored starting state of this NFA. |
1205 | start_unanchored: StateID, |
1206 | /// The starting states for each individual pattern. Starting at any |
1207 | /// of these states will result in only an anchored search for the |
1208 | /// corresponding pattern. The vec is indexed by pattern ID. When the NFA |
1209 | /// contains a single regex, then `start_pattern[0]` and `start_anchored` |
1210 | /// are always equivalent. |
1211 | start_pattern: Vec<StateID>, |
1212 | /// Info about the capturing groups in this NFA. This is responsible for |
1213 | /// mapping groups to slots, mapping groups to names and names to groups. |
1214 | group_info: GroupInfo, |
1215 | /// A representation of equivalence classes over the transitions in this |
1216 | /// NFA. Two bytes in the same equivalence class must not discriminate |
1217 | /// between a match or a non-match. This map can be used to shrink the |
1218 | /// total size of a DFA's transition table with a small match-time cost. |
1219 | /// |
1220 | /// Note that the NFA's transitions are *not* defined in terms of these |
1221 | /// equivalence classes. The NFA's transitions are defined on the original |
1222 | /// byte values. For the most part, this is because they wouldn't really |
1223 | /// help the NFA much since the NFA already uses a sparse representation |
1224 | /// to represent transitions. Byte classes are most effective in a dense |
1225 | /// representation. |
1226 | byte_class_set: ByteClassSet, |
1227 | /// This is generated from `byte_class_set`, and essentially represents the |
1228 | /// same thing but supports different access patterns. Namely, this permits |
1229 | /// looking up the equivalence class of a byte very cheaply. |
1230 | /// |
1231 | /// Ideally we would just store this, but because of annoying code |
1232 | /// structure reasons, we keep both this and `byte_class_set` around for |
1233 | /// now. I think I would prefer that `byte_class_set` were computed in the |
1234 | /// `Builder`, but right now, we compute it as states are added to the |
1235 | /// `NFA`. |
1236 | byte_classes: ByteClasses, |
1237 | /// Whether this NFA has a `Capture` state anywhere. |
1238 | has_capture: bool, |
1239 | /// When the empty string is in the language matched by this NFA. |
1240 | has_empty: bool, |
1241 | /// Whether UTF-8 mode is enabled for this NFA. Briefly, this means that |
1242 | /// all non-empty matches produced by this NFA correspond to spans of valid |
1243 | /// UTF-8, and any empty matches produced by this NFA that split a UTF-8 |
1244 | /// encoded codepoint should be filtered out by the corresponding regex |
1245 | /// engine. |
1246 | utf8: bool, |
1247 | /// Whether this NFA is meant to be matched in reverse or not. |
1248 | reverse: bool, |
1249 | /// The matcher to be used for look-around assertions. |
1250 | look_matcher: LookMatcher, |
1251 | /// The union of all look-around assertions that occur anywhere within |
1252 | /// this NFA. If this set is empty, then it means there are precisely zero |
1253 | /// conditional epsilon transitions in the NFA. |
1254 | look_set_any: LookSet, |
1255 | /// The union of all look-around assertions that occur as a zero-length |
1256 | /// prefix for any of the patterns in this NFA. |
1257 | look_set_prefix_any: LookSet, |
1258 | /* |
1259 | /// The intersection of all look-around assertions that occur as a |
1260 | /// zero-length prefix for any of the patterns in this NFA. |
1261 | look_set_prefix_all: LookSet, |
1262 | */ |
1263 | /// Heap memory used indirectly by NFA states and other things (like the |
1264 | /// various capturing group representations above). Since each state |
1265 | /// might use a different amount of heap, we need to keep track of this |
1266 | /// incrementally. |
1267 | memory_extra: usize, |
1268 | } |
1269 | |
1270 | impl Inner { |
1271 | /// Runs any last finalization bits and turns this into a full NFA. |
1272 | pub(super) fn into_nfa(mut self) -> NFA { |
1273 | self.byte_classes = self.byte_class_set.byte_classes(); |
1274 | // Do epsilon closure from the start state of every pattern in order |
1275 | // to compute various properties such as look-around assertions and |
1276 | // whether the empty string can be matched. |
1277 | let mut stack = vec![]; |
1278 | let mut seen = SparseSet::new(self.states.len()); |
1279 | for &start_id in self.start_pattern.iter() { |
1280 | stack.push(start_id); |
1281 | seen.clear(); |
1282 | // let mut prefix_all = LookSet::full(); |
1283 | let mut prefix_any = LookSet::empty(); |
1284 | while let Some(sid) = stack.pop() { |
1285 | if !seen.insert(sid) { |
1286 | continue; |
1287 | } |
1288 | match self.states[sid] { |
1289 | State::ByteRange { .. } |
1290 | | State::Dense { .. } |
1291 | | State::Fail => continue, |
1292 | State::Sparse(_) => { |
1293 | // This snippet below will rewrite this sparse state |
1294 | // as a dense state. By doing it here, we apply this |
1295 | // optimization to all hot "sparse" states since these |
1296 | // are the states that are reachable from the start |
1297 | // state via an epsilon closure. |
1298 | // |
1299 | // Unfortunately, this optimization did not seem to |
1300 | // help much in some very limited ad hoc benchmarking. |
1301 | // |
1302 | // I left the 'Dense' state type in place in case we |
1303 | // want to revisit this, but I suspect the real way |
1304 | // to make forward progress is a more fundamental |
1305 | // rearchitecting of how data in the NFA is laid out. |
1306 | // I think we should consider a single contiguous |
1307 | // allocation instead of all this indirection and |
1308 | // potential heap allocations for every state. But this |
1309 | // is a large re-design and will require API breaking |
1310 | // changes. |
1311 | // self.memory_extra -= self.states[sid].memory_usage(); |
1312 | // let trans = DenseTransitions::from_sparse(sparse); |
1313 | // self.states[sid] = State::Dense(trans); |
1314 | // self.memory_extra += self.states[sid].memory_usage(); |
1315 | continue; |
1316 | } |
1317 | State::Match { .. } => self.has_empty = true, |
1318 | State::Look { look, next } => { |
1319 | prefix_any = prefix_any.insert(look); |
1320 | stack.push(next); |
1321 | } |
1322 | State::Union { ref alternates } => { |
1323 | // Order doesn't matter here, since we're just dealing |
1324 | // with look-around sets. But if we do richer analysis |
1325 | // here that needs to care about preference order, then |
1326 | // this should be done in reverse. |
1327 | stack.extend(alternates.iter()); |
1328 | } |
1329 | State::BinaryUnion { alt1, alt2 } => { |
1330 | stack.push(alt2); |
1331 | stack.push(alt1); |
1332 | } |
1333 | State::Capture { next, .. } => { |
1334 | stack.push(next); |
1335 | } |
1336 | } |
1337 | } |
1338 | self.look_set_prefix_any = |
1339 | self.look_set_prefix_any.union(prefix_any); |
1340 | } |
1341 | NFA(Arc::new(self)) |
1342 | } |
1343 | |
1344 | /// Returns the capturing group info for this NFA. |
1345 | pub(super) fn group_info(&self) -> &GroupInfo { |
1346 | &self.group_info |
1347 | } |
1348 | |
1349 | /// Add the given state to this NFA after allocating a fresh identifier for |
1350 | /// it. |
1351 | /// |
1352 | /// This panics if too many states are added such that a fresh identifier |
1353 | /// could not be created. (Currently, the only caller of this routine is |
1354 | /// a `Builder`, and it upholds this invariant.) |
1355 | pub(super) fn add(&mut self, state: State) -> StateID { |
1356 | match state { |
1357 | State::ByteRange { ref trans } => { |
1358 | self.byte_class_set.set_range(trans.start, trans.end); |
1359 | } |
1360 | State::Sparse(ref sparse) => { |
1361 | for trans in sparse.transitions.iter() { |
1362 | self.byte_class_set.set_range(trans.start, trans.end); |
1363 | } |
1364 | } |
1365 | State::Dense { .. } => unreachable!(), |
1366 | State::Look { look, .. } => { |
1367 | self.look_matcher |
1368 | .add_to_byteset(look, &mut self.byte_class_set); |
1369 | self.look_set_any = self.look_set_any.insert(look); |
1370 | } |
1371 | State::Capture { .. } => { |
1372 | self.has_capture = true; |
1373 | } |
1374 | State::Union { .. } |
1375 | | State::BinaryUnion { .. } |
1376 | | State::Fail |
1377 | | State::Match { .. } => {} |
1378 | } |
1379 | |
1380 | let id = StateID::new(self.states.len()).unwrap(); |
1381 | self.memory_extra += state.memory_usage(); |
1382 | self.states.push(state); |
1383 | id |
1384 | } |
1385 | |
1386 | /// Set the starting state identifiers for this NFA. |
1387 | /// |
1388 | /// `start_anchored` and `start_unanchored` may be equivalent. When they |
1389 | /// are, then the NFA can only execute anchored searches. This might |
1390 | /// occur, for example, for patterns that are unconditionally anchored. |
1391 | /// e.g., `^foo`. |
1392 | pub(super) fn set_starts( |
1393 | &mut self, |
1394 | start_anchored: StateID, |
1395 | start_unanchored: StateID, |
1396 | start_pattern: &[StateID], |
1397 | ) { |
1398 | self.start_anchored = start_anchored; |
1399 | self.start_unanchored = start_unanchored; |
1400 | self.start_pattern = start_pattern.to_vec(); |
1401 | } |
1402 | |
1403 | /// Sets the UTF-8 mode of this NFA. |
1404 | pub(super) fn set_utf8(&mut self, yes: bool) { |
1405 | self.utf8 = yes; |
1406 | } |
1407 | |
1408 | /// Sets the reverse mode of this NFA. |
1409 | pub(super) fn set_reverse(&mut self, yes: bool) { |
1410 | self.reverse = yes; |
1411 | } |
1412 | |
1413 | /// Sets the look-around assertion matcher for this NFA. |
1414 | pub(super) fn set_look_matcher(&mut self, m: LookMatcher) { |
1415 | self.look_matcher = m; |
1416 | } |
1417 | |
1418 | /// Set the capturing groups for this NFA. |
1419 | /// |
1420 | /// The given slice should contain the capturing groups for each pattern, |
1421 | /// The capturing groups in turn should correspond to the total number of |
1422 | /// capturing groups in the pattern, including the anonymous first capture |
1423 | /// group for each pattern. If a capturing group does have a name, then it |
1424 | /// should be provided as a Arc<str>. |
1425 | /// |
1426 | /// This returns an error if a corresponding `GroupInfo` could not be |
1427 | /// built. |
1428 | pub(super) fn set_captures( |
1429 | &mut self, |
1430 | captures: &[Vec<Option<Arc<str>>>], |
1431 | ) -> Result<(), GroupInfoError> { |
1432 | self.group_info = GroupInfo::new( |
1433 | captures.iter().map(|x| x.iter().map(|y| y.as_ref())), |
1434 | )?; |
1435 | Ok(()) |
1436 | } |
1437 | |
1438 | /// Remap the transitions in every state of this NFA using the given map. |
1439 | /// The given map should be indexed according to state ID namespace used by |
1440 | /// the transitions of the states currently in this NFA. |
1441 | /// |
1442 | /// This is particularly useful to the NFA builder, since it is convenient |
1443 | /// to add NFA states in order to produce their final IDs. Then, after all |
1444 | /// of the intermediate "empty" states (unconditional epsilon transitions) |
1445 | /// have been removed from the builder's representation, we can re-map all |
1446 | /// of the transitions in the states already added to their final IDs. |
1447 | pub(super) fn remap(&mut self, old_to_new: &[StateID]) { |
1448 | for state in &mut self.states { |
1449 | state.remap(old_to_new); |
1450 | } |
1451 | self.start_anchored = old_to_new[self.start_anchored]; |
1452 | self.start_unanchored = old_to_new[self.start_unanchored]; |
1453 | for id in self.start_pattern.iter_mut() { |
1454 | *id = old_to_new[*id]; |
1455 | } |
1456 | } |
1457 | } |
1458 | |
1459 | impl fmt::Debug for Inner { |
1460 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
1461 | writeln!(f, "thompson::NFA(" )?; |
1462 | for (sid, state) in self.states.iter().with_state_ids() { |
1463 | let status = if sid == self.start_anchored { |
1464 | '^' |
1465 | } else if sid == self.start_unanchored { |
1466 | '>' |
1467 | } else { |
1468 | ' ' |
1469 | }; |
1470 | writeln!(f, "{}{:06?}: {:?}" , status, sid.as_usize(), state)?; |
1471 | } |
1472 | let pattern_len = self.start_pattern.len(); |
1473 | if pattern_len > 1 { |
1474 | writeln!(f, "" )?; |
1475 | for pid in 0..pattern_len { |
1476 | let sid = self.start_pattern[pid]; |
1477 | writeln!(f, "START({:06?}): {:?}" , pid, sid.as_usize())?; |
1478 | } |
1479 | } |
1480 | writeln!(f, "" )?; |
1481 | writeln!( |
1482 | f, |
1483 | "transition equivalence classes: {:?}" , |
1484 | self.byte_classes, |
1485 | )?; |
1486 | writeln!(f, ")" )?; |
1487 | Ok(()) |
1488 | } |
1489 | } |
1490 | |
1491 | /// A state in an NFA. |
1492 | /// |
1493 | /// In theory, it can help to conceptualize an `NFA` as a graph consisting of |
1494 | /// `State`s. Each `State` contains its complete set of outgoing transitions. |
1495 | /// |
1496 | /// In practice, it can help to conceptualize an `NFA` as a sequence of |
1497 | /// instructions for a virtual machine. Each `State` says what to do and where |
1498 | /// to go next. |
1499 | /// |
1500 | /// Strictly speaking, the practical interpretation is the most correct one, |
1501 | /// because of the [`Capture`](State::Capture) state. Namely, a `Capture` |
1502 | /// state always forwards execution to another state unconditionally. Its only |
1503 | /// purpose is to cause a side effect: the recording of the current input |
1504 | /// position at a particular location in memory. In this sense, an `NFA` |
1505 | /// has more power than a theoretical non-deterministic finite automaton. |
1506 | /// |
1507 | /// For most uses of this crate, it is likely that one may never even need to |
1508 | /// be aware of this type at all. The main use cases for looking at `State`s |
1509 | /// directly are if you need to write your own search implementation or if you |
1510 | /// need to do some kind of analysis on the NFA. |
1511 | #[derive(Clone, Eq, PartialEq)] |
1512 | pub enum State { |
1513 | /// A state with a single transition that can only be taken if the current |
1514 | /// input symbol is in a particular range of bytes. |
1515 | ByteRange { |
1516 | /// The transition from this state to the next. |
1517 | trans: Transition, |
1518 | }, |
1519 | /// A state with possibly many transitions represented in a sparse fashion. |
1520 | /// Transitions are non-overlapping and ordered lexicographically by input |
1521 | /// range. |
1522 | /// |
1523 | /// In practice, this is used for encoding UTF-8 automata. Its presence is |
1524 | /// primarily an optimization that avoids many additional unconditional |
1525 | /// epsilon transitions (via [`Union`](State::Union) states), and thus |
1526 | /// decreases the overhead of traversing the NFA. This can improve both |
1527 | /// matching time and DFA construction time. |
1528 | Sparse(SparseTransitions), |
1529 | /// A dense representation of a state with multiple transitions. |
1530 | Dense(DenseTransitions), |
1531 | /// A conditional epsilon transition satisfied via some sort of |
1532 | /// look-around. Look-around is limited to anchor and word boundary |
1533 | /// assertions. |
1534 | /// |
1535 | /// Look-around states are meant to be evaluated while performing epsilon |
1536 | /// closure (computing the set of states reachable from a particular state |
1537 | /// via only epsilon transitions). If the current position in the haystack |
1538 | /// satisfies the look-around assertion, then you're permitted to follow |
1539 | /// that epsilon transition. |
1540 | Look { |
1541 | /// The look-around assertion that must be satisfied before moving |
1542 | /// to `next`. |
1543 | look: Look, |
1544 | /// The state to transition to if the look-around assertion is |
1545 | /// satisfied. |
1546 | next: StateID, |
1547 | }, |
1548 | /// An alternation such that there exists an epsilon transition to all |
1549 | /// states in `alternates`, where matches found via earlier transitions |
1550 | /// are preferred over later transitions. |
1551 | Union { |
1552 | /// An ordered sequence of unconditional epsilon transitions to other |
1553 | /// states. Transitions earlier in the sequence are preferred over |
1554 | /// transitions later in the sequence. |
1555 | alternates: Box<[StateID]>, |
1556 | }, |
1557 | /// An alternation such that there exists precisely two unconditional |
1558 | /// epsilon transitions, where matches found via `alt1` are preferred over |
1559 | /// matches found via `alt2`. |
1560 | /// |
1561 | /// This state exists as a common special case of Union where there are |
1562 | /// only two alternates. In this case, we don't need any allocations to |
1563 | /// represent the state. This saves a bit of memory and also saves an |
1564 | /// additional memory access when traversing the NFA. |
1565 | BinaryUnion { |
1566 | /// An unconditional epsilon transition to another NFA state. This |
1567 | /// is preferred over `alt2`. |
1568 | alt1: StateID, |
1569 | /// An unconditional epsilon transition to another NFA state. Matches |
1570 | /// reported via this transition should only be reported if no matches |
1571 | /// were found by following `alt1`. |
1572 | alt2: StateID, |
1573 | }, |
1574 | /// An empty state that records a capture location. |
1575 | /// |
1576 | /// From the perspective of finite automata, this is precisely equivalent |
1577 | /// to an unconditional epsilon transition, but serves the purpose of |
1578 | /// instructing NFA simulations to record additional state when the finite |
1579 | /// state machine passes through this epsilon transition. |
1580 | /// |
1581 | /// `slot` in this context refers to the specific capture group slot |
1582 | /// offset that is being recorded. Each capturing group has two slots |
1583 | /// corresponding to the start and end of the matching portion of that |
1584 | /// group. |
1585 | /// |
1586 | /// The pattern ID and capture group index are also included in this state |
1587 | /// in case they are useful. But mostly, all you'll need is `next` and |
1588 | /// `slot`. |
1589 | Capture { |
1590 | /// The state to transition to, unconditionally. |
1591 | next: StateID, |
1592 | /// The pattern ID that this capture belongs to. |
1593 | pattern_id: PatternID, |
1594 | /// The capture group index that this capture belongs to. Capture group |
1595 | /// indices are local to each pattern. For example, when capturing |
1596 | /// groups are enabled, every pattern has a capture group at index |
1597 | /// `0`. |
1598 | group_index: SmallIndex, |
1599 | /// The slot index for this capture. Every capturing group has two |
1600 | /// slots: one for the start haystack offset and one for the end |
1601 | /// haystack offset. Unlike capture group indices, slot indices are |
1602 | /// global across all patterns in this NFA. That is, each slot belongs |
1603 | /// to a single pattern, but there is only one slot at index `i`. |
1604 | slot: SmallIndex, |
1605 | }, |
1606 | /// A state that cannot be transitioned out of. This is useful for cases |
1607 | /// where you want to prevent matching from occurring. For example, if your |
1608 | /// regex parser permits empty character classes, then one could choose |
1609 | /// a `Fail` state to represent them. (An empty character class can be |
1610 | /// thought of as an empty set. Since nothing is in an empty set, they can |
1611 | /// never match anything.) |
1612 | Fail, |
1613 | /// A match state. There is at least one such occurrence of this state for |
1614 | /// each regex that can match that is in this NFA. |
1615 | Match { |
1616 | /// The matching pattern ID. |
1617 | pattern_id: PatternID, |
1618 | }, |
1619 | } |
1620 | |
1621 | impl State { |
1622 | /// Returns true if and only if this state contains one or more epsilon |
1623 | /// transitions. |
1624 | /// |
1625 | /// In practice, a state has no outgoing transitions (like `Match`), has |
1626 | /// only non-epsilon transitions (like `ByteRange`) or has only epsilon |
1627 | /// transitions (like `Union`). |
1628 | /// |
1629 | /// # Example |
1630 | /// |
1631 | /// ``` |
1632 | /// use regex_automata::{ |
1633 | /// nfa::thompson::{State, Transition}, |
1634 | /// util::primitives::{PatternID, StateID, SmallIndex}, |
1635 | /// }; |
1636 | /// |
1637 | /// // Capture states are epsilon transitions. |
1638 | /// let state = State::Capture { |
1639 | /// next: StateID::ZERO, |
1640 | /// pattern_id: PatternID::ZERO, |
1641 | /// group_index: SmallIndex::ZERO, |
1642 | /// slot: SmallIndex::ZERO, |
1643 | /// }; |
1644 | /// assert!(state.is_epsilon()); |
1645 | /// |
1646 | /// // ByteRange states are not. |
1647 | /// let state = State::ByteRange { |
1648 | /// trans: Transition { start: b'a' , end: b'z' , next: StateID::ZERO }, |
1649 | /// }; |
1650 | /// assert!(!state.is_epsilon()); |
1651 | /// |
1652 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
1653 | /// ``` |
1654 | #[inline ] |
1655 | pub fn is_epsilon(&self) -> bool { |
1656 | match *self { |
1657 | State::ByteRange { .. } |
1658 | | State::Sparse { .. } |
1659 | | State::Dense { .. } |
1660 | | State::Fail |
1661 | | State::Match { .. } => false, |
1662 | State::Look { .. } |
1663 | | State::Union { .. } |
1664 | | State::BinaryUnion { .. } |
1665 | | State::Capture { .. } => true, |
1666 | } |
1667 | } |
1668 | |
1669 | /// Returns the heap memory usage of this NFA state in bytes. |
1670 | fn memory_usage(&self) -> usize { |
1671 | match *self { |
1672 | State::ByteRange { .. } |
1673 | | State::Look { .. } |
1674 | | State::BinaryUnion { .. } |
1675 | | State::Capture { .. } |
1676 | | State::Match { .. } |
1677 | | State::Fail => 0, |
1678 | State::Sparse(SparseTransitions { ref transitions }) => { |
1679 | transitions.len() * mem::size_of::<Transition>() |
1680 | } |
1681 | State::Dense { .. } => 256 * mem::size_of::<StateID>(), |
1682 | State::Union { ref alternates } => { |
1683 | alternates.len() * mem::size_of::<StateID>() |
1684 | } |
1685 | } |
1686 | } |
1687 | |
1688 | /// Remap the transitions in this state using the given map. Namely, the |
1689 | /// given map should be indexed according to the transitions currently |
1690 | /// in this state. |
1691 | /// |
1692 | /// This is used during the final phase of the NFA compiler, which turns |
1693 | /// its intermediate NFA into the final NFA. |
1694 | fn remap(&mut self, remap: &[StateID]) { |
1695 | match *self { |
1696 | State::ByteRange { ref mut trans } => { |
1697 | trans.next = remap[trans.next] |
1698 | } |
1699 | State::Sparse(SparseTransitions { ref mut transitions }) => { |
1700 | for t in transitions.iter_mut() { |
1701 | t.next = remap[t.next]; |
1702 | } |
1703 | } |
1704 | State::Dense(DenseTransitions { ref mut transitions }) => { |
1705 | for sid in transitions.iter_mut() { |
1706 | *sid = remap[*sid]; |
1707 | } |
1708 | } |
1709 | State::Look { ref mut next, .. } => *next = remap[*next], |
1710 | State::Union { ref mut alternates } => { |
1711 | for alt in alternates.iter_mut() { |
1712 | *alt = remap[*alt]; |
1713 | } |
1714 | } |
1715 | State::BinaryUnion { ref mut alt1, ref mut alt2 } => { |
1716 | *alt1 = remap[*alt1]; |
1717 | *alt2 = remap[*alt2]; |
1718 | } |
1719 | State::Capture { ref mut next, .. } => *next = remap[*next], |
1720 | State::Fail => {} |
1721 | State::Match { .. } => {} |
1722 | } |
1723 | } |
1724 | } |
1725 | |
1726 | impl fmt::Debug for State { |
1727 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
1728 | match *self { |
1729 | State::ByteRange { ref trans } => trans.fmt(f), |
1730 | State::Sparse(SparseTransitions { ref transitions }) => { |
1731 | let rs = transitions |
1732 | .iter() |
1733 | .map(|t| format!("{:?}" , t)) |
1734 | .collect::<Vec<String>>() |
1735 | .join(", " ); |
1736 | write!(f, "sparse({})" , rs) |
1737 | } |
1738 | State::Dense(ref dense) => { |
1739 | write!(f, "dense(" )?; |
1740 | for (i, t) in dense.iter().enumerate() { |
1741 | if i > 0 { |
1742 | write!(f, ", " )?; |
1743 | } |
1744 | write!(f, "{:?}" , t)?; |
1745 | } |
1746 | write!(f, ")" ) |
1747 | } |
1748 | State::Look { ref look, next } => { |
1749 | write!(f, "{:?} => {:?}" , look, next.as_usize()) |
1750 | } |
1751 | State::Union { ref alternates } => { |
1752 | let alts = alternates |
1753 | .iter() |
1754 | .map(|id| format!("{:?}" , id.as_usize())) |
1755 | .collect::<Vec<String>>() |
1756 | .join(", " ); |
1757 | write!(f, "union({})" , alts) |
1758 | } |
1759 | State::BinaryUnion { alt1, alt2 } => { |
1760 | write!( |
1761 | f, |
1762 | "binary-union({}, {})" , |
1763 | alt1.as_usize(), |
1764 | alt2.as_usize() |
1765 | ) |
1766 | } |
1767 | State::Capture { next, pattern_id, group_index, slot } => { |
1768 | write!( |
1769 | f, |
1770 | "capture(pid={:?}, group={:?}, slot={:?}) => {:?}" , |
1771 | pattern_id.as_usize(), |
1772 | group_index.as_usize(), |
1773 | slot.as_usize(), |
1774 | next.as_usize(), |
1775 | ) |
1776 | } |
1777 | State::Fail => write!(f, "FAIL" ), |
1778 | State::Match { pattern_id } => { |
1779 | write!(f, "MATCH({:?})" , pattern_id.as_usize()) |
1780 | } |
1781 | } |
1782 | } |
1783 | } |
1784 | |
1785 | /// A sequence of transitions used to represent a sparse state. |
1786 | /// |
1787 | /// This is the primary representation of a [`Sparse`](State::Sparse) state. |
1788 | /// It corresponds to a sorted sequence of transitions with non-overlapping |
1789 | /// byte ranges. If the byte at the current position in the haystack matches |
1790 | /// one of the byte ranges, then the finite state machine should take the |
1791 | /// corresponding transition. |
1792 | #[derive(Clone, Debug, Eq, PartialEq)] |
1793 | pub struct SparseTransitions { |
1794 | /// The sorted sequence of non-overlapping transitions. |
1795 | pub transitions: Box<[Transition]>, |
1796 | } |
1797 | |
1798 | impl SparseTransitions { |
1799 | /// This follows the matching transition for a particular byte. |
1800 | /// |
1801 | /// The matching transition is found by looking for a matching byte |
1802 | /// range (there is at most one) corresponding to the position `at` in |
1803 | /// `haystack`. |
1804 | /// |
1805 | /// If `at >= haystack.len()`, then this returns `None`. |
1806 | #[inline ] |
1807 | pub fn matches(&self, haystack: &[u8], at: usize) -> Option<StateID> { |
1808 | haystack.get(at).and_then(|&b| self.matches_byte(b)) |
1809 | } |
1810 | |
1811 | /// This follows the matching transition for any member of the alphabet. |
1812 | /// |
1813 | /// The matching transition is found by looking for a matching byte |
1814 | /// range (there is at most one) corresponding to the position `at` in |
1815 | /// `haystack`. If the given alphabet unit is [`EOI`](alphabet::Unit::eoi), |
1816 | /// then this always returns `None`. |
1817 | #[inline ] |
1818 | pub(crate) fn matches_unit( |
1819 | &self, |
1820 | unit: alphabet::Unit, |
1821 | ) -> Option<StateID> { |
1822 | unit.as_u8().map_or(None, |byte| self.matches_byte(byte)) |
1823 | } |
1824 | |
1825 | /// This follows the matching transition for a particular byte. |
1826 | /// |
1827 | /// The matching transition is found by looking for a matching byte range |
1828 | /// (there is at most one) corresponding to the byte given. |
1829 | #[inline ] |
1830 | pub fn matches_byte(&self, byte: u8) -> Option<StateID> { |
1831 | for t in self.transitions.iter() { |
1832 | if t.start > byte { |
1833 | break; |
1834 | } else if t.matches_byte(byte) { |
1835 | return Some(t.next); |
1836 | } |
1837 | } |
1838 | None |
1839 | |
1840 | /* |
1841 | // This is an alternative implementation that uses binary search. In |
1842 | // some ad hoc experiments, like |
1843 | // |
1844 | // regex-cli find match pikevm -b -p '\b\w+\b' non-ascii-file |
1845 | // |
1846 | // I could not observe any improvement, and in fact, things seemed to |
1847 | // be a bit slower. I can see an improvement in at least one benchmark: |
1848 | // |
1849 | // regex-cli find match pikevm -b -p '\pL{100}' all-codepoints-utf8 |
1850 | // |
1851 | // Where total search time goes from 3.2s to 2.4s when using binary |
1852 | // search. |
1853 | self.transitions |
1854 | .binary_search_by(|t| { |
1855 | if t.end < byte { |
1856 | core::cmp::Ordering::Less |
1857 | } else if t.start > byte { |
1858 | core::cmp::Ordering::Greater |
1859 | } else { |
1860 | core::cmp::Ordering::Equal |
1861 | } |
1862 | }) |
1863 | .ok() |
1864 | .map(|i| self.transitions[i].next) |
1865 | */ |
1866 | } |
1867 | } |
1868 | |
1869 | /// A sequence of transitions used to represent a dense state. |
1870 | /// |
1871 | /// This is the primary representation of a [`Dense`](State::Dense) state. It |
1872 | /// provides constant time matching. That is, given a byte in a haystack and |
1873 | /// a `DenseTransitions`, one can determine if the state matches in constant |
1874 | /// time. |
1875 | /// |
1876 | /// This is in contrast to `SparseTransitions`, whose time complexity is |
1877 | /// necessarily bigger than constant time. Also in contrast, `DenseTransitions` |
1878 | /// usually requires (much) more heap memory. |
1879 | #[derive(Clone, Debug, Eq, PartialEq)] |
1880 | pub struct DenseTransitions { |
1881 | /// A dense representation of this state's transitions on the heap. This |
1882 | /// always has length 256. |
1883 | pub transitions: Box<[StateID]>, |
1884 | } |
1885 | |
1886 | impl DenseTransitions { |
1887 | /// This follows the matching transition for a particular byte. |
1888 | /// |
1889 | /// The matching transition is found by looking for a transition that |
1890 | /// doesn't correspond to `StateID::ZERO` for the byte `at` the given |
1891 | /// position in `haystack`. |
1892 | /// |
1893 | /// If `at >= haystack.len()`, then this returns `None`. |
1894 | #[inline ] |
1895 | pub fn matches(&self, haystack: &[u8], at: usize) -> Option<StateID> { |
1896 | haystack.get(at).and_then(|&b| self.matches_byte(b)) |
1897 | } |
1898 | |
1899 | /// This follows the matching transition for any member of the alphabet. |
1900 | /// |
1901 | /// The matching transition is found by looking for a transition that |
1902 | /// doesn't correspond to `StateID::ZERO` for the byte `at` the given |
1903 | /// position in `haystack`. |
1904 | /// |
1905 | /// If `at >= haystack.len()` or if the given alphabet unit is |
1906 | /// [`EOI`](alphabet::Unit::eoi), then this returns `None`. |
1907 | #[inline ] |
1908 | pub(crate) fn matches_unit( |
1909 | &self, |
1910 | unit: alphabet::Unit, |
1911 | ) -> Option<StateID> { |
1912 | unit.as_u8().map_or(None, |byte| self.matches_byte(byte)) |
1913 | } |
1914 | |
1915 | /// This follows the matching transition for a particular byte. |
1916 | /// |
1917 | /// The matching transition is found by looking for a transition that |
1918 | /// doesn't correspond to `StateID::ZERO` for the given `byte`. |
1919 | /// |
1920 | /// If `at >= haystack.len()`, then this returns `None`. |
1921 | #[inline ] |
1922 | pub fn matches_byte(&self, byte: u8) -> Option<StateID> { |
1923 | let next = self.transitions[usize::from(byte)]; |
1924 | if next == StateID::ZERO { |
1925 | None |
1926 | } else { |
1927 | Some(next) |
1928 | } |
1929 | } |
1930 | |
1931 | /* |
1932 | /// The dense state optimization isn't currently enabled, so permit a |
1933 | /// little bit of dead code. |
1934 | pub(crate) fn from_sparse(sparse: &SparseTransitions) -> DenseTransitions { |
1935 | let mut dense = vec![StateID::ZERO; 256]; |
1936 | for t in sparse.transitions.iter() { |
1937 | for b in t.start..=t.end { |
1938 | dense[usize::from(b)] = t.next; |
1939 | } |
1940 | } |
1941 | DenseTransitions { transitions: dense.into_boxed_slice() } |
1942 | } |
1943 | */ |
1944 | |
1945 | /// Returns an iterator over all transitions that don't point to |
1946 | /// `StateID::ZERO`. |
1947 | pub(crate) fn iter(&self) -> impl Iterator<Item = Transition> + '_ { |
1948 | use crate::util::int::Usize; |
1949 | self.transitions |
1950 | .iter() |
1951 | .enumerate() |
1952 | .filter(|&(_, &sid)| sid != StateID::ZERO) |
1953 | .map(|(byte, &next)| Transition { |
1954 | start: byte.as_u8(), |
1955 | end: byte.as_u8(), |
1956 | next, |
1957 | }) |
1958 | } |
1959 | } |
1960 | |
1961 | /// A single transition to another state. |
1962 | /// |
1963 | /// This transition may only be followed if the current byte in the haystack |
1964 | /// falls in the inclusive range of bytes specified. |
1965 | #[derive(Clone, Copy, Eq, Hash, PartialEq)] |
1966 | pub struct Transition { |
1967 | /// The inclusive start of the byte range. |
1968 | pub start: u8, |
1969 | /// The inclusive end of the byte range. |
1970 | pub end: u8, |
1971 | /// The identifier of the state to transition to. |
1972 | pub next: StateID, |
1973 | } |
1974 | |
1975 | impl Transition { |
1976 | /// Returns true if the position `at` in `haystack` falls in this |
1977 | /// transition's range of bytes. |
1978 | /// |
1979 | /// If `at >= haystack.len()`, then this returns `false`. |
1980 | pub fn matches(&self, haystack: &[u8], at: usize) -> bool { |
1981 | haystack.get(at).map_or(false, |&b| self.matches_byte(b)) |
1982 | } |
1983 | |
1984 | /// Returns true if the given alphabet unit falls in this transition's |
1985 | /// range of bytes. If the given unit is [`EOI`](alphabet::Unit::eoi), then |
1986 | /// this returns `false`. |
1987 | pub fn matches_unit(&self, unit: alphabet::Unit) -> bool { |
1988 | unit.as_u8().map_or(false, |byte| self.matches_byte(byte)) |
1989 | } |
1990 | |
1991 | /// Returns true if the given byte falls in this transition's range of |
1992 | /// bytes. |
1993 | pub fn matches_byte(&self, byte: u8) -> bool { |
1994 | self.start <= byte && byte <= self.end |
1995 | } |
1996 | } |
1997 | |
1998 | impl fmt::Debug for Transition { |
1999 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
2000 | use crate::util::escape::DebugByte; |
2001 | |
2002 | let Transition { start, end, next } = *self; |
2003 | if self.start == self.end { |
2004 | write!(f, "{:?} => {:?}" , DebugByte(start), next.as_usize()) |
2005 | } else { |
2006 | write!( |
2007 | f, |
2008 | "{:?}-{:?} => {:?}" , |
2009 | DebugByte(start), |
2010 | DebugByte(end), |
2011 | next.as_usize(), |
2012 | ) |
2013 | } |
2014 | } |
2015 | } |
2016 | |
2017 | /// An iterator over all pattern IDs in an NFA. |
2018 | /// |
2019 | /// This iterator is created by [`NFA::patterns`]. |
2020 | /// |
2021 | /// The lifetime parameter `'a` refers to the lifetime of the NFA from which |
2022 | /// this pattern iterator was created. |
2023 | #[derive(Debug)] |
2024 | pub struct PatternIter<'a> { |
2025 | it: PatternIDIter, |
2026 | /// We explicitly associate a lifetime with this iterator even though we |
2027 | /// don't actually borrow anything from the NFA. We do this for backward |
2028 | /// compatibility purposes. If we ever do need to borrow something from |
2029 | /// the NFA, then we can and just get rid of this marker without breaking |
2030 | /// the public API. |
2031 | _marker: core::marker::PhantomData<&'a ()>, |
2032 | } |
2033 | |
2034 | impl<'a> Iterator for PatternIter<'a> { |
2035 | type Item = PatternID; |
2036 | |
2037 | fn next(&mut self) -> Option<PatternID> { |
2038 | self.it.next() |
2039 | } |
2040 | } |
2041 | |
2042 | #[cfg (all(test, feature = "nfa-pikevm" ))] |
2043 | mod tests { |
2044 | use super::*; |
2045 | use crate::{nfa::thompson::pikevm::PikeVM, Input}; |
2046 | |
2047 | // This asserts that an NFA state doesn't have its size changed. It is |
2048 | // *really* easy to accidentally increase the size, and thus potentially |
2049 | // dramatically increase the memory usage of every NFA. |
2050 | // |
2051 | // This assert doesn't mean we absolutely cannot increase the size of an |
2052 | // NFA state. We can. It's just here to make sure we do it knowingly and |
2053 | // intentionally. |
2054 | #[test] |
2055 | fn state_has_small_size() { |
2056 | #[cfg (target_pointer_width = "64" )] |
2057 | assert_eq!(24, core::mem::size_of::<State>()); |
2058 | #[cfg (target_pointer_width = "32" )] |
2059 | assert_eq!(20, core::mem::size_of::<State>()); |
2060 | } |
2061 | |
2062 | #[test] |
2063 | fn always_match() { |
2064 | let re = PikeVM::new_from_nfa(NFA::always_match()).unwrap(); |
2065 | let mut cache = re.create_cache(); |
2066 | let mut caps = re.create_captures(); |
2067 | let mut find = |haystack, start, end| { |
2068 | let input = Input::new(haystack).range(start..end); |
2069 | re.search(&mut cache, &input, &mut caps); |
2070 | caps.get_match().map(|m| m.end()) |
2071 | }; |
2072 | |
2073 | assert_eq!(Some(0), find("" , 0, 0)); |
2074 | assert_eq!(Some(0), find("a" , 0, 1)); |
2075 | assert_eq!(Some(1), find("a" , 1, 1)); |
2076 | assert_eq!(Some(0), find("ab" , 0, 2)); |
2077 | assert_eq!(Some(1), find("ab" , 1, 2)); |
2078 | assert_eq!(Some(2), find("ab" , 2, 2)); |
2079 | } |
2080 | |
2081 | #[test] |
2082 | fn never_match() { |
2083 | let re = PikeVM::new_from_nfa(NFA::never_match()).unwrap(); |
2084 | let mut cache = re.create_cache(); |
2085 | let mut caps = re.create_captures(); |
2086 | let mut find = |haystack, start, end| { |
2087 | let input = Input::new(haystack).range(start..end); |
2088 | re.search(&mut cache, &input, &mut caps); |
2089 | caps.get_match().map(|m| m.end()) |
2090 | }; |
2091 | |
2092 | assert_eq!(None, find("" , 0, 0)); |
2093 | assert_eq!(None, find("a" , 0, 1)); |
2094 | assert_eq!(None, find("a" , 1, 1)); |
2095 | assert_eq!(None, find("ab" , 0, 2)); |
2096 | assert_eq!(None, find("ab" , 1, 2)); |
2097 | assert_eq!(None, find("ab" , 2, 2)); |
2098 | } |
2099 | } |
2100 | |