1 | use core::mem; |
2 | |
3 | use alloc::{sync::Arc, vec, vec::Vec}; |
4 | |
5 | use crate::{ |
6 | nfa::thompson::{ |
7 | error::BuildError, |
8 | nfa::{self, SparseTransitions, Transition, NFA}, |
9 | }, |
10 | util::{ |
11 | look::{Look, LookMatcher}, |
12 | primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID}, |
13 | }, |
14 | }; |
15 | |
16 | /// An intermediate NFA state used during construction. |
17 | /// |
18 | /// During construction of an NFA, it is often convenient to work with states |
19 | /// that are amenable to mutation and other carry more information than we |
20 | /// otherwise need once an NFA has been built. This type represents those |
21 | /// needs. |
22 | /// |
23 | /// Once construction is finished, the builder will convert these states to a |
24 | /// [`nfa::thompson::State`](crate::nfa::thompson::State). This conversion not |
25 | /// only results in a simpler representation, but in some cases, entire classes |
26 | /// of states are completely removed (such as [`State::Empty`]). |
27 | #[derive(Clone, Debug, Eq, PartialEq)] |
28 | enum State { |
29 | /// An empty state whose only purpose is to forward the automaton to |
30 | /// another state via an unconditional epsilon transition. |
31 | /// |
32 | /// Unconditional epsilon transitions are quite useful during the |
33 | /// construction of an NFA, as they permit the insertion of no-op |
34 | /// placeholders that make it easier to compose NFA sub-graphs. When |
35 | /// the Thompson NFA builder produces a final NFA, all unconditional |
36 | /// epsilon transitions are removed, and state identifiers are remapped |
37 | /// accordingly. |
38 | Empty { |
39 | /// The next state that this state should transition to. |
40 | next: StateID, |
41 | }, |
42 | /// A state that only transitions to another state if the current input |
43 | /// byte is in a particular range of bytes. |
44 | ByteRange { trans: Transition }, |
45 | /// A state with possibly many transitions, represented in a sparse |
46 | /// fashion. Transitions must be ordered lexicographically by input range |
47 | /// and be non-overlapping. As such, this may only be used when every |
48 | /// transition has equal priority. (In practice, this is only used for |
49 | /// encoding large UTF-8 automata.) In contrast, a `Union` state has each |
50 | /// alternate in order of priority. Priority is used to implement greedy |
51 | /// matching and also alternations themselves, e.g., `abc|a` where `abc` |
52 | /// has priority over `a`. |
53 | /// |
54 | /// To clarify, it is possible to remove `Sparse` and represent all things |
55 | /// that `Sparse` is used for via `Union`. But this creates a more bloated |
56 | /// NFA with more epsilon transitions than is necessary in the special case |
57 | /// of character classes. |
58 | Sparse { transitions: Vec<Transition> }, |
59 | /// A conditional epsilon transition satisfied via some sort of |
60 | /// look-around. |
61 | Look { look: Look, next: StateID }, |
62 | /// An empty state that records the start of a capture location. This is an |
63 | /// unconditional epsilon transition like `Empty`, except it can be used to |
64 | /// record position information for a capture group when using the NFA for |
65 | /// search. |
66 | CaptureStart { |
67 | /// The ID of the pattern that this capture was defined. |
68 | pattern_id: PatternID, |
69 | /// The capture group index that this capture state corresponds to. |
70 | /// The capture group index is always relative to its corresponding |
71 | /// pattern. Therefore, in the presence of multiple patterns, both the |
72 | /// pattern ID and the capture group index are required to uniquely |
73 | /// identify a capturing group. |
74 | group_index: SmallIndex, |
75 | /// The next state that this state should transition to. |
76 | next: StateID, |
77 | }, |
78 | /// An empty state that records the end of a capture location. This is an |
79 | /// unconditional epsilon transition like `Empty`, except it can be used to |
80 | /// record position information for a capture group when using the NFA for |
81 | /// search. |
82 | CaptureEnd { |
83 | /// The ID of the pattern that this capture was defined. |
84 | pattern_id: PatternID, |
85 | /// The capture group index that this capture state corresponds to. |
86 | /// The capture group index is always relative to its corresponding |
87 | /// pattern. Therefore, in the presence of multiple patterns, both the |
88 | /// pattern ID and the capture group index are required to uniquely |
89 | /// identify a capturing group. |
90 | group_index: SmallIndex, |
91 | /// The next state that this state should transition to. |
92 | next: StateID, |
93 | }, |
94 | /// An alternation such that there exists an epsilon transition to all |
95 | /// states in `alternates`, where matches found via earlier transitions |
96 | /// are preferred over later transitions. |
97 | Union { alternates: Vec<StateID> }, |
98 | /// An alternation such that there exists an epsilon transition to all |
99 | /// states in `alternates`, where matches found via later transitions are |
100 | /// preferred over earlier transitions. |
101 | /// |
102 | /// This "reverse" state exists for convenience during compilation that |
103 | /// permits easy construction of non-greedy combinations of NFA states. At |
104 | /// the end of compilation, Union and UnionReverse states are merged into |
105 | /// one Union type of state, where the latter has its epsilon transitions |
106 | /// reversed to reflect the priority inversion. |
107 | /// |
108 | /// The "convenience" here arises from the fact that as new states are |
109 | /// added to the list of `alternates`, we would like that add operation |
110 | /// to be amortized constant time. But if we used a `Union`, we'd need to |
111 | /// prepend the state, which takes O(n) time. There are other approaches we |
112 | /// could use to solve this, but this seems simple enough. |
113 | UnionReverse { alternates: Vec<StateID> }, |
114 | /// A state that cannot be transitioned out of. This is useful for cases |
115 | /// where you want to prevent matching from occurring. For example, if your |
116 | /// regex parser permits empty character classes, then one could choose a |
117 | /// `Fail` state to represent it. |
118 | Fail, |
119 | /// A match state. There is at most one such occurrence of this state in |
120 | /// an NFA for each pattern compiled into the NFA. At time of writing, a |
121 | /// match state is always produced for every pattern given, but in theory, |
122 | /// if a pattern can never lead to a match, then the match state could be |
123 | /// omitted. |
124 | /// |
125 | /// `pattern_id` refers to the ID of the pattern itself, which corresponds |
126 | /// to the pattern's index (starting at 0). |
127 | Match { pattern_id: PatternID }, |
128 | } |
129 | |
130 | impl State { |
131 | /// If this state is an unconditional epsilon transition, then this returns |
132 | /// the target of the transition. |
133 | fn goto(&self) -> Option<StateID> { |
134 | match *self { |
135 | State::Empty { next } => Some(next), |
136 | State::Union { ref alternates } if alternates.len() == 1 => { |
137 | Some(alternates[0]) |
138 | } |
139 | State::UnionReverse { ref alternates } |
140 | if alternates.len() == 1 => |
141 | { |
142 | Some(alternates[0]) |
143 | } |
144 | _ => None, |
145 | } |
146 | } |
147 | |
148 | /// Returns the heap memory usage, in bytes, of this state. |
149 | fn memory_usage(&self) -> usize { |
150 | match *self { |
151 | State::Empty { .. } |
152 | | State::ByteRange { .. } |
153 | | State::Look { .. } |
154 | | State::CaptureStart { .. } |
155 | | State::CaptureEnd { .. } |
156 | | State::Fail |
157 | | State::Match { .. } => 0, |
158 | State::Sparse { ref transitions } => { |
159 | transitions.len() * mem::size_of::<Transition>() |
160 | } |
161 | State::Union { ref alternates } => { |
162 | alternates.len() * mem::size_of::<StateID>() |
163 | } |
164 | State::UnionReverse { ref alternates } => { |
165 | alternates.len() * mem::size_of::<StateID>() |
166 | } |
167 | } |
168 | } |
169 | } |
170 | |
171 | /// An abstraction for building Thompson NFAs by hand. |
172 | /// |
173 | /// A builder is what a [`thompson::Compiler`](crate::nfa::thompson::Compiler) |
174 | /// uses internally to translate a regex's high-level intermediate |
175 | /// representation into an [`NFA`]. |
176 | /// |
177 | /// The primary function of this builder is to abstract away the internal |
178 | /// representation of an NFA and make it difficult to produce NFAs are that |
179 | /// internally invalid or inconsistent. This builder also provides a way to |
180 | /// add "empty" states (which can be thought of as unconditional epsilon |
181 | /// transitions), despite the fact that [`thompson::State`](nfa::State) does |
182 | /// not have any "empty" representation. The advantage of "empty" states is |
183 | /// that they make the code for constructing a Thompson NFA logically simpler. |
184 | /// |
185 | /// Many of the routines on this builder may panic or return errors. Generally |
186 | /// speaking, panics occur when an invalid sequence of method calls were made, |
187 | /// where as an error occurs if things get too big. (Where "too big" might mean |
188 | /// exhausting identifier space or using up too much heap memory in accordance |
189 | /// with the configured [`size_limit`](Builder::set_size_limit).) |
190 | /// |
191 | /// # Overview |
192 | /// |
193 | /// ## Adding multiple patterns |
194 | /// |
195 | /// Each pattern you add to an NFA should correspond to a pair of |
196 | /// [`Builder::start_pattern`] and [`Builder::finish_pattern`] calls, with |
197 | /// calls inbetween that add NFA states for that pattern. NFA states may be |
198 | /// added without first calling `start_pattern`, with the exception of adding |
199 | /// capturing states. |
200 | /// |
201 | /// ## Adding NFA states |
202 | /// |
203 | /// Here is a very brief overview of each of the methods that add NFA states. |
204 | /// Every method adds a single state. |
205 | /// |
206 | /// * [`add_empty`](Builder::add_empty): Add a state with a single |
207 | /// unconditional epsilon transition to another state. |
208 | /// * [`add_union`](Builder::add_union): Adds a state with unconditional |
209 | /// epsilon transitions to two or more states, with earlier transitions |
210 | /// preferred over later ones. |
211 | /// * [`add_union_reverse`](Builder::add_union_reverse): Adds a state with |
212 | /// unconditional epsilon transitions to two or more states, with later |
213 | /// transitions preferred over earlier ones. |
214 | /// * [`add_range`](Builder::add_range): Adds a state with a single transition |
215 | /// to another state that can only be followed if the current input byte is |
216 | /// within the range given. |
217 | /// * [`add_sparse`](Builder::add_sparse): Adds a state with two or more |
218 | /// range transitions to other states, where a transition is only followed |
219 | /// if the current input byte is within one of the ranges. All transitions |
220 | /// in this state have equal priority, and the corresponding ranges must be |
221 | /// non-overlapping. |
222 | /// * [`add_look`](Builder::add_look): Adds a state with a single *conditional* |
223 | /// epsilon transition to another state, where the condition depends on a |
224 | /// limited look-around property. |
225 | /// * [`add_capture_start`](Builder::add_capture_start): Adds a state with |
226 | /// a single unconditional epsilon transition that also instructs an NFA |
227 | /// simulation to record the current input position to a specific location in |
228 | /// memory. This is intended to represent the starting location of a capturing |
229 | /// group. |
230 | /// * [`add_capture_end`](Builder::add_capture_end): Adds a state with |
231 | /// a single unconditional epsilon transition that also instructs an NFA |
232 | /// simulation to record the current input position to a specific location in |
233 | /// memory. This is intended to represent the ending location of a capturing |
234 | /// group. |
235 | /// * [`add_fail`](Builder::add_fail): Adds a state that never transitions to |
236 | /// another state. |
237 | /// * [`add_match`](Builder::add_match): Add a state that indicates a match has |
238 | /// been found for a particular pattern. A match state is a final state with |
239 | /// no outgoing transitions. |
240 | /// |
241 | /// ## Setting transitions between NFA states |
242 | /// |
243 | /// The [`Builder::patch`] method creates a transition from one state to the |
244 | /// next. If the `from` state corresponds to a state that supports multiple |
245 | /// outgoing transitions (such as "union"), then this adds the corresponding |
246 | /// transition. Otherwise, it sets the single transition. (This routine panics |
247 | /// if `from` corresponds to a state added by `add_sparse`, since sparse states |
248 | /// need more specialized handling.) |
249 | /// |
250 | /// # Example |
251 | /// |
252 | /// This annotated example shows how to hand construct the regex `[a-z]+` |
253 | /// (without an unanchored prefix). |
254 | /// |
255 | /// ``` |
256 | /// use regex_automata::{ |
257 | /// nfa::thompson::{pikevm::PikeVM, Builder, Transition}, |
258 | /// util::primitives::StateID, |
259 | /// Match, |
260 | /// }; |
261 | /// |
262 | /// let mut builder = Builder::new(); |
263 | /// // Before adding NFA states for our pattern, we need to tell the builder |
264 | /// // that we are starting the pattern. |
265 | /// builder.start_pattern()?; |
266 | /// // Since we use the Pike VM below for searching, we need to add capturing |
267 | /// // states. If you're just going to build a DFA from the NFA, then capturing |
268 | /// // states do not need to be added. |
269 | /// let start = builder.add_capture_start(StateID::ZERO, 0, None)?; |
270 | /// let range = builder.add_range(Transition { |
271 | /// // We don't know the state ID of the 'next' state yet, so we just fill |
272 | /// // in a dummy 'ZERO' value. |
273 | /// start: b'a' , end: b'z' , next: StateID::ZERO, |
274 | /// })?; |
275 | /// // This state will point back to 'range', but also enable us to move ahead. |
276 | /// // That is, this implements the '+' repetition operator. We add 'range' and |
277 | /// // then 'end' below to this alternation. |
278 | /// let alt = builder.add_union(vec![])?; |
279 | /// // The final state before the match state, which serves to capture the |
280 | /// // end location of the match. |
281 | /// let end = builder.add_capture_end(StateID::ZERO, 0)?; |
282 | /// // The match state for our pattern. |
283 | /// let mat = builder.add_match()?; |
284 | /// // Now we fill in the transitions between states. |
285 | /// builder.patch(start, range)?; |
286 | /// builder.patch(range, alt)?; |
287 | /// // If we added 'end' before 'range', then we'd implement non-greedy |
288 | /// // matching, i.e., '+?'. |
289 | /// builder.patch(alt, range)?; |
290 | /// builder.patch(alt, end)?; |
291 | /// builder.patch(end, mat)?; |
292 | /// // We must explicitly finish pattern and provide the starting state ID for |
293 | /// // this particular pattern. |
294 | /// builder.finish_pattern(start)?; |
295 | /// // Finally, when we build the NFA, we provide the anchored and unanchored |
296 | /// // starting state IDs. Since we didn't bother with an unanchored prefix |
297 | /// // here, we only support anchored searching. Thus, both starting states are |
298 | /// // the same. |
299 | /// let nfa = builder.build(start, start)?; |
300 | /// |
301 | /// // Now build a Pike VM from our NFA, and use it for searching. This shows |
302 | /// // how we can use a regex engine without ever worrying about syntax! |
303 | /// let re = PikeVM::new_from_nfa(nfa)?; |
304 | /// let mut cache = re.create_cache(); |
305 | /// let mut caps = re.create_captures(); |
306 | /// let expected = Some(Match::must(0, 0..3)); |
307 | /// re.captures(&mut cache, "foo0" , &mut caps); |
308 | /// assert_eq!(expected, caps.get_match()); |
309 | /// |
310 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
311 | /// ``` |
312 | #[derive(Clone, Debug, Default)] |
313 | pub struct Builder { |
314 | /// The ID of the pattern that we're currently building. |
315 | /// |
316 | /// Callers are required to set (and unset) this by calling |
317 | /// {start,finish}_pattern. Otherwise, most methods will panic. |
318 | pattern_id: Option<PatternID>, |
319 | /// A sequence of intermediate NFA states. Once a state is added to this |
320 | /// sequence, it is assigned a state ID equivalent to its index. Once a |
321 | /// state is added, it is still expected to be mutated, e.g., to set its |
322 | /// transition to a state that didn't exist at the time it was added. |
323 | states: Vec<State>, |
324 | /// The starting states for each individual pattern. Starting at any |
325 | /// of these states will result in only an anchored search for the |
326 | /// corresponding pattern. The vec is indexed by pattern ID. When the NFA |
327 | /// contains a single regex, then `start_pattern[0]` and `start_anchored` |
328 | /// are always equivalent. |
329 | start_pattern: Vec<StateID>, |
330 | /// A map from pattern ID to capture group index to name. (If no name |
331 | /// exists, then a None entry is present. Thus, all capturing groups are |
332 | /// present in this mapping.) |
333 | /// |
334 | /// The outer vec is indexed by pattern ID, while the inner vec is indexed |
335 | /// by capture index offset for the corresponding pattern. |
336 | /// |
337 | /// The first capture group for each pattern is always unnamed and is thus |
338 | /// always None. |
339 | captures: Vec<Vec<Option<Arc<str>>>>, |
340 | /// The combined memory used by each of the 'State's in 'states'. This |
341 | /// only includes heap usage by each state, and not the size of the state |
342 | /// itself. In other words, this tracks heap memory used that isn't |
343 | /// captured via `size_of::<State>() * states.len()`. |
344 | memory_states: usize, |
345 | /// Whether this NFA only matches UTF-8 and whether regex engines using |
346 | /// this NFA for searching should report empty matches that split a |
347 | /// codepoint. |
348 | utf8: bool, |
349 | /// Whether this NFA should be matched in reverse or not. |
350 | reverse: bool, |
351 | /// The matcher to use for look-around assertions. |
352 | look_matcher: LookMatcher, |
353 | /// A size limit to respect when building an NFA. If the total heap memory |
354 | /// of the intermediate NFA states exceeds (or would exceed) this amount, |
355 | /// then an error is returned. |
356 | size_limit: Option<usize>, |
357 | } |
358 | |
359 | impl Builder { |
360 | /// Create a new builder for hand-assembling NFAs. |
361 | pub fn new() -> Builder { |
362 | Builder::default() |
363 | } |
364 | |
365 | /// Clear this builder. |
366 | /// |
367 | /// Clearing removes all state associated with building an NFA, but does |
368 | /// not reset configuration (such as size limits and whether the NFA |
369 | /// should only match UTF-8). After clearing, the builder can be reused to |
370 | /// assemble an entirely new NFA. |
371 | pub fn clear(&mut self) { |
372 | self.pattern_id = None; |
373 | self.states.clear(); |
374 | self.start_pattern.clear(); |
375 | self.captures.clear(); |
376 | self.memory_states = 0; |
377 | } |
378 | |
379 | /// Assemble a [`NFA`] from the states added so far. |
380 | /// |
381 | /// After building an NFA, more states may be added and `build` may be |
382 | /// called again. To reuse a builder to produce an entirely new NFA from |
383 | /// scratch, call the [`clear`](Builder::clear) method first. |
384 | /// |
385 | /// `start_anchored` refers to the ID of the starting state that anchored |
386 | /// searches should use. That is, searches who matches are limited to the |
387 | /// starting position of the search. |
388 | /// |
389 | /// `start_unanchored` refers to the ID of the starting state that |
390 | /// unanchored searches should use. This permits searches to report matches |
391 | /// that start after the beginning of the search. In cases where unanchored |
392 | /// searches are not supported, the unanchored starting state ID must be |
393 | /// the same as the anchored starting state ID. |
394 | /// |
395 | /// # Errors |
396 | /// |
397 | /// This returns an error if there was a problem producing the final NFA. |
398 | /// In particular, this might include an error if the capturing groups |
399 | /// added to this builder violate any of the invariants documented on |
400 | /// [`GroupInfo`](crate::util::captures::GroupInfo). |
401 | /// |
402 | /// # Panics |
403 | /// |
404 | /// If `start_pattern` was called, then `finish_pattern` must be called |
405 | /// before `build`, otherwise this panics. |
406 | /// |
407 | /// This may panic for other invalid uses of a builder. For example, if |
408 | /// a "start capture" state was added without a corresponding "end capture" |
409 | /// state. |
410 | pub fn build( |
411 | &self, |
412 | start_anchored: StateID, |
413 | start_unanchored: StateID, |
414 | ) -> Result<NFA, BuildError> { |
415 | assert!(self.pattern_id.is_none(), "must call 'finish_pattern' first" ); |
416 | debug!( |
417 | "intermediate NFA compilation via builder is complete, \ |
418 | intermediate NFA size: {} states, {} bytes on heap" , |
419 | self.states.len(), |
420 | self.memory_usage(), |
421 | ); |
422 | |
423 | let mut nfa = nfa::Inner::default(); |
424 | nfa.set_utf8(self.utf8); |
425 | nfa.set_reverse(self.reverse); |
426 | nfa.set_look_matcher(self.look_matcher.clone()); |
427 | // A set of compiler internal state IDs that correspond to states |
428 | // that are exclusively epsilon transitions, i.e., goto instructions, |
429 | // combined with the state that they point to. This is used to |
430 | // record said states while transforming the compiler's internal NFA |
431 | // representation to the external form. |
432 | let mut empties = vec![]; |
433 | // A map used to re-map state IDs when translating this builder's |
434 | // internal NFA state representation to the final NFA representation. |
435 | let mut remap = vec![]; |
436 | remap.resize(self.states.len(), StateID::ZERO); |
437 | |
438 | nfa.set_starts(start_anchored, start_unanchored, &self.start_pattern); |
439 | nfa.set_captures(&self.captures).map_err(BuildError::captures)?; |
440 | // The idea here is to convert our intermediate states to their final |
441 | // form. The only real complexity here is the process of converting |
442 | // transitions, which are expressed in terms of state IDs. The new |
443 | // set of states will be smaller because of partial epsilon removal, |
444 | // so the state IDs will not be the same. |
445 | for (sid, state) in self.states.iter().with_state_ids() { |
446 | match *state { |
447 | State::Empty { next } => { |
448 | // Since we're removing empty states, we need to handle |
449 | // them later since we don't yet know which new state this |
450 | // empty state will be mapped to. |
451 | empties.push((sid, next)); |
452 | } |
453 | State::ByteRange { trans } => { |
454 | remap[sid] = nfa.add(nfa::State::ByteRange { trans }); |
455 | } |
456 | State::Sparse { ref transitions } => { |
457 | remap[sid] = match transitions.len() { |
458 | 0 => nfa.add(nfa::State::Fail), |
459 | 1 => nfa.add(nfa::State::ByteRange { |
460 | trans: transitions[0], |
461 | }), |
462 | _ => { |
463 | let transitions = |
464 | transitions.to_vec().into_boxed_slice(); |
465 | let sparse = SparseTransitions { transitions }; |
466 | nfa.add(nfa::State::Sparse(sparse)) |
467 | } |
468 | } |
469 | } |
470 | State::Look { look, next } => { |
471 | remap[sid] = nfa.add(nfa::State::Look { look, next }); |
472 | } |
473 | State::CaptureStart { pattern_id, group_index, next } => { |
474 | // We can't remove this empty state because of the side |
475 | // effect of capturing an offset for this capture slot. |
476 | let slot = nfa |
477 | .group_info() |
478 | .slot(pattern_id, group_index.as_usize()) |
479 | .expect("invalid capture index" ); |
480 | let slot = |
481 | SmallIndex::new(slot).expect("a small enough slot" ); |
482 | remap[sid] = nfa.add(nfa::State::Capture { |
483 | next, |
484 | pattern_id, |
485 | group_index, |
486 | slot, |
487 | }); |
488 | } |
489 | State::CaptureEnd { pattern_id, group_index, next } => { |
490 | // We can't remove this empty state because of the side |
491 | // effect of capturing an offset for this capture slot. |
492 | // Also, this always succeeds because we check that all |
493 | // slot indices are valid for all capture indices when they |
494 | // are initially added. |
495 | let slot = nfa |
496 | .group_info() |
497 | .slot(pattern_id, group_index.as_usize()) |
498 | .expect("invalid capture index" ) |
499 | .checked_add(1) |
500 | .unwrap(); |
501 | let slot = |
502 | SmallIndex::new(slot).expect("a small enough slot" ); |
503 | remap[sid] = nfa.add(nfa::State::Capture { |
504 | next, |
505 | pattern_id, |
506 | group_index, |
507 | slot, |
508 | }); |
509 | } |
510 | State::Union { ref alternates } => { |
511 | if alternates.is_empty() { |
512 | remap[sid] = nfa.add(nfa::State::Fail); |
513 | } else if alternates.len() == 1 { |
514 | empties.push((sid, alternates[0])); |
515 | remap[sid] = alternates[0]; |
516 | } else if alternates.len() == 2 { |
517 | remap[sid] = nfa.add(nfa::State::BinaryUnion { |
518 | alt1: alternates[0], |
519 | alt2: alternates[1], |
520 | }); |
521 | } else { |
522 | let alternates = |
523 | alternates.to_vec().into_boxed_slice(); |
524 | remap[sid] = nfa.add(nfa::State::Union { alternates }); |
525 | } |
526 | } |
527 | State::UnionReverse { ref alternates } => { |
528 | if alternates.is_empty() { |
529 | remap[sid] = nfa.add(nfa::State::Fail); |
530 | } else if alternates.len() == 1 { |
531 | empties.push((sid, alternates[0])); |
532 | remap[sid] = alternates[0]; |
533 | } else if alternates.len() == 2 { |
534 | remap[sid] = nfa.add(nfa::State::BinaryUnion { |
535 | alt1: alternates[1], |
536 | alt2: alternates[0], |
537 | }); |
538 | } else { |
539 | let mut alternates = |
540 | alternates.to_vec().into_boxed_slice(); |
541 | alternates.reverse(); |
542 | remap[sid] = nfa.add(nfa::State::Union { alternates }); |
543 | } |
544 | } |
545 | State::Fail => { |
546 | remap[sid] = nfa.add(nfa::State::Fail); |
547 | } |
548 | State::Match { pattern_id } => { |
549 | remap[sid] = nfa.add(nfa::State::Match { pattern_id }); |
550 | } |
551 | } |
552 | } |
553 | // Some of the new states still point to empty state IDs, so we need to |
554 | // follow each of them and remap the empty state IDs to their non-empty |
555 | // state IDs. |
556 | // |
557 | // We also keep track of which states we've already mapped. This helps |
558 | // avoid quadratic behavior in a long chain of empty states. For |
559 | // example, in 'a{0}{50000}'. |
560 | let mut remapped = vec![false; self.states.len()]; |
561 | for &(empty_id, empty_next) in empties.iter() { |
562 | if remapped[empty_id] { |
563 | continue; |
564 | } |
565 | // empty states can point to other empty states, forming a chain. |
566 | // So we must follow the chain until the end, which must end at |
567 | // a non-empty state, and therefore, a state that is correctly |
568 | // remapped. We are guaranteed to terminate because our compiler |
569 | // never builds a loop among only empty states. |
570 | let mut new_next = empty_next; |
571 | while let Some(next) = self.states[new_next].goto() { |
572 | new_next = next; |
573 | } |
574 | remap[empty_id] = remap[new_next]; |
575 | remapped[empty_id] = true; |
576 | |
577 | // Now that we've remapped the main 'empty_id' above, we re-follow |
578 | // the chain from above and remap every empty state we found along |
579 | // the way to our ultimate non-empty target. We are careful to set |
580 | // 'remapped' to true for each such state. We thus will not need |
581 | // to re-compute this chain for any subsequent empty states in |
582 | // 'empties' that are part of this chain. |
583 | let mut next2 = empty_next; |
584 | while let Some(next) = self.states[next2].goto() { |
585 | remap[next2] = remap[new_next]; |
586 | remapped[next2] = true; |
587 | next2 = next; |
588 | } |
589 | } |
590 | // Finally remap all of the state IDs. |
591 | nfa.remap(&remap); |
592 | let final_nfa = nfa.into_nfa(); |
593 | debug!( |
594 | "NFA compilation via builder complete, \ |
595 | final NFA size: {} states, {} bytes on heap, \ |
596 | has empty? {:?}, utf8? {:?}" , |
597 | final_nfa.states().len(), |
598 | final_nfa.memory_usage(), |
599 | final_nfa.has_empty(), |
600 | final_nfa.is_utf8(), |
601 | ); |
602 | Ok(final_nfa) |
603 | } |
604 | |
605 | /// Start the assembly of a pattern in this NFA. |
606 | /// |
607 | /// Upon success, this returns the identifier for the new pattern. |
608 | /// Identifiers start at `0` and are incremented by 1 for each new pattern. |
609 | /// |
610 | /// It is necessary to call this routine before adding capturing states. |
611 | /// Otherwise, any other NFA state may be added before starting a pattern. |
612 | /// |
613 | /// # Errors |
614 | /// |
615 | /// If the pattern identifier space is exhausted, then this returns an |
616 | /// error. |
617 | /// |
618 | /// # Panics |
619 | /// |
620 | /// If this is called while assembling another pattern (i.e., before |
621 | /// `finish_pattern` is called), then this panics. |
622 | pub fn start_pattern(&mut self) -> Result<PatternID, BuildError> { |
623 | assert!(self.pattern_id.is_none(), "must call 'finish_pattern' first" ); |
624 | |
625 | let proposed = self.start_pattern.len(); |
626 | let pid = PatternID::new(proposed) |
627 | .map_err(|_| BuildError::too_many_patterns(proposed))?; |
628 | self.pattern_id = Some(pid); |
629 | // This gets filled in when 'finish_pattern' is called. |
630 | self.start_pattern.push(StateID::ZERO); |
631 | Ok(pid) |
632 | } |
633 | |
634 | /// Finish the assembly of a pattern in this NFA. |
635 | /// |
636 | /// Upon success, this returns the identifier for the new pattern. |
637 | /// Identifiers start at `0` and are incremented by 1 for each new |
638 | /// pattern. This is the same identifier returned by the corresponding |
639 | /// `start_pattern` call. |
640 | /// |
641 | /// Note that `start_pattern` and `finish_pattern` pairs cannot be |
642 | /// interleaved or nested. A correct `finish_pattern` call _always_ |
643 | /// corresponds to the most recently called `start_pattern` routine. |
644 | /// |
645 | /// # Errors |
646 | /// |
647 | /// This currently never returns an error, but this is subject to change. |
648 | /// |
649 | /// # Panics |
650 | /// |
651 | /// If this is called without a corresponding `start_pattern` call, then |
652 | /// this panics. |
653 | pub fn finish_pattern( |
654 | &mut self, |
655 | start_id: StateID, |
656 | ) -> Result<PatternID, BuildError> { |
657 | let pid = self.current_pattern_id(); |
658 | self.start_pattern[pid] = start_id; |
659 | self.pattern_id = None; |
660 | Ok(pid) |
661 | } |
662 | |
663 | /// Returns the pattern identifier of the current pattern. |
664 | /// |
665 | /// # Panics |
666 | /// |
667 | /// If this doesn't occur after a `start_pattern` call and before the |
668 | /// corresponding `finish_pattern` call, then this panics. |
669 | pub fn current_pattern_id(&self) -> PatternID { |
670 | self.pattern_id.expect("must call 'start_pattern' first" ) |
671 | } |
672 | |
673 | /// Returns the number of patterns added to this builder so far. |
674 | /// |
675 | /// This only includes patterns that have had `finish_pattern` called |
676 | /// for them. |
677 | pub fn pattern_len(&self) -> usize { |
678 | self.start_pattern.len() |
679 | } |
680 | |
681 | /// Add an "empty" NFA state. |
682 | /// |
683 | /// An "empty" NFA state is a state with a single unconditional epsilon |
684 | /// transition to another NFA state. Such empty states are removed before |
685 | /// building the final [`NFA`] (which has no such "empty" states), but they |
686 | /// can be quite useful in the construction process of an NFA. |
687 | /// |
688 | /// # Errors |
689 | /// |
690 | /// This returns an error if the state identifier space is exhausted, or if |
691 | /// the configured heap size limit has been exceeded. |
692 | pub fn add_empty(&mut self) -> Result<StateID, BuildError> { |
693 | self.add(State::Empty { next: StateID::ZERO }) |
694 | } |
695 | |
696 | /// Add a "union" NFA state. |
697 | /// |
698 | /// A "union" NFA state that contains zero or more unconditional epsilon |
699 | /// transitions to other NFA states. The order of these transitions |
700 | /// reflects a priority order where earlier transitions are preferred over |
701 | /// later transitions. |
702 | /// |
703 | /// Callers may provide an empty set of alternates to this method call, and |
704 | /// then later add transitions via `patch`. At final build time, a "union" |
705 | /// state with no alternates is converted to a "fail" state, and a "union" |
706 | /// state with exactly one alternate is treated as if it were an "empty" |
707 | /// state. |
708 | /// |
709 | /// # Errors |
710 | /// |
711 | /// This returns an error if the state identifier space is exhausted, or if |
712 | /// the configured heap size limit has been exceeded. |
713 | pub fn add_union( |
714 | &mut self, |
715 | alternates: Vec<StateID>, |
716 | ) -> Result<StateID, BuildError> { |
717 | self.add(State::Union { alternates }) |
718 | } |
719 | |
720 | /// Add a "reverse union" NFA state. |
721 | /// |
722 | /// A "reverse union" NFA state contains zero or more unconditional epsilon |
723 | /// transitions to other NFA states. The order of these transitions |
724 | /// reflects a priority order where later transitions are preferred |
725 | /// over earlier transitions. This is an inverted priority order when |
726 | /// compared to `add_union`. This is useful, for example, for implementing |
727 | /// non-greedy repetition operators. |
728 | /// |
729 | /// Callers may provide an empty set of alternates to this method call, and |
730 | /// then later add transitions via `patch`. At final build time, a "reverse |
731 | /// union" state with no alternates is converted to a "fail" state, and a |
732 | /// "reverse union" state with exactly one alternate is treated as if it |
733 | /// were an "empty" state. |
734 | /// |
735 | /// # Errors |
736 | /// |
737 | /// This returns an error if the state identifier space is exhausted, or if |
738 | /// the configured heap size limit has been exceeded. |
739 | pub fn add_union_reverse( |
740 | &mut self, |
741 | alternates: Vec<StateID>, |
742 | ) -> Result<StateID, BuildError> { |
743 | self.add(State::UnionReverse { alternates }) |
744 | } |
745 | |
746 | /// Add a "range" NFA state. |
747 | /// |
748 | /// A "range" NFA state is a state with one outgoing transition to another |
749 | /// state, where that transition may only be followed if the current input |
750 | /// byte falls between a range of bytes given. |
751 | /// |
752 | /// # Errors |
753 | /// |
754 | /// This returns an error if the state identifier space is exhausted, or if |
755 | /// the configured heap size limit has been exceeded. |
756 | pub fn add_range( |
757 | &mut self, |
758 | trans: Transition, |
759 | ) -> Result<StateID, BuildError> { |
760 | self.add(State::ByteRange { trans }) |
761 | } |
762 | |
763 | /// Add a "sparse" NFA state. |
764 | /// |
765 | /// A "sparse" NFA state contains zero or more outgoing transitions, where |
766 | /// the transition to be followed (if any) is chosen based on whether the |
767 | /// current input byte falls in the range of one such transition. The |
768 | /// transitions given *must* be non-overlapping and in ascending order. (A |
769 | /// "sparse" state with no transitions is equivalent to a "fail" state.) |
770 | /// |
771 | /// A "sparse" state is like adding a "union" state and pointing it at a |
772 | /// bunch of "range" states, except that the different alternates have |
773 | /// equal priority. |
774 | /// |
775 | /// Note that a "sparse" state is the only state that cannot be patched. |
776 | /// This is because a "sparse" state has many transitions, each of which |
777 | /// may point to a different NFA state. Moreover, adding more such |
778 | /// transitions requires more than just an NFA state ID to point to. It |
779 | /// also requires a byte range. The `patch` routine does not support the |
780 | /// additional information required. Therefore, callers must ensure that |
781 | /// all outgoing transitions for this state are included when `add_sparse` |
782 | /// is called. There is no way to add more later. |
783 | /// |
784 | /// # Errors |
785 | /// |
786 | /// This returns an error if the state identifier space is exhausted, or if |
787 | /// the configured heap size limit has been exceeded. |
788 | /// |
789 | /// # Panics |
790 | /// |
791 | /// This routine _may_ panic if the transitions given overlap or are not |
792 | /// in ascending order. |
793 | pub fn add_sparse( |
794 | &mut self, |
795 | transitions: Vec<Transition>, |
796 | ) -> Result<StateID, BuildError> { |
797 | self.add(State::Sparse { transitions }) |
798 | } |
799 | |
800 | /// Add a "look" NFA state. |
801 | /// |
802 | /// A "look" NFA state corresponds to a state with exactly one |
803 | /// *conditional* epsilon transition to another NFA state. Namely, it |
804 | /// represents one of a small set of simplistic look-around operators. |
805 | /// |
806 | /// Callers may provide a "dummy" state ID (typically [`StateID::ZERO`]), |
807 | /// and then change it later with [`patch`](Builder::patch). |
808 | /// |
809 | /// # Errors |
810 | /// |
811 | /// This returns an error if the state identifier space is exhausted, or if |
812 | /// the configured heap size limit has been exceeded. |
813 | pub fn add_look( |
814 | &mut self, |
815 | next: StateID, |
816 | look: Look, |
817 | ) -> Result<StateID, BuildError> { |
818 | self.add(State::Look { look, next }) |
819 | } |
820 | |
821 | /// Add a "start capture" NFA state. |
822 | /// |
823 | /// A "start capture" NFA state corresponds to a state with exactly one |
824 | /// outgoing unconditional epsilon transition to another state. Unlike |
825 | /// "empty" states, a "start capture" state also carries with it an |
826 | /// instruction for saving the current position of input to a particular |
827 | /// location in memory. NFA simulations, like the Pike VM, may use this |
828 | /// information to report the match locations of capturing groups in a |
829 | /// regex pattern. |
830 | /// |
831 | /// If the corresponding capturing group has a name, then callers should |
832 | /// include it here. |
833 | /// |
834 | /// Callers may provide a "dummy" state ID (typically [`StateID::ZERO`]), |
835 | /// and then change it later with [`patch`](Builder::patch). |
836 | /// |
837 | /// Note that unlike `start_pattern`/`finish_pattern`, capturing start and |
838 | /// end states may be interleaved. Indeed, it is typical for many "start |
839 | /// capture" NFA states to appear before the first "end capture" state. |
840 | /// |
841 | /// # Errors |
842 | /// |
843 | /// This returns an error if the state identifier space is exhausted, or if |
844 | /// the configured heap size limit has been exceeded or if the given |
845 | /// capture index overflows `usize`. |
846 | /// |
847 | /// While the above are the only conditions in which this routine can |
848 | /// currently return an error, it is possible to call this method with an |
849 | /// inputs that results in the final `build()` step failing to produce an |
850 | /// NFA. For example, if one adds two distinct capturing groups with the |
851 | /// same name, then that will result in `build()` failing with an error. |
852 | /// |
853 | /// See the [`GroupInfo`](crate::util::captures::GroupInfo) type for |
854 | /// more information on what qualifies as valid capturing groups. |
855 | /// |
856 | /// # Example |
857 | /// |
858 | /// This example shows that an error occurs when one tries to add multiple |
859 | /// capturing groups with the same name to the same pattern. |
860 | /// |
861 | /// ``` |
862 | /// use regex_automata::{ |
863 | /// nfa::thompson::Builder, |
864 | /// util::primitives::StateID, |
865 | /// }; |
866 | /// |
867 | /// let name = Some(std::sync::Arc::from("foo" )); |
868 | /// let mut builder = Builder::new(); |
869 | /// builder.start_pattern()?; |
870 | /// // 0th capture group should always be unnamed. |
871 | /// let start = builder.add_capture_start(StateID::ZERO, 0, None)?; |
872 | /// // OK |
873 | /// builder.add_capture_start(StateID::ZERO, 1, name.clone())?; |
874 | /// // This is not OK, but 'add_capture_start' still succeeds. We don't |
875 | /// // get an error until we call 'build' below. Without this call, the |
876 | /// // call to 'build' below would succeed. |
877 | /// builder.add_capture_start(StateID::ZERO, 2, name.clone())?; |
878 | /// // Finish our pattern so we can try to build the NFA. |
879 | /// builder.finish_pattern(start)?; |
880 | /// let result = builder.build(start, start); |
881 | /// assert!(result.is_err()); |
882 | /// |
883 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
884 | /// ``` |
885 | /// |
886 | /// However, adding multiple capturing groups with the same name to |
887 | /// distinct patterns is okay: |
888 | /// |
889 | /// ``` |
890 | /// use std::sync::Arc; |
891 | /// |
892 | /// use regex_automata::{ |
893 | /// nfa::thompson::{pikevm::PikeVM, Builder, Transition}, |
894 | /// util::{ |
895 | /// captures::Captures, |
896 | /// primitives::{PatternID, StateID}, |
897 | /// }, |
898 | /// Span, |
899 | /// }; |
900 | /// |
901 | /// // Hand-compile the patterns '(?P<foo>[a-z])' and '(?P<foo>[A-Z])'. |
902 | /// let mut builder = Builder::new(); |
903 | /// // We compile them to support an unanchored search, which requires |
904 | /// // adding an implicit '(?s-u:.)*?' prefix before adding either pattern. |
905 | /// let unanchored_prefix = builder.add_union_reverse(vec![])?; |
906 | /// let any = builder.add_range(Transition { |
907 | /// start: b' \x00' , end: b' \xFF' , next: StateID::ZERO, |
908 | /// })?; |
909 | /// builder.patch(unanchored_prefix, any)?; |
910 | /// builder.patch(any, unanchored_prefix)?; |
911 | /// |
912 | /// // Compile an alternation that permits matching multiple patterns. |
913 | /// let alt = builder.add_union(vec![])?; |
914 | /// builder.patch(unanchored_prefix, alt)?; |
915 | /// |
916 | /// // Compile '(?P<foo>[a-z]+)'. |
917 | /// builder.start_pattern()?; |
918 | /// let start0 = builder.add_capture_start(StateID::ZERO, 0, None)?; |
919 | /// // N.B. 0th capture group must always be unnamed. |
920 | /// let foo_start0 = builder.add_capture_start( |
921 | /// StateID::ZERO, 1, Some(Arc::from("foo" )), |
922 | /// )?; |
923 | /// let lowercase = builder.add_range(Transition { |
924 | /// start: b'a' , end: b'z' , next: StateID::ZERO, |
925 | /// })?; |
926 | /// let foo_end0 = builder.add_capture_end(StateID::ZERO, 1)?; |
927 | /// let end0 = builder.add_capture_end(StateID::ZERO, 0)?; |
928 | /// let match0 = builder.add_match()?; |
929 | /// builder.patch(start0, foo_start0)?; |
930 | /// builder.patch(foo_start0, lowercase)?; |
931 | /// builder.patch(lowercase, foo_end0)?; |
932 | /// builder.patch(foo_end0, end0)?; |
933 | /// builder.patch(end0, match0)?; |
934 | /// builder.finish_pattern(start0)?; |
935 | /// |
936 | /// // Compile '(?P<foo>[A-Z]+)'. |
937 | /// builder.start_pattern()?; |
938 | /// let start1 = builder.add_capture_start(StateID::ZERO, 0, None)?; |
939 | /// // N.B. 0th capture group must always be unnamed. |
940 | /// let foo_start1 = builder.add_capture_start( |
941 | /// StateID::ZERO, 1, Some(Arc::from("foo" )), |
942 | /// )?; |
943 | /// let uppercase = builder.add_range(Transition { |
944 | /// start: b'A' , end: b'Z' , next: StateID::ZERO, |
945 | /// })?; |
946 | /// let foo_end1 = builder.add_capture_end(StateID::ZERO, 1)?; |
947 | /// let end1 = builder.add_capture_end(StateID::ZERO, 0)?; |
948 | /// let match1 = builder.add_match()?; |
949 | /// builder.patch(start1, foo_start1)?; |
950 | /// builder.patch(foo_start1, uppercase)?; |
951 | /// builder.patch(uppercase, foo_end1)?; |
952 | /// builder.patch(foo_end1, end1)?; |
953 | /// builder.patch(end1, match1)?; |
954 | /// builder.finish_pattern(start1)?; |
955 | /// |
956 | /// // Now add the patterns to our alternation that we started above. |
957 | /// builder.patch(alt, start0)?; |
958 | /// builder.patch(alt, start1)?; |
959 | /// |
960 | /// // Finally build the NFA. The first argument is the anchored starting |
961 | /// // state (the pattern alternation) where as the second is the |
962 | /// // unanchored starting state (the unanchored prefix). |
963 | /// let nfa = builder.build(alt, unanchored_prefix)?; |
964 | /// |
965 | /// // Now build a Pike VM from our NFA and access the 'foo' capture |
966 | /// // group regardless of which pattern matched, since it is defined |
967 | /// // for both patterns. |
968 | /// let vm = PikeVM::new_from_nfa(nfa)?; |
969 | /// let mut cache = vm.create_cache(); |
970 | /// let caps: Vec<Captures> = |
971 | /// vm.captures_iter(&mut cache, "0123aAaAA" ).collect(); |
972 | /// assert_eq!(5, caps.len()); |
973 | /// |
974 | /// assert_eq!(Some(PatternID::must(0)), caps[0].pattern()); |
975 | /// assert_eq!(Some(Span::from(4..5)), caps[0].get_group_by_name("foo" )); |
976 | /// |
977 | /// assert_eq!(Some(PatternID::must(1)), caps[1].pattern()); |
978 | /// assert_eq!(Some(Span::from(5..6)), caps[1].get_group_by_name("foo" )); |
979 | /// |
980 | /// assert_eq!(Some(PatternID::must(0)), caps[2].pattern()); |
981 | /// assert_eq!(Some(Span::from(6..7)), caps[2].get_group_by_name("foo" )); |
982 | /// |
983 | /// assert_eq!(Some(PatternID::must(1)), caps[3].pattern()); |
984 | /// assert_eq!(Some(Span::from(7..8)), caps[3].get_group_by_name("foo" )); |
985 | /// |
986 | /// assert_eq!(Some(PatternID::must(1)), caps[4].pattern()); |
987 | /// assert_eq!(Some(Span::from(8..9)), caps[4].get_group_by_name("foo" )); |
988 | /// |
989 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
990 | /// ``` |
991 | pub fn add_capture_start( |
992 | &mut self, |
993 | next: StateID, |
994 | group_index: u32, |
995 | name: Option<Arc<str>>, |
996 | ) -> Result<StateID, BuildError> { |
997 | let pid = self.current_pattern_id(); |
998 | let group_index = match SmallIndex::try_from(group_index) { |
999 | Err(_) => { |
1000 | return Err(BuildError::invalid_capture_index(group_index)) |
1001 | } |
1002 | Ok(group_index) => group_index, |
1003 | }; |
1004 | // Make sure we have space to insert our (pid,index)|-->name mapping. |
1005 | if pid.as_usize() >= self.captures.len() { |
1006 | for _ in 0..=(pid.as_usize() - self.captures.len()) { |
1007 | self.captures.push(vec![]); |
1008 | } |
1009 | } |
1010 | // In the case where 'group_index < self.captures[pid].len()', it means |
1011 | // that we are adding a duplicate capture group. This is somewhat |
1012 | // weird, but permissible because the capture group itself can be |
1013 | // repeated in the syntax. For example, '([a-z]){4}' will produce 4 |
1014 | // capture groups. In practice, only the last will be set at search |
1015 | // time when a match occurs. For duplicates, we don't need to push |
1016 | // anything other than a CaptureStart NFA state. |
1017 | if group_index.as_usize() >= self.captures[pid].len() { |
1018 | // For discontiguous indices, push placeholders for earlier capture |
1019 | // groups that weren't explicitly added. |
1020 | for _ in 0..(group_index.as_usize() - self.captures[pid].len()) { |
1021 | self.captures[pid].push(None); |
1022 | } |
1023 | self.captures[pid].push(name); |
1024 | } |
1025 | self.add(State::CaptureStart { pattern_id: pid, group_index, next }) |
1026 | } |
1027 | |
1028 | /// Add a "end capture" NFA state. |
1029 | /// |
1030 | /// A "end capture" NFA state corresponds to a state with exactly one |
1031 | /// outgoing unconditional epsilon transition to another state. Unlike |
1032 | /// "empty" states, a "end capture" state also carries with it an |
1033 | /// instruction for saving the current position of input to a particular |
1034 | /// location in memory. NFA simulations, like the Pike VM, may use this |
1035 | /// information to report the match locations of capturing groups in a |
1036 | /// |
1037 | /// Callers may provide a "dummy" state ID (typically [`StateID::ZERO`]), |
1038 | /// and then change it later with [`patch`](Builder::patch). |
1039 | /// |
1040 | /// Note that unlike `start_pattern`/`finish_pattern`, capturing start and |
1041 | /// end states may be interleaved. Indeed, it is typical for many "start |
1042 | /// capture" NFA states to appear before the first "end capture" state. |
1043 | /// |
1044 | /// # Errors |
1045 | /// |
1046 | /// This returns an error if the state identifier space is exhausted, or if |
1047 | /// the configured heap size limit has been exceeded or if the given |
1048 | /// capture index overflows `usize`. |
1049 | /// |
1050 | /// While the above are the only conditions in which this routine can |
1051 | /// currently return an error, it is possible to call this method with an |
1052 | /// inputs that results in the final `build()` step failing to produce an |
1053 | /// NFA. For example, if one adds two distinct capturing groups with the |
1054 | /// same name, then that will result in `build()` failing with an error. |
1055 | /// |
1056 | /// See the [`GroupInfo`](crate::util::captures::GroupInfo) type for |
1057 | /// more information on what qualifies as valid capturing groups. |
1058 | pub fn add_capture_end( |
1059 | &mut self, |
1060 | next: StateID, |
1061 | group_index: u32, |
1062 | ) -> Result<StateID, BuildError> { |
1063 | let pid = self.current_pattern_id(); |
1064 | let group_index = match SmallIndex::try_from(group_index) { |
1065 | Err(_) => { |
1066 | return Err(BuildError::invalid_capture_index(group_index)) |
1067 | } |
1068 | Ok(group_index) => group_index, |
1069 | }; |
1070 | self.add(State::CaptureEnd { pattern_id: pid, group_index, next }) |
1071 | } |
1072 | |
1073 | /// Adds a "fail" NFA state. |
1074 | /// |
1075 | /// A "fail" state is simply a state that has no outgoing transitions. It |
1076 | /// acts as a way to cause a search to stop without reporting a match. |
1077 | /// For example, one way to represent an NFA with zero patterns is with a |
1078 | /// single "fail" state. |
1079 | /// |
1080 | /// # Errors |
1081 | /// |
1082 | /// This returns an error if the state identifier space is exhausted, or if |
1083 | /// the configured heap size limit has been exceeded. |
1084 | pub fn add_fail(&mut self) -> Result<StateID, BuildError> { |
1085 | self.add(State::Fail) |
1086 | } |
1087 | |
1088 | /// Adds a "match" NFA state. |
1089 | /// |
1090 | /// A "match" state has no outgoing transitions (just like a "fail" |
1091 | /// state), but it has special significance in that if a search enters |
1092 | /// this state, then a match has been found. The match state that is added |
1093 | /// automatically has the current pattern ID associated with it. This is |
1094 | /// used to report the matching pattern ID at search time. |
1095 | /// |
1096 | /// # Errors |
1097 | /// |
1098 | /// This returns an error if the state identifier space is exhausted, or if |
1099 | /// the configured heap size limit has been exceeded. |
1100 | /// |
1101 | /// # Panics |
1102 | /// |
1103 | /// This must be called after a `start_pattern` call but before the |
1104 | /// corresponding `finish_pattern` call. Otherwise, it panics. |
1105 | pub fn add_match(&mut self) -> Result<StateID, BuildError> { |
1106 | let pattern_id = self.current_pattern_id(); |
1107 | let sid = self.add(State::Match { pattern_id })?; |
1108 | Ok(sid) |
1109 | } |
1110 | |
1111 | /// The common implementation of "add a state." It handles the common |
1112 | /// error cases of state ID exhausting (by owning state ID allocation) and |
1113 | /// whether the size limit has been exceeded. |
1114 | fn add(&mut self, state: State) -> Result<StateID, BuildError> { |
1115 | let id = StateID::new(self.states.len()) |
1116 | .map_err(|_| BuildError::too_many_states(self.states.len()))?; |
1117 | self.memory_states += state.memory_usage(); |
1118 | self.states.push(state); |
1119 | self.check_size_limit()?; |
1120 | Ok(id) |
1121 | } |
1122 | |
1123 | /// Add a transition from one state to another. |
1124 | /// |
1125 | /// This routine is called "patch" since it is very common to add the |
1126 | /// states you want, typically with "dummy" state ID transitions, and then |
1127 | /// "patch" in the real state IDs later. This is because you don't always |
1128 | /// know all of the necessary state IDs to add because they might not |
1129 | /// exist yet. |
1130 | /// |
1131 | /// # Errors |
1132 | /// |
1133 | /// This may error if patching leads to an increase in heap usage beyond |
1134 | /// the configured size limit. Heap usage only grows when patching adds a |
1135 | /// new transition (as in the case of a "union" state). |
1136 | /// |
1137 | /// # Panics |
1138 | /// |
1139 | /// This panics if `from` corresponds to a "sparse" state. When "sparse" |
1140 | /// states are added, there is no way to patch them after-the-fact. (If you |
1141 | /// have a use case where this would be helpful, please file an issue. It |
1142 | /// will likely require a new API.) |
1143 | pub fn patch( |
1144 | &mut self, |
1145 | from: StateID, |
1146 | to: StateID, |
1147 | ) -> Result<(), BuildError> { |
1148 | let old_memory_states = self.memory_states; |
1149 | match self.states[from] { |
1150 | State::Empty { ref mut next } => { |
1151 | *next = to; |
1152 | } |
1153 | State::ByteRange { ref mut trans } => { |
1154 | trans.next = to; |
1155 | } |
1156 | State::Sparse { .. } => { |
1157 | panic!("cannot patch from a sparse NFA state" ) |
1158 | } |
1159 | State::Look { ref mut next, .. } => { |
1160 | *next = to; |
1161 | } |
1162 | State::Union { ref mut alternates } => { |
1163 | alternates.push(to); |
1164 | self.memory_states += mem::size_of::<StateID>(); |
1165 | } |
1166 | State::UnionReverse { ref mut alternates } => { |
1167 | alternates.push(to); |
1168 | self.memory_states += mem::size_of::<StateID>(); |
1169 | } |
1170 | State::CaptureStart { ref mut next, .. } => { |
1171 | *next = to; |
1172 | } |
1173 | State::CaptureEnd { ref mut next, .. } => { |
1174 | *next = to; |
1175 | } |
1176 | State::Fail => {} |
1177 | State::Match { .. } => {} |
1178 | } |
1179 | if old_memory_states != self.memory_states { |
1180 | self.check_size_limit()?; |
1181 | } |
1182 | Ok(()) |
1183 | } |
1184 | |
1185 | /// Set whether the NFA produced by this builder should only match UTF-8. |
1186 | /// |
1187 | /// This should be set when both of the following are true: |
1188 | /// |
1189 | /// 1. The caller guarantees that the NFA created by this build will only |
1190 | /// report non-empty matches with spans that are valid UTF-8. |
1191 | /// 2. The caller desires regex engines using this NFA to avoid reporting |
1192 | /// empty matches with a span that splits a valid UTF-8 encoded codepoint. |
1193 | /// |
1194 | /// Property (1) is not checked. Instead, this requires the caller to |
1195 | /// promise that it is true. Property (2) corresponds to the behavior of |
1196 | /// regex engines using the NFA created by this builder. Namely, there |
1197 | /// is no way in the NFA's graph itself to say that empty matches found |
1198 | /// by, for example, the regex `a*` will fall on valid UTF-8 boundaries. |
1199 | /// Instead, this option is used to communicate the UTF-8 semantic to regex |
1200 | /// engines that will typically implement it as a post-processing step by |
1201 | /// filtering out empty matches that don't fall on UTF-8 boundaries. |
1202 | /// |
1203 | /// If you're building an NFA from an HIR (and not using a |
1204 | /// [`thompson::Compiler`](crate::nfa::thompson::Compiler)), then you can |
1205 | /// use the [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) |
1206 | /// option to guarantee that if the HIR detects a non-empty match, then it |
1207 | /// is guaranteed to be valid UTF-8. |
1208 | /// |
1209 | /// Note that property (2) does *not* specify the behavior of executing |
1210 | /// a search on a haystack that is not valid UTF-8. Therefore, if you're |
1211 | /// *not* running this NFA on strings that are guaranteed to be valid |
1212 | /// UTF-8, you almost certainly do not want to enable this option. |
1213 | /// Similarly, if you are running the NFA on strings that *are* guaranteed |
1214 | /// to be valid UTF-8, then you almost certainly want to enable this option |
1215 | /// unless you can guarantee that your NFA will never produce a zero-width |
1216 | /// match. |
1217 | /// |
1218 | /// It is disabled by default. |
1219 | pub fn set_utf8(&mut self, yes: bool) { |
1220 | self.utf8 = yes; |
1221 | } |
1222 | |
1223 | /// Returns whether UTF-8 mode is enabled for this builder. |
1224 | /// |
1225 | /// See [`Builder::set_utf8`] for more details about what "UTF-8 mode" is. |
1226 | pub fn get_utf8(&self) -> bool { |
1227 | self.utf8 |
1228 | } |
1229 | |
1230 | /// Sets whether the NFA produced by this builder should be matched in |
1231 | /// reverse or not. Generally speaking, when enabled, the NFA produced |
1232 | /// should be matched by moving backwards through a haystack, from a higher |
1233 | /// memory address to a lower memory address. |
1234 | /// |
1235 | /// See also [`NFA::is_reverse`] for more details. |
1236 | /// |
1237 | /// This is disabled by default, which means NFAs are by default matched |
1238 | /// in the forward direction. |
1239 | pub fn set_reverse(&mut self, yes: bool) { |
1240 | self.reverse = yes; |
1241 | } |
1242 | |
1243 | /// Returns whether reverse mode is enabled for this builder. |
1244 | /// |
1245 | /// See [`Builder::set_reverse`] for more details about what "reverse mode" |
1246 | /// is. |
1247 | pub fn get_reverse(&self) -> bool { |
1248 | self.reverse |
1249 | } |
1250 | |
1251 | /// Sets the look-around matcher that should be used for the resulting NFA. |
1252 | /// |
1253 | /// A look-around matcher can be used to configure how look-around |
1254 | /// assertions are matched. For example, a matcher might carry |
1255 | /// configuration that changes the line terminator used for `(?m:^)` and |
1256 | /// `(?m:$)` assertions. |
1257 | pub fn set_look_matcher(&mut self, m: LookMatcher) { |
1258 | self.look_matcher = m; |
1259 | } |
1260 | |
1261 | /// Returns the look-around matcher used for this builder. |
1262 | /// |
1263 | /// If a matcher was not explicitly set, then `LookMatcher::default()` is |
1264 | /// returned. |
1265 | pub fn get_look_matcher(&self) -> &LookMatcher { |
1266 | &self.look_matcher |
1267 | } |
1268 | |
1269 | /// Set the size limit on this builder. |
1270 | /// |
1271 | /// Setting the size limit will also check whether the NFA built so far |
1272 | /// fits within the given size limit. If it doesn't, then an error is |
1273 | /// returned. |
1274 | /// |
1275 | /// By default, there is no configured size limit. |
1276 | pub fn set_size_limit( |
1277 | &mut self, |
1278 | limit: Option<usize>, |
1279 | ) -> Result<(), BuildError> { |
1280 | self.size_limit = limit; |
1281 | self.check_size_limit() |
1282 | } |
1283 | |
1284 | /// Return the currently configured size limit. |
1285 | /// |
1286 | /// By default, this returns `None`, which corresponds to no configured |
1287 | /// size limit. |
1288 | pub fn get_size_limit(&self) -> Option<usize> { |
1289 | self.size_limit |
1290 | } |
1291 | |
1292 | /// Returns the heap memory usage, in bytes, used by the NFA states added |
1293 | /// so far. |
1294 | /// |
1295 | /// Note that this is an approximation of how big the final NFA will be. |
1296 | /// In practice, the final NFA will likely be a bit smaller because of |
1297 | /// its simpler state representation. (For example, using things like |
1298 | /// `Box<[StateID]>` instead of `Vec<StateID>`.) |
1299 | pub fn memory_usage(&self) -> usize { |
1300 | self.states.len() * mem::size_of::<State>() + self.memory_states |
1301 | } |
1302 | |
1303 | fn check_size_limit(&self) -> Result<(), BuildError> { |
1304 | if let Some(limit) = self.size_limit { |
1305 | if self.memory_usage() > limit { |
1306 | return Err(BuildError::exceeded_size_limit(limit)); |
1307 | } |
1308 | } |
1309 | Ok(()) |
1310 | } |
1311 | } |
1312 | |
1313 | #[cfg (test)] |
1314 | mod tests { |
1315 | use super::*; |
1316 | |
1317 | // This asserts that a builder state doesn't have its size changed. It is |
1318 | // *really* easy to accidentally increase the size, and thus potentially |
1319 | // dramatically increase the memory usage of NFA builder. |
1320 | // |
1321 | // This assert doesn't mean we absolutely cannot increase the size of a |
1322 | // builder state. We can. It's just here to make sure we do it knowingly |
1323 | // and intentionally. |
1324 | // |
1325 | // A builder state is unfortunately a little bigger than an NFA state, |
1326 | // since we really want to support adding things to a pre-existing state. |
1327 | // i.e., We use Vec<thing> instead of Box<[thing]>. So we end up using an |
1328 | // extra 8 bytes per state. Sad, but at least it gets freed once the NFA |
1329 | // is built. |
1330 | #[test] |
1331 | fn state_has_small_size() { |
1332 | #[cfg (target_pointer_width = "64" )] |
1333 | assert_eq!(32, core::mem::size_of::<State>()); |
1334 | #[cfg (target_pointer_width = "32" )] |
1335 | assert_eq!(16, core::mem::size_of::<State>()); |
1336 | } |
1337 | } |
1338 | |