1use core::mem;
2
3use alloc::{sync::Arc, vec, vec::Vec};
4
5use crate::{
6 nfa::thompson::{
7 error::BuildError,
8 nfa::{self, SparseTransitions, Transition, NFA},
9 },
10 util::{
11 look::{Look, LookMatcher},
12 primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID},
13 },
14};
15
16/// An intermediate NFA state used during construction.
17///
18/// During construction of an NFA, it is often convenient to work with states
19/// that are amenable to mutation and other carry more information than we
20/// otherwise need once an NFA has been built. This type represents those
21/// needs.
22///
23/// Once construction is finished, the builder will convert these states to a
24/// [`nfa::thompson::State`](crate::nfa::thompson::State). This conversion not
25/// only results in a simpler representation, but in some cases, entire classes
26/// of states are completely removed (such as [`State::Empty`]).
27#[derive(Clone, Debug, Eq, PartialEq)]
28enum State {
29 /// An empty state whose only purpose is to forward the automaton to
30 /// another state via an unconditional epsilon transition.
31 ///
32 /// Unconditional epsilon transitions are quite useful during the
33 /// construction of an NFA, as they permit the insertion of no-op
34 /// placeholders that make it easier to compose NFA sub-graphs. When
35 /// the Thompson NFA builder produces a final NFA, all unconditional
36 /// epsilon transitions are removed, and state identifiers are remapped
37 /// accordingly.
38 Empty {
39 /// The next state that this state should transition to.
40 next: StateID,
41 },
42 /// A state that only transitions to another state if the current input
43 /// byte is in a particular range of bytes.
44 ByteRange { trans: Transition },
45 /// A state with possibly many transitions, represented in a sparse
46 /// fashion. Transitions must be ordered lexicographically by input range
47 /// and be non-overlapping. As such, this may only be used when every
48 /// transition has equal priority. (In practice, this is only used for
49 /// encoding large UTF-8 automata.) In contrast, a `Union` state has each
50 /// alternate in order of priority. Priority is used to implement greedy
51 /// matching and also alternations themselves, e.g., `abc|a` where `abc`
52 /// has priority over `a`.
53 ///
54 /// To clarify, it is possible to remove `Sparse` and represent all things
55 /// that `Sparse` is used for via `Union`. But this creates a more bloated
56 /// NFA with more epsilon transitions than is necessary in the special case
57 /// of character classes.
58 Sparse { transitions: Vec<Transition> },
59 /// A conditional epsilon transition satisfied via some sort of
60 /// look-around.
61 Look { look: Look, next: StateID },
62 /// An empty state that records the start of a capture location. This is an
63 /// unconditional epsilon transition like `Empty`, except it can be used to
64 /// record position information for a capture group when using the NFA for
65 /// search.
66 CaptureStart {
67 /// The ID of the pattern that this capture was defined.
68 pattern_id: PatternID,
69 /// The capture group index that this capture state corresponds to.
70 /// The capture group index is always relative to its corresponding
71 /// pattern. Therefore, in the presence of multiple patterns, both the
72 /// pattern ID and the capture group index are required to uniquely
73 /// identify a capturing group.
74 group_index: SmallIndex,
75 /// The next state that this state should transition to.
76 next: StateID,
77 },
78 /// An empty state that records the end of a capture location. This is an
79 /// unconditional epsilon transition like `Empty`, except it can be used to
80 /// record position information for a capture group when using the NFA for
81 /// search.
82 CaptureEnd {
83 /// The ID of the pattern that this capture was defined.
84 pattern_id: PatternID,
85 /// The capture group index that this capture state corresponds to.
86 /// The capture group index is always relative to its corresponding
87 /// pattern. Therefore, in the presence of multiple patterns, both the
88 /// pattern ID and the capture group index are required to uniquely
89 /// identify a capturing group.
90 group_index: SmallIndex,
91 /// The next state that this state should transition to.
92 next: StateID,
93 },
94 /// An alternation such that there exists an epsilon transition to all
95 /// states in `alternates`, where matches found via earlier transitions
96 /// are preferred over later transitions.
97 Union { alternates: Vec<StateID> },
98 /// An alternation such that there exists an epsilon transition to all
99 /// states in `alternates`, where matches found via later transitions are
100 /// preferred over earlier transitions.
101 ///
102 /// This "reverse" state exists for convenience during compilation that
103 /// permits easy construction of non-greedy combinations of NFA states. At
104 /// the end of compilation, Union and UnionReverse states are merged into
105 /// one Union type of state, where the latter has its epsilon transitions
106 /// reversed to reflect the priority inversion.
107 ///
108 /// The "convenience" here arises from the fact that as new states are
109 /// added to the list of `alternates`, we would like that add operation
110 /// to be amortized constant time. But if we used a `Union`, we'd need to
111 /// prepend the state, which takes O(n) time. There are other approaches we
112 /// could use to solve this, but this seems simple enough.
113 UnionReverse { alternates: Vec<StateID> },
114 /// A state that cannot be transitioned out of. This is useful for cases
115 /// where you want to prevent matching from occurring. For example, if your
116 /// regex parser permits empty character classes, then one could choose a
117 /// `Fail` state to represent it.
118 Fail,
119 /// A match state. There is at most one such occurrence of this state in
120 /// an NFA for each pattern compiled into the NFA. At time of writing, a
121 /// match state is always produced for every pattern given, but in theory,
122 /// if a pattern can never lead to a match, then the match state could be
123 /// omitted.
124 ///
125 /// `pattern_id` refers to the ID of the pattern itself, which corresponds
126 /// to the pattern's index (starting at 0).
127 Match { pattern_id: PatternID },
128}
129
130impl State {
131 /// If this state is an unconditional epsilon transition, then this returns
132 /// the target of the transition.
133 fn goto(&self) -> Option<StateID> {
134 match *self {
135 State::Empty { next } => Some(next),
136 State::Union { ref alternates } if alternates.len() == 1 => {
137 Some(alternates[0])
138 }
139 State::UnionReverse { ref alternates }
140 if alternates.len() == 1 =>
141 {
142 Some(alternates[0])
143 }
144 _ => None,
145 }
146 }
147
148 /// Returns the heap memory usage, in bytes, of this state.
149 fn memory_usage(&self) -> usize {
150 match *self {
151 State::Empty { .. }
152 | State::ByteRange { .. }
153 | State::Look { .. }
154 | State::CaptureStart { .. }
155 | State::CaptureEnd { .. }
156 | State::Fail
157 | State::Match { .. } => 0,
158 State::Sparse { ref transitions } => {
159 transitions.len() * mem::size_of::<Transition>()
160 }
161 State::Union { ref alternates } => {
162 alternates.len() * mem::size_of::<StateID>()
163 }
164 State::UnionReverse { ref alternates } => {
165 alternates.len() * mem::size_of::<StateID>()
166 }
167 }
168 }
169}
170
171/// An abstraction for building Thompson NFAs by hand.
172///
173/// A builder is what a [`thompson::Compiler`](crate::nfa::thompson::Compiler)
174/// uses internally to translate a regex's high-level intermediate
175/// representation into an [`NFA`].
176///
177/// The primary function of this builder is to abstract away the internal
178/// representation of an NFA and make it difficult to produce NFAs are that
179/// internally invalid or inconsistent. This builder also provides a way to
180/// add "empty" states (which can be thought of as unconditional epsilon
181/// transitions), despite the fact that [`thompson::State`](nfa::State) does
182/// not have any "empty" representation. The advantage of "empty" states is
183/// that they make the code for constructing a Thompson NFA logically simpler.
184///
185/// Many of the routines on this builder may panic or return errors. Generally
186/// speaking, panics occur when an invalid sequence of method calls were made,
187/// where as an error occurs if things get too big. (Where "too big" might mean
188/// exhausting identifier space or using up too much heap memory in accordance
189/// with the configured [`size_limit`](Builder::set_size_limit).)
190///
191/// # Overview
192///
193/// ## Adding multiple patterns
194///
195/// Each pattern you add to an NFA should correspond to a pair of
196/// [`Builder::start_pattern`] and [`Builder::finish_pattern`] calls, with
197/// calls inbetween that add NFA states for that pattern. NFA states may be
198/// added without first calling `start_pattern`, with the exception of adding
199/// capturing states.
200///
201/// ## Adding NFA states
202///
203/// Here is a very brief overview of each of the methods that add NFA states.
204/// Every method adds a single state.
205///
206/// * [`add_empty`](Builder::add_empty): Add a state with a single
207/// unconditional epsilon transition to another state.
208/// * [`add_union`](Builder::add_union): Adds a state with unconditional
209/// epsilon transitions to two or more states, with earlier transitions
210/// preferred over later ones.
211/// * [`add_union_reverse`](Builder::add_union_reverse): Adds a state with
212/// unconditional epsilon transitions to two or more states, with later
213/// transitions preferred over earlier ones.
214/// * [`add_range`](Builder::add_range): Adds a state with a single transition
215/// to another state that can only be followed if the current input byte is
216/// within the range given.
217/// * [`add_sparse`](Builder::add_sparse): Adds a state with two or more
218/// range transitions to other states, where a transition is only followed
219/// if the current input byte is within one of the ranges. All transitions
220/// in this state have equal priority, and the corresponding ranges must be
221/// non-overlapping.
222/// * [`add_look`](Builder::add_look): Adds a state with a single *conditional*
223/// epsilon transition to another state, where the condition depends on a
224/// limited look-around property.
225/// * [`add_capture_start`](Builder::add_capture_start): Adds a state with
226/// a single unconditional epsilon transition that also instructs an NFA
227/// simulation to record the current input position to a specific location in
228/// memory. This is intended to represent the starting location of a capturing
229/// group.
230/// * [`add_capture_end`](Builder::add_capture_end): Adds a state with
231/// a single unconditional epsilon transition that also instructs an NFA
232/// simulation to record the current input position to a specific location in
233/// memory. This is intended to represent the ending location of a capturing
234/// group.
235/// * [`add_fail`](Builder::add_fail): Adds a state that never transitions to
236/// another state.
237/// * [`add_match`](Builder::add_match): Add a state that indicates a match has
238/// been found for a particular pattern. A match state is a final state with
239/// no outgoing transitions.
240///
241/// ## Setting transitions between NFA states
242///
243/// The [`Builder::patch`] method creates a transition from one state to the
244/// next. If the `from` state corresponds to a state that supports multiple
245/// outgoing transitions (such as "union"), then this adds the corresponding
246/// transition. Otherwise, it sets the single transition. (This routine panics
247/// if `from` corresponds to a state added by `add_sparse`, since sparse states
248/// need more specialized handling.)
249///
250/// # Example
251///
252/// This annotated example shows how to hand construct the regex `[a-z]+`
253/// (without an unanchored prefix).
254///
255/// ```
256/// use regex_automata::{
257/// nfa::thompson::{pikevm::PikeVM, Builder, Transition},
258/// util::primitives::StateID,
259/// Match,
260/// };
261///
262/// let mut builder = Builder::new();
263/// // Before adding NFA states for our pattern, we need to tell the builder
264/// // that we are starting the pattern.
265/// builder.start_pattern()?;
266/// // Since we use the Pike VM below for searching, we need to add capturing
267/// // states. If you're just going to build a DFA from the NFA, then capturing
268/// // states do not need to be added.
269/// let start = builder.add_capture_start(StateID::ZERO, 0, None)?;
270/// let range = builder.add_range(Transition {
271/// // We don't know the state ID of the 'next' state yet, so we just fill
272/// // in a dummy 'ZERO' value.
273/// start: b'a', end: b'z', next: StateID::ZERO,
274/// })?;
275/// // This state will point back to 'range', but also enable us to move ahead.
276/// // That is, this implements the '+' repetition operator. We add 'range' and
277/// // then 'end' below to this alternation.
278/// let alt = builder.add_union(vec![])?;
279/// // The final state before the match state, which serves to capture the
280/// // end location of the match.
281/// let end = builder.add_capture_end(StateID::ZERO, 0)?;
282/// // The match state for our pattern.
283/// let mat = builder.add_match()?;
284/// // Now we fill in the transitions between states.
285/// builder.patch(start, range)?;
286/// builder.patch(range, alt)?;
287/// // If we added 'end' before 'range', then we'd implement non-greedy
288/// // matching, i.e., '+?'.
289/// builder.patch(alt, range)?;
290/// builder.patch(alt, end)?;
291/// builder.patch(end, mat)?;
292/// // We must explicitly finish pattern and provide the starting state ID for
293/// // this particular pattern.
294/// builder.finish_pattern(start)?;
295/// // Finally, when we build the NFA, we provide the anchored and unanchored
296/// // starting state IDs. Since we didn't bother with an unanchored prefix
297/// // here, we only support anchored searching. Thus, both starting states are
298/// // the same.
299/// let nfa = builder.build(start, start)?;
300///
301/// // Now build a Pike VM from our NFA, and use it for searching. This shows
302/// // how we can use a regex engine without ever worrying about syntax!
303/// let re = PikeVM::new_from_nfa(nfa)?;
304/// let mut cache = re.create_cache();
305/// let mut caps = re.create_captures();
306/// let expected = Some(Match::must(0, 0..3));
307/// re.captures(&mut cache, "foo0", &mut caps);
308/// assert_eq!(expected, caps.get_match());
309///
310/// # Ok::<(), Box<dyn std::error::Error>>(())
311/// ```
312#[derive(Clone, Debug, Default)]
313pub struct Builder {
314 /// The ID of the pattern that we're currently building.
315 ///
316 /// Callers are required to set (and unset) this by calling
317 /// {start,finish}_pattern. Otherwise, most methods will panic.
318 pattern_id: Option<PatternID>,
319 /// A sequence of intermediate NFA states. Once a state is added to this
320 /// sequence, it is assigned a state ID equivalent to its index. Once a
321 /// state is added, it is still expected to be mutated, e.g., to set its
322 /// transition to a state that didn't exist at the time it was added.
323 states: Vec<State>,
324 /// The starting states for each individual pattern. Starting at any
325 /// of these states will result in only an anchored search for the
326 /// corresponding pattern. The vec is indexed by pattern ID. When the NFA
327 /// contains a single regex, then `start_pattern[0]` and `start_anchored`
328 /// are always equivalent.
329 start_pattern: Vec<StateID>,
330 /// A map from pattern ID to capture group index to name. (If no name
331 /// exists, then a None entry is present. Thus, all capturing groups are
332 /// present in this mapping.)
333 ///
334 /// The outer vec is indexed by pattern ID, while the inner vec is indexed
335 /// by capture index offset for the corresponding pattern.
336 ///
337 /// The first capture group for each pattern is always unnamed and is thus
338 /// always None.
339 captures: Vec<Vec<Option<Arc<str>>>>,
340 /// The combined memory used by each of the 'State's in 'states'. This
341 /// only includes heap usage by each state, and not the size of the state
342 /// itself. In other words, this tracks heap memory used that isn't
343 /// captured via `size_of::<State>() * states.len()`.
344 memory_states: usize,
345 /// Whether this NFA only matches UTF-8 and whether regex engines using
346 /// this NFA for searching should report empty matches that split a
347 /// codepoint.
348 utf8: bool,
349 /// Whether this NFA should be matched in reverse or not.
350 reverse: bool,
351 /// The matcher to use for look-around assertions.
352 look_matcher: LookMatcher,
353 /// A size limit to respect when building an NFA. If the total heap memory
354 /// of the intermediate NFA states exceeds (or would exceed) this amount,
355 /// then an error is returned.
356 size_limit: Option<usize>,
357}
358
359impl Builder {
360 /// Create a new builder for hand-assembling NFAs.
361 pub fn new() -> Builder {
362 Builder::default()
363 }
364
365 /// Clear this builder.
366 ///
367 /// Clearing removes all state associated with building an NFA, but does
368 /// not reset configuration (such as size limits and whether the NFA
369 /// should only match UTF-8). After clearing, the builder can be reused to
370 /// assemble an entirely new NFA.
371 pub fn clear(&mut self) {
372 self.pattern_id = None;
373 self.states.clear();
374 self.start_pattern.clear();
375 self.captures.clear();
376 self.memory_states = 0;
377 }
378
379 /// Assemble a [`NFA`] from the states added so far.
380 ///
381 /// After building an NFA, more states may be added and `build` may be
382 /// called again. To reuse a builder to produce an entirely new NFA from
383 /// scratch, call the [`clear`](Builder::clear) method first.
384 ///
385 /// `start_anchored` refers to the ID of the starting state that anchored
386 /// searches should use. That is, searches who matches are limited to the
387 /// starting position of the search.
388 ///
389 /// `start_unanchored` refers to the ID of the starting state that
390 /// unanchored searches should use. This permits searches to report matches
391 /// that start after the beginning of the search. In cases where unanchored
392 /// searches are not supported, the unanchored starting state ID must be
393 /// the same as the anchored starting state ID.
394 ///
395 /// # Errors
396 ///
397 /// This returns an error if there was a problem producing the final NFA.
398 /// In particular, this might include an error if the capturing groups
399 /// added to this builder violate any of the invariants documented on
400 /// [`GroupInfo`](crate::util::captures::GroupInfo).
401 ///
402 /// # Panics
403 ///
404 /// If `start_pattern` was called, then `finish_pattern` must be called
405 /// before `build`, otherwise this panics.
406 ///
407 /// This may panic for other invalid uses of a builder. For example, if
408 /// a "start capture" state was added without a corresponding "end capture"
409 /// state.
410 pub fn build(
411 &self,
412 start_anchored: StateID,
413 start_unanchored: StateID,
414 ) -> Result<NFA, BuildError> {
415 assert!(self.pattern_id.is_none(), "must call 'finish_pattern' first");
416 debug!(
417 "intermediate NFA compilation via builder is complete, \
418 intermediate NFA size: {} states, {} bytes on heap",
419 self.states.len(),
420 self.memory_usage(),
421 );
422
423 let mut nfa = nfa::Inner::default();
424 nfa.set_utf8(self.utf8);
425 nfa.set_reverse(self.reverse);
426 nfa.set_look_matcher(self.look_matcher.clone());
427 // A set of compiler internal state IDs that correspond to states
428 // that are exclusively epsilon transitions, i.e., goto instructions,
429 // combined with the state that they point to. This is used to
430 // record said states while transforming the compiler's internal NFA
431 // representation to the external form.
432 let mut empties = vec![];
433 // A map used to re-map state IDs when translating this builder's
434 // internal NFA state representation to the final NFA representation.
435 let mut remap = vec![];
436 remap.resize(self.states.len(), StateID::ZERO);
437
438 nfa.set_starts(start_anchored, start_unanchored, &self.start_pattern);
439 nfa.set_captures(&self.captures).map_err(BuildError::captures)?;
440 // The idea here is to convert our intermediate states to their final
441 // form. The only real complexity here is the process of converting
442 // transitions, which are expressed in terms of state IDs. The new
443 // set of states will be smaller because of partial epsilon removal,
444 // so the state IDs will not be the same.
445 for (sid, state) in self.states.iter().with_state_ids() {
446 match *state {
447 State::Empty { next } => {
448 // Since we're removing empty states, we need to handle
449 // them later since we don't yet know which new state this
450 // empty state will be mapped to.
451 empties.push((sid, next));
452 }
453 State::ByteRange { trans } => {
454 remap[sid] = nfa.add(nfa::State::ByteRange { trans });
455 }
456 State::Sparse { ref transitions } => {
457 remap[sid] = match transitions.len() {
458 0 => nfa.add(nfa::State::Fail),
459 1 => nfa.add(nfa::State::ByteRange {
460 trans: transitions[0],
461 }),
462 _ => {
463 let transitions =
464 transitions.to_vec().into_boxed_slice();
465 let sparse = SparseTransitions { transitions };
466 nfa.add(nfa::State::Sparse(sparse))
467 }
468 }
469 }
470 State::Look { look, next } => {
471 remap[sid] = nfa.add(nfa::State::Look { look, next });
472 }
473 State::CaptureStart { pattern_id, group_index, next } => {
474 // We can't remove this empty state because of the side
475 // effect of capturing an offset for this capture slot.
476 let slot = nfa
477 .group_info()
478 .slot(pattern_id, group_index.as_usize())
479 .expect("invalid capture index");
480 let slot =
481 SmallIndex::new(slot).expect("a small enough slot");
482 remap[sid] = nfa.add(nfa::State::Capture {
483 next,
484 pattern_id,
485 group_index,
486 slot,
487 });
488 }
489 State::CaptureEnd { pattern_id, group_index, next } => {
490 // We can't remove this empty state because of the side
491 // effect of capturing an offset for this capture slot.
492 // Also, this always succeeds because we check that all
493 // slot indices are valid for all capture indices when they
494 // are initially added.
495 let slot = nfa
496 .group_info()
497 .slot(pattern_id, group_index.as_usize())
498 .expect("invalid capture index")
499 .checked_add(1)
500 .unwrap();
501 let slot =
502 SmallIndex::new(slot).expect("a small enough slot");
503 remap[sid] = nfa.add(nfa::State::Capture {
504 next,
505 pattern_id,
506 group_index,
507 slot,
508 });
509 }
510 State::Union { ref alternates } => {
511 if alternates.is_empty() {
512 remap[sid] = nfa.add(nfa::State::Fail);
513 } else if alternates.len() == 1 {
514 empties.push((sid, alternates[0]));
515 remap[sid] = alternates[0];
516 } else if alternates.len() == 2 {
517 remap[sid] = nfa.add(nfa::State::BinaryUnion {
518 alt1: alternates[0],
519 alt2: alternates[1],
520 });
521 } else {
522 let alternates =
523 alternates.to_vec().into_boxed_slice();
524 remap[sid] = nfa.add(nfa::State::Union { alternates });
525 }
526 }
527 State::UnionReverse { ref alternates } => {
528 if alternates.is_empty() {
529 remap[sid] = nfa.add(nfa::State::Fail);
530 } else if alternates.len() == 1 {
531 empties.push((sid, alternates[0]));
532 remap[sid] = alternates[0];
533 } else if alternates.len() == 2 {
534 remap[sid] = nfa.add(nfa::State::BinaryUnion {
535 alt1: alternates[1],
536 alt2: alternates[0],
537 });
538 } else {
539 let mut alternates =
540 alternates.to_vec().into_boxed_slice();
541 alternates.reverse();
542 remap[sid] = nfa.add(nfa::State::Union { alternates });
543 }
544 }
545 State::Fail => {
546 remap[sid] = nfa.add(nfa::State::Fail);
547 }
548 State::Match { pattern_id } => {
549 remap[sid] = nfa.add(nfa::State::Match { pattern_id });
550 }
551 }
552 }
553 // Some of the new states still point to empty state IDs, so we need to
554 // follow each of them and remap the empty state IDs to their non-empty
555 // state IDs.
556 //
557 // We also keep track of which states we've already mapped. This helps
558 // avoid quadratic behavior in a long chain of empty states. For
559 // example, in 'a{0}{50000}'.
560 let mut remapped = vec![false; self.states.len()];
561 for &(empty_id, empty_next) in empties.iter() {
562 if remapped[empty_id] {
563 continue;
564 }
565 // empty states can point to other empty states, forming a chain.
566 // So we must follow the chain until the end, which must end at
567 // a non-empty state, and therefore, a state that is correctly
568 // remapped. We are guaranteed to terminate because our compiler
569 // never builds a loop among only empty states.
570 let mut new_next = empty_next;
571 while let Some(next) = self.states[new_next].goto() {
572 new_next = next;
573 }
574 remap[empty_id] = remap[new_next];
575 remapped[empty_id] = true;
576
577 // Now that we've remapped the main 'empty_id' above, we re-follow
578 // the chain from above and remap every empty state we found along
579 // the way to our ultimate non-empty target. We are careful to set
580 // 'remapped' to true for each such state. We thus will not need
581 // to re-compute this chain for any subsequent empty states in
582 // 'empties' that are part of this chain.
583 let mut next2 = empty_next;
584 while let Some(next) = self.states[next2].goto() {
585 remap[next2] = remap[new_next];
586 remapped[next2] = true;
587 next2 = next;
588 }
589 }
590 // Finally remap all of the state IDs.
591 nfa.remap(&remap);
592 let final_nfa = nfa.into_nfa();
593 debug!(
594 "NFA compilation via builder complete, \
595 final NFA size: {} states, {} bytes on heap, \
596 has empty? {:?}, utf8? {:?}",
597 final_nfa.states().len(),
598 final_nfa.memory_usage(),
599 final_nfa.has_empty(),
600 final_nfa.is_utf8(),
601 );
602 Ok(final_nfa)
603 }
604
605 /// Start the assembly of a pattern in this NFA.
606 ///
607 /// Upon success, this returns the identifier for the new pattern.
608 /// Identifiers start at `0` and are incremented by 1 for each new pattern.
609 ///
610 /// It is necessary to call this routine before adding capturing states.
611 /// Otherwise, any other NFA state may be added before starting a pattern.
612 ///
613 /// # Errors
614 ///
615 /// If the pattern identifier space is exhausted, then this returns an
616 /// error.
617 ///
618 /// # Panics
619 ///
620 /// If this is called while assembling another pattern (i.e., before
621 /// `finish_pattern` is called), then this panics.
622 pub fn start_pattern(&mut self) -> Result<PatternID, BuildError> {
623 assert!(self.pattern_id.is_none(), "must call 'finish_pattern' first");
624
625 let proposed = self.start_pattern.len();
626 let pid = PatternID::new(proposed)
627 .map_err(|_| BuildError::too_many_patterns(proposed))?;
628 self.pattern_id = Some(pid);
629 // This gets filled in when 'finish_pattern' is called.
630 self.start_pattern.push(StateID::ZERO);
631 Ok(pid)
632 }
633
634 /// Finish the assembly of a pattern in this NFA.
635 ///
636 /// Upon success, this returns the identifier for the new pattern.
637 /// Identifiers start at `0` and are incremented by 1 for each new
638 /// pattern. This is the same identifier returned by the corresponding
639 /// `start_pattern` call.
640 ///
641 /// Note that `start_pattern` and `finish_pattern` pairs cannot be
642 /// interleaved or nested. A correct `finish_pattern` call _always_
643 /// corresponds to the most recently called `start_pattern` routine.
644 ///
645 /// # Errors
646 ///
647 /// This currently never returns an error, but this is subject to change.
648 ///
649 /// # Panics
650 ///
651 /// If this is called without a corresponding `start_pattern` call, then
652 /// this panics.
653 pub fn finish_pattern(
654 &mut self,
655 start_id: StateID,
656 ) -> Result<PatternID, BuildError> {
657 let pid = self.current_pattern_id();
658 self.start_pattern[pid] = start_id;
659 self.pattern_id = None;
660 Ok(pid)
661 }
662
663 /// Returns the pattern identifier of the current pattern.
664 ///
665 /// # Panics
666 ///
667 /// If this doesn't occur after a `start_pattern` call and before the
668 /// corresponding `finish_pattern` call, then this panics.
669 pub fn current_pattern_id(&self) -> PatternID {
670 self.pattern_id.expect("must call 'start_pattern' first")
671 }
672
673 /// Returns the number of patterns added to this builder so far.
674 ///
675 /// This only includes patterns that have had `finish_pattern` called
676 /// for them.
677 pub fn pattern_len(&self) -> usize {
678 self.start_pattern.len()
679 }
680
681 /// Add an "empty" NFA state.
682 ///
683 /// An "empty" NFA state is a state with a single unconditional epsilon
684 /// transition to another NFA state. Such empty states are removed before
685 /// building the final [`NFA`] (which has no such "empty" states), but they
686 /// can be quite useful in the construction process of an NFA.
687 ///
688 /// # Errors
689 ///
690 /// This returns an error if the state identifier space is exhausted, or if
691 /// the configured heap size limit has been exceeded.
692 pub fn add_empty(&mut self) -> Result<StateID, BuildError> {
693 self.add(State::Empty { next: StateID::ZERO })
694 }
695
696 /// Add a "union" NFA state.
697 ///
698 /// A "union" NFA state that contains zero or more unconditional epsilon
699 /// transitions to other NFA states. The order of these transitions
700 /// reflects a priority order where earlier transitions are preferred over
701 /// later transitions.
702 ///
703 /// Callers may provide an empty set of alternates to this method call, and
704 /// then later add transitions via `patch`. At final build time, a "union"
705 /// state with no alternates is converted to a "fail" state, and a "union"
706 /// state with exactly one alternate is treated as if it were an "empty"
707 /// state.
708 ///
709 /// # Errors
710 ///
711 /// This returns an error if the state identifier space is exhausted, or if
712 /// the configured heap size limit has been exceeded.
713 pub fn add_union(
714 &mut self,
715 alternates: Vec<StateID>,
716 ) -> Result<StateID, BuildError> {
717 self.add(State::Union { alternates })
718 }
719
720 /// Add a "reverse union" NFA state.
721 ///
722 /// A "reverse union" NFA state contains zero or more unconditional epsilon
723 /// transitions to other NFA states. The order of these transitions
724 /// reflects a priority order where later transitions are preferred
725 /// over earlier transitions. This is an inverted priority order when
726 /// compared to `add_union`. This is useful, for example, for implementing
727 /// non-greedy repetition operators.
728 ///
729 /// Callers may provide an empty set of alternates to this method call, and
730 /// then later add transitions via `patch`. At final build time, a "reverse
731 /// union" state with no alternates is converted to a "fail" state, and a
732 /// "reverse union" state with exactly one alternate is treated as if it
733 /// were an "empty" state.
734 ///
735 /// # Errors
736 ///
737 /// This returns an error if the state identifier space is exhausted, or if
738 /// the configured heap size limit has been exceeded.
739 pub fn add_union_reverse(
740 &mut self,
741 alternates: Vec<StateID>,
742 ) -> Result<StateID, BuildError> {
743 self.add(State::UnionReverse { alternates })
744 }
745
746 /// Add a "range" NFA state.
747 ///
748 /// A "range" NFA state is a state with one outgoing transition to another
749 /// state, where that transition may only be followed if the current input
750 /// byte falls between a range of bytes given.
751 ///
752 /// # Errors
753 ///
754 /// This returns an error if the state identifier space is exhausted, or if
755 /// the configured heap size limit has been exceeded.
756 pub fn add_range(
757 &mut self,
758 trans: Transition,
759 ) -> Result<StateID, BuildError> {
760 self.add(State::ByteRange { trans })
761 }
762
763 /// Add a "sparse" NFA state.
764 ///
765 /// A "sparse" NFA state contains zero or more outgoing transitions, where
766 /// the transition to be followed (if any) is chosen based on whether the
767 /// current input byte falls in the range of one such transition. The
768 /// transitions given *must* be non-overlapping and in ascending order. (A
769 /// "sparse" state with no transitions is equivalent to a "fail" state.)
770 ///
771 /// A "sparse" state is like adding a "union" state and pointing it at a
772 /// bunch of "range" states, except that the different alternates have
773 /// equal priority.
774 ///
775 /// Note that a "sparse" state is the only state that cannot be patched.
776 /// This is because a "sparse" state has many transitions, each of which
777 /// may point to a different NFA state. Moreover, adding more such
778 /// transitions requires more than just an NFA state ID to point to. It
779 /// also requires a byte range. The `patch` routine does not support the
780 /// additional information required. Therefore, callers must ensure that
781 /// all outgoing transitions for this state are included when `add_sparse`
782 /// is called. There is no way to add more later.
783 ///
784 /// # Errors
785 ///
786 /// This returns an error if the state identifier space is exhausted, or if
787 /// the configured heap size limit has been exceeded.
788 ///
789 /// # Panics
790 ///
791 /// This routine _may_ panic if the transitions given overlap or are not
792 /// in ascending order.
793 pub fn add_sparse(
794 &mut self,
795 transitions: Vec<Transition>,
796 ) -> Result<StateID, BuildError> {
797 self.add(State::Sparse { transitions })
798 }
799
800 /// Add a "look" NFA state.
801 ///
802 /// A "look" NFA state corresponds to a state with exactly one
803 /// *conditional* epsilon transition to another NFA state. Namely, it
804 /// represents one of a small set of simplistic look-around operators.
805 ///
806 /// Callers may provide a "dummy" state ID (typically [`StateID::ZERO`]),
807 /// and then change it later with [`patch`](Builder::patch).
808 ///
809 /// # Errors
810 ///
811 /// This returns an error if the state identifier space is exhausted, or if
812 /// the configured heap size limit has been exceeded.
813 pub fn add_look(
814 &mut self,
815 next: StateID,
816 look: Look,
817 ) -> Result<StateID, BuildError> {
818 self.add(State::Look { look, next })
819 }
820
821 /// Add a "start capture" NFA state.
822 ///
823 /// A "start capture" NFA state corresponds to a state with exactly one
824 /// outgoing unconditional epsilon transition to another state. Unlike
825 /// "empty" states, a "start capture" state also carries with it an
826 /// instruction for saving the current position of input to a particular
827 /// location in memory. NFA simulations, like the Pike VM, may use this
828 /// information to report the match locations of capturing groups in a
829 /// regex pattern.
830 ///
831 /// If the corresponding capturing group has a name, then callers should
832 /// include it here.
833 ///
834 /// Callers may provide a "dummy" state ID (typically [`StateID::ZERO`]),
835 /// and then change it later with [`patch`](Builder::patch).
836 ///
837 /// Note that unlike `start_pattern`/`finish_pattern`, capturing start and
838 /// end states may be interleaved. Indeed, it is typical for many "start
839 /// capture" NFA states to appear before the first "end capture" state.
840 ///
841 /// # Errors
842 ///
843 /// This returns an error if the state identifier space is exhausted, or if
844 /// the configured heap size limit has been exceeded or if the given
845 /// capture index overflows `usize`.
846 ///
847 /// While the above are the only conditions in which this routine can
848 /// currently return an error, it is possible to call this method with an
849 /// inputs that results in the final `build()` step failing to produce an
850 /// NFA. For example, if one adds two distinct capturing groups with the
851 /// same name, then that will result in `build()` failing with an error.
852 ///
853 /// See the [`GroupInfo`](crate::util::captures::GroupInfo) type for
854 /// more information on what qualifies as valid capturing groups.
855 ///
856 /// # Example
857 ///
858 /// This example shows that an error occurs when one tries to add multiple
859 /// capturing groups with the same name to the same pattern.
860 ///
861 /// ```
862 /// use regex_automata::{
863 /// nfa::thompson::Builder,
864 /// util::primitives::StateID,
865 /// };
866 ///
867 /// let name = Some(std::sync::Arc::from("foo"));
868 /// let mut builder = Builder::new();
869 /// builder.start_pattern()?;
870 /// // 0th capture group should always be unnamed.
871 /// let start = builder.add_capture_start(StateID::ZERO, 0, None)?;
872 /// // OK
873 /// builder.add_capture_start(StateID::ZERO, 1, name.clone())?;
874 /// // This is not OK, but 'add_capture_start' still succeeds. We don't
875 /// // get an error until we call 'build' below. Without this call, the
876 /// // call to 'build' below would succeed.
877 /// builder.add_capture_start(StateID::ZERO, 2, name.clone())?;
878 /// // Finish our pattern so we can try to build the NFA.
879 /// builder.finish_pattern(start)?;
880 /// let result = builder.build(start, start);
881 /// assert!(result.is_err());
882 ///
883 /// # Ok::<(), Box<dyn std::error::Error>>(())
884 /// ```
885 ///
886 /// However, adding multiple capturing groups with the same name to
887 /// distinct patterns is okay:
888 ///
889 /// ```
890 /// use std::sync::Arc;
891 ///
892 /// use regex_automata::{
893 /// nfa::thompson::{pikevm::PikeVM, Builder, Transition},
894 /// util::{
895 /// captures::Captures,
896 /// primitives::{PatternID, StateID},
897 /// },
898 /// Span,
899 /// };
900 ///
901 /// // Hand-compile the patterns '(?P<foo>[a-z])' and '(?P<foo>[A-Z])'.
902 /// let mut builder = Builder::new();
903 /// // We compile them to support an unanchored search, which requires
904 /// // adding an implicit '(?s-u:.)*?' prefix before adding either pattern.
905 /// let unanchored_prefix = builder.add_union_reverse(vec![])?;
906 /// let any = builder.add_range(Transition {
907 /// start: b'\x00', end: b'\xFF', next: StateID::ZERO,
908 /// })?;
909 /// builder.patch(unanchored_prefix, any)?;
910 /// builder.patch(any, unanchored_prefix)?;
911 ///
912 /// // Compile an alternation that permits matching multiple patterns.
913 /// let alt = builder.add_union(vec![])?;
914 /// builder.patch(unanchored_prefix, alt)?;
915 ///
916 /// // Compile '(?P<foo>[a-z]+)'.
917 /// builder.start_pattern()?;
918 /// let start0 = builder.add_capture_start(StateID::ZERO, 0, None)?;
919 /// // N.B. 0th capture group must always be unnamed.
920 /// let foo_start0 = builder.add_capture_start(
921 /// StateID::ZERO, 1, Some(Arc::from("foo")),
922 /// )?;
923 /// let lowercase = builder.add_range(Transition {
924 /// start: b'a', end: b'z', next: StateID::ZERO,
925 /// })?;
926 /// let foo_end0 = builder.add_capture_end(StateID::ZERO, 1)?;
927 /// let end0 = builder.add_capture_end(StateID::ZERO, 0)?;
928 /// let match0 = builder.add_match()?;
929 /// builder.patch(start0, foo_start0)?;
930 /// builder.patch(foo_start0, lowercase)?;
931 /// builder.patch(lowercase, foo_end0)?;
932 /// builder.patch(foo_end0, end0)?;
933 /// builder.patch(end0, match0)?;
934 /// builder.finish_pattern(start0)?;
935 ///
936 /// // Compile '(?P<foo>[A-Z]+)'.
937 /// builder.start_pattern()?;
938 /// let start1 = builder.add_capture_start(StateID::ZERO, 0, None)?;
939 /// // N.B. 0th capture group must always be unnamed.
940 /// let foo_start1 = builder.add_capture_start(
941 /// StateID::ZERO, 1, Some(Arc::from("foo")),
942 /// )?;
943 /// let uppercase = builder.add_range(Transition {
944 /// start: b'A', end: b'Z', next: StateID::ZERO,
945 /// })?;
946 /// let foo_end1 = builder.add_capture_end(StateID::ZERO, 1)?;
947 /// let end1 = builder.add_capture_end(StateID::ZERO, 0)?;
948 /// let match1 = builder.add_match()?;
949 /// builder.patch(start1, foo_start1)?;
950 /// builder.patch(foo_start1, uppercase)?;
951 /// builder.patch(uppercase, foo_end1)?;
952 /// builder.patch(foo_end1, end1)?;
953 /// builder.patch(end1, match1)?;
954 /// builder.finish_pattern(start1)?;
955 ///
956 /// // Now add the patterns to our alternation that we started above.
957 /// builder.patch(alt, start0)?;
958 /// builder.patch(alt, start1)?;
959 ///
960 /// // Finally build the NFA. The first argument is the anchored starting
961 /// // state (the pattern alternation) where as the second is the
962 /// // unanchored starting state (the unanchored prefix).
963 /// let nfa = builder.build(alt, unanchored_prefix)?;
964 ///
965 /// // Now build a Pike VM from our NFA and access the 'foo' capture
966 /// // group regardless of which pattern matched, since it is defined
967 /// // for both patterns.
968 /// let vm = PikeVM::new_from_nfa(nfa)?;
969 /// let mut cache = vm.create_cache();
970 /// let caps: Vec<Captures> =
971 /// vm.captures_iter(&mut cache, "0123aAaAA").collect();
972 /// assert_eq!(5, caps.len());
973 ///
974 /// assert_eq!(Some(PatternID::must(0)), caps[0].pattern());
975 /// assert_eq!(Some(Span::from(4..5)), caps[0].get_group_by_name("foo"));
976 ///
977 /// assert_eq!(Some(PatternID::must(1)), caps[1].pattern());
978 /// assert_eq!(Some(Span::from(5..6)), caps[1].get_group_by_name("foo"));
979 ///
980 /// assert_eq!(Some(PatternID::must(0)), caps[2].pattern());
981 /// assert_eq!(Some(Span::from(6..7)), caps[2].get_group_by_name("foo"));
982 ///
983 /// assert_eq!(Some(PatternID::must(1)), caps[3].pattern());
984 /// assert_eq!(Some(Span::from(7..8)), caps[3].get_group_by_name("foo"));
985 ///
986 /// assert_eq!(Some(PatternID::must(1)), caps[4].pattern());
987 /// assert_eq!(Some(Span::from(8..9)), caps[4].get_group_by_name("foo"));
988 ///
989 /// # Ok::<(), Box<dyn std::error::Error>>(())
990 /// ```
991 pub fn add_capture_start(
992 &mut self,
993 next: StateID,
994 group_index: u32,
995 name: Option<Arc<str>>,
996 ) -> Result<StateID, BuildError> {
997 let pid = self.current_pattern_id();
998 let group_index = match SmallIndex::try_from(group_index) {
999 Err(_) => {
1000 return Err(BuildError::invalid_capture_index(group_index))
1001 }
1002 Ok(group_index) => group_index,
1003 };
1004 // Make sure we have space to insert our (pid,index)|-->name mapping.
1005 if pid.as_usize() >= self.captures.len() {
1006 for _ in 0..=(pid.as_usize() - self.captures.len()) {
1007 self.captures.push(vec![]);
1008 }
1009 }
1010 // In the case where 'group_index < self.captures[pid].len()', it means
1011 // that we are adding a duplicate capture group. This is somewhat
1012 // weird, but permissible because the capture group itself can be
1013 // repeated in the syntax. For example, '([a-z]){4}' will produce 4
1014 // capture groups. In practice, only the last will be set at search
1015 // time when a match occurs. For duplicates, we don't need to push
1016 // anything other than a CaptureStart NFA state.
1017 if group_index.as_usize() >= self.captures[pid].len() {
1018 // For discontiguous indices, push placeholders for earlier capture
1019 // groups that weren't explicitly added.
1020 for _ in 0..(group_index.as_usize() - self.captures[pid].len()) {
1021 self.captures[pid].push(None);
1022 }
1023 self.captures[pid].push(name);
1024 }
1025 self.add(State::CaptureStart { pattern_id: pid, group_index, next })
1026 }
1027
1028 /// Add a "end capture" NFA state.
1029 ///
1030 /// A "end capture" NFA state corresponds to a state with exactly one
1031 /// outgoing unconditional epsilon transition to another state. Unlike
1032 /// "empty" states, a "end capture" state also carries with it an
1033 /// instruction for saving the current position of input to a particular
1034 /// location in memory. NFA simulations, like the Pike VM, may use this
1035 /// information to report the match locations of capturing groups in a
1036 ///
1037 /// Callers may provide a "dummy" state ID (typically [`StateID::ZERO`]),
1038 /// and then change it later with [`patch`](Builder::patch).
1039 ///
1040 /// Note that unlike `start_pattern`/`finish_pattern`, capturing start and
1041 /// end states may be interleaved. Indeed, it is typical for many "start
1042 /// capture" NFA states to appear before the first "end capture" state.
1043 ///
1044 /// # Errors
1045 ///
1046 /// This returns an error if the state identifier space is exhausted, or if
1047 /// the configured heap size limit has been exceeded or if the given
1048 /// capture index overflows `usize`.
1049 ///
1050 /// While the above are the only conditions in which this routine can
1051 /// currently return an error, it is possible to call this method with an
1052 /// inputs that results in the final `build()` step failing to produce an
1053 /// NFA. For example, if one adds two distinct capturing groups with the
1054 /// same name, then that will result in `build()` failing with an error.
1055 ///
1056 /// See the [`GroupInfo`](crate::util::captures::GroupInfo) type for
1057 /// more information on what qualifies as valid capturing groups.
1058 pub fn add_capture_end(
1059 &mut self,
1060 next: StateID,
1061 group_index: u32,
1062 ) -> Result<StateID, BuildError> {
1063 let pid = self.current_pattern_id();
1064 let group_index = match SmallIndex::try_from(group_index) {
1065 Err(_) => {
1066 return Err(BuildError::invalid_capture_index(group_index))
1067 }
1068 Ok(group_index) => group_index,
1069 };
1070 self.add(State::CaptureEnd { pattern_id: pid, group_index, next })
1071 }
1072
1073 /// Adds a "fail" NFA state.
1074 ///
1075 /// A "fail" state is simply a state that has no outgoing transitions. It
1076 /// acts as a way to cause a search to stop without reporting a match.
1077 /// For example, one way to represent an NFA with zero patterns is with a
1078 /// single "fail" state.
1079 ///
1080 /// # Errors
1081 ///
1082 /// This returns an error if the state identifier space is exhausted, or if
1083 /// the configured heap size limit has been exceeded.
1084 pub fn add_fail(&mut self) -> Result<StateID, BuildError> {
1085 self.add(State::Fail)
1086 }
1087
1088 /// Adds a "match" NFA state.
1089 ///
1090 /// A "match" state has no outgoing transitions (just like a "fail"
1091 /// state), but it has special significance in that if a search enters
1092 /// this state, then a match has been found. The match state that is added
1093 /// automatically has the current pattern ID associated with it. This is
1094 /// used to report the matching pattern ID at search time.
1095 ///
1096 /// # Errors
1097 ///
1098 /// This returns an error if the state identifier space is exhausted, or if
1099 /// the configured heap size limit has been exceeded.
1100 ///
1101 /// # Panics
1102 ///
1103 /// This must be called after a `start_pattern` call but before the
1104 /// corresponding `finish_pattern` call. Otherwise, it panics.
1105 pub fn add_match(&mut self) -> Result<StateID, BuildError> {
1106 let pattern_id = self.current_pattern_id();
1107 let sid = self.add(State::Match { pattern_id })?;
1108 Ok(sid)
1109 }
1110
1111 /// The common implementation of "add a state." It handles the common
1112 /// error cases of state ID exhausting (by owning state ID allocation) and
1113 /// whether the size limit has been exceeded.
1114 fn add(&mut self, state: State) -> Result<StateID, BuildError> {
1115 let id = StateID::new(self.states.len())
1116 .map_err(|_| BuildError::too_many_states(self.states.len()))?;
1117 self.memory_states += state.memory_usage();
1118 self.states.push(state);
1119 self.check_size_limit()?;
1120 Ok(id)
1121 }
1122
1123 /// Add a transition from one state to another.
1124 ///
1125 /// This routine is called "patch" since it is very common to add the
1126 /// states you want, typically with "dummy" state ID transitions, and then
1127 /// "patch" in the real state IDs later. This is because you don't always
1128 /// know all of the necessary state IDs to add because they might not
1129 /// exist yet.
1130 ///
1131 /// # Errors
1132 ///
1133 /// This may error if patching leads to an increase in heap usage beyond
1134 /// the configured size limit. Heap usage only grows when patching adds a
1135 /// new transition (as in the case of a "union" state).
1136 ///
1137 /// # Panics
1138 ///
1139 /// This panics if `from` corresponds to a "sparse" state. When "sparse"
1140 /// states are added, there is no way to patch them after-the-fact. (If you
1141 /// have a use case where this would be helpful, please file an issue. It
1142 /// will likely require a new API.)
1143 pub fn patch(
1144 &mut self,
1145 from: StateID,
1146 to: StateID,
1147 ) -> Result<(), BuildError> {
1148 let old_memory_states = self.memory_states;
1149 match self.states[from] {
1150 State::Empty { ref mut next } => {
1151 *next = to;
1152 }
1153 State::ByteRange { ref mut trans } => {
1154 trans.next = to;
1155 }
1156 State::Sparse { .. } => {
1157 panic!("cannot patch from a sparse NFA state")
1158 }
1159 State::Look { ref mut next, .. } => {
1160 *next = to;
1161 }
1162 State::Union { ref mut alternates } => {
1163 alternates.push(to);
1164 self.memory_states += mem::size_of::<StateID>();
1165 }
1166 State::UnionReverse { ref mut alternates } => {
1167 alternates.push(to);
1168 self.memory_states += mem::size_of::<StateID>();
1169 }
1170 State::CaptureStart { ref mut next, .. } => {
1171 *next = to;
1172 }
1173 State::CaptureEnd { ref mut next, .. } => {
1174 *next = to;
1175 }
1176 State::Fail => {}
1177 State::Match { .. } => {}
1178 }
1179 if old_memory_states != self.memory_states {
1180 self.check_size_limit()?;
1181 }
1182 Ok(())
1183 }
1184
1185 /// Set whether the NFA produced by this builder should only match UTF-8.
1186 ///
1187 /// This should be set when both of the following are true:
1188 ///
1189 /// 1. The caller guarantees that the NFA created by this build will only
1190 /// report non-empty matches with spans that are valid UTF-8.
1191 /// 2. The caller desires regex engines using this NFA to avoid reporting
1192 /// empty matches with a span that splits a valid UTF-8 encoded codepoint.
1193 ///
1194 /// Property (1) is not checked. Instead, this requires the caller to
1195 /// promise that it is true. Property (2) corresponds to the behavior of
1196 /// regex engines using the NFA created by this builder. Namely, there
1197 /// is no way in the NFA's graph itself to say that empty matches found
1198 /// by, for example, the regex `a*` will fall on valid UTF-8 boundaries.
1199 /// Instead, this option is used to communicate the UTF-8 semantic to regex
1200 /// engines that will typically implement it as a post-processing step by
1201 /// filtering out empty matches that don't fall on UTF-8 boundaries.
1202 ///
1203 /// If you're building an NFA from an HIR (and not using a
1204 /// [`thompson::Compiler`](crate::nfa::thompson::Compiler)), then you can
1205 /// use the [`syntax::Config::utf8`](crate::util::syntax::Config::utf8)
1206 /// option to guarantee that if the HIR detects a non-empty match, then it
1207 /// is guaranteed to be valid UTF-8.
1208 ///
1209 /// Note that property (2) does *not* specify the behavior of executing
1210 /// a search on a haystack that is not valid UTF-8. Therefore, if you're
1211 /// *not* running this NFA on strings that are guaranteed to be valid
1212 /// UTF-8, you almost certainly do not want to enable this option.
1213 /// Similarly, if you are running the NFA on strings that *are* guaranteed
1214 /// to be valid UTF-8, then you almost certainly want to enable this option
1215 /// unless you can guarantee that your NFA will never produce a zero-width
1216 /// match.
1217 ///
1218 /// It is disabled by default.
1219 pub fn set_utf8(&mut self, yes: bool) {
1220 self.utf8 = yes;
1221 }
1222
1223 /// Returns whether UTF-8 mode is enabled for this builder.
1224 ///
1225 /// See [`Builder::set_utf8`] for more details about what "UTF-8 mode" is.
1226 pub fn get_utf8(&self) -> bool {
1227 self.utf8
1228 }
1229
1230 /// Sets whether the NFA produced by this builder should be matched in
1231 /// reverse or not. Generally speaking, when enabled, the NFA produced
1232 /// should be matched by moving backwards through a haystack, from a higher
1233 /// memory address to a lower memory address.
1234 ///
1235 /// See also [`NFA::is_reverse`] for more details.
1236 ///
1237 /// This is disabled by default, which means NFAs are by default matched
1238 /// in the forward direction.
1239 pub fn set_reverse(&mut self, yes: bool) {
1240 self.reverse = yes;
1241 }
1242
1243 /// Returns whether reverse mode is enabled for this builder.
1244 ///
1245 /// See [`Builder::set_reverse`] for more details about what "reverse mode"
1246 /// is.
1247 pub fn get_reverse(&self) -> bool {
1248 self.reverse
1249 }
1250
1251 /// Sets the look-around matcher that should be used for the resulting NFA.
1252 ///
1253 /// A look-around matcher can be used to configure how look-around
1254 /// assertions are matched. For example, a matcher might carry
1255 /// configuration that changes the line terminator used for `(?m:^)` and
1256 /// `(?m:$)` assertions.
1257 pub fn set_look_matcher(&mut self, m: LookMatcher) {
1258 self.look_matcher = m;
1259 }
1260
1261 /// Returns the look-around matcher used for this builder.
1262 ///
1263 /// If a matcher was not explicitly set, then `LookMatcher::default()` is
1264 /// returned.
1265 pub fn get_look_matcher(&self) -> &LookMatcher {
1266 &self.look_matcher
1267 }
1268
1269 /// Set the size limit on this builder.
1270 ///
1271 /// Setting the size limit will also check whether the NFA built so far
1272 /// fits within the given size limit. If it doesn't, then an error is
1273 /// returned.
1274 ///
1275 /// By default, there is no configured size limit.
1276 pub fn set_size_limit(
1277 &mut self,
1278 limit: Option<usize>,
1279 ) -> Result<(), BuildError> {
1280 self.size_limit = limit;
1281 self.check_size_limit()
1282 }
1283
1284 /// Return the currently configured size limit.
1285 ///
1286 /// By default, this returns `None`, which corresponds to no configured
1287 /// size limit.
1288 pub fn get_size_limit(&self) -> Option<usize> {
1289 self.size_limit
1290 }
1291
1292 /// Returns the heap memory usage, in bytes, used by the NFA states added
1293 /// so far.
1294 ///
1295 /// Note that this is an approximation of how big the final NFA will be.
1296 /// In practice, the final NFA will likely be a bit smaller because of
1297 /// its simpler state representation. (For example, using things like
1298 /// `Box<[StateID]>` instead of `Vec<StateID>`.)
1299 pub fn memory_usage(&self) -> usize {
1300 self.states.len() * mem::size_of::<State>() + self.memory_states
1301 }
1302
1303 fn check_size_limit(&self) -> Result<(), BuildError> {
1304 if let Some(limit) = self.size_limit {
1305 if self.memory_usage() > limit {
1306 return Err(BuildError::exceeded_size_limit(limit));
1307 }
1308 }
1309 Ok(())
1310 }
1311}
1312
1313#[cfg(test)]
1314mod tests {
1315 use super::*;
1316
1317 // This asserts that a builder state doesn't have its size changed. It is
1318 // *really* easy to accidentally increase the size, and thus potentially
1319 // dramatically increase the memory usage of NFA builder.
1320 //
1321 // This assert doesn't mean we absolutely cannot increase the size of a
1322 // builder state. We can. It's just here to make sure we do it knowingly
1323 // and intentionally.
1324 //
1325 // A builder state is unfortunately a little bigger than an NFA state,
1326 // since we really want to support adding things to a pre-existing state.
1327 // i.e., We use Vec<thing> instead of Box<[thing]>. So we end up using an
1328 // extra 8 bytes per state. Sad, but at least it gets freed once the NFA
1329 // is built.
1330 #[test]
1331 fn state_has_small_size() {
1332 #[cfg(target_pointer_width = "64")]
1333 assert_eq!(32, core::mem::size_of::<State>());
1334 #[cfg(target_pointer_width = "32")]
1335 assert_eq!(16, core::mem::size_of::<State>());
1336 }
1337}
1338