builder.rs source code [crates/regex-automata-0.4.9/src/nfa/thompson/builder.rs]

1	use core::mem;
2
3	use alloc::{sync::Arc, vec, vec::Vec};
4
5	use crate::{
6	nfa::thompson::{
7	error::BuildError,
8	nfa::{self, SparseTransitions, Transition, NFA},
9	},
10	util::{
11	look::{Look, LookMatcher},
12	primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID},
13	},
14	};
15
16	/// An intermediate NFA state used during construction.
17	///
18	/// During construction of an NFA, it is often convenient to work with states
19	/// that are amenable to mutation and other carry more information than we
20	/// otherwise need once an NFA has been built. This type represents those
21	/// needs.
22	///
23	/// Once construction is finished, the builder will convert these states to a
24	/// [`nfa::thompson::State`](crate::nfa::thompson::State). This conversion not
25	/// only results in a simpler representation, but in some cases, entire classes
26	/// of states are completely removed (such as [`State::Empty`]).
27	#[derive(Clone, Debug, Eq, PartialEq)]
28	enum State {
29	/// An empty state whose only purpose is to forward the automaton to
30	/// another state via an unconditional epsilon transition.
31	///
32	/// Unconditional epsilon transitions are quite useful during the
33	/// construction of an NFA, as they permit the insertion of no-op
34	/// placeholders that make it easier to compose NFA sub-graphs. When
35	/// the Thompson NFA builder produces a final NFA, all unconditional
36	/// epsilon transitions are removed, and state identifiers are remapped
37	/// accordingly.
38	Empty {
39	/// The next state that this state should transition to.
40	next: StateID,
41	},
42	/// A state that only transitions to another state if the current input
43	/// byte is in a particular range of bytes.
44	ByteRange { trans: Transition },
45	/// A state with possibly many transitions, represented in a sparse
46	/// fashion. Transitions must be ordered lexicographically by input range
47	/// and be non-overlapping. As such, this may only be used when every
48	/// transition has equal priority. (In practice, this is only used for
49	/// encoding large UTF-8 automata.) In contrast, a `Union` state has each
50	/// alternate in order of priority. Priority is used to implement greedy
51	/// matching and also alternations themselves, e.g., `abc\|a` where `abc`
52	/// has priority over `a`.
53	///
54	/// To clarify, it is possible to remove `Sparse` and represent all things
55	/// that `Sparse` is used for via `Union`. But this creates a more bloated
56	/// NFA with more epsilon transitions than is necessary in the special case
57	/// of character classes.
58	Sparse { transitions: Vec<Transition> },
59	/// A conditional epsilon transition satisfied via some sort of
60	/// look-around.
61	Look { look: Look, next: StateID },
62	/// An empty state that records the start of a capture location. This is an
63	/// unconditional epsilon transition like `Empty`, except it can be used to
64	/// record position information for a capture group when using the NFA for
65	/// search.
66	CaptureStart {
67	/// The ID of the pattern that this capture was defined.
68	pattern_id: PatternID,
69	/// The capture group index that this capture state corresponds to.
70	/// The capture group index is always relative to its corresponding
71	/// pattern. Therefore, in the presence of multiple patterns, both the
72	/// pattern ID and the capture group index are required to uniquely
73	/// identify a capturing group.
74	group_index: SmallIndex,
75	/// The next state that this state should transition to.
76	next: StateID,
77	},
78	/// An empty state that records the end of a capture location. This is an
79	/// unconditional epsilon transition like `Empty`, except it can be used to
80	/// record position information for a capture group when using the NFA for
81	/// search.
82	CaptureEnd {
83	/// The ID of the pattern that this capture was defined.
84	pattern_id: PatternID,
85	/// The capture group index that this capture state corresponds to.
86	/// The capture group index is always relative to its corresponding
87	/// pattern. Therefore, in the presence of multiple patterns, both the
88	/// pattern ID and the capture group index are required to uniquely
89	/// identify a capturing group.
90	group_index: SmallIndex,
91	/// The next state that this state should transition to.
92	next: StateID,
93	},
94	/// An alternation such that there exists an epsilon transition to all
95	/// states in `alternates`, where matches found via earlier transitions
96	/// are preferred over later transitions.
97	Union { alternates: Vec<StateID> },
98	/// An alternation such that there exists an epsilon transition to all
99	/// states in `alternates`, where matches found via later transitions are
100	/// preferred over earlier transitions.
101	///
102	/// This "reverse" state exists for convenience during compilation that
103	/// permits easy construction of non-greedy combinations of NFA states. At
104	/// the end of compilation, Union and UnionReverse states are merged into
105	/// one Union type of state, where the latter has its epsilon transitions
106	/// reversed to reflect the priority inversion.
107	///
108	/// The "convenience" here arises from the fact that as new states are
109	/// added to the list of `alternates`, we would like that add operation
110	/// to be amortized constant time. But if we used a `Union`, we'd need to
111	/// prepend the state, which takes O(n) time. There are other approaches we
112	/// could use to solve this, but this seems simple enough.
113	UnionReverse { alternates: Vec<StateID> },
114	/// A state that cannot be transitioned out of. This is useful for cases
115	/// where you want to prevent matching from occurring. For example, if your
116	/// regex parser permits empty character classes, then one could choose a
117	/// `Fail` state to represent it.
118	Fail,
119	/// A match state. There is at most one such occurrence of this state in
120	/// an NFA for each pattern compiled into the NFA. At time of writing, a
121	/// match state is always produced for every pattern given, but in theory,
122	/// if a pattern can never lead to a match, then the match state could be
123	/// omitted.
124	///
125	/// `pattern_id` refers to the ID of the pattern itself, which corresponds
126	/// to the pattern's index (starting at 0).
127	Match { pattern_id: PatternID },
128	}
129
130	impl State {
131	/// If this state is an unconditional epsilon transition, then this returns
132	/// the target of the transition.
133	fn goto(&self) -> Option<StateID> {
134	match *self {
135	State::Empty { next } => Some(next),
136	State::Union { ref alternates } if alternates.len() == `1` => {
137	Some(alternates[`0`])
138	}
139	State::UnionReverse { ref alternates }
140	if alternates.len() == `1` =>
141	{
142	Some(alternates[`0`])
143	}
144	_ => None,
145	}
146	}
147
148	/// Returns the heap memory usage, in bytes, of this state.
149	fn memory_usage(&self) -> usize {
150	match *self {
151	State::Empty { .. }
152	\| State::ByteRange { .. }
153	\| State::Look { .. }
154	\| State::CaptureStart { .. }
155	\| State::CaptureEnd { .. }
156	\| State::Fail
157	\| State::Match { .. } => `0`,
158	State::Sparse { ref transitions } => {
159	transitions.len() * mem::size_of::<Transition>()
160	}
161	State::Union { ref alternates } => {
162	alternates.len() * mem::size_of::<StateID>()
163	}
164	State::UnionReverse { ref alternates } => {
165	alternates.len() * mem::size_of::<StateID>()
166	}
167	}
168	}
169	}
170
171	/// An abstraction for building Thompson NFAs by hand.
172	///
173	/// A builder is what a [`thompson::Compiler`](crate::nfa::thompson::Compiler)
174	/// uses internally to translate a regex's high-level intermediate
175	/// representation into an [`NFA`].
176	///
177	/// The primary function of this builder is to abstract away the internal
178	/// representation of an NFA and make it difficult to produce NFAs are that
179	/// internally invalid or inconsistent. This builder also provides a way to
180	/// add "empty" states (which can be thought of as unconditional epsilon
181	/// transitions), despite the fact that [`thompson::State`](nfa::State) does
182	/// not have any "empty" representation. The advantage of "empty" states is
183	/// that they make the code for constructing a Thompson NFA logically simpler.
184	///
185	/// Many of the routines on this builder may panic or return errors. Generally
186	/// speaking, panics occur when an invalid sequence of method calls were made,
187	/// where as an error occurs if things get too big. (Where "too big" might mean
188	/// exhausting identifier space or using up too much heap memory in accordance
189	/// with the configured [`size_limit`](Builder::set_size_limit).)
190	///
191	/// # Overview
192	///
193	/// ## Adding multiple patterns
194	///
195	/// Each pattern you add to an NFA should correspond to a pair of
196	/// [`Builder::start_pattern`] and [`Builder::finish_pattern`] calls, with
197	/// calls inbetween that add NFA states for that pattern. NFA states may be
198	/// added without first calling `start_pattern`, with the exception of adding
199	/// capturing states.
200	///
201	/// ## Adding NFA states
202	///
203	/// Here is a very brief overview of each of the methods that add NFA states.
204	/// Every method adds a single state.
205	///
206	/// * [`add_empty`](Builder::add_empty): Add a state with a single
207	/// unconditional epsilon transition to another state.
208	/// * [`add_union`](Builder::add_union): Adds a state with unconditional
209	/// epsilon transitions to two or more states, with earlier transitions
210	/// preferred over later ones.
211	/// * [`add_union_reverse`](Builder::add_union_reverse): Adds a state with
212	/// unconditional epsilon transitions to two or more states, with later
213	/// transitions preferred over earlier ones.
214	/// * [`add_range`](Builder::add_range): Adds a state with a single transition
215	/// to another state that can only be followed if the current input byte is
216	/// within the range given.
217	/// * [`add_sparse`](Builder::add_sparse): Adds a state with two or more
218	/// range transitions to other states, where a transition is only followed
219	/// if the current input byte is within one of the ranges. All transitions
220	/// in this state have equal priority, and the corresponding ranges must be
221	/// non-overlapping.
222	/// * [`add_look`](Builder::add_look): Adds a state with a single conditional
223	/// epsilon transition to another state, where the condition depends on a
224	/// limited look-around property.
225	/// * [`add_capture_start`](Builder::add_capture_start): Adds a state with
226	/// a single unconditional epsilon transition that also instructs an NFA
227	/// simulation to record the current input position to a specific location in
228	/// memory. This is intended to represent the starting location of a capturing
229	/// group.
230	/// * [`add_capture_end`](Builder::add_capture_end): Adds a state with
231	/// a single unconditional epsilon transition that also instructs an NFA
232	/// simulation to record the current input position to a specific location in
233	/// memory. This is intended to represent the ending location of a capturing
234	/// group.
235	/// * [`add_fail`](Builder::add_fail): Adds a state that never transitions to
236	/// another state.
237	/// * [`add_match`](Builder::add_match): Add a state that indicates a match has
238	/// been found for a particular pattern. A match state is a final state with
239	/// no outgoing transitions.
240	///
241	/// ## Setting transitions between NFA states
242	///
243	/// The [`Builder::patch`] method creates a transition from one state to the
244	/// next. If the `from` state corresponds to a state that supports multiple
245	/// outgoing transitions (such as "union"), then this adds the corresponding
246	/// transition. Otherwise, it sets the single transition. (This routine panics
247	/// if `from` corresponds to a state added by `add_sparse`, since sparse states
248	/// need more specialized handling.)
249	///
250	/// # Example
251	///
252	/// This annotated example shows how to hand construct the regex `[a-z]+`
253	/// (without an unanchored prefix).
254	///
255	/// ```
256	/// use regex_automata::{
257	/// nfa::thompson::{pikevm::PikeVM, Builder, Transition},
258	/// util::primitives::StateID,
259	/// Match,
260	/// };
261	///
262	/// let mut builder = Builder::new();
263	/// // Before adding NFA states for our pattern, we need to tell the builder
264	/// // that we are starting the pattern.
265	/// builder.start_pattern()?;
266	/// // Since we use the Pike VM below for searching, we need to add capturing
267	/// // states. If you're just going to build a DFA from the NFA, then capturing
268	/// // states do not need to be added.
269	/// let start = builder.add_capture_start(StateID::ZERO, `0`, None)?;
270	/// let range = builder.add_range(Transition {
271	/// // We don't know the state ID of the 'next' state yet, so we just fill
272	/// // in a dummy 'ZERO' value.
273	/// start: b'a', end: b'z', next: StateID::ZERO,
274	/// })?;
275	/// // This state will point back to 'range', but also enable us to move ahead.
276	/// // That is, this implements the '+' repetition operator. We add 'range' and
277	/// // then 'end' below to this alternation.
278	/// let alt = builder.add_union(vec![])?;
279	/// // The final state before the match state, which serves to capture the
280	/// // end location of the match.
281	/// let end = builder.add_capture_end(StateID::ZERO, `0`)?;
282	/// // The match state for our pattern.
283	/// let mat = builder.add_match()?;
284	/// // Now we fill in the transitions between states.
285	/// builder.patch(start, range)?;
286	/// builder.patch(range, alt)?;
287	/// // If we added 'end' before 'range', then we'd implement non-greedy
288	/// // matching, i.e., '+?'.
289	/// builder.patch(alt, range)?;
290	/// builder.patch(alt, end)?;
291	/// builder.patch(end, mat)?;
292	/// // We must explicitly finish pattern and provide the starting state ID for
293	/// // this particular pattern.
294	/// builder.finish_pattern(start)?;
295	/// // Finally, when we build the NFA, we provide the anchored and unanchored
296	/// // starting state IDs. Since we didn't bother with an unanchored prefix
297	/// // here, we only support anchored searching. Thus, both starting states are
298	/// // the same.
299	/// let nfa = builder.build(start, start)?;
300	///
301	/// // Now build a Pike VM from our NFA, and use it for searching. This shows
302	/// // how we can use a regex engine without ever worrying about syntax!
303	/// let re = PikeVM::new_from_nfa(nfa)?;
304	/// let mut cache = re.create_cache();
305	/// let mut caps = re.create_captures();
306	/// let expected = Some(Match::must(`0`, `0`..`3`));
307	/// re.captures(&mut cache, "foo0", &mut caps);
308	/// assert_eq!(expected, caps.get_match());
309	///
310	/// # Ok::<(), Box<dyn std::error::Error>>(())
311	/// ```
312	#[derive(Clone, Debug, Default)]
313	pub struct Builder {
314	/// The ID of the pattern that we're currently building.
315	///
316	/// Callers are required to set (and unset) this by calling
317	/// {start,finish}_pattern. Otherwise, most methods will panic.
318	pattern_id: Option<PatternID>,
319	/// A sequence of intermediate NFA states. Once a state is added to this
320	/// sequence, it is assigned a state ID equivalent to its index. Once a
321	/// state is added, it is still expected to be mutated, e.g., to set its
322	/// transition to a state that didn't exist at the time it was added.
323	states: Vec<State>,
324	/// The starting states for each individual pattern. Starting at any
325	/// of these states will result in only an anchored search for the
326	/// corresponding pattern. The vec is indexed by pattern ID. When the NFA
327	/// contains a single regex, then `start_pattern[0]` and `start_anchored`
328	/// are always equivalent.
329	start_pattern: Vec<StateID>,
330	/// A map from pattern ID to capture group index to name. (If no name
331	/// exists, then a None entry is present. Thus, all capturing groups are
332	/// present in this mapping.)
333	///
334	/// The outer vec is indexed by pattern ID, while the inner vec is indexed
335	/// by capture index offset for the corresponding pattern.
336	///
337	/// The first capture group for each pattern is always unnamed and is thus
338	/// always None.
339	captures: Vec<Vec<Option<Arc<str>>>>,
340	/// The combined memory used by each of the 'State's in 'states'. This
341	/// only includes heap usage by each state, and not the size of the state
342	/// itself. In other words, this tracks heap memory used that isn't
343	/// captured via `size_of::<State>() states.len()`.*
344	memory_states: usize,
345	/// Whether this NFA only matches UTF-8 and whether regex engines using
346	/// this NFA for searching should report empty matches that split a
347	/// codepoint.
348	utf8: bool,
349	/// Whether this NFA should be matched in reverse or not.
350	reverse: bool,
351	/// The matcher to use for look-around assertions.
352	look_matcher: LookMatcher,
353	/// A size limit to respect when building an NFA. If the total heap memory
354	/// of the intermediate NFA states exceeds (or would exceed) this amount,
355	/// then an error is returned.
356	size_limit: Option<usize>,
357	}
358
359	impl Builder {
360	/// Create a new builder for hand-assembling NFAs.
361	pub fn new() -> Builder {
362	Builder::default()
363	}
364
365	/// Clear this builder.
366	///
367	/// Clearing removes all state associated with building an NFA, but does
368	/// not reset configuration (such as size limits and whether the NFA
369	/// should only match UTF-8). After clearing, the builder can be reused to
370	/// assemble an entirely new NFA.
371	pub fn clear(&mut self) {
372	self.pattern_id = None;
373	self.states.clear();
374	self.start_pattern.clear();
375	self.captures.clear();
376	self.memory_states = `0`;
377	}
378
379	/// Assemble a [`NFA`] from the states added so far.
380	///
381	/// After building an NFA, more states may be added and `build` may be
382	/// called again. To reuse a builder to produce an entirely new NFA from
383	/// scratch, call the [`clear`](Builder::clear) method first.
384	///
385	/// `start_anchored` refers to the ID of the starting state that anchored
386	/// searches should use. That is, searches who matches are limited to the
387	/// starting position of the search.
388	///
389	/// `start_unanchored` refers to the ID of the starting state that
390	/// unanchored searches should use. This permits searches to report matches
391	/// that start after the beginning of the search. In cases where unanchored
392	/// searches are not supported, the unanchored starting state ID must be
393	/// the same as the anchored starting state ID.
394	///
395	/// # Errors
396	///
397	/// This returns an error if there was a problem producing the final NFA.
398	/// In particular, this might include an error if the capturing groups
399	/// added to this builder violate any of the invariants documented on
400	/// [`GroupInfo`](crate::util::captures::GroupInfo).
401	///
402	/// # Panics
403	///
404	/// If `start_pattern` was called, then `finish_pattern` must be called
405	/// before `build`, otherwise this panics.
406	///
407	/// This may panic for other invalid uses of a builder. For example, if
408	/// a "start capture" state was added without a corresponding "end capture"
409	/// state.
410	pub fn build(
411	&self,
412	start_anchored: StateID,
413	start_unanchored: StateID,
414	) -> Result<NFA, BuildError> {
415	assert!(self.pattern_id.is_none(), "must call 'finish_pattern' first");
416	debug!(
417	"intermediate NFA compilation via builder is complete, \
418	intermediate NFA size: {} states, {} bytes on heap",
419	self.states.len(),
420	self.memory_usage(),
421	);
422
423	let mut nfa = nfa::Inner::default();
424	nfa.set_utf8(self.utf8);
425	nfa.set_reverse(self.reverse);
426	nfa.set_look_matcher(self.look_matcher.clone());
427	// A set of compiler internal state IDs that correspond to states
428	// that are exclusively epsilon transitions, i.e., goto instructions,
429	// combined with the state that they point to. This is used to
430	// record said states while transforming the compiler's internal NFA
431	// representation to the external form.
432	let mut empties = vec![];
433	// A map used to re-map state IDs when translating this builder's
434	// internal NFA state representation to the final NFA representation.
435	let mut remap = vec![];
436	remap.resize(self.states.len(), StateID::ZERO);
437
438	nfa.set_starts(start_anchored, start_unanchored, &self.start_pattern);
439	nfa.set_captures(&self.captures).map_err(BuildError::captures)?;
440	// The idea here is to convert our intermediate states to their final
441	// form. The only real complexity here is the process of converting
442	// transitions, which are expressed in terms of state IDs. The new
443	// set of states will be smaller because of partial epsilon removal,
444	// so the state IDs will not be the same.
445	for (sid, state) in self.states.iter().with_state_ids() {
446	match *state {
447	State::Empty { next } => {
448	// Since we're removing empty states, we need to handle
449	// them later since we don't yet know which new state this
450	// empty state will be mapped to.
451	empties.push((sid, next));
452	}
453	State::ByteRange { trans } => {
454	remap[sid] = nfa.add(nfa::State::ByteRange { trans });
455	}
456	State::Sparse { ref transitions } => {
457	remap[sid] = match transitions.len() {
458	`0` => nfa.add(nfa::State::Fail),
459	`1` => nfa.add(nfa::State::ByteRange {
460	trans: transitions[`0`],
461	}),
462	_ => {
463	let transitions =
464	transitions.to_vec().into_boxed_slice();
465	let sparse = SparseTransitions { transitions };
466	nfa.add(nfa::State::Sparse(sparse))
467	}
468	}
469	}
470	State::Look { look, next } => {
471	remap[sid] = nfa.add(nfa::State::Look { look, next });
472	}
473	State::CaptureStart { pattern_id, group_index, next } => {
474	// We can't remove this empty state because of the side
475	// effect of capturing an offset for this capture slot.
476	let slot = nfa
477	.group_info()
478	.slot(pattern_id, group_index.as_usize())
479	.expect("invalid capture index");
480	let slot =
481	SmallIndex::new(slot).expect("a small enough slot");
482	remap[sid] = nfa.add(nfa::State::Capture {
483	next,
484	pattern_id,
485	group_index,
486	slot,
487	});
488	}
489	State::CaptureEnd { pattern_id, group_index, next } => {
490	// We can't remove this empty state because of the side
491	// effect of capturing an offset for this capture slot.
492	// Also, this always succeeds because we check that all
493	// slot indices are valid for all capture indices when they
494	// are initially added.
495	let slot = nfa
496	.group_info()
497	.slot(pattern_id, group_index.as_usize())
498	.expect("invalid capture index")
499	.checked_add(`1`)
500	.unwrap();
501	let slot =
502	SmallIndex::new(slot).expect("a small enough slot");
503	remap[sid] = nfa.add(nfa::State::Capture {
504	next,
505	pattern_id,
506	group_index,
507	slot,
508	});
509	}
510	State::Union { ref alternates } => {
511	if alternates.is_empty() {
512	remap[sid] = nfa.add(nfa::State::Fail);
513	} else if alternates.len() == `1` {
514	empties.push((sid, alternates[`0`]));
515	remap[sid] = alternates[`0`];
516	} else if alternates.len() == `2` {
517	remap[sid] = nfa.add(nfa::State::BinaryUnion {
518	alt1: alternates[`0`],
519	alt2: alternates[`1`],
520	});
521	} else {
522	let alternates =
523	alternates.to_vec().into_boxed_slice();
524	remap[sid] = nfa.add(nfa::State::Union { alternates });
525	}
526	}
527	State::UnionReverse { ref alternates } => {
528	if alternates.is_empty() {
529	remap[sid] = nfa.add(nfa::State::Fail);
530	} else if alternates.len() == `1` {
531	empties.push((sid, alternates[`0`]));
532	remap[sid] = alternates[`0`];
533	} else if alternates.len() == `2` {
534	remap[sid] = nfa.add(nfa::State::BinaryUnion {
535	alt1: alternates[`1`],
536	alt2: alternates[`0`],
537	});
538	} else {
539	let mut alternates =
540	alternates.to_vec().into_boxed_slice();
541	alternates.reverse();
542	remap[sid] = nfa.add(nfa::State::Union { alternates });
543	}
544	}
545	State::Fail => {
546	remap[sid] = nfa.add(nfa::State::Fail);
547	}
548	State::Match { pattern_id } => {
549	remap[sid] = nfa.add(nfa::State::Match { pattern_id });
550	}
551	}
552	}
553	// Some of the new states still point to empty state IDs, so we need to
554	// follow each of them and remap the empty state IDs to their non-empty
555	// state IDs.
556	//
557	// We also keep track of which states we've already mapped. This helps
558	// avoid quadratic behavior in a long chain of empty states. For
559	// example, in 'a{0}{50000}'.
560	let mut remapped = vec![`false`; self.states.len()];
561	for &(empty_id, empty_next) in empties.iter() {
562	if remapped[empty_id] {
563	continue;
564	}
565	// empty states can point to other empty states, forming a chain.
566	// So we must follow the chain until the end, which must end at
567	// a non-empty state, and therefore, a state that is correctly
568	// remapped. We are guaranteed to terminate because our compiler
569	// never builds a loop among only empty states.
570	let mut new_next = empty_next;
571	while let Some(next) = self.states[new_next].goto() {
572	new_next = next;
573	}
574	remap[empty_id] = remap[new_next];
575	remapped[empty_id] = `true`;
576
577	// Now that we've remapped the main 'empty_id' above, we re-follow
578	// the chain from above and remap every empty state we found along
579	// the way to our ultimate non-empty target. We are careful to set
580	// 'remapped' to true for each such state. We thus will not need
581	// to re-compute this chain for any subsequent empty states in
582	// 'empties' that are part of this chain.
583	let mut next2 = empty_next;
584	while let Some(next) = self.states[next2].goto() {
585	remap[next2] = remap[new_next];
586	remapped[next2] = `true`;
587	next2 = next;
588	}
589	}
590	// Finally remap all of the state IDs.
591	nfa.remap(&remap);
592	let final_nfa = nfa.into_nfa();
593	debug!(
594	"NFA compilation via builder complete, \
595	final NFA size: {} states, {} bytes on heap, \
596	has empty? {:?}, utf8? {:?}",
597	final_nfa.states().len(),
598	final_nfa.memory_usage(),
599	final_nfa.has_empty(),
600	final_nfa.is_utf8(),
601	);
602	Ok(final_nfa)
603	}
604
605	/// Start the assembly of a pattern in this NFA.
606	///
607	/// Upon success, this returns the identifier for the new pattern.
608	/// Identifiers start at `0` and are incremented by 1 for each new pattern.
609	///
610	/// It is necessary to call this routine before adding capturing states.
611	/// Otherwise, any other NFA state may be added before starting a pattern.
612	///
613	/// # Errors
614	///
615	/// If the pattern identifier space is exhausted, then this returns an
616	/// error.
617	///
618	/// # Panics
619	///
620	/// If this is called while assembling another pattern (i.e., before
621	/// `finish_pattern` is called), then this panics.
622	pub fn start_pattern(&mut self) -> Result<PatternID, BuildError> {
623	assert!(self.pattern_id.is_none(), "must call 'finish_pattern' first");
624
625	let proposed = self.start_pattern.len();
626	let pid = PatternID::new(proposed)
627	.map_err(\|_\| BuildError::too_many_patterns(proposed))?;
628	self.pattern_id = Some(pid);
629	// This gets filled in when 'finish_pattern' is called.
630	self.start_pattern.push(StateID::ZERO);
631	Ok(pid)
632	}
633
634	/// Finish the assembly of a pattern in this NFA.
635	///
636	/// Upon success, this returns the identifier for the new pattern.
637	/// Identifiers start at `0` and are incremented by 1 for each new
638	/// pattern. This is the same identifier returned by the corresponding
639	/// `start_pattern` call.
640	///
641	/// Note that `start_pattern` and `finish_pattern` pairs cannot be
642	/// interleaved or nested. A correct `finish_pattern` call _always_
643	/// corresponds to the most recently called `start_pattern` routine.
644	///
645	/// # Errors
646	///
647	/// This currently never returns an error, but this is subject to change.
648	///
649	/// # Panics
650	///
651	/// If this is called without a corresponding `start_pattern` call, then
652	/// this panics.
653	pub fn finish_pattern(
654	&mut self,
655	start_id: StateID,
656	) -> Result<PatternID, BuildError> {
657	let pid = self.current_pattern_id();
658	self.start_pattern[pid] = start_id;
659	self.pattern_id = None;
660	Ok(pid)
661	}
662
663	/// Returns the pattern identifier of the current pattern.
664	///
665	/// # Panics
666	///
667	/// If this doesn't occur after a `start_pattern` call and before the
668	/// corresponding `finish_pattern` call, then this panics.
669	pub fn current_pattern_id(&self) -> PatternID {
670	self.pattern_id.expect("must call 'start_pattern' first")
671	}
672
673	/// Returns the number of patterns added to this builder so far.
674	///
675	/// This only includes patterns that have had `finish_pattern` called
676	/// for them.
677	pub fn pattern_len(&self) -> usize {
678	self.start_pattern.len()
679	}
680
681	/// Add an "empty" NFA state.
682	///
683	/// An "empty" NFA state is a state with a single unconditional epsilon
684	/// transition to another NFA state. Such empty states are removed before
685	/// building the final [`NFA`] (which has no such "empty" states), but they
686	/// can be quite useful in the construction process of an NFA.
687	///
688	/// # Errors
689	///
690	/// This returns an error if the state identifier space is exhausted, or if
691	/// the configured heap size limit has been exceeded.
692	pub fn add_empty(&mut self) -> Result<StateID, BuildError> {
693	self.add(State::Empty { next: StateID::ZERO })
694	}
695
696	/// Add a "union" NFA state.
697	///
698	/// A "union" NFA state that contains zero or more unconditional epsilon
699	/// transitions to other NFA states. The order of these transitions
700	/// reflects a priority order where earlier transitions are preferred over
701	/// later transitions.
702	///
703	/// Callers may provide an empty set of alternates to this method call, and
704	/// then later add transitions via `patch`. At final build time, a "union"
705	/// state with no alternates is converted to a "fail" state, and a "union"
706	/// state with exactly one alternate is treated as if it were an "empty"
707	/// state.
708	///
709	/// # Errors
710	///
711	/// This returns an error if the state identifier space is exhausted, or if
712	/// the configured heap size limit has been exceeded.
713	pub fn add_union(
714	&mut self,
715	alternates: Vec<StateID>,
716	) -> Result<StateID, BuildError> {
717	self.add(State::Union { alternates })
718	}
719
720	/// Add a "reverse union" NFA state.
721	///
722	/// A "reverse union" NFA state contains zero or more unconditional epsilon
723	/// transitions to other NFA states. The order of these transitions
724	/// reflects a priority order where later transitions are preferred
725	/// over earlier transitions. This is an inverted priority order when
726	/// compared to `add_union`. This is useful, for example, for implementing
727	/// non-greedy repetition operators.
728	///
729	/// Callers may provide an empty set of alternates to this method call, and
730	/// then later add transitions via `patch`. At final build time, a "reverse
731	/// union" state with no alternates is converted to a "fail" state, and a
732	/// "reverse union" state with exactly one alternate is treated as if it
733	/// were an "empty" state.
734	///
735	/// # Errors
736	///
737	/// This returns an error if the state identifier space is exhausted, or if
738	/// the configured heap size limit has been exceeded.
739	pub fn add_union_reverse(
740	&mut self,
741	alternates: Vec<StateID>,
742	) -> Result<StateID, BuildError> {
743	self.add(State::UnionReverse { alternates })
744	}
745
746	/// Add a "range" NFA state.
747	///
748	/// A "range" NFA state is a state with one outgoing transition to another
749	/// state, where that transition may only be followed if the current input
750	/// byte falls between a range of bytes given.
751	///
752	/// # Errors
753	///
754	/// This returns an error if the state identifier space is exhausted, or if
755	/// the configured heap size limit has been exceeded.
756	pub fn add_range(
757	&mut self,
758	trans: Transition,
759	) -> Result<StateID, BuildError> {
760	self.add(State::ByteRange { trans })
761	}
762
763	/// Add a "sparse" NFA state.
764	///
765	/// A "sparse" NFA state contains zero or more outgoing transitions, where
766	/// the transition to be followed (if any) is chosen based on whether the
767	/// current input byte falls in the range of one such transition. The
768	/// transitions given must* be non-overlapping and in ascending order. (A*
769	/// "sparse" state with no transitions is equivalent to a "fail" state.)
770	///
771	/// A "sparse" state is like adding a "union" state and pointing it at a
772	/// bunch of "range" states, except that the different alternates have
773	/// equal priority.
774	///
775	/// Note that a "sparse" state is the only state that cannot be patched.
776	/// This is because a "sparse" state has many transitions, each of which
777	/// may point to a different NFA state. Moreover, adding more such
778	/// transitions requires more than just an NFA state ID to point to. It
779	/// also requires a byte range. The `patch` routine does not support the
780	/// additional information required. Therefore, callers must ensure that
781	/// all outgoing transitions for this state are included when `add_sparse`
782	/// is called. There is no way to add more later.
783	///
784	/// # Errors
785	///
786	/// This returns an error if the state identifier space is exhausted, or if
787	/// the configured heap size limit has been exceeded.
788	///
789	/// # Panics
790	///
791	/// This routine _may_ panic if the transitions given overlap or are not
792	/// in ascending order.
793	pub fn add_sparse(
794	&mut self,
795	transitions: Vec<Transition>,
796	) -> Result<StateID, BuildError> {
797	self.add(State::Sparse { transitions })
798	}
799
800	/// Add a "look" NFA state.
801	///
802	/// A "look" NFA state corresponds to a state with exactly one
803	/// conditional* epsilon transition to another NFA state. Namely, it*
804	/// represents one of a small set of simplistic look-around operators.
805	///
806	/// Callers may provide a "dummy" state ID (typically [`StateID::ZERO`]),
807	/// and then change it later with [`patch`](Builder::patch).
808	///
809	/// # Errors
810	///
811	/// This returns an error if the state identifier space is exhausted, or if
812	/// the configured heap size limit has been exceeded.
813	pub fn add_look(
814	&mut self,
815	next: StateID,
816	look: Look,
817	) -> Result<StateID, BuildError> {
818	self.add(State::Look { look, next })
819	}
820
821	/// Add a "start capture" NFA state.
822	///
823	/// A "start capture" NFA state corresponds to a state with exactly one
824	/// outgoing unconditional epsilon transition to another state. Unlike
825	/// "empty" states, a "start capture" state also carries with it an
826	/// instruction for saving the current position of input to a particular
827	/// location in memory. NFA simulations, like the Pike VM, may use this
828	/// information to report the match locations of capturing groups in a
829	/// regex pattern.
830	///
831	/// If the corresponding capturing group has a name, then callers should
832	/// include it here.
833	///
834	/// Callers may provide a "dummy" state ID (typically [`StateID::ZERO`]),
835	/// and then change it later with [`patch`](Builder::patch).
836	///
837	/// Note that unlike `start_pattern`/`finish_pattern`, capturing start and
838	/// end states may be interleaved. Indeed, it is typical for many "start
839	/// capture" NFA states to appear before the first "end capture" state.
840	///
841	/// # Errors
842	///
843	/// This returns an error if the state identifier space is exhausted, or if
844	/// the configured heap size limit has been exceeded or if the given
845	/// capture index overflows `usize`.
846	///
847	/// While the above are the only conditions in which this routine can
848	/// currently return an error, it is possible to call this method with an
849	/// inputs that results in the final `build()` step failing to produce an
850	/// NFA. For example, if one adds two distinct capturing groups with the
851	/// same name, then that will result in `build()` failing with an error.
852	///
853	/// See the [`GroupInfo`](crate::util::captures::GroupInfo) type for
854	/// more information on what qualifies as valid capturing groups.
855	///
856	/// # Example
857	///
858	/// This example shows that an error occurs when one tries to add multiple
859	/// capturing groups with the same name to the same pattern.
860	///
861	/// ```
862	/// use regex_automata::{
863	/// nfa::thompson::Builder,
864	/// util::primitives::StateID,
865	/// };
866	///
867	/// let name = Some(std::sync::Arc::from("foo"));
868	/// let mut builder = Builder::new();
869	/// builder.start_pattern()?;
870	/// // 0th capture group should always be unnamed.
871	/// let start = builder.add_capture_start(StateID::ZERO, `0`, None)?;
872	/// // OK
873	/// builder.add_capture_start(StateID::ZERO, `1`, name.clone())?;
874	/// // This is not OK, but 'add_capture_start' still succeeds. We don't
875	/// // get an error until we call 'build' below. Without this call, the
876	/// // call to 'build' below would succeed.
877	/// builder.add_capture_start(StateID::ZERO, `2`, name.clone())?;
878	/// // Finish our pattern so we can try to build the NFA.
879	/// builder.finish_pattern(start)?;
880	/// let result = builder.build(start, start);
881	/// assert!(result.is_err());
882	///
883	/// # Ok::<(), Box<dyn std::error::Error>>(())
884	/// ```
885	///
886	/// However, adding multiple capturing groups with the same name to
887	/// distinct patterns is okay:
888	///
889	/// ```
890	/// use std::sync::Arc;
891	///
892	/// use regex_automata::{
893	/// nfa::thompson::{pikevm::PikeVM, Builder, Transition},
894	/// util::{
895	/// captures::Captures,
896	/// primitives::{PatternID, StateID},
897	/// },
898	/// Span,
899	/// };
900	///
901	/// // Hand-compile the patterns '(?P<foo>[a-z])' and '(?P<foo>[A-Z])'.
902	/// let mut builder = Builder::new();
903	/// // We compile them to support an unanchored search, which requires
904	/// // adding an implicit '(?s-u:.)?' prefix before adding either pattern.*
905	/// let unanchored_prefix = builder.add_union_reverse(vec![])?;
906	/// let any = builder.add_range(Transition {
907	/// start: b'`\x00`', end: b'`\xFF`', next: StateID::ZERO,
908	/// })?;
909	/// builder.patch(unanchored_prefix, any)?;
910	/// builder.patch(any, unanchored_prefix)?;
911	///
912	/// // Compile an alternation that permits matching multiple patterns.
913	/// let alt = builder.add_union(vec![])?;
914	/// builder.patch(unanchored_prefix, alt)?;
915	///
916	/// // Compile '(?P<foo>[a-z]+)'.
917	/// builder.start_pattern()?;
918	/// let start0 = builder.add_capture_start(StateID::ZERO, `0`, None)?;
919	/// // N.B. 0th capture group must always be unnamed.
920	/// let foo_start0 = builder.add_capture_start(
921	/// StateID::ZERO, `1`, Some(Arc::from("foo")),
922	/// )?;
923	/// let lowercase = builder.add_range(Transition {
924	/// start: b'a', end: b'z', next: StateID::ZERO,
925	/// })?;
926	/// let foo_end0 = builder.add_capture_end(StateID::ZERO, `1`)?;
927	/// let end0 = builder.add_capture_end(StateID::ZERO, `0`)?;
928	/// let match0 = builder.add_match()?;
929	/// builder.patch(start0, foo_start0)?;
930	/// builder.patch(foo_start0, lowercase)?;
931	/// builder.patch(lowercase, foo_end0)?;
932	/// builder.patch(foo_end0, end0)?;
933	/// builder.patch(end0, match0)?;
934	/// builder.finish_pattern(start0)?;
935	///
936	/// // Compile '(?P<foo>[A-Z]+)'.
937	/// builder.start_pattern()?;
938	/// let start1 = builder.add_capture_start(StateID::ZERO, `0`, None)?;
939	/// // N.B. 0th capture group must always be unnamed.
940	/// let foo_start1 = builder.add_capture_start(
941	/// StateID::ZERO, `1`, Some(Arc::from("foo")),
942	/// )?;
943	/// let uppercase = builder.add_range(Transition {
944	/// start: b'A', end: b'Z', next: StateID::ZERO,
945	/// })?;
946	/// let foo_end1 = builder.add_capture_end(StateID::ZERO, `1`)?;
947	/// let end1 = builder.add_capture_end(StateID::ZERO, `0`)?;
948	/// let match1 = builder.add_match()?;
949	/// builder.patch(start1, foo_start1)?;
950	/// builder.patch(foo_start1, uppercase)?;
951	/// builder.patch(uppercase, foo_end1)?;
952	/// builder.patch(foo_end1, end1)?;
953	/// builder.patch(end1, match1)?;
954	/// builder.finish_pattern(start1)?;
955	///
956	/// // Now add the patterns to our alternation that we started above.
957	/// builder.patch(alt, start0)?;
958	/// builder.patch(alt, start1)?;
959	///
960	/// // Finally build the NFA. The first argument is the anchored starting
961	/// // state (the pattern alternation) where as the second is the
962	/// // unanchored starting state (the unanchored prefix).
963	/// let nfa = builder.build(alt, unanchored_prefix)?;
964	///
965	/// // Now build a Pike VM from our NFA and access the 'foo' capture
966	/// // group regardless of which pattern matched, since it is defined
967	/// // for both patterns.
968	/// let vm = PikeVM::new_from_nfa(nfa)?;
969	/// let mut cache = vm.create_cache();
970	/// let caps: Vec<Captures> =
971	/// vm.captures_iter(&mut cache, "0123aAaAA").collect();
972	/// assert_eq!(`5`, caps.len());
973	///
974	/// assert_eq!(Some(PatternID::must(`0`)), caps[`0`].pattern());
975	/// assert_eq!(Some(Span::from(`4`..`5`)), caps[`0`].get_group_by_name("foo"));
976	///
977	/// assert_eq!(Some(PatternID::must(`1`)), caps[`1`].pattern());
978	/// assert_eq!(Some(Span::from(`5`..`6`)), caps[`1`].get_group_by_name("foo"));
979	///
980	/// assert_eq!(Some(PatternID::must(`0`)), caps[`2`].pattern());
981	/// assert_eq!(Some(Span::from(`6`..`7`)), caps[`2`].get_group_by_name("foo"));
982	///
983	/// assert_eq!(Some(PatternID::must(`1`)), caps[`3`].pattern());
984	/// assert_eq!(Some(Span::from(`7`..`8`)), caps[`3`].get_group_by_name("foo"));
985	///
986	/// assert_eq!(Some(PatternID::must(`1`)), caps[`4`].pattern());
987	/// assert_eq!(Some(Span::from(`8`..`9`)), caps[`4`].get_group_by_name("foo"));
988	///
989	/// # Ok::<(), Box<dyn std::error::Error>>(())
990	/// ```
991	pub fn add_capture_start(
992	&mut self,
993	next: StateID,
994	group_index: u32,
995	name: Option<Arc<str>>,
996	) -> Result<StateID, BuildError> {
997	let pid = self.current_pattern_id();
998	let group_index = match SmallIndex::try_from(group_index) {
999	Err(_) => {
1000	return Err(BuildError::invalid_capture_index(group_index))
1001	}
1002	Ok(group_index) => group_index,
1003	};
1004	// Make sure we have space to insert our (pid,index)\|-->name mapping.
1005	if pid.as_usize() >= self.captures.len() {
1006	for _ in `0`..=(pid.as_usize() - self.captures.len()) {
1007	self.captures.push(vec![]);
1008	}
1009	}
1010	// In the case where 'group_index < self.captures[pid].len()', it means
1011	// that we are adding a duplicate capture group. This is somewhat
1012	// weird, but permissible because the capture group itself can be
1013	// repeated in the syntax. For example, '([a-z]){4}' will produce 4
1014	// capture groups. In practice, only the last will be set at search
1015	// time when a match occurs. For duplicates, we don't need to push
1016	// anything other than a CaptureStart NFA state.
1017	if group_index.as_usize() >= self.captures[pid].len() {
1018	// For discontiguous indices, push placeholders for earlier capture
1019	// groups that weren't explicitly added.
1020	for _ in `0`..(group_index.as_usize() - self.captures[pid].len()) {
1021	self.captures[pid].push(None);
1022	}
1023	self.captures[pid].push(name);
1024	}
1025	self.add(State::CaptureStart { pattern_id: pid, group_index, next })
1026	}
1027
1028	/// Add a "end capture" NFA state.
1029	///
1030	/// A "end capture" NFA state corresponds to a state with exactly one
1031	/// outgoing unconditional epsilon transition to another state. Unlike
1032	/// "empty" states, a "end capture" state also carries with it an
1033	/// instruction for saving the current position of input to a particular
1034	/// location in memory. NFA simulations, like the Pike VM, may use this
1035	/// information to report the match locations of capturing groups in a
1036	///
1037	/// Callers may provide a "dummy" state ID (typically [`StateID::ZERO`]),
1038	/// and then change it later with [`patch`](Builder::patch).
1039	///
1040	/// Note that unlike `start_pattern`/`finish_pattern`, capturing start and
1041	/// end states may be interleaved. Indeed, it is typical for many "start
1042	/// capture" NFA states to appear before the first "end capture" state.
1043	///
1044	/// # Errors
1045	///
1046	/// This returns an error if the state identifier space is exhausted, or if
1047	/// the configured heap size limit has been exceeded or if the given
1048	/// capture index overflows `usize`.
1049	///
1050	/// While the above are the only conditions in which this routine can
1051	/// currently return an error, it is possible to call this method with an
1052	/// inputs that results in the final `build()` step failing to produce an
1053	/// NFA. For example, if one adds two distinct capturing groups with the
1054	/// same name, then that will result in `build()` failing with an error.
1055	///
1056	/// See the [`GroupInfo`](crate::util::captures::GroupInfo) type for
1057	/// more information on what qualifies as valid capturing groups.
1058	pub fn add_capture_end(
1059	&mut self,
1060	next: StateID,
1061	group_index: u32,
1062	) -> Result<StateID, BuildError> {
1063	let pid = self.current_pattern_id();
1064	let group_index = match SmallIndex::try_from(group_index) {
1065	Err(_) => {
1066	return Err(BuildError::invalid_capture_index(group_index))
1067	}
1068	Ok(group_index) => group_index,
1069	};
1070	self.add(State::CaptureEnd { pattern_id: pid, group_index, next })
1071	}
1072
1073	/// Adds a "fail" NFA state.
1074	///
1075	/// A "fail" state is simply a state that has no outgoing transitions. It
1076	/// acts as a way to cause a search to stop without reporting a match.
1077	/// For example, one way to represent an NFA with zero patterns is with a
1078	/// single "fail" state.
1079	///
1080	/// # Errors
1081	///
1082	/// This returns an error if the state identifier space is exhausted, or if
1083	/// the configured heap size limit has been exceeded.
1084	pub fn add_fail(&mut self) -> Result<StateID, BuildError> {
1085	self.add(State::Fail)
1086	}
1087
1088	/// Adds a "match" NFA state.
1089	///
1090	/// A "match" state has no outgoing transitions (just like a "fail"
1091	/// state), but it has special significance in that if a search enters
1092	/// this state, then a match has been found. The match state that is added
1093	/// automatically has the current pattern ID associated with it. This is
1094	/// used to report the matching pattern ID at search time.
1095	///
1096	/// # Errors
1097	///
1098	/// This returns an error if the state identifier space is exhausted, or if
1099	/// the configured heap size limit has been exceeded.
1100	///
1101	/// # Panics
1102	///
1103	/// This must be called after a `start_pattern` call but before the
1104	/// corresponding `finish_pattern` call. Otherwise, it panics.
1105	pub fn add_match(&mut self) -> Result<StateID, BuildError> {
1106	let pattern_id = self.current_pattern_id();
1107	let sid = self.add(State::Match { pattern_id })?;
1108	Ok(sid)
1109	}
1110
1111	/// The common implementation of "add a state." It handles the common
1112	/// error cases of state ID exhausting (by owning state ID allocation) and
1113	/// whether the size limit has been exceeded.
1114	fn add(&mut self, state: State) -> Result<StateID, BuildError> {
1115	let id = StateID::new(self.states.len())
1116	.map_err(\|_\| BuildError::too_many_states(self.states.len()))?;
1117	self.memory_states += state.memory_usage();
1118	self.states.push(state);
1119	self.check_size_limit()?;
1120	Ok(id)
1121	}
1122
1123	/// Add a transition from one state to another.
1124	///
1125	/// This routine is called "patch" since it is very common to add the
1126	/// states you want, typically with "dummy" state ID transitions, and then
1127	/// "patch" in the real state IDs later. This is because you don't always
1128	/// know all of the necessary state IDs to add because they might not
1129	/// exist yet.
1130	///
1131	/// # Errors
1132	///
1133	/// This may error if patching leads to an increase in heap usage beyond
1134	/// the configured size limit. Heap usage only grows when patching adds a
1135	/// new transition (as in the case of a "union" state).
1136	///
1137	/// # Panics
1138	///
1139	/// This panics if `from` corresponds to a "sparse" state. When "sparse"
1140	/// states are added, there is no way to patch them after-the-fact. (If you
1141	/// have a use case where this would be helpful, please file an issue. It
1142	/// will likely require a new API.)
1143	pub fn patch(
1144	&mut self,
1145	from: StateID,
1146	to: StateID,
1147	) -> Result<(), BuildError> {
1148	let old_memory_states = self.memory_states;
1149	match self.states[from] {
1150	State::Empty { ref mut next } => {
1151	*next = to;
1152	}
1153	State::ByteRange { ref mut trans } => {
1154	trans.next = to;
1155	}
1156	State::Sparse { .. } => {
1157	panic!("cannot patch from a sparse NFA state")
1158	}
1159	State::Look { ref mut next, .. } => {
1160	*next = to;
1161	}
1162	State::Union { ref mut alternates } => {
1163	alternates.push(to);
1164	self.memory_states += mem::size_of::<StateID>();
1165	}
1166	State::UnionReverse { ref mut alternates } => {
1167	alternates.push(to);
1168	self.memory_states += mem::size_of::<StateID>();
1169	}
1170	State::CaptureStart { ref mut next, .. } => {
1171	*next = to;
1172	}
1173	State::CaptureEnd { ref mut next, .. } => {
1174	*next = to;
1175	}
1176	State::Fail => {}
1177	State::Match { .. } => {}
1178	}
1179	if old_memory_states != self.memory_states {
1180	self.check_size_limit()?;
1181	}
1182	Ok(())
1183	}
1184
1185	/// Set whether the NFA produced by this builder should only match UTF-8.
1186	///
1187	/// This should be set when both of the following are true:
1188	///
1189	/// 1. The caller guarantees that the NFA created by this build will only
1190	/// report non-empty matches with spans that are valid UTF-8.
1191	/// 2. The caller desires regex engines using this NFA to avoid reporting
1192	/// empty matches with a span that splits a valid UTF-8 encoded codepoint.
1193	///
1194	/// Property (1) is not checked. Instead, this requires the caller to
1195	/// promise that it is true. Property (2) corresponds to the behavior of
1196	/// regex engines using the NFA created by this builder. Namely, there
1197	/// is no way in the NFA's graph itself to say that empty matches found
1198	/// by, for example, the regex `a` will fall on valid UTF-8 boundaries.*
1199	/// Instead, this option is used to communicate the UTF-8 semantic to regex
1200	/// engines that will typically implement it as a post-processing step by
1201	/// filtering out empty matches that don't fall on UTF-8 boundaries.
1202	///
1203	/// If you're building an NFA from an HIR (and not using a
1204	/// [`thompson::Compiler`](crate::nfa::thompson::Compiler)), then you can
1205	/// use the [`syntax::Config::utf8`](crate::util::syntax::Config::utf8)
1206	/// option to guarantee that if the HIR detects a non-empty match, then it
1207	/// is guaranteed to be valid UTF-8.
1208	///
1209	/// Note that property (2) does not* specify the behavior of executing*
1210	/// a search on a haystack that is not valid UTF-8. Therefore, if you're
1211	/// not* running this NFA on strings that are guaranteed to be valid*
1212	/// UTF-8, you almost certainly do not want to enable this option.
1213	/// Similarly, if you are running the NFA on strings that are* guaranteed*
1214	/// to be valid UTF-8, then you almost certainly want to enable this option
1215	/// unless you can guarantee that your NFA will never produce a zero-width
1216	/// match.
1217	///
1218	/// It is disabled by default.
1219	pub fn set_utf8(&mut self, yes: bool) {
1220	self.utf8 = yes;
1221	}
1222
1223	/// Returns whether UTF-8 mode is enabled for this builder.
1224	///
1225	/// See [`Builder::set_utf8`] for more details about what "UTF-8 mode" is.
1226	pub fn get_utf8(&self) -> bool {
1227	self.utf8
1228	}
1229
1230	/// Sets whether the NFA produced by this builder should be matched in
1231	/// reverse or not. Generally speaking, when enabled, the NFA produced
1232	/// should be matched by moving backwards through a haystack, from a higher
1233	/// memory address to a lower memory address.
1234	///
1235	/// See also [`NFA::is_reverse`] for more details.
1236	///
1237	/// This is disabled by default, which means NFAs are by default matched
1238	/// in the forward direction.
1239	pub fn set_reverse(&mut self, yes: bool) {
1240	self.reverse = yes;
1241	}
1242
1243	/// Returns whether reverse mode is enabled for this builder.
1244	///
1245	/// See [`Builder::set_reverse`] for more details about what "reverse mode"
1246	/// is.
1247	pub fn get_reverse(&self) -> bool {
1248	self.reverse
1249	}
1250
1251	/// Sets the look-around matcher that should be used for the resulting NFA.
1252	///
1253	/// A look-around matcher can be used to configure how look-around
1254	/// assertions are matched. For example, a matcher might carry
1255	/// configuration that changes the line terminator used for `(?m:^)` and
1256	/// `(?m:$)` assertions.
1257	pub fn set_look_matcher(&mut self, m: LookMatcher) {
1258	self.look_matcher = m;
1259	}
1260
1261	/// Returns the look-around matcher used for this builder.
1262	///
1263	/// If a matcher was not explicitly set, then `LookMatcher::default()` is
1264	/// returned.
1265	pub fn get_look_matcher(&self) -> &LookMatcher {
1266	&self.look_matcher
1267	}
1268
1269	/// Set the size limit on this builder.
1270	///
1271	/// Setting the size limit will also check whether the NFA built so far
1272	/// fits within the given size limit. If it doesn't, then an error is
1273	/// returned.
1274	///
1275	/// By default, there is no configured size limit.
1276	pub fn set_size_limit(
1277	&mut self,
1278	limit: Option<usize>,
1279	) -> Result<(), BuildError> {
1280	self.size_limit = limit;
1281	self.check_size_limit()
1282	}
1283
1284	/// Return the currently configured size limit.
1285	///
1286	/// By default, this returns `None`, which corresponds to no configured
1287	/// size limit.
1288	pub fn get_size_limit(&self) -> Option<usize> {
1289	self.size_limit
1290	}
1291
1292	/// Returns the heap memory usage, in bytes, used by the NFA states added
1293	/// so far.
1294	///
1295	/// Note that this is an approximation of how big the final NFA will be.
1296	/// In practice, the final NFA will likely be a bit smaller because of
1297	/// its simpler state representation. (For example, using things like
1298	/// `Box<[StateID]>` instead of `Vec<StateID>`.)
1299	pub fn memory_usage(&self) -> usize {
1300	self.states.len() * mem::size_of::<State>() + self.memory_states
1301	}
1302
1303	fn check_size_limit(&self) -> Result<(), BuildError> {
1304	if let Some(limit) = self.size_limit {
1305	if self.memory_usage() > limit {
1306	return Err(BuildError::exceeded_size_limit(limit));
1307	}
1308	}
1309	Ok(())
1310	}
1311	}
1312
1313	#[cfg(test)]
1314	mod tests {
1315	use super::*;
1316
1317	// This asserts that a builder state doesn't have its size changed. It is
1318	// really* easy to accidentally increase the size, and thus potentially*
1319	// dramatically increase the memory usage of NFA builder.
1320	//
1321	// This assert doesn't mean we absolutely cannot increase the size of a
1322	// builder state. We can. It's just here to make sure we do it knowingly
1323	// and intentionally.
1324	//
1325	// A builder state is unfortunately a little bigger than an NFA state,
1326	// since we really want to support adding things to a pre-existing state.
1327	// i.e., We use Vec<thing> instead of Box<[thing]>. So we end up using an
1328	// extra 8 bytes per state. Sad, but at least it gets freed once the NFA
1329	// is built.
1330	#[test]
1331	fn state_has_small_size() {
1332	#[cfg(target_pointer_width = "64")]
1333	assert_eq!(`32`, core::mem::size_of::<State>());
1334	#[cfg(target_pointer_width = "32")]
1335	assert_eq!(`16`, core::mem::size_of::<State>());
1336	}
1337	}
1338