1use core::mem;
2
3use alloc::{vec, vec::Vec};
4
5use crate::{
6 nfa::thompson::{self, compiler::ThompsonRef, BuildError, Builder},
7 util::primitives::{IteratorIndexExt, StateID},
8};
9
10/// A trie that preserves leftmost-first match semantics.
11///
12/// This is a purpose-built data structure for optimizing 'lit1|lit2|..|litN'
13/// patterns. It can *only* handle alternations of literals, which makes it
14/// somewhat restricted in its scope, but literal alternations are fairly
15/// common.
16///
17/// At a 5,000 foot level, the main idea of this trie is make an alternation of
18/// literals look more like a DFA than an NFA via epsilon removal.
19///
20/// More precisely, the main issue is in how alternations are compiled into
21/// a Thompson NFA. Namely, each alternation gets a single NFA "union" state
22/// with an epsilon transition for every branch of the alternation pointing to
23/// an NFA state corresponding to the start of that branch. The main problem
24/// with this representation is the cost of computing an epsilon closure. Once
25/// you hit the alternation's start state, it acts as a sort of "clog" that
26/// requires you to traverse all of the epsilon transitions to compute the full
27/// closure.
28///
29/// While fixing such clogs in the general case is pretty tricky without going
30/// to a DFA (or perhaps a Glushkov NFA, but that comes with other problems).
31/// But at least in the case of an alternation of literals, we can convert
32/// that to a prefix trie without too much cost. In theory, that's all you
33/// really need to do: build the trie and then compile it to a Thompson NFA.
34/// For example, if you have the pattern 'bar|baz|foo', then using a trie, it
35/// is transformed to something like 'b(a(r|z))|f'. This reduces the clog by
36/// reducing the number of epsilon transitions out of the alternation's start
37/// state from 3 to 2 (it actually gets down to 1 when you use a sparse state,
38/// which we do below). It's a small effect here, but when your alternation is
39/// huge, the savings is also huge.
40///
41/// And that is... essentially what a LiteralTrie does. But there is one
42/// hiccup. Consider a regex like 'sam|samwise'. How does a prefix trie compile
43/// that when leftmost-first semantics are used? If 'sam|samwise' was the
44/// entire regex, then you could just drop the 'samwise' branch entirely since
45/// it is impossible to match ('sam' will always take priority, and since it
46/// is a prefix of 'samwise', 'samwise' will never match). But what about the
47/// regex '\b(sam|samwise)\b'? In that case, you can't remove 'samwise' because
48/// it might match when 'sam' doesn't fall on a word boundary.
49///
50/// The main idea is that 'sam|samwise' can be translated to 'sam(?:|wise)',
51/// which is a precisely equivalent regex that also gets rid of the clog.
52///
53/// Another example is 'zapper|z|zap'. That gets translated to
54/// 'z(?:apper||ap)'.
55///
56/// We accomplish this by giving each state in the trie multiple "chunks" of
57/// transitions. Each chunk barrier represents a match. The idea is that once
58/// you know a match occurs, none of the transitions after the match can be
59/// re-ordered and mixed in with the transitions before the match. Otherwise,
60/// the match semantics could be changed.
61///
62/// See the 'State' data type for a bit more detail.
63///
64/// Future work:
65///
66/// * In theory, it would be nice to generalize the idea of removing clogs and
67/// apply it to the NFA graph itself. Then this could in theory work for
68/// case insensitive alternations of literals, or even just alternations where
69/// each branch starts with a non-epsilon transition.
70/// * Could we instead use the Aho-Corasick algorithm here? The aho-corasick
71/// crate deals with leftmost-first matches correctly, but I think this implies
72/// encoding failure transitions into a Thompson NFA somehow. Which seems fine,
73/// because failure transitions are just unconditional epsilon transitions?
74/// * Or perhaps even better, could we use an aho_corasick::AhoCorasick
75/// directly? At time of writing, 0.7 is the current version of the
76/// aho-corasick crate, and that definitely cannot be used as-is. But if we
77/// expose the underlying finite state machine API, then could we use it? That
78/// would be super. If we could figure that out, it might also lend itself to
79/// more general composition of finite state machines.
80#[derive(Clone)]
81pub(crate) struct LiteralTrie {
82 /// The set of trie states. Each state contains one or more chunks, where
83 /// each chunk is a sparse set of transitions to other states. A leaf state
84 /// is always a match state that contains only empty chunks (i.e., no
85 /// transitions).
86 states: Vec<State>,
87 /// Whether to add literals in reverse to the trie. Useful when building
88 /// a reverse NFA automaton.
89 rev: bool,
90}
91
92impl LiteralTrie {
93 /// Create a new literal trie that adds literals in the forward direction.
94 pub(crate) fn forward() -> LiteralTrie {
95 let root = State::default();
96 LiteralTrie { states: vec![root], rev: false }
97 }
98
99 /// Create a new literal trie that adds literals in reverse.
100 pub(crate) fn reverse() -> LiteralTrie {
101 let root = State::default();
102 LiteralTrie { states: vec![root], rev: true }
103 }
104
105 /// Add the given literal to this trie.
106 ///
107 /// If the literal could not be added because the `StateID` space was
108 /// exhausted, then an error is returned. If an error returns, the trie
109 /// is in an unspecified state.
110 pub(crate) fn add(&mut self, bytes: &[u8]) -> Result<(), BuildError> {
111 let mut prev = StateID::ZERO;
112 let mut it = bytes.iter().copied();
113 while let Some(b) = if self.rev { it.next_back() } else { it.next() } {
114 prev = self.get_or_add_state(prev, b)?;
115 }
116 self.states[prev].add_match();
117 Ok(())
118 }
119
120 /// If the given transition is defined, then return the next state ID.
121 /// Otherwise, add the transition to `from` and point it to a new state.
122 ///
123 /// If a new state ID could not be allocated, then an error is returned.
124 fn get_or_add_state(
125 &mut self,
126 from: StateID,
127 byte: u8,
128 ) -> Result<StateID, BuildError> {
129 let active = self.states[from].active_chunk();
130 match active.binary_search_by_key(&byte, |t| t.byte) {
131 Ok(i) => Ok(active[i].next),
132 Err(i) => {
133 // Add a new state and get its ID.
134 let next = StateID::new(self.states.len()).map_err(|_| {
135 BuildError::too_many_states(self.states.len())
136 })?;
137 self.states.push(State::default());
138 // Offset our position to account for all transitions and not
139 // just the ones in the active chunk.
140 let i = self.states[from].active_chunk_start() + i;
141 let t = Transition { byte, next };
142 self.states[from].transitions.insert(i, t);
143 Ok(next)
144 }
145 }
146 }
147
148 /// Compile this literal trie to the NFA builder given.
149 ///
150 /// This forwards any errors that may occur while using the given builder.
151 pub(crate) fn compile(
152 &self,
153 builder: &mut Builder,
154 ) -> Result<ThompsonRef, BuildError> {
155 // Compilation proceeds via depth-first traversal of the trie.
156 //
157 // This is overall pretty brutal. The recursive version of this is
158 // deliciously simple. (See 'compile_to_hir' below for what it might
159 // look like.) But recursion on a trie means your call stack grows
160 // in accordance with the longest literal, which just does not seem
161 // appropriate. So we push the call stack to the heap. But as a result,
162 // the trie traversal becomes pretty brutal because we essentially
163 // have to encode the state of a double for-loop into an explicit call
164 // frame. If someone can simplify this without using recursion, that'd
165 // be great.
166
167 // 'end' is our match state for this trie, but represented in the the
168 // NFA. Any time we see a match in the trie, we insert a transition
169 // from the current state we're in to 'end'.
170 let end = builder.add_empty()?;
171 let mut stack = vec![];
172 let mut f = Frame::new(&self.states[StateID::ZERO]);
173 loop {
174 if let Some(t) = f.transitions.next() {
175 if self.states[t.next].is_leaf() {
176 f.sparse.push(thompson::Transition {
177 start: t.byte,
178 end: t.byte,
179 next: end,
180 });
181 } else {
182 f.sparse.push(thompson::Transition {
183 start: t.byte,
184 end: t.byte,
185 // This is a little funny, but when the frame we create
186 // below completes, it will pop this parent frame off
187 // and modify this transition to point to the correct
188 // state.
189 next: StateID::ZERO,
190 });
191 stack.push(f);
192 f = Frame::new(&self.states[t.next]);
193 }
194 continue;
195 }
196 // At this point, we have visited all transitions in f.chunk, so
197 // add it as a sparse NFA state. Unless the chunk was empty, in
198 // which case, we don't do anything.
199 if !f.sparse.is_empty() {
200 let chunk_id = if f.sparse.len() == 1 {
201 builder.add_range(f.sparse.pop().unwrap())?
202 } else {
203 let sparse = mem::replace(&mut f.sparse, vec![]);
204 builder.add_sparse(sparse)?
205 };
206 f.union.push(chunk_id);
207 }
208 // Now we need to look to see if there are other chunks to visit.
209 if let Some(chunk) = f.chunks.next() {
210 // If we're here, it means we're on the second (or greater)
211 // chunk, which implies there is a match at this point. So
212 // connect this state to the final end state.
213 f.union.push(end);
214 // Advance to the next chunk.
215 f.transitions = chunk.iter();
216 continue;
217 }
218 // Now that we are out of chunks, we have completely visited
219 // this state. So turn our union of chunks into an NFA union
220 // state, and add that union state to the parent state's current
221 // sparse state. (If there is no parent, we're done.)
222 let start = builder.add_union(f.union)?;
223 match stack.pop() {
224 None => {
225 return Ok(ThompsonRef { start, end });
226 }
227 Some(mut parent) => {
228 // OK because the only way a frame gets pushed on to the
229 // stack (aside from the root) is when a transition has
230 // been added to 'sparse'.
231 parent.sparse.last_mut().unwrap().next = start;
232 f = parent;
233 }
234 }
235 }
236 }
237
238 /// Converts this trie to an equivalent HIR expression.
239 ///
240 /// We don't actually use this, but it's useful for tests. In particular,
241 /// it provides a (somewhat) human readable representation of the trie
242 /// itself.
243 #[cfg(test)]
244 fn compile_to_hir(&self) -> regex_syntax::hir::Hir {
245 self.compile_state_to_hir(StateID::ZERO)
246 }
247
248 /// The recursive implementation of 'to_hir'.
249 ///
250 /// Notice how simple this is compared to 'compile' above. 'compile' could
251 /// be similarly simple, but we opt to not use recursion in order to avoid
252 /// overflowing the stack in the case of a longer literal.
253 #[cfg(test)]
254 fn compile_state_to_hir(&self, sid: StateID) -> regex_syntax::hir::Hir {
255 use regex_syntax::hir::Hir;
256
257 let mut alt = vec![];
258 for (i, chunk) in self.states[sid].chunks().enumerate() {
259 if i > 0 {
260 alt.push(Hir::empty());
261 }
262 if chunk.is_empty() {
263 continue;
264 }
265 let mut chunk_alt = vec![];
266 for t in chunk.iter() {
267 chunk_alt.push(Hir::concat(vec![
268 Hir::literal(vec![t.byte]),
269 self.compile_state_to_hir(t.next),
270 ]));
271 }
272 alt.push(Hir::alternation(chunk_alt));
273 }
274 Hir::alternation(alt)
275 }
276}
277
278impl core::fmt::Debug for LiteralTrie {
279 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
280 writeln!(f, "LiteralTrie(")?;
281 for (sid: StateID, state: &State) in self.states.iter().with_state_ids() {
282 writeln!(f, "{:06?}: {:?}", sid.as_usize(), state)?;
283 }
284 writeln!(f, ")")?;
285 Ok(())
286 }
287}
288
289/// An explicit stack frame used for traversing the trie without using
290/// recursion.
291///
292/// Each frame is tied to the traversal of a single trie state. The frame is
293/// dropped once the entire state (and all of its children) have been visited.
294/// The "output" of compiling a state is the 'union' vector, which is turn
295/// converted to a NFA union state. Each branch of the union corresponds to a
296/// chunk in the trie state.
297///
298/// 'sparse' corresponds to the set of transitions for a particular chunk in a
299/// trie state. It is ultimately converted to an NFA sparse state. The 'sparse'
300/// field, after being converted to a sparse NFA state, is reused for any
301/// subsequent chunks in the trie state, if any exist.
302#[derive(Debug)]
303struct Frame<'a> {
304 /// The remaining chunks to visit for a trie state.
305 chunks: StateChunksIter<'a>,
306 /// The transitions of the current chunk that we're iterating over. Since
307 /// every trie state has at least one chunk, every frame is initialized
308 /// with the first chunk's transitions ready to be consumed.
309 transitions: core::slice::Iter<'a, Transition>,
310 /// The NFA state IDs pointing to the start of each chunk compiled by
311 /// this trie state. This ultimately gets converted to an NFA union once
312 /// the entire trie state (and all of its children) have been compiled.
313 /// The order of these matters for leftmost-first match semantics, since
314 /// earlier matches in the union are preferred over later ones.
315 union: Vec<StateID>,
316 /// The actual NFA transitions for a single chunk in a trie state. This
317 /// gets converted to an NFA sparse state, and its corresponding NFA state
318 /// ID should get added to 'union'.
319 sparse: Vec<thompson::Transition>,
320}
321
322impl<'a> Frame<'a> {
323 /// Create a new stack frame for trie traversal. This initializes the
324 /// 'transitions' iterator to the transitions for the first chunk, with the
325 /// 'chunks' iterator being every chunk after the first one.
326 fn new(state: &'a State) -> Frame<'a> {
327 let mut chunks: StateChunksIter<'_> = state.chunks();
328 // every state has at least 1 chunk
329 let chunk: &[Transition] = chunks.next().unwrap();
330 let transitions: Iter<'_, Transition> = chunk.iter();
331 Frame { chunks, transitions, union: vec![], sparse: vec![] }
332 }
333}
334
335/// A state in a trie.
336///
337/// This uses a sparse representation. Since we don't use literal tries
338/// for searching, and ultimately (and compilation requires visiting every
339/// transition anyway), we use a sparse representation for transitions. This
340/// means we save on memory, at the expense of 'LiteralTrie::add' being perhaps
341/// a bit slower.
342///
343/// While 'transitions' is pretty standard as far as tries goes, the 'chunks'
344/// piece here is more unusual. In effect, 'chunks' defines a partitioning
345/// of 'transitions', where each chunk corresponds to a distinct set of
346/// transitions. The key invariant is that a transition in one chunk cannot
347/// be moved to another chunk. This is the secret sauce that preserve
348/// leftmost-first match semantics.
349///
350/// A new chunk is added whenever we mark a state as a match state. Once a
351/// new chunk is added, the old active chunk is frozen and is never mutated
352/// again. The new chunk becomes the active chunk, which is defined as
353/// '&transitions[chunks.last().map_or(0, |c| c.1)..]'. Thus, a state where
354/// 'chunks' is empty actually contains one chunk. Thus, every state contains
355/// at least one (possibly empty) chunk.
356///
357/// A "leaf" state is a state that has no outgoing transitions (so
358/// 'transitions' is empty). Note that there is no way for a leaf state to be a
359/// non-matching state. (Although while building the trie, within 'add', a leaf
360/// state may exist while not containing any matches. But this invariant is
361/// only broken within 'add'. Once 'add' returns, the invariant is upheld.)
362#[derive(Clone, Default)]
363struct State {
364 transitions: Vec<Transition>,
365 chunks: Vec<(usize, usize)>,
366}
367
368impl State {
369 /// Mark this state as a match state and freeze the active chunk such that
370 /// it can not be further mutated.
371 fn add_match(&mut self) {
372 // This is not strictly necessary, but there's no point in recording
373 // another match by adding another chunk if the state has no
374 // transitions. Note though that we only skip this if we already know
375 // this is a match state, which is only true if 'chunks' is not empty.
376 // Basically, if we didn't do this, nothing semantically would change,
377 // but we'd end up pushing another chunk and potentially triggering an
378 // alloc.
379 if self.transitions.is_empty() && !self.chunks.is_empty() {
380 return;
381 }
382 let chunk_start = self.active_chunk_start();
383 let chunk_end = self.transitions.len();
384 self.chunks.push((chunk_start, chunk_end));
385 }
386
387 /// Returns true if and only if this state is a leaf state. That is, a
388 /// state that has no outgoing transitions.
389 fn is_leaf(&self) -> bool {
390 self.transitions.is_empty()
391 }
392
393 /// Returns an iterator over all of the chunks (including the currently
394 /// active chunk) in this state. Since the active chunk is included, the
395 /// iterator is guaranteed to always yield at least one chunk (although the
396 /// chunk may be empty).
397 fn chunks(&self) -> StateChunksIter<'_> {
398 StateChunksIter {
399 transitions: &*self.transitions,
400 chunks: self.chunks.iter(),
401 active: Some(self.active_chunk()),
402 }
403 }
404
405 /// Returns the active chunk as a slice of transitions.
406 fn active_chunk(&self) -> &[Transition] {
407 let start = self.active_chunk_start();
408 &self.transitions[start..]
409 }
410
411 /// Returns the index into 'transitions' where the active chunk starts.
412 fn active_chunk_start(&self) -> usize {
413 self.chunks.last().map_or(0, |&(_, end)| end)
414 }
415}
416
417impl core::fmt::Debug for State {
418 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
419 let mut spacing: &str = " ";
420 for (i: usize, chunk: &[Transition]) in self.chunks().enumerate() {
421 if i > 0 {
422 write!(f, "{}MATCH", spacing)?;
423 }
424 spacing = "";
425 for (j: usize, t: &Transition) in chunk.iter().enumerate() {
426 spacing = " ";
427 if j == 0 && i > 0 {
428 write!(f, " ")?;
429 } else if j > 0 {
430 write!(f, ", ")?;
431 }
432 write!(f, "{:?}", t)?;
433 }
434 }
435 Ok(())
436 }
437}
438
439/// An iterator over all of the chunks in a state, including the active chunk.
440///
441/// This iterator is created by `State::chunks`. We name this iterator so that
442/// we can include it in the `Frame` type for non-recursive trie traversal.
443#[derive(Debug)]
444struct StateChunksIter<'a> {
445 transitions: &'a [Transition],
446 chunks: core::slice::Iter<'a, (usize, usize)>,
447 active: Option<&'a [Transition]>,
448}
449
450impl<'a> Iterator for StateChunksIter<'a> {
451 type Item = &'a [Transition];
452
453 fn next(&mut self) -> Option<&'a [Transition]> {
454 if let Some(&(start: usize, end: usize)) = self.chunks.next() {
455 return Some(&self.transitions[start..end]);
456 }
457 if let Some(chunk: &[Transition]) = self.active.take() {
458 return Some(chunk);
459 }
460 None
461 }
462}
463
464/// A single transition in a trie to another state.
465#[derive(Clone, Copy)]
466struct Transition {
467 byte: u8,
468 next: StateID,
469}
470
471impl core::fmt::Debug for Transition {
472 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
473 write!(
474 f,
475 "{:?} => {}",
476 crate::util::escape::DebugByte(self.byte),
477 self.next.as_usize()
478 )
479 }
480}
481
482#[cfg(test)]
483mod tests {
484 use bstr::B;
485 use regex_syntax::hir::Hir;
486
487 use super::*;
488
489 #[test]
490 fn zap() {
491 let mut trie = LiteralTrie::forward();
492 trie.add(b"zapper").unwrap();
493 trie.add(b"z").unwrap();
494 trie.add(b"zap").unwrap();
495
496 let got = trie.compile_to_hir();
497 let expected = Hir::concat(vec![
498 Hir::literal(B("z")),
499 Hir::alternation(vec![
500 Hir::literal(B("apper")),
501 Hir::empty(),
502 Hir::literal(B("ap")),
503 ]),
504 ]);
505 assert_eq!(expected, got);
506 }
507
508 #[test]
509 fn maker() {
510 let mut trie = LiteralTrie::forward();
511 trie.add(b"make").unwrap();
512 trie.add(b"maple").unwrap();
513 trie.add(b"maker").unwrap();
514
515 let got = trie.compile_to_hir();
516 let expected = Hir::concat(vec![
517 Hir::literal(B("ma")),
518 Hir::alternation(vec![
519 Hir::concat(vec![
520 Hir::literal(B("ke")),
521 Hir::alternation(vec![Hir::empty(), Hir::literal(B("r"))]),
522 ]),
523 Hir::literal(B("ple")),
524 ]),
525 ]);
526 assert_eq!(expected, got);
527 }
528}
529