literal_trie.rs - Codebrowser

1	use core::mem;
2
3	use alloc::{vec, vec::Vec};
4
5	use crate::{
6	nfa::thompson::{self, compiler::ThompsonRef, BuildError, Builder},
7	util::primitives::{IteratorIndexExt, StateID},
8	};
9
10	/// A trie that preserves leftmost-first match semantics.
11	///
12	/// This is a purpose-built data structure for optimizing 'lit1\|lit2\|..\|litN'
13	/// patterns. It can only* handle alternations of literals, which makes it*
14	/// somewhat restricted in its scope, but literal alternations are fairly
15	/// common.
16	///
17	/// At a 5,000 foot level, the main idea of this trie is make an alternation of
18	/// literals look more like a DFA than an NFA via epsilon removal.
19	///
20	/// More precisely, the main issue is in how alternations are compiled into
21	/// a Thompson NFA. Namely, each alternation gets a single NFA "union" state
22	/// with an epsilon transition for every branch of the alternation pointing to
23	/// an NFA state corresponding to the start of that branch. The main problem
24	/// with this representation is the cost of computing an epsilon closure. Once
25	/// you hit the alternation's start state, it acts as a sort of "clog" that
26	/// requires you to traverse all of the epsilon transitions to compute the full
27	/// closure.
28	///
29	/// While fixing such clogs in the general case is pretty tricky without going
30	/// to a DFA (or perhaps a Glushkov NFA, but that comes with other problems).
31	/// But at least in the case of an alternation of literals, we can convert
32	/// that to a prefix trie without too much cost. In theory, that's all you
33	/// really need to do: build the trie and then compile it to a Thompson NFA.
34	/// For example, if you have the pattern 'bar\|baz\|foo', then using a trie, it
35	/// is transformed to something like 'b(a(r\|z))\|f'. This reduces the clog by
36	/// reducing the number of epsilon transitions out of the alternation's start
37	/// state from 3 to 2 (it actually gets down to 1 when you use a sparse state,
38	/// which we do below). It's a small effect here, but when your alternation is
39	/// huge, the savings is also huge.
40	///
41	/// And that is... essentially what a LiteralTrie does. But there is one
42	/// hiccup. Consider a regex like 'sam\|samwise'. How does a prefix trie compile
43	/// that when leftmost-first semantics are used? If 'sam\|samwise' was the
44	/// entire regex, then you could just drop the 'samwise' branch entirely since
45	/// it is impossible to match ('sam' will always take priority, and since it
46	/// is a prefix of 'samwise', 'samwise' will never match). But what about the
47	/// regex '\b(sam\|samwise)\b'? In that case, you can't remove 'samwise' because
48	/// it might match when 'sam' doesn't fall on a word boundary.
49	///
50	/// The main idea is that 'sam\|samwise' can be translated to 'sam(?:\|wise)',
51	/// which is a precisely equivalent regex that also gets rid of the clog.
52	///
53	/// Another example is 'zapper\|z\|zap'. That gets translated to
54	/// 'z(?:apper\|\|ap)'.
55	///
56	/// We accomplish this by giving each state in the trie multiple "chunks" of
57	/// transitions. Each chunk barrier represents a match. The idea is that once
58	/// you know a match occurs, none of the transitions after the match can be
59	/// re-ordered and mixed in with the transitions before the match. Otherwise,
60	/// the match semantics could be changed.
61	///
62	/// See the 'State' data type for a bit more detail.
63	///
64	/// Future work:
65	///
66	/// In theory, it would be nice to generalize the idea of removing clogs and*
67	/// apply it to the NFA graph itself. Then this could in theory work for
68	/// case insensitive alternations of literals, or even just alternations where
69	/// each branch starts with a non-epsilon transition.
70	/// Could we instead use the Aho-Corasick algorithm here? The aho-corasick*
71	/// crate deals with leftmost-first matches correctly, but I think this implies
72	/// encoding failure transitions into a Thompson NFA somehow. Which seems fine,
73	/// because failure transitions are just unconditional epsilon transitions?
74	/// Or perhaps even better, could we use an aho_corasick::AhoCorasick*
75	/// directly? At time of writing, 0.7 is the current version of the
76	/// aho-corasick crate, and that definitely cannot be used as-is. But if we
77	/// expose the underlying finite state machine API, then could we use it? That
78	/// would be super. If we could figure that out, it might also lend itself to
79	/// more general composition of finite state machines.
80	#[derive(Clone)]
81	pub(crate) struct LiteralTrie {
82	/// The set of trie states. Each state contains one or more chunks, where
83	/// each chunk is a sparse set of transitions to other states. A leaf state
84	/// is always a match state that contains only empty chunks (i.e., no
85	/// transitions).
86	states: Vec<State>,
87	/// Whether to add literals in reverse to the trie. Useful when building
88	/// a reverse NFA automaton.
89	rev: bool,
90	}
91
92	impl LiteralTrie {
93	/// Create a new literal trie that adds literals in the forward direction.
94	pub(crate) fn forward() -> LiteralTrie {
95	let root = State::default();
96	LiteralTrie { states: vec![root], rev: `false` }
97	}
98
99	/// Create a new literal trie that adds literals in reverse.
100	pub(crate) fn reverse() -> LiteralTrie {
101	let root = State::default();
102	LiteralTrie { states: vec![root], rev: `true` }
103	}
104
105	/// Add the given literal to this trie.
106	///
107	/// If the literal could not be added because the `StateID` space was
108	/// exhausted, then an error is returned. If an error returns, the trie
109	/// is in an unspecified state.
110	pub(crate) fn add(&mut self, bytes: &[u8]) -> Result<(), BuildError> {
111	let mut prev = StateID::ZERO;
112	let mut it = bytes.iter().copied();
113	while let Some(b) = if self.rev { it.next_back() } else { it.next() } {
114	prev = self.get_or_add_state(prev, b)?;
115	}
116	self.states[prev].add_match();
117	Ok(())
118	}
119
120	/// If the given transition is defined, then return the next state ID.
121	/// Otherwise, add the transition to `from` and point it to a new state.
122	///
123	/// If a new state ID could not be allocated, then an error is returned.
124	fn get_or_add_state(
125	&mut self,
126	from: StateID,
127	byte: u8,
128	) -> Result<StateID, BuildError> {
129	let active = self.states[from].active_chunk();
130	match active.binary_search_by_key(&byte, \|t\| t.byte) {
131	Ok(i) => Ok(active[i].next),
132	Err(i) => {
133	// Add a new state and get its ID.
134	let next = StateID::new(self.states.len()).map_err(\|_\| {
135	BuildError::too_many_states(self.states.len())
136	})?;
137	self.states.push(State::default());
138	// Offset our position to account for all transitions and not
139	// just the ones in the active chunk.
140	let i = self.states[from].active_chunk_start() + i;
141	let t = Transition { byte, next };
142	self.states[from].transitions.insert(i, t);
143	Ok(next)
144	}
145	}
146	}
147
148	/// Compile this literal trie to the NFA builder given.
149	///
150	/// This forwards any errors that may occur while using the given builder.
151	pub(crate) fn compile(
152	&self,
153	builder: &mut Builder,
154	) -> Result<ThompsonRef, BuildError> {
155	// Compilation proceeds via depth-first traversal of the trie.
156	//
157	// This is overall pretty brutal. The recursive version of this is
158	// deliciously simple. (See 'compile_to_hir' below for what it might
159	// look like.) But recursion on a trie means your call stack grows
160	// in accordance with the longest literal, which just does not seem
161	// appropriate. So we push the call stack to the heap. But as a result,
162	// the trie traversal becomes pretty brutal because we essentially
163	// have to encode the state of a double for-loop into an explicit call
164	// frame. If someone can simplify this without using recursion, that'd
165	// be great.
166
167	// 'end' is our match state for this trie, but represented in the the
168	// NFA. Any time we see a match in the trie, we insert a transition
169	// from the current state we're in to 'end'.
170	let end = builder.add_empty()?;
171	let mut stack = vec![];
172	let mut f = Frame::new(&self.states[StateID::ZERO]);
173	loop {
174	if let Some(t) = f.transitions.next() {
175	if self.states[t.next].is_leaf() {
176	f.sparse.push(thompson::Transition {
177	start: t.byte,
178	end: t.byte,
179	next: end,
180	});
181	} else {
182	f.sparse.push(thompson::Transition {
183	start: t.byte,
184	end: t.byte,
185	// This is a little funny, but when the frame we create
186	// below completes, it will pop this parent frame off
187	// and modify this transition to point to the correct
188	// state.
189	next: StateID::ZERO,
190	});
191	stack.push(f);
192	f = Frame::new(&self.states[t.next]);
193	}
194	continue;
195	}
196	// At this point, we have visited all transitions in f.chunk, so
197	// add it as a sparse NFA state. Unless the chunk was empty, in
198	// which case, we don't do anything.
199	if !f.sparse.is_empty() {
200	let chunk_id = if f.sparse.len() == `1` {
201	builder.add_range(f.sparse.pop().unwrap())?
202	} else {
203	let sparse = mem::replace(&mut f.sparse, vec![]);
204	builder.add_sparse(sparse)?
205	};
206	f.union.push(chunk_id);
207	}
208	// Now we need to look to see if there are other chunks to visit.
209	if let Some(chunk) = f.chunks.next() {
210	// If we're here, it means we're on the second (or greater)
211	// chunk, which implies there is a match at this point. So
212	// connect this state to the final end state.
213	f.union.push(end);
214	// Advance to the next chunk.
215	f.transitions = chunk.iter();
216	continue;
217	}
218	// Now that we are out of chunks, we have completely visited
219	// this state. So turn our union of chunks into an NFA union
220	// state, and add that union state to the parent state's current
221	// sparse state. (If there is no parent, we're done.)
222	let start = builder.add_union(f.union)?;
223	match stack.pop() {
224	None => {
225	return Ok(ThompsonRef { start, end });
226	}
227	Some(mut parent) => {
228	// OK because the only way a frame gets pushed on to the
229	// stack (aside from the root) is when a transition has
230	// been added to 'sparse'.
231	parent.sparse.last_mut().unwrap().next = start;
232	f = parent;
233	}
234	}
235	}
236	}
237
238	/// Converts this trie to an equivalent HIR expression.
239	///
240	/// We don't actually use this, but it's useful for tests. In particular,
241	/// it provides a (somewhat) human readable representation of the trie
242	/// itself.
243	#[cfg(test)]
244	fn compile_to_hir(&self) -> regex_syntax::hir::Hir {
245	self.compile_state_to_hir(StateID::ZERO)
246	}
247
248	/// The recursive implementation of 'to_hir'.
249	///
250	/// Notice how simple this is compared to 'compile' above. 'compile' could
251	/// be similarly simple, but we opt to not use recursion in order to avoid
252	/// overflowing the stack in the case of a longer literal.
253	#[cfg(test)]
254	fn compile_state_to_hir(&self, sid: StateID) -> regex_syntax::hir::Hir {
255	use regex_syntax::hir::Hir;
256
257	let mut alt = vec![];
258	for (i, chunk) in self.states[sid].chunks().enumerate() {
259	if i > `0` {
260	alt.push(Hir::empty());
261	}
262	if chunk.is_empty() {
263	continue;
264	}
265	let mut chunk_alt = vec![];
266	for t in chunk.iter() {
267	chunk_alt.push(Hir::concat(vec![
268	Hir::literal(vec![t.byte]),
269	self.compile_state_to_hir(t.next),
270	]));
271	}
272	alt.push(Hir::alternation(chunk_alt));
273	}
274	Hir::alternation(alt)
275	}
276	}
277
278	impl core::fmt::Debug for LiteralTrie {
279	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
280	writeln!(f, "LiteralTrie(")?;
281	for (sid, state) in self.states.iter().with_state_ids() {
282	writeln!(f, "{:06?}: {:?}", sid.as_usize(), state)?;
283	}
284	writeln!(f, ")")?;
285	Ok(())
286	}
287	}
288
289	/// An explicit stack frame used for traversing the trie without using
290	/// recursion.
291	///
292	/// Each frame is tied to the traversal of a single trie state. The frame is
293	/// dropped once the entire state (and all of its children) have been visited.
294	/// The "output" of compiling a state is the 'union' vector, which is turn
295	/// converted to a NFA union state. Each branch of the union corresponds to a
296	/// chunk in the trie state.
297	///
298	/// 'sparse' corresponds to the set of transitions for a particular chunk in a
299	/// trie state. It is ultimately converted to an NFA sparse state. The 'sparse'
300	/// field, after being converted to a sparse NFA state, is reused for any
301	/// subsequent chunks in the trie state, if any exist.
302	#[derive(Debug)]
303	struct Frame<'a> {
304	/// The remaining chunks to visit for a trie state.
305	chunks: StateChunksIter<'a>,
306	/// The transitions of the current chunk that we're iterating over. Since
307	/// every trie state has at least one chunk, every frame is initialized
308	/// with the first chunk's transitions ready to be consumed.
309	transitions: core::slice::Iter<'a, Transition>,
310	/// The NFA state IDs pointing to the start of each chunk compiled by
311	/// this trie state. This ultimately gets converted to an NFA union once
312	/// the entire trie state (and all of its children) have been compiled.
313	/// The order of these matters for leftmost-first match semantics, since
314	/// earlier matches in the union are preferred over later ones.
315	union: Vec<StateID>,
316	/// The actual NFA transitions for a single chunk in a trie state. This
317	/// gets converted to an NFA sparse state, and its corresponding NFA state
318	/// ID should get added to 'union'.
319	sparse: Vec<thompson::Transition>,
320	}
321
322	impl<'a> Frame<'a> {
323	/// Create a new stack frame for trie traversal. This initializes the
324	/// 'transitions' iterator to the transitions for the first chunk, with the
325	/// 'chunks' iterator being every chunk after the first one.
326	fn new(state: &'a State) -> Frame<'a> {
327	let mut chunks = state.chunks();
328	// every state has at least 1 chunk
329	let chunk = chunks.next().unwrap();
330	let transitions = chunk.iter();
331	Frame { chunks, transitions, union: vec![], sparse: vec![] }
332	}
333	}
334
335	/// A state in a trie.
336	///
337	/// This uses a sparse representation. Since we don't use literal tries
338	/// for searching, and ultimately (and compilation requires visiting every
339	/// transition anyway), we use a sparse representation for transitions. This
340	/// means we save on memory, at the expense of 'LiteralTrie::add' being perhaps
341	/// a bit slower.
342	///
343	/// While 'transitions' is pretty standard as far as tries goes, the 'chunks'
344	/// piece here is more unusual. In effect, 'chunks' defines a partitioning
345	/// of 'transitions', where each chunk corresponds to a distinct set of
346	/// transitions. The key invariant is that a transition in one chunk cannot
347	/// be moved to another chunk. This is the secret sauce that preserve
348	/// leftmost-first match semantics.
349	///
350	/// A new chunk is added whenever we mark a state as a match state. Once a
351	/// new chunk is added, the old active chunk is frozen and is never mutated
352	/// again. The new chunk becomes the active chunk, which is defined as
353	/// '&transitions[chunks.last().map_or(0, \|c\| c.1)..]'. Thus, a state where
354	/// 'chunks' is empty actually contains one chunk. Thus, every state contains
355	/// at least one (possibly empty) chunk.
356	///
357	/// A "leaf" state is a state that has no outgoing transitions (so
358	/// 'transitions' is empty). Note that there is no way for a leaf state to be a
359	/// non-matching state. (Although while building the trie, within 'add', a leaf
360	/// state may exist while not containing any matches. But this invariant is
361	/// only broken within 'add'. Once 'add' returns, the invariant is upheld.)
362	#[derive(Clone, Default)]
363	struct State {
364	transitions: Vec<Transition>,
365	chunks: Vec<(usize, usize)>,
366	}
367
368	impl State {
369	/// Mark this state as a match state and freeze the active chunk such that
370	/// it can not be further mutated.
371	fn add_match(&mut self) {
372	// This is not strictly necessary, but there's no point in recording
373	// another match by adding another chunk if the state has no
374	// transitions. Note though that we only skip this if we already know
375	// this is a match state, which is only true if 'chunks' is not empty.
376	// Basically, if we didn't do this, nothing semantically would change,
377	// but we'd end up pushing another chunk and potentially triggering an
378	// alloc.
379	if self.transitions.is_empty() && !self.chunks.is_empty() {
380	return;
381	}
382	let chunk_start = self.active_chunk_start();
383	let chunk_end = self.transitions.len();
384	self.chunks.push((chunk_start, chunk_end));
385	}
386
387	/// Returns true if and only if this state is a leaf state. That is, a
388	/// state that has no outgoing transitions.
389	fn is_leaf(&self) -> bool {
390	self.transitions.is_empty()
391	}
392
393	/// Returns an iterator over all of the chunks (including the currently
394	/// active chunk) in this state. Since the active chunk is included, the
395	/// iterator is guaranteed to always yield at least one chunk (although the
396	/// chunk may be empty).
397	fn chunks(&self) -> StateChunksIter<'_> {
398	StateChunksIter {
399	transitions: &*self.transitions,
400	chunks: self.chunks.iter(),
401	active: Some(self.active_chunk()),
402	}
403	}
404
405	/// Returns the active chunk as a slice of transitions.
406	fn active_chunk(&self) -> &[Transition] {
407	let start = self.active_chunk_start();
408	&self.transitions[start..]
409	}
410
411	/// Returns the index into 'transitions' where the active chunk starts.
412	fn active_chunk_start(&self) -> usize {
413	self.chunks.last().map_or(`0`, \|&(_, end)\| end)
414	}
415	}
416
417	impl core::fmt::Debug for State {
418	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
419	let mut spacing = " ";
420	for (i, chunk) in self.chunks().enumerate() {
421	if i > `0` {
422	write!(f, "{}MATCH", spacing)?;
423	}
424	spacing = "";
425	for (j, t) in chunk.iter().enumerate() {
426	spacing = " ";
427	if j == `0` && i > `0` {
428	write!(f, " ")?;
429	} else if j > `0` {
430	write!(f, ", ")?;
431	}
432	write!(f, "{:?}", t)?;
433	}
434	}
435	Ok(())
436	}
437	}
438
439	/// An iterator over all of the chunks in a state, including the active chunk.
440	///
441	/// This iterator is created by `State::chunks`. We name this iterator so that
442	/// we can include it in the `Frame` type for non-recursive trie traversal.
443	#[derive(Debug)]
444	struct StateChunksIter<'a> {
445	transitions: &'a [Transition],
446	chunks: core::slice::Iter<'a, (usize, usize)>,
447	active: Option<&'a [Transition]>,
448	}
449
450	impl<'a> Iterator for StateChunksIter<'a> {
451	type Item = &'a [Transition];
452
453	fn next(&mut self) -> Option<&'a [Transition]> {
454	if let Some(&(start, end)) = self.chunks.next() {
455	return Some(&self.transitions[start..end]);
456	}
457	if let Some(chunk) = self.active.take() {
458	return Some(chunk);
459	}
460	None
461	}
462	}
463
464	/// A single transition in a trie to another state.
465	#[derive(Clone, Copy)]
466	struct Transition {
467	byte: u8,
468	next: StateID,
469	}
470
471	impl core::fmt::Debug for Transition {
472	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
473	write!(
474	f,
475	"{:?} => {}",
476	crate::util::escape::DebugByte(self.byte),
477	self.next.as_usize()
478	)
479	}
480	}
481
482	#[cfg(test)]
483	mod tests {
484	use bstr::B;
485	use regex_syntax::hir::Hir;
486
487	use super::*;
488
489	#[test]
490	fn zap() {
491	let mut trie = LiteralTrie::forward();
492	trie.add(b"zapper").unwrap();
493	trie.add(b"z").unwrap();
494	trie.add(b"zap").unwrap();
495
496	let got = trie.compile_to_hir();
497	let expected = Hir::concat(vec![
498	Hir::literal(B("z")),
499	Hir::alternation(vec![
500	Hir::literal(B("apper")),
501	Hir::empty(),
502	Hir::literal(B("ap")),
503	]),
504	]);
505	assert_eq!(expected, got);
506	}
507
508	#[test]
509	fn maker() {
510	let mut trie = LiteralTrie::forward();
511	trie.add(b"make").unwrap();
512	trie.add(b"maple").unwrap();
513	trie.add(b"maker").unwrap();
514
515	let got = trie.compile_to_hir();
516	let expected = Hir::concat(vec![
517	Hir::literal(B("ma")),
518	Hir::alternation(vec![
519	Hir::concat(vec![
520	Hir::literal(B("ke")),
521	Hir::alternation(vec![Hir::empty(), Hir::literal(B("r"))]),
522	]),
523	Hir::literal(B("ple")),
524	]),
525	]);
526	assert_eq!(expected, got);
527	}
528	}
529

Provided by KDAB

Definitions