1 | use std::cmp; |
2 | use std::collections::{BTreeSet, VecDeque}; |
3 | use std::fmt; |
4 | use std::mem::size_of; |
5 | use std::ops::{Index, IndexMut}; |
6 | |
7 | use crate::ahocorasick::MatchKind; |
8 | use crate::automaton::Automaton; |
9 | use crate::classes::{ByteClassBuilder, ByteClasses}; |
10 | use crate::error::Result; |
11 | use crate::prefilter::{self, opposite_ascii_case, Prefilter, PrefilterObj}; |
12 | use crate::state_id::{dead_id, fail_id, usize_to_state_id, StateID}; |
13 | use crate::Match; |
14 | |
15 | /// The identifier for a pattern, which is simply the position of the pattern |
16 | /// in the sequence of patterns given by the caller. |
17 | pub type PatternID = usize; |
18 | |
19 | /// The length of a pattern, in bytes. |
20 | pub type PatternLength = usize; |
21 | |
22 | /// An Aho-Corasick automaton, represented as an NFA. |
23 | /// |
24 | /// This is the classical formulation of Aho-Corasick, which involves building |
25 | /// up a prefix trie of a given set of patterns, and then wiring up failure |
26 | /// transitions between states in order to guarantee linear time matching. The |
27 | /// standard formulation is, technically, an NFA because of these failure |
28 | /// transitions. That is, one can see them as enabling the automaton to be in |
29 | /// multiple states at once. Indeed, during search, it is possible to check |
30 | /// the transitions on multiple states for a single input byte. |
31 | /// |
32 | /// This particular implementation not only supports the standard style of |
33 | /// matching, but also provides a mode for choosing leftmost-first or |
34 | /// leftmost-longest match semantics. When a leftmost mode is chosen, some |
35 | /// failure transitions that would otherwise be added are elided. See |
36 | /// the documentation of `MatchKind` for more details and examples on how the |
37 | /// match semantics may differ. |
38 | /// |
39 | /// If one wants a DFA, then it is necessary to first build an NFA and convert |
40 | /// it into a DFA. Note, however, that because we've constrained ourselves to |
41 | /// matching literal patterns, this does not need to use subset construction |
42 | /// for determinization. Instead, the DFA has at most a number of states |
43 | /// equivalent to the number of NFA states. The only real difference between |
44 | /// them is that all failure transitions are followed and pre-computed. This |
45 | /// uses much more memory, but also executes searches more quickly. |
46 | #[derive (Clone)] |
47 | pub struct NFA<S> { |
48 | /// The match semantics built into this NFA. |
49 | match_kind: MatchKind, |
50 | /// The start state id as an index into `states`. |
51 | start_id: S, |
52 | /// The length, in bytes, of the longest pattern in this automaton. This |
53 | /// information is useful for keeping correct buffer sizes when searching |
54 | /// on streams. |
55 | max_pattern_len: usize, |
56 | /// The total number of patterns added to this automaton, including |
57 | /// patterns that may never be matched. |
58 | pattern_count: usize, |
59 | /// The number of bytes of heap used by this NFA's transition table. |
60 | heap_bytes: usize, |
61 | /// A prefilter for quickly skipping to candidate matches, if pertinent. |
62 | prefilter: Option<PrefilterObj>, |
63 | /// Whether this automaton anchors all matches to the start of input. |
64 | anchored: bool, |
65 | /// A set of equivalence classes in terms of bytes. We compute this while |
66 | /// building the NFA, but don't use it in the NFA's states. Instead, we |
67 | /// use this for building the DFA. We store it on the NFA since it's easy |
68 | /// to compute while visiting the patterns. |
69 | byte_classes: ByteClasses, |
70 | /// A set of states. Each state defines its own transitions, a fail |
71 | /// transition and a set of indices corresponding to matches. |
72 | /// |
73 | /// The first state is always the fail state, which is used only as a |
74 | /// sentinel. Namely, in the final NFA, no transition into the fail state |
75 | /// exists. (Well, they do, but they aren't followed. Instead, the state's |
76 | /// failure transition is followed.) |
77 | /// |
78 | /// The second state (index 1) is always the dead state. Dead states are |
79 | /// in every automaton, but only used when leftmost-{first,longest} match |
80 | /// semantics are enabled. Specifically, they instruct search to stop |
81 | /// at specific points in order to report the correct match location. In |
82 | /// the standard Aho-Corasick construction, there are no transitions to |
83 | /// the dead state. |
84 | /// |
85 | /// The third state (index 2) is generally intended to be the starting or |
86 | /// "root" state. |
87 | states: Vec<State<S>>, |
88 | } |
89 | |
90 | impl<S: StateID> NFA<S> { |
91 | /// Returns the equivalence classes of bytes found while constructing |
92 | /// this NFA. |
93 | /// |
94 | /// Note that the NFA doesn't actually make use of these equivalence |
95 | /// classes. Instead, these are useful for building the DFA when desired. |
96 | pub fn byte_classes(&self) -> &ByteClasses { |
97 | &self.byte_classes |
98 | } |
99 | |
100 | /// Returns a prefilter, if one exists. |
101 | pub fn prefilter_obj(&self) -> Option<&PrefilterObj> { |
102 | self.prefilter.as_ref() |
103 | } |
104 | |
105 | /// Returns the total number of heap bytes used by this NFA's transition |
106 | /// table. |
107 | pub fn heap_bytes(&self) -> usize { |
108 | self.heap_bytes |
109 | + self.prefilter.as_ref().map_or(0, |p| p.as_ref().heap_bytes()) |
110 | } |
111 | |
112 | /// Return the length of the longest pattern in this automaton. |
113 | pub fn max_pattern_len(&self) -> usize { |
114 | self.max_pattern_len |
115 | } |
116 | |
117 | /// Return the total number of patterns added to this automaton. |
118 | pub fn pattern_count(&self) -> usize { |
119 | self.pattern_count |
120 | } |
121 | |
122 | /// Returns the total number of states in this NFA. |
123 | pub fn state_len(&self) -> usize { |
124 | self.states.len() |
125 | } |
126 | |
127 | /// Returns the matches for the given state. |
128 | pub fn matches(&self, id: S) -> &[(PatternID, PatternLength)] { |
129 | &self.states[id.to_usize()].matches |
130 | } |
131 | |
132 | /// Returns an iterator over all transitions in the given state according |
133 | /// to the given equivalence classes, including transitions to `fail_id()`. |
134 | /// The number of transitions returned is always equivalent to the number |
135 | /// of equivalence classes. |
136 | pub fn iter_all_transitions<F: FnMut(u8, S)>( |
137 | &self, |
138 | byte_classes: &ByteClasses, |
139 | id: S, |
140 | f: F, |
141 | ) { |
142 | self.states[id.to_usize()].trans.iter_all(byte_classes, f); |
143 | } |
144 | |
145 | /// Returns the failure transition for the given state. |
146 | pub fn failure_transition(&self, id: S) -> S { |
147 | self.states[id.to_usize()].fail |
148 | } |
149 | |
150 | /// Returns the next state for the given state and input byte. |
151 | /// |
152 | /// Note that this does not follow failure transitions. As such, the id |
153 | /// returned may be `fail_id`. |
154 | pub fn next_state(&self, current: S, input: u8) -> S { |
155 | self.states[current.to_usize()].next_state(input) |
156 | } |
157 | |
158 | fn state(&self, id: S) -> &State<S> { |
159 | &self.states[id.to_usize()] |
160 | } |
161 | |
162 | fn state_mut(&mut self, id: S) -> &mut State<S> { |
163 | &mut self.states[id.to_usize()] |
164 | } |
165 | |
166 | fn start(&self) -> &State<S> { |
167 | self.state(self.start_id) |
168 | } |
169 | |
170 | fn start_mut(&mut self) -> &mut State<S> { |
171 | let id = self.start_id; |
172 | self.state_mut(id) |
173 | } |
174 | |
175 | fn iter_transitions_mut(&mut self, id: S) -> IterTransitionsMut<'_, S> { |
176 | IterTransitionsMut::new(self, id) |
177 | } |
178 | |
179 | fn copy_matches(&mut self, src: S, dst: S) { |
180 | let (src, dst) = |
181 | get_two_mut(&mut self.states, src.to_usize(), dst.to_usize()); |
182 | dst.matches.extend_from_slice(&src.matches); |
183 | } |
184 | |
185 | fn copy_empty_matches(&mut self, dst: S) { |
186 | let start_id = self.start_id; |
187 | self.copy_matches(start_id, dst); |
188 | } |
189 | |
190 | fn add_dense_state(&mut self, depth: usize) -> Result<S> { |
191 | let trans = Transitions::Dense(Dense::new()); |
192 | let id = usize_to_state_id(self.states.len())?; |
193 | self.states.push(State { |
194 | trans, |
195 | // Anchored automatons do not have any failure transitions. |
196 | fail: if self.anchored { dead_id() } else { self.start_id }, |
197 | depth, |
198 | matches: vec![], |
199 | }); |
200 | Ok(id) |
201 | } |
202 | |
203 | fn add_sparse_state(&mut self, depth: usize) -> Result<S> { |
204 | let trans = Transitions::Sparse(vec![]); |
205 | let id = usize_to_state_id(self.states.len())?; |
206 | self.states.push(State { |
207 | trans, |
208 | // Anchored automatons do not have any failure transitions. |
209 | fail: if self.anchored { dead_id() } else { self.start_id }, |
210 | depth, |
211 | matches: vec![], |
212 | }); |
213 | Ok(id) |
214 | } |
215 | } |
216 | |
217 | impl<S: StateID> Automaton for NFA<S> { |
218 | type ID = S; |
219 | |
220 | fn match_kind(&self) -> &MatchKind { |
221 | &self.match_kind |
222 | } |
223 | |
224 | fn anchored(&self) -> bool { |
225 | self.anchored |
226 | } |
227 | |
228 | fn prefilter(&self) -> Option<&dyn Prefilter> { |
229 | self.prefilter.as_ref().map(|p| p.as_ref()) |
230 | } |
231 | |
232 | fn start_state(&self) -> S { |
233 | self.start_id |
234 | } |
235 | |
236 | fn is_valid(&self, id: S) -> bool { |
237 | id.to_usize() < self.states.len() |
238 | } |
239 | |
240 | fn is_match_state(&self, id: S) -> bool { |
241 | self.states[id.to_usize()].is_match() |
242 | } |
243 | |
244 | fn get_match( |
245 | &self, |
246 | id: S, |
247 | match_index: usize, |
248 | end: usize, |
249 | ) -> Option<Match> { |
250 | let state = match self.states.get(id.to_usize()) { |
251 | None => return None, |
252 | Some(state) => state, |
253 | }; |
254 | state.matches.get(match_index).map(|&(id, len)| Match { |
255 | pattern: id, |
256 | len, |
257 | end, |
258 | }) |
259 | } |
260 | |
261 | fn match_count(&self, id: S) -> usize { |
262 | self.states[id.to_usize()].matches.len() |
263 | } |
264 | |
265 | fn next_state(&self, mut current: S, input: u8) -> S { |
266 | // This terminates since: |
267 | // |
268 | // 1. `State.fail` never points to fail_id(). |
269 | // 2. All `State.fail` values point to a state closer to `start`. |
270 | // 3. The start state has no transitions to fail_id(). |
271 | loop { |
272 | let state = &self.states[current.to_usize()]; |
273 | let next = state.next_state(input); |
274 | if next != fail_id() { |
275 | return next; |
276 | } |
277 | current = state.fail; |
278 | } |
279 | } |
280 | } |
281 | |
282 | /// A representation of an NFA state for an Aho-Corasick automaton. |
283 | /// |
284 | /// It contains the transitions to the next state, a failure transition for |
285 | /// cases where there exists no other transition for the current input byte, |
286 | /// the matches implied by visiting this state (if any) and the depth of this |
287 | /// state. The depth of a state is simply the distance from it to the start |
288 | /// state in the automaton, where the depth of the start state is 0. |
289 | #[derive (Clone, Debug)] |
290 | pub struct State<S> { |
291 | trans: Transitions<S>, |
292 | fail: S, |
293 | matches: Vec<(PatternID, PatternLength)>, |
294 | // TODO: Strictly speaking, this isn't needed for searching. It's only |
295 | // used when building an NFA that supports leftmost match semantics. We |
296 | // could drop this from the state and dynamically build a map only when |
297 | // computing failure transitions, but it's not clear which is better. |
298 | // Benchmark this. |
299 | depth: usize, |
300 | } |
301 | |
302 | impl<S: StateID> State<S> { |
303 | fn heap_bytes(&self) -> usize { |
304 | self.trans.heap_bytes() |
305 | + (self.matches.len() * size_of::<(PatternID, PatternLength)>()) |
306 | } |
307 | |
308 | fn add_match(&mut self, i: PatternID, len: PatternLength) { |
309 | self.matches.push((i, len)); |
310 | } |
311 | |
312 | fn is_match(&self) -> bool { |
313 | !self.matches.is_empty() |
314 | } |
315 | |
316 | fn next_state(&self, input: u8) -> S { |
317 | self.trans.next_state(input) |
318 | } |
319 | |
320 | fn set_next_state(&mut self, input: u8, next: S) { |
321 | self.trans.set_next_state(input, next); |
322 | } |
323 | } |
324 | |
325 | /// Represents the transitions for a single dense state. |
326 | /// |
327 | /// The primary purpose here is to encapsulate index access. Namely, since a |
328 | /// dense representation always contains 256 elements, all values of `u8` are |
329 | /// valid indices. |
330 | #[derive (Clone, Debug)] |
331 | struct Dense<S>(Vec<S>); |
332 | |
333 | impl<S> Dense<S> |
334 | where |
335 | S: StateID, |
336 | { |
337 | fn new() -> Self { |
338 | Dense(vec![fail_id(); 256]) |
339 | } |
340 | |
341 | #[inline ] |
342 | fn len(&self) -> usize { |
343 | self.0.len() |
344 | } |
345 | } |
346 | |
347 | impl<S> Index<u8> for Dense<S> { |
348 | type Output = S; |
349 | |
350 | #[inline ] |
351 | fn index(&self, i: u8) -> &S { |
352 | // SAFETY: This is safe because all dense transitions have |
353 | // exactly 256 elements, so all u8 values are valid indices. |
354 | &self.0[i as usize] |
355 | } |
356 | } |
357 | |
358 | impl<S> IndexMut<u8> for Dense<S> { |
359 | #[inline ] |
360 | fn index_mut(&mut self, i: u8) -> &mut S { |
361 | // SAFETY: This is safe because all dense transitions have |
362 | // exactly 256 elements, so all u8 values are valid indices. |
363 | &mut self.0[i as usize] |
364 | } |
365 | } |
366 | |
367 | /// A representation of transitions in an NFA. |
368 | /// |
369 | /// Transitions have either a sparse representation, which is slower for |
370 | /// lookups but uses less memory, or a dense representation, which is faster |
371 | /// for lookups but uses more memory. In the sparse representation, the absence |
372 | /// of a state implies a transition to `fail_id()`. Transitions to `dead_id()` |
373 | /// are still explicitly represented. |
374 | /// |
375 | /// For the NFA, by default, we use a dense representation for transitions for |
376 | /// states close to the start state because it's likely these are the states |
377 | /// that will be most frequently visited. |
378 | #[derive (Clone, Debug)] |
379 | enum Transitions<S> { |
380 | Sparse(Vec<(u8, S)>), |
381 | Dense(Dense<S>), |
382 | } |
383 | |
384 | impl<S: StateID> Transitions<S> { |
385 | fn heap_bytes(&self) -> usize { |
386 | match *self { |
387 | Transitions::Sparse(ref sparse) => { |
388 | sparse.len() * size_of::<(u8, S)>() |
389 | } |
390 | Transitions::Dense(ref dense) => dense.len() * size_of::<S>(), |
391 | } |
392 | } |
393 | |
394 | fn next_state(&self, input: u8) -> S { |
395 | match *self { |
396 | Transitions::Sparse(ref sparse) => { |
397 | for &(b, id) in sparse { |
398 | if b == input { |
399 | return id; |
400 | } |
401 | } |
402 | fail_id() |
403 | } |
404 | Transitions::Dense(ref dense) => dense[input], |
405 | } |
406 | } |
407 | |
408 | fn set_next_state(&mut self, input: u8, next: S) { |
409 | match *self { |
410 | Transitions::Sparse(ref mut sparse) => { |
411 | match sparse.binary_search_by_key(&input, |&(b, _)| b) { |
412 | Ok(i) => sparse[i] = (input, next), |
413 | Err(i) => sparse.insert(i, (input, next)), |
414 | } |
415 | } |
416 | Transitions::Dense(ref mut dense) => { |
417 | dense[input] = next; |
418 | } |
419 | } |
420 | } |
421 | |
422 | /// Iterate over transitions in this state while skipping over transitions |
423 | /// to `fail_id()`. |
424 | fn iter<F: FnMut(u8, S)>(&self, mut f: F) { |
425 | match *self { |
426 | Transitions::Sparse(ref sparse) => { |
427 | for &(b, id) in sparse { |
428 | f(b, id); |
429 | } |
430 | } |
431 | Transitions::Dense(ref dense) => { |
432 | for b in AllBytesIter::new() { |
433 | let id = dense[b]; |
434 | if id != fail_id() { |
435 | f(b, id); |
436 | } |
437 | } |
438 | } |
439 | } |
440 | } |
441 | |
442 | /// Iterate over all transitions in this state according to the given |
443 | /// equivalence classes, including transitions to `fail_id()`. |
444 | fn iter_all<F: FnMut(u8, S)>(&self, classes: &ByteClasses, mut f: F) { |
445 | if classes.is_singleton() { |
446 | match *self { |
447 | Transitions::Sparse(ref sparse) => { |
448 | sparse_iter(sparse, f); |
449 | } |
450 | Transitions::Dense(ref dense) => { |
451 | for b in AllBytesIter::new() { |
452 | f(b, dense[b]); |
453 | } |
454 | } |
455 | } |
456 | } else { |
457 | // In this case, we only want to yield a single byte for each |
458 | // equivalence class. |
459 | match *self { |
460 | Transitions::Sparse(ref sparse) => { |
461 | let mut last_class = None; |
462 | sparse_iter(sparse, |b, next| { |
463 | let class = classes.get(b); |
464 | if last_class != Some(class) { |
465 | last_class = Some(class); |
466 | f(b, next); |
467 | } |
468 | }) |
469 | } |
470 | Transitions::Dense(ref dense) => { |
471 | for b in classes.representatives() { |
472 | f(b, dense[b]); |
473 | } |
474 | } |
475 | } |
476 | } |
477 | } |
478 | } |
479 | |
480 | /// Iterator over transitions in a state, skipping transitions to `fail_id()`. |
481 | /// |
482 | /// This abstracts over the representation of NFA transitions, which may be |
483 | /// either in a sparse or dense representation. |
484 | /// |
485 | /// This somewhat idiosyncratically borrows the NFA mutably, so that when one |
486 | /// is iterating over transitions, the caller can still mutate the NFA. This |
487 | /// is useful when creating failure transitions. |
488 | #[derive (Debug)] |
489 | struct IterTransitionsMut<'a, S: StateID> { |
490 | nfa: &'a mut NFA<S>, |
491 | state_id: S, |
492 | cur: usize, |
493 | } |
494 | |
495 | impl<'a, S: StateID> IterTransitionsMut<'a, S> { |
496 | fn new(nfa: &'a mut NFA<S>, state_id: S) -> IterTransitionsMut<'a, S> { |
497 | IterTransitionsMut { nfa, state_id, cur: 0 } |
498 | } |
499 | |
500 | fn nfa(&mut self) -> &mut NFA<S> { |
501 | self.nfa |
502 | } |
503 | } |
504 | |
505 | impl<'a, S: StateID> Iterator for IterTransitionsMut<'a, S> { |
506 | type Item = (u8, S); |
507 | |
508 | fn next(&mut self) -> Option<(u8, S)> { |
509 | match self.nfa.states[self.state_id.to_usize()].trans { |
510 | Transitions::Sparse(ref sparse) => { |
511 | if self.cur >= sparse.len() { |
512 | return None; |
513 | } |
514 | let i = self.cur; |
515 | self.cur += 1; |
516 | Some(sparse[i]) |
517 | } |
518 | Transitions::Dense(ref dense) => { |
519 | while self.cur < dense.len() { |
520 | // There are always exactly 255 transitions in dense repr. |
521 | debug_assert!(self.cur < 256); |
522 | |
523 | let b = self.cur as u8; |
524 | let id = dense[b]; |
525 | self.cur += 1; |
526 | if id != fail_id() { |
527 | return Some((b, id)); |
528 | } |
529 | } |
530 | None |
531 | } |
532 | } |
533 | } |
534 | } |
535 | |
536 | /// A simple builder for configuring the NFA construction of Aho-Corasick. |
537 | #[derive (Clone, Debug)] |
538 | pub struct Builder { |
539 | dense_depth: usize, |
540 | match_kind: MatchKind, |
541 | prefilter: bool, |
542 | anchored: bool, |
543 | ascii_case_insensitive: bool, |
544 | } |
545 | |
546 | impl Default for Builder { |
547 | fn default() -> Builder { |
548 | Builder { |
549 | dense_depth: 2, |
550 | match_kind: MatchKind::default(), |
551 | prefilter: true, |
552 | anchored: false, |
553 | ascii_case_insensitive: false, |
554 | } |
555 | } |
556 | } |
557 | |
558 | impl Builder { |
559 | pub fn new() -> Builder { |
560 | Builder::default() |
561 | } |
562 | |
563 | pub fn build<I, P, S: StateID>(&self, patterns: I) -> Result<NFA<S>> |
564 | where |
565 | I: IntoIterator<Item = P>, |
566 | P: AsRef<[u8]>, |
567 | { |
568 | Compiler::new(self)?.compile(patterns) |
569 | } |
570 | |
571 | pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder { |
572 | self.match_kind = kind; |
573 | self |
574 | } |
575 | |
576 | pub fn dense_depth(&mut self, depth: usize) -> &mut Builder { |
577 | self.dense_depth = depth; |
578 | self |
579 | } |
580 | |
581 | pub fn prefilter(&mut self, yes: bool) -> &mut Builder { |
582 | self.prefilter = yes; |
583 | self |
584 | } |
585 | |
586 | pub fn anchored(&mut self, yes: bool) -> &mut Builder { |
587 | self.anchored = yes; |
588 | self |
589 | } |
590 | |
591 | pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder { |
592 | self.ascii_case_insensitive = yes; |
593 | self |
594 | } |
595 | } |
596 | |
597 | /// A compiler uses a builder configuration and builds up the NFA formulation |
598 | /// of an Aho-Corasick automaton. This roughly corresponds to the standard |
599 | /// formulation described in textbooks. |
600 | #[derive (Debug)] |
601 | struct Compiler<'a, S: StateID> { |
602 | builder: &'a Builder, |
603 | prefilter: prefilter::Builder, |
604 | nfa: NFA<S>, |
605 | byte_classes: ByteClassBuilder, |
606 | } |
607 | |
608 | impl<'a, S: StateID> Compiler<'a, S> { |
609 | fn new(builder: &'a Builder) -> Result<Compiler<'a, S>> { |
610 | Ok(Compiler { |
611 | builder, |
612 | prefilter: prefilter::Builder::new(builder.match_kind) |
613 | .ascii_case_insensitive(builder.ascii_case_insensitive), |
614 | nfa: NFA { |
615 | match_kind: builder.match_kind, |
616 | start_id: usize_to_state_id(2)?, |
617 | max_pattern_len: 0, |
618 | pattern_count: 0, |
619 | heap_bytes: 0, |
620 | prefilter: None, |
621 | anchored: builder.anchored, |
622 | byte_classes: ByteClasses::singletons(), |
623 | states: vec![], |
624 | }, |
625 | byte_classes: ByteClassBuilder::new(), |
626 | }) |
627 | } |
628 | |
629 | fn compile<I, P>(mut self, patterns: I) -> Result<NFA<S>> |
630 | where |
631 | I: IntoIterator<Item = P>, |
632 | P: AsRef<[u8]>, |
633 | { |
634 | self.add_state(0)?; // the fail state, which is never entered |
635 | self.add_state(0)?; // the dead state, only used for leftmost |
636 | self.add_state(0)?; // the start state |
637 | self.build_trie(patterns)?; |
638 | self.add_start_state_loop(); |
639 | self.add_dead_state_loop(); |
640 | if !self.builder.anchored { |
641 | self.fill_failure_transitions(); |
642 | } |
643 | self.close_start_state_loop(); |
644 | self.nfa.byte_classes = self.byte_classes.build(); |
645 | if !self.builder.anchored { |
646 | self.nfa.prefilter = self.prefilter.build(); |
647 | } |
648 | self.calculate_size(); |
649 | Ok(self.nfa) |
650 | } |
651 | |
652 | /// This sets up the initial prefix trie that makes up the Aho-Corasick |
653 | /// automaton. Effectively, it creates the basic structure of the |
654 | /// automaton, where every pattern given has a path from the start state to |
655 | /// the end of the pattern. |
656 | fn build_trie<I, P>(&mut self, patterns: I) -> Result<()> |
657 | where |
658 | I: IntoIterator<Item = P>, |
659 | P: AsRef<[u8]>, |
660 | { |
661 | 'PATTERNS: for (pati, pat) in patterns.into_iter().enumerate() { |
662 | let pat = pat.as_ref(); |
663 | self.nfa.max_pattern_len = |
664 | cmp::max(self.nfa.max_pattern_len, pat.len()); |
665 | self.nfa.pattern_count += 1; |
666 | |
667 | let mut prev = self.nfa.start_id; |
668 | let mut saw_match = false; |
669 | for (depth, &b) in pat.iter().enumerate() { |
670 | // When leftmost-first match semantics are requested, we |
671 | // specifically stop adding patterns when a previously added |
672 | // pattern is a prefix of it. We avoid adding it because |
673 | // leftmost-first semantics imply that the pattern can never |
674 | // match. This is not just an optimization to save space! It |
675 | // is necessary for correctness. In fact, this is the only |
676 | // difference in the automaton between the implementations for |
677 | // leftmost-first and leftmost-longest. |
678 | saw_match = saw_match || self.nfa.state(prev).is_match(); |
679 | if self.builder.match_kind.is_leftmost_first() && saw_match { |
680 | // Skip to the next pattern immediately. This avoids |
681 | // incorrectly adding a match after this loop terminates. |
682 | continue 'PATTERNS; |
683 | } |
684 | |
685 | // Add this byte to our equivalence classes. We don't use these |
686 | // for NFA construction. These are instead used only if we're |
687 | // building a DFA. They would technically be useful for the |
688 | // NFA, but it would require a second pass over the patterns. |
689 | self.byte_classes.set_range(b, b); |
690 | if self.builder.ascii_case_insensitive { |
691 | let b = opposite_ascii_case(b); |
692 | self.byte_classes.set_range(b, b); |
693 | } |
694 | |
695 | // If the transition from prev using the current byte already |
696 | // exists, then just move through it. Otherwise, add a new |
697 | // state. We track the depth here so that we can determine |
698 | // how to represent transitions. States near the start state |
699 | // use a dense representation that uses more memory but is |
700 | // faster. Other states use a sparse representation that uses |
701 | // less memory but is slower. |
702 | let next = self.nfa.state(prev).next_state(b); |
703 | if next != fail_id() { |
704 | prev = next; |
705 | } else { |
706 | let next = self.add_state(depth + 1)?; |
707 | self.nfa.state_mut(prev).set_next_state(b, next); |
708 | if self.builder.ascii_case_insensitive { |
709 | let b = opposite_ascii_case(b); |
710 | self.nfa.state_mut(prev).set_next_state(b, next); |
711 | } |
712 | prev = next; |
713 | } |
714 | } |
715 | // Once the pattern has been added, log the match in the final |
716 | // state that it reached. |
717 | self.nfa.state_mut(prev).add_match(pati, pat.len()); |
718 | // ... and hand it to the prefilter builder, if applicable. |
719 | if self.builder.prefilter { |
720 | self.prefilter.add(pat); |
721 | } |
722 | } |
723 | Ok(()) |
724 | } |
725 | |
726 | /// This routine creates failure transitions according to the standard |
727 | /// textbook formulation of the Aho-Corasick algorithm, with a couple small |
728 | /// tweaks to support "leftmost" semantics. |
729 | /// |
730 | /// Building failure transitions is the most interesting part of building |
731 | /// the Aho-Corasick automaton, because they are what allow searches to |
732 | /// be performed in linear time. Specifically, a failure transition is |
733 | /// a single transition associated with each state that points back to |
734 | /// the longest proper suffix of the pattern being searched. The failure |
735 | /// transition is followed whenever there exists no transition on the |
736 | /// current state for the current input byte. If there is no other proper |
737 | /// suffix, then the failure transition points back to the starting state. |
738 | /// |
739 | /// For example, let's say we built an Aho-Corasick automaton with the |
740 | /// following patterns: 'abcd' and 'cef'. The trie looks like this: |
741 | /// |
742 | /// ```ignore |
743 | /// a - S1 - b - S2 - c - S3 - d - S4* |
744 | /// / |
745 | /// S0 - c - S5 - e - S6 - f - S7* |
746 | /// ``` |
747 | /// |
748 | /// At this point, it should be fairly straight-forward to see how this |
749 | /// trie can be used in a simplistic way. At any given position in the |
750 | /// text we're searching (called the "subject" string), all we need to do |
751 | /// is follow the transitions in the trie by consuming one transition for |
752 | /// each byte in the subject string. If we reach a match state, then we can |
753 | /// report that location as a match. |
754 | /// |
755 | /// The trick comes when searching a subject string like 'abcef'. We'll |
756 | /// initially follow the transition from S0 to S1 and wind up in S3 after |
757 | /// observng the 'c' byte. At this point, the next byte is 'e' but state |
758 | /// S3 has no transition for 'e', so the search fails. We then would need |
759 | /// to restart the search at the next position in 'abcef', which |
760 | /// corresponds to 'b'. The match would fail, but the next search starting |
761 | /// at 'c' would finally succeed. The problem with this approach is that |
762 | /// we wind up searching the subject string potentially many times. In |
763 | /// effect, this makes the algorithm have worst case `O(n * m)` complexity, |
764 | /// where `n ~ len(subject)` and `m ~ len(all patterns)`. We would instead |
765 | /// like to achieve a `O(n + m)` worst case complexity. |
766 | /// |
767 | /// This is where failure transitions come in. Instead of dying at S3 in |
768 | /// the first search, the automaton can instruct the search to move to |
769 | /// another part of the automaton that corresponds to a suffix of what |
770 | /// we've seen so far. Recall that we've seen 'abc' in the subject string, |
771 | /// and the automaton does indeed have a non-empty suffix, 'c', that could |
772 | /// potentially lead to another match. Thus, the actual Aho-Corasick |
773 | /// automaton for our patterns in this case looks like this: |
774 | /// |
775 | /// ```ignore |
776 | /// a - S1 - b - S2 - c - S3 - d - S4* |
777 | /// / / |
778 | /// / ---------------- |
779 | /// / / |
780 | /// S0 - c - S5 - e - S6 - f - S7* |
781 | /// ``` |
782 | /// |
783 | /// That is, we have a failure transition from S3 to S5, which is followed |
784 | /// exactly in cases when we are in state S3 but see any byte other than |
785 | /// 'd' (that is, we've "failed" to find a match in this portion of our |
786 | /// trie). We know we can transition back to S5 because we've already seen |
787 | /// a 'c' byte, so we don't need to re-scan it. We can then pick back up |
788 | /// with the search starting at S5 and complete our match. |
789 | /// |
790 | /// Adding failure transitions to a trie is fairly simple, but subtle. The |
791 | /// key issue is that you might have multiple failure transition that you |
792 | /// need to follow. For example, look at the trie for the patterns |
793 | /// 'abcd', 'b', 'bcd' and 'cd': |
794 | /// |
795 | /// ```ignore |
796 | /// - a - S1 - b - S2* - c - S3 - d - S4* |
797 | /// / / / |
798 | /// / ------- ------- |
799 | /// / / / |
800 | /// S0 --- b - S5* - c - S6 - d - S7* |
801 | /// \ / |
802 | /// \ -------- |
803 | /// \ / |
804 | /// - c - S8 - d - S9* |
805 | /// ``` |
806 | /// |
807 | /// The failure transitions for this trie are defined from S2 to S5, |
808 | /// S3 to S6 and S6 to S8. Moreover, state S2 needs to track that it |
809 | /// corresponds to a match, since its failure transition to S5 is itself |
810 | /// a match state. |
811 | /// |
812 | /// Perhaps simplest way to think about adding these failure transitions |
813 | /// is recursively. That is, if you know the failure transitions for every |
814 | /// possible previous state that could be visited (e.g., when computing the |
815 | /// failure transition for S3, you already know the failure transitions |
816 | /// for S0, S1 and S2), then you can simply follow the failure transition |
817 | /// of the previous state and check whether the incoming transition is |
818 | /// defined after following the failure transition. |
819 | /// |
820 | /// For example, when determining the failure state for S3, by our |
821 | /// assumptions, we already know that there is a failure transition from |
822 | /// S2 (the previous state) to S5. So we follow that transition and check |
823 | /// whether the transition connecting S2 to S3 is defined. Indeed, it is, |
824 | /// as there is a transition from S5 to S6 for the byte 'c'. If no such |
825 | /// transition existed, we could keep following the failure transitions |
826 | /// until we reach the start state, which is the failure transition for |
827 | /// every state that has no corresponding proper suffix. |
828 | /// |
829 | /// We don't actually use recursion to implement this, but instead, use a |
830 | /// breadth first search of the automaton. Our base case is the start |
831 | /// state, whose failure transition is just a transition to itself. |
832 | /// |
833 | /// When building a leftmost automaton, we proceed as above, but only |
834 | /// include a subset of failure transitions. Namely, we omit any failure |
835 | /// transitions that appear after a match state in the trie. This is |
836 | /// because failure transitions always point back to a proper suffix of |
837 | /// what has been seen so far. Thus, following a failure transition after |
838 | /// a match implies looking for a match that starts after the one that has |
839 | /// already been seen, which is of course therefore not the leftmost match. |
840 | /// |
841 | /// N.B. I came up with this algorithm on my own, and after scouring all of |
842 | /// the other AC implementations I know of (Perl, Snort, many on GitHub). |
843 | /// I couldn't find any that implement leftmost semantics like this. |
844 | /// Perl of course needs leftmost-first semantics, but they implement it |
845 | /// with a seeming hack at *search* time instead of encoding it into the |
846 | /// automaton. There are also a couple Java libraries that support leftmost |
847 | /// longest semantics, but they do it by building a queue of matches at |
848 | /// search time, which is even worse than what Perl is doing. ---AG |
849 | fn fill_failure_transitions(&mut self) { |
850 | let kind = self.match_kind(); |
851 | // Initialize the queue for breadth first search with all transitions |
852 | // out of the start state. We handle the start state specially because |
853 | // we only want to follow non-self transitions. If we followed self |
854 | // transitions, then this would never terminate. |
855 | let mut queue = VecDeque::new(); |
856 | let mut seen = self.queued_set(); |
857 | let mut it = self.nfa.iter_transitions_mut(self.nfa.start_id); |
858 | while let Some((_, next)) = it.next() { |
859 | // Skip anything we've seen before and any self-transitions on the |
860 | // start state. |
861 | if next == it.nfa().start_id || seen.contains(next) { |
862 | continue; |
863 | } |
864 | queue.push_back(next); |
865 | seen.insert(next); |
866 | // Under leftmost semantics, if a state immediately following |
867 | // the start state is a match state, then we never want to |
868 | // follow its failure transition since the failure transition |
869 | // necessarily leads back to the start state, which we never |
870 | // want to do for leftmost matching after a match has been |
871 | // found. |
872 | // |
873 | // We apply the same logic to non-start states below as well. |
874 | if kind.is_leftmost() && it.nfa().state(next).is_match() { |
875 | it.nfa().state_mut(next).fail = dead_id(); |
876 | } |
877 | } |
878 | while let Some(id) = queue.pop_front() { |
879 | let mut it = self.nfa.iter_transitions_mut(id); |
880 | while let Some((b, next)) = it.next() { |
881 | if seen.contains(next) { |
882 | // The only way to visit a duplicate state in a transition |
883 | // list is when ASCII case insensitivity is enabled. In |
884 | // this case, we want to skip it since it's redundant work. |
885 | // But it would also end up duplicating matches, which |
886 | // results in reporting duplicate matches in some cases. |
887 | // See the 'acasei010' regression test. |
888 | continue; |
889 | } |
890 | queue.push_back(next); |
891 | seen.insert(next); |
892 | |
893 | // As above for start states, under leftmost semantics, once |
894 | // we see a match all subsequent states should have no failure |
895 | // transitions because failure transitions always imply looking |
896 | // for a match that is a suffix of what has been seen so far |
897 | // (where "seen so far" corresponds to the string formed by |
898 | // following the transitions from the start state to the |
899 | // current state). Under leftmost semantics, we specifically do |
900 | // not want to allow this to happen because we always want to |
901 | // report the match found at the leftmost position. |
902 | // |
903 | // The difference between leftmost-first and leftmost-longest |
904 | // occurs previously while we build the trie. For |
905 | // leftmost-first, we simply omit any entries that would |
906 | // otherwise require passing through a match state. |
907 | // |
908 | // Note that for correctness, the failure transition has to be |
909 | // set to the dead state for ALL states following a match, not |
910 | // just the match state itself. However, by setting the failure |
911 | // transition to the dead state on all match states, the dead |
912 | // state will automatically propagate to all subsequent states |
913 | // via the failure state computation below. |
914 | if kind.is_leftmost() && it.nfa().state(next).is_match() { |
915 | it.nfa().state_mut(next).fail = dead_id(); |
916 | continue; |
917 | } |
918 | let mut fail = it.nfa().state(id).fail; |
919 | while it.nfa().state(fail).next_state(b) == fail_id() { |
920 | fail = it.nfa().state(fail).fail; |
921 | } |
922 | fail = it.nfa().state(fail).next_state(b); |
923 | it.nfa().state_mut(next).fail = fail; |
924 | it.nfa().copy_matches(fail, next); |
925 | } |
926 | // If the start state is a match state, then this automaton can |
927 | // match the empty string. This implies all states are match states |
928 | // since every position matches the empty string, so copy the |
929 | // matches from the start state to every state. Strictly speaking, |
930 | // this is only necessary for overlapping matches since each |
931 | // non-empty non-start match state needs to report empty matches |
932 | // in addition to its own. For the non-overlapping case, such |
933 | // states only report the first match, which is never empty since |
934 | // it isn't a start state. |
935 | if !kind.is_leftmost() { |
936 | it.nfa().copy_empty_matches(id); |
937 | } |
938 | } |
939 | } |
940 | |
941 | /// Returns a set that tracked queued states. |
942 | /// |
943 | /// This is only necessary when ASCII case insensitivity is enabled, since |
944 | /// it is the only way to visit the same state twice. Otherwise, this |
945 | /// returns an inert set that nevers adds anything and always reports |
946 | /// `false` for every member test. |
947 | fn queued_set(&self) -> QueuedSet<S> { |
948 | if self.builder.ascii_case_insensitive { |
949 | QueuedSet::active() |
950 | } else { |
951 | QueuedSet::inert() |
952 | } |
953 | } |
954 | |
955 | /// Set the failure transitions on the start state to loop back to the |
956 | /// start state. This effectively permits the Aho-Corasick automaton to |
957 | /// match at any position. This is also required for finding the next |
958 | /// state to terminate, namely, finding the next state should never return |
959 | /// a fail_id. |
960 | /// |
961 | /// This must be done after building the initial trie, since trie |
962 | /// construction depends on transitions to `fail_id` to determine whether a |
963 | /// state already exists or not. |
964 | fn add_start_state_loop(&mut self) { |
965 | let start_id = self.nfa.start_id; |
966 | let start = self.nfa.start_mut(); |
967 | for b in AllBytesIter::new() { |
968 | if start.next_state(b) == fail_id() { |
969 | start.set_next_state(b, start_id); |
970 | } |
971 | } |
972 | } |
973 | |
974 | /// Remove the start state loop by rewriting any transitions on the start |
975 | /// state back to the start state with transitions to the dead state. |
976 | /// |
977 | /// The loop is only closed when two conditions are met: the start state |
978 | /// is a match state and the match kind is leftmost-first or |
979 | /// leftmost-longest. (Alternatively, if this is an anchored automaton, |
980 | /// then the start state is always closed, regardless of aforementioned |
981 | /// conditions.) |
982 | /// |
983 | /// The reason for this is that under leftmost semantics, a start state |
984 | /// that is also a match implies that we should never restart the search |
985 | /// process. We allow normal transitions out of the start state, but if |
986 | /// none exist, we transition to the dead state, which signals that |
987 | /// searching should stop. |
988 | fn close_start_state_loop(&mut self) { |
989 | if self.builder.anchored |
990 | || (self.match_kind().is_leftmost() && self.nfa.start().is_match()) |
991 | { |
992 | let start_id = self.nfa.start_id; |
993 | let start = self.nfa.start_mut(); |
994 | for b in AllBytesIter::new() { |
995 | if start.next_state(b) == start_id { |
996 | start.set_next_state(b, dead_id()); |
997 | } |
998 | } |
999 | } |
1000 | } |
1001 | |
1002 | /// Sets all transitions on the dead state to point back to the dead state. |
1003 | /// Normally, missing transitions map back to the failure state, but the |
1004 | /// point of the dead state is to act as a sink that can never be escaped. |
1005 | fn add_dead_state_loop(&mut self) { |
1006 | let dead = self.nfa.state_mut(dead_id()); |
1007 | for b in AllBytesIter::new() { |
1008 | dead.set_next_state(b, dead_id()); |
1009 | } |
1010 | } |
1011 | |
1012 | /// Computes the total amount of heap used by this NFA in bytes. |
1013 | fn calculate_size(&mut self) { |
1014 | let mut size = 0; |
1015 | for state in &self.nfa.states { |
1016 | size += size_of::<State<S>>() + state.heap_bytes(); |
1017 | } |
1018 | self.nfa.heap_bytes = size; |
1019 | } |
1020 | |
1021 | /// Add a new state to the underlying NFA with the given depth. The depth |
1022 | /// is used to determine how to represent the transitions. |
1023 | /// |
1024 | /// If adding the new state would overflow the chosen state ID |
1025 | /// representation, then this returns an error. |
1026 | fn add_state(&mut self, depth: usize) -> Result<S> { |
1027 | if depth < self.builder.dense_depth { |
1028 | self.nfa.add_dense_state(depth) |
1029 | } else { |
1030 | self.nfa.add_sparse_state(depth) |
1031 | } |
1032 | } |
1033 | |
1034 | /// Returns the match kind configured on the underlying builder. |
1035 | fn match_kind(&self) -> MatchKind { |
1036 | self.builder.match_kind |
1037 | } |
1038 | } |
1039 | |
1040 | /// A set of state identifiers used to avoid revisiting the same state multiple |
1041 | /// times when filling in failure transitions. |
1042 | /// |
1043 | /// This set has an "inert" and an "active" mode. When inert, the set never |
1044 | /// stores anything and always returns `false` for every member test. This is |
1045 | /// useful to avoid the performance and memory overhead of maintaining this |
1046 | /// set when it is not needed. |
1047 | #[derive (Debug)] |
1048 | struct QueuedSet<S> { |
1049 | set: Option<BTreeSet<S>>, |
1050 | } |
1051 | |
1052 | impl<S: StateID> QueuedSet<S> { |
1053 | /// Return an inert set that returns `false` for every state ID membership |
1054 | /// test. |
1055 | fn inert() -> QueuedSet<S> { |
1056 | QueuedSet { set: None } |
1057 | } |
1058 | |
1059 | /// Return an active set that tracks state ID membership. |
1060 | fn active() -> QueuedSet<S> { |
1061 | QueuedSet { set: Some(BTreeSet::new()) } |
1062 | } |
1063 | |
1064 | /// Inserts the given state ID into this set. (If the set is inert, then |
1065 | /// this is a no-op.) |
1066 | fn insert(&mut self, state_id: S) { |
1067 | if let Some(ref mut set) = self.set { |
1068 | set.insert(state_id); |
1069 | } |
1070 | } |
1071 | |
1072 | /// Returns true if and only if the given state ID is in this set. If the |
1073 | /// set is inert, this always returns false. |
1074 | fn contains(&self, state_id: S) -> bool { |
1075 | match self.set { |
1076 | None => false, |
1077 | Some(ref set) => set.contains(&state_id), |
1078 | } |
1079 | } |
1080 | } |
1081 | |
1082 | /// An iterator over every byte value. |
1083 | /// |
1084 | /// We use this instead of (0..256).map(|b| b as u8) because this optimizes |
1085 | /// better in debug builds. |
1086 | /// |
1087 | /// We also use this instead of 0..=255 because we're targeting Rust 1.24 and |
1088 | /// inclusive range syntax was stabilized in Rust 1.26. We can get rid of this |
1089 | /// once our MSRV is Rust 1.26 or newer. |
1090 | #[derive (Debug)] |
1091 | struct AllBytesIter(u16); |
1092 | |
1093 | impl AllBytesIter { |
1094 | fn new() -> AllBytesIter { |
1095 | AllBytesIter(0) |
1096 | } |
1097 | } |
1098 | |
1099 | impl Iterator for AllBytesIter { |
1100 | type Item = u8; |
1101 | |
1102 | fn next(&mut self) -> Option<Self::Item> { |
1103 | if self.0 >= 256 { |
1104 | None |
1105 | } else { |
1106 | let b: u8 = self.0 as u8; |
1107 | self.0 += 1; |
1108 | Some(b) |
1109 | } |
1110 | } |
1111 | } |
1112 | |
1113 | impl<S: StateID> fmt::Debug for NFA<S> { |
1114 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
1115 | writeln!(f, "NFA(" )?; |
1116 | writeln!(f, "match_kind: {:?}" , self.match_kind)?; |
1117 | writeln!(f, "prefilter: {:?}" , self.prefilter)?; |
1118 | writeln!(f, " {}" , "-" .repeat(79))?; |
1119 | for (id, s) in self.states.iter().enumerate() { |
1120 | let mut trans = vec![]; |
1121 | s.trans.iter(|byte, next| { |
1122 | // The start state has a bunch of uninteresting transitions |
1123 | // back into itself. It's questionable to hide them since they |
1124 | // are critical to understanding the automaton, but they are |
1125 | // very noisy without better formatting for contiugous ranges |
1126 | // to the same state. |
1127 | if id == self.start_id.to_usize() && next == self.start_id { |
1128 | return; |
1129 | } |
1130 | // Similarly, the dead state has a bunch of uninteresting |
1131 | // transitions too. |
1132 | if id == dead_id() { |
1133 | return; |
1134 | } |
1135 | trans.push(format!(" {} => {}" , escape(byte), next.to_usize())); |
1136 | }); |
1137 | writeln!(f, " {:04}: {}" , id, trans.join(", " ))?; |
1138 | |
1139 | let matches: Vec<String> = s |
1140 | .matches |
1141 | .iter() |
1142 | .map(|&(pattern_id, _)| pattern_id.to_string()) |
1143 | .collect(); |
1144 | writeln!(f, " matches: {}" , matches.join(", " ))?; |
1145 | writeln!(f, " fail: {}" , s.fail.to_usize())?; |
1146 | writeln!(f, " depth: {}" , s.depth)?; |
1147 | } |
1148 | writeln!(f, " {}" , "-" .repeat(79))?; |
1149 | writeln!(f, ")" )?; |
1150 | Ok(()) |
1151 | } |
1152 | } |
1153 | |
1154 | /// Iterate over all possible byte transitions given a sparse set. |
1155 | fn sparse_iter<S: StateID, F: FnMut(u8, S)>(trans: &[(u8, S)], mut f: F) { |
1156 | let mut byte: u16 = 0u16; |
1157 | for &(b: u8, id: S) in trans { |
1158 | while byte < (b as u16) { |
1159 | f(byte as u8, fail_id()); |
1160 | byte += 1; |
1161 | } |
1162 | f(b, id); |
1163 | byte += 1; |
1164 | } |
1165 | for b: u16 in byte..256 { |
1166 | f(b as u8, fail_id()); |
1167 | } |
1168 | } |
1169 | |
1170 | /// Safely return two mutable borrows to two different locations in the given |
1171 | /// slice. |
1172 | /// |
1173 | /// This panics if i == j. |
1174 | fn get_two_mut<T>(xs: &mut [T], i: usize, j: usize) -> (&mut T, &mut T) { |
1175 | assert!(i != j, " {} must not be equal to {}" , i, j); |
1176 | if i < j { |
1177 | let (before: &mut [T], after: &mut [T]) = xs.split_at_mut(mid:j); |
1178 | (&mut before[i], &mut after[0]) |
1179 | } else { |
1180 | let (before: &mut [T], after: &mut [T]) = xs.split_at_mut(mid:i); |
1181 | (&mut after[0], &mut before[j]) |
1182 | } |
1183 | } |
1184 | |
1185 | /// Return the given byte as its escaped string form. |
1186 | fn escape(b: u8) -> String { |
1187 | use std::ascii; |
1188 | |
1189 | String::from_utf8(vec:ascii::escape_default(b).collect::<Vec<_>>()).unwrap() |
1190 | } |
1191 | |
1192 | #[cfg (test)] |
1193 | mod tests { |
1194 | use super::*; |
1195 | |
1196 | #[test ] |
1197 | fn scratch() { |
1198 | let nfa: NFA<usize> = Builder::new() |
1199 | .dense_depth(0) |
1200 | // .match_kind(MatchKind::LeftmostShortest) |
1201 | // .match_kind(MatchKind::LeftmostLongest) |
1202 | .match_kind(MatchKind::LeftmostFirst) |
1203 | // .build(&["abcd", "ce", "b"]) |
1204 | // .build(&["ab", "bc"]) |
1205 | // .build(&["b", "bcd", "ce"]) |
1206 | // .build(&["abc", "bx"]) |
1207 | // .build(&["abc", "bd", "ab"]) |
1208 | // .build(&["abcdefghi", "hz", "abcdefgh"]) |
1209 | // .build(&["abcd", "bce", "b"]) |
1210 | .build(&["abcdefg" , "bcde" , "bcdef" ]) |
1211 | .unwrap(); |
1212 | println!(" {:?}" , nfa); |
1213 | } |
1214 | } |
1215 | |