1 | /*! |
---|---|

2 | Provides a noncontiguous NFA implementation of Aho-Corasick. |

3 | |

4 | This is a low-level API that generally only needs to be used in niche |

5 | circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) |

6 | instead of a noncontiguous NFA directly. Using an `NFA` directly is typically |

7 | only necessary when one needs access to the [`Automaton`] trait implementation. |

8 | */ |

9 | |

10 | use alloc::{ |

11 | collections::{BTreeSet, VecDeque}, |

12 | vec, |

13 | vec::Vec, |

14 | }; |

15 | |

16 | use crate::{ |

17 | automaton::Automaton, |

18 | util::{ |

19 | alphabet::{ByteClassSet, ByteClasses}, |

20 | error::{BuildError, MatchError}, |

21 | prefilter::{self, opposite_ascii_case, Prefilter}, |

22 | primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID}, |

23 | remapper::Remapper, |

24 | search::{Anchored, MatchKind}, |

25 | special::Special, |

26 | }, |

27 | }; |

28 | |

29 | /// A noncontiguous NFA implementation of Aho-Corasick. |

30 | /// |

31 | /// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of |

32 | /// this type directly. Using an `NFA` directly is typically only necessary |

33 | /// when one needs access to the [`Automaton`] trait implementation. |

34 | /// |

35 | /// This NFA represents the "core" implementation of Aho-Corasick in this |

36 | /// crate. Namely, constructing this NFA involving building a trie and then |

37 | /// filling in the failure transitions between states, similar to what is |

38 | /// described in any standard textbook description of Aho-Corasick. |

39 | /// |

40 | /// In order to minimize heap usage and to avoid additional construction costs, |

41 | /// this implementation represents the transitions of all states as distinct |

42 | /// sparse memory allocations. This is where it gets its name from. That is, |

43 | /// this NFA has no contiguous memory allocation for its transition table. Each |

44 | /// state gets its own allocation. |

45 | /// |

46 | /// While the sparse representation keeps memory usage to somewhat reasonable |

47 | /// levels, it is still quite large and also results in somewhat mediocre |

48 | /// search performance. For this reason, it is almost always a good idea to |

49 | /// use a [`contiguous::NFA`](crate::nfa::contiguous::NFA) instead. It is |

50 | /// marginally slower to build, but has higher throughput and can sometimes use |

51 | /// an order of magnitude less memory. The main reason to use a noncontiguous |

52 | /// NFA is when you need the fastest possible construction time, or when a |

53 | /// contiguous NFA does not have the desired capacity. (The total number of NFA |

54 | /// states it can have is fewer than a noncontiguous NFA.) |

55 | /// |

56 | /// # Example |

57 | /// |

58 | /// This example shows how to build an `NFA` directly and use it to execute |

59 | /// [`Automaton::try_find`]: |

60 | /// |

61 | /// ``` |

62 | /// use aho_corasick::{ |

63 | /// automaton::Automaton, |

64 | /// nfa::noncontiguous::NFA, |

65 | /// Input, Match, |

66 | /// }; |

67 | /// |

68 | /// let patterns = &["b", "abc", "abcd"]; |

69 | /// let haystack = "abcd"; |

70 | /// |

71 | /// let nfa = NFA::new(patterns).unwrap(); |

72 | /// assert_eq!( |

73 | /// Some(Match::must(0, 1..2)), |

74 | /// nfa.try_find(&Input::new(haystack))?, |

75 | /// ); |

76 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |

77 | /// ``` |

78 | /// |

79 | /// It is also possible to implement your own version of `try_find`. See the |

80 | /// [`Automaton`] documentation for an example. |

81 | #[derive(Clone)] |

82 | pub struct NFA { |

83 | /// The match semantics built into this NFA. |

84 | match_kind: MatchKind, |

85 | /// A set of states. Each state defines its own transitions, a fail |

86 | /// transition and a set of indices corresponding to matches. |

87 | /// |

88 | /// The first state is always the fail state, which is used only as a |

89 | /// sentinel. Namely, in the final NFA, no transition into the fail state |

90 | /// exists. (Well, they do, but they aren't followed. Instead, the state's |

91 | /// failure transition is followed.) |

92 | /// |

93 | /// The second state (index 1) is always the dead state. Dead states are |

94 | /// in every automaton, but only used when leftmost-{first,longest} match |

95 | /// semantics are enabled. Specifically, they instruct search to stop |

96 | /// at specific points in order to report the correct match location. In |

97 | /// the standard Aho-Corasick construction, there are no transitions to |

98 | /// the dead state. |

99 | /// |

100 | /// The third state (index 2) is generally intended to be the starting or |

101 | /// "root" state. |

102 | states: Vec<State>, |

103 | /// Transitions stored in a sparse representation via a linked list. |

104 | /// |

105 | /// Each transition contains three pieces of information: the byte it |

106 | /// is defined for, the state it transitions to and a link to the next |

107 | /// transition in the same state (or `StateID::ZERO` if it is the last |

108 | /// transition). |

109 | /// |

110 | /// The first transition for each state is determined by `State::sparse`. |

111 | /// |

112 | /// Note that this contains a complete set of all transitions in this NFA, |

113 | /// including states that have a dense representation for transitions. |

114 | /// (Adding dense transitions for a state doesn't remove its sparse |

115 | /// transitions, since deleting transitions from this particular sparse |

116 | /// representation would be fairly expensive.) |

117 | sparse: Vec<Transition>, |

118 | /// Transitions stored in a dense representation. |

119 | /// |

120 | /// A state has a row in this table if and only if `State::dense` is |

121 | /// not equal to `StateID::ZERO`. When not zero, there are precisely |

122 | /// `NFA::byte_classes::alphabet_len()` entries beginning at `State::dense` |

123 | /// in this table. |

124 | /// |

125 | /// Generally a very small minority of states have a dense representation |

126 | /// since it uses so much memory. |

127 | dense: Vec<StateID>, |

128 | /// Matches stored in linked list for each state. |

129 | /// |

130 | /// Like sparse transitions, each match has a link to the next match in the |

131 | /// state. |

132 | /// |

133 | /// The first match for each state is determined by `State::matches`. |

134 | matches: Vec<Match>, |

135 | /// The length, in bytes, of each pattern in this NFA. This slice is |

136 | /// indexed by `PatternID`. |

137 | /// |

138 | /// The number of entries in this vector corresponds to the total number of |

139 | /// patterns in this automaton. |

140 | pattern_lens: Vec<SmallIndex>, |

141 | /// A prefilter for quickly skipping to candidate matches, if pertinent. |

142 | prefilter: Option<Prefilter>, |

143 | /// A set of equivalence classes in terms of bytes. We compute this while |

144 | /// building the NFA, but don't use it in the NFA's states. Instead, we |

145 | /// use this for building the DFA. We store it on the NFA since it's easy |

146 | /// to compute while visiting the patterns. |

147 | byte_classes: ByteClasses, |

148 | /// The length, in bytes, of the shortest pattern in this automaton. This |

149 | /// information is useful for detecting whether an automaton matches the |

150 | /// empty string or not. |

151 | min_pattern_len: usize, |

152 | /// The length, in bytes, of the longest pattern in this automaton. This |

153 | /// information is useful for keeping correct buffer sizes when searching |

154 | /// on streams. |

155 | max_pattern_len: usize, |

156 | /// The information required to deduce which states are "special" in this |

157 | /// NFA. |

158 | /// |

159 | /// Since the DEAD and FAIL states are always the first two states and |

160 | /// there are only ever two start states (which follow all of the match |

161 | /// states), it follows that we can determine whether a state is a fail, |

162 | /// dead, match or start with just a few comparisons on the ID itself: |

163 | /// |

164 | /// is_dead(sid): sid == NFA::DEAD |

165 | /// is_fail(sid): sid == NFA::FAIL |

166 | /// is_match(sid): NFA::FAIL < sid && sid <= max_match_id |

167 | /// is_start(sid): sid == start_unanchored_id || sid == start_anchored_id |

168 | /// |

169 | /// Note that this only applies to the NFA after it has been constructed. |

170 | /// During construction, the start states are the first ones added and the |

171 | /// match states are inter-leaved with non-match states. Once all of the |

172 | /// states have been added, the states are shuffled such that the above |

173 | /// predicates hold. |

174 | special: Special, |

175 | } |

176 | |

177 | impl NFA { |

178 | /// Create a new Aho-Corasick noncontiguous NFA using the default |

179 | /// configuration. |

180 | /// |

181 | /// Use a [`Builder`] if you want to change the configuration. |

182 | pub fn new<I, P>(patterns: I) -> Result<NFA, BuildError> |

183 | where |

184 | I: IntoIterator<Item = P>, |

185 | P: AsRef<[u8]>, |

186 | { |

187 | NFA::builder().build(patterns) |

188 | } |

189 | |

190 | /// A convenience method for returning a new Aho-Corasick noncontiguous NFA |

191 | /// builder. |

192 | /// |

193 | /// This usually permits one to just import the `NFA` type. |

194 | pub fn builder() -> Builder { |

195 | Builder::new() |

196 | } |

197 | } |

198 | |

199 | impl NFA { |

200 | /// The DEAD state is a sentinel state like the FAIL state. The DEAD state |

201 | /// instructs any search to stop and return any currently recorded match, |

202 | /// or no match otherwise. Generally speaking, it is impossible for an |

203 | /// unanchored standard search to enter a DEAD state. But an anchored |

204 | /// search can, and so to can a leftmost search. |

205 | /// |

206 | /// We put DEAD before FAIL so that DEAD is always 0. We repeat this |

207 | /// decision across the other Aho-Corasicm automata, so that DEAD |

208 | /// states there are always 0 too. It's not that we need all of the |

209 | /// implementations to agree, but rather, the contiguous NFA and the DFA |

210 | /// use a sort of "premultiplied" state identifier where the only state |

211 | /// whose ID is always known and constant is the first state. Subsequent |

212 | /// state IDs depend on how much space has already been used in the |

213 | /// transition table. |

214 | pub(crate) const DEAD: StateID = StateID::new_unchecked(0); |

215 | /// The FAIL state mostly just corresponds to the ID of any transition on a |

216 | /// state that isn't explicitly defined. When one transitions into the FAIL |

217 | /// state, one must follow the previous state's failure transition before |

218 | /// doing the next state lookup. In this way, FAIL is more of a sentinel |

219 | /// than a state that one actually transitions into. In particular, it is |

220 | /// never exposed in the `Automaton` interface. |

221 | pub(crate) const FAIL: StateID = StateID::new_unchecked(1); |

222 | |

223 | /// Returns the equivalence classes of bytes found while constructing |

224 | /// this NFA. |

225 | /// |

226 | /// Note that the NFA doesn't actually make use of these equivalence |

227 | /// classes. Instead, these are useful for building the DFA when desired. |

228 | pub(crate) fn byte_classes(&self) -> &ByteClasses { |

229 | &self.byte_classes |

230 | } |

231 | |

232 | /// Returns a slice containing the length of each pattern in this searcher. |

233 | /// It is indexed by `PatternID` and has length `NFA::patterns_len`. |

234 | /// |

235 | /// This is exposed for convenience when building a contiguous NFA. But it |

236 | /// can be reconstructed from the `Automaton` API if necessary. |

237 | pub(crate) fn pattern_lens_raw(&self) -> &[SmallIndex] { |

238 | &self.pattern_lens |

239 | } |

240 | |

241 | /// Returns a slice of all states in this non-contiguous NFA. |

242 | pub(crate) fn states(&self) -> &[State] { |

243 | &self.states |

244 | } |

245 | |

246 | /// Returns the underlying "special" state information for this NFA. |

247 | pub(crate) fn special(&self) -> &Special { |

248 | &self.special |

249 | } |

250 | |

251 | /// Swaps the states at `id1` and `id2`. |

252 | /// |

253 | /// This does not update the transitions of any state to account for the |

254 | /// state swap. |

255 | pub(crate) fn swap_states(&mut self, id1: StateID, id2: StateID) { |

256 | self.states.swap(id1.as_usize(), id2.as_usize()); |

257 | } |

258 | |

259 | /// Re-maps all state IDs in this NFA according to the `map` function |

260 | /// given. |

261 | pub(crate) fn remap(&mut self, map: impl Fn(StateID) -> StateID) { |

262 | let alphabet_len = self.byte_classes.alphabet_len(); |

263 | for state in self.states.iter_mut() { |

264 | state.fail = map(state.fail); |

265 | let mut link = state.sparse; |

266 | while link != StateID::ZERO { |

267 | let t = &mut self.sparse[link]; |

268 | t.next = map(t.next); |

269 | link = t.link; |

270 | } |

271 | if state.dense != StateID::ZERO { |

272 | let start = state.dense.as_usize(); |

273 | for next in self.dense[start..][..alphabet_len].iter_mut() { |

274 | *next = map(*next); |

275 | } |

276 | } |

277 | } |

278 | } |

279 | |

280 | /// Iterate over all of the transitions for the given state ID. |

281 | pub(crate) fn iter_trans( |

282 | &self, |

283 | sid: StateID, |

284 | ) -> impl Iterator<Item = Transition> + '_ { |

285 | let mut link = self.states[sid].sparse; |

286 | core::iter::from_fn(move || { |

287 | if link == StateID::ZERO { |

288 | return None; |

289 | } |

290 | let t = self.sparse[link]; |

291 | link = t.link; |

292 | Some(t) |

293 | }) |

294 | } |

295 | |

296 | /// Iterate over all of the matches for the given state ID. |

297 | pub(crate) fn iter_matches( |

298 | &self, |

299 | sid: StateID, |

300 | ) -> impl Iterator<Item = PatternID> + '_ { |

301 | let mut link = self.states[sid].matches; |

302 | core::iter::from_fn(move || { |

303 | if link == StateID::ZERO { |

304 | return None; |

305 | } |

306 | let m = self.matches[link]; |

307 | link = m.link; |

308 | Some(m.pid) |

309 | }) |

310 | } |

311 | |

312 | /// Return the link following the one given. If the one given is the last |

313 | /// link for the given state, then return `None`. |

314 | /// |

315 | /// If no previous link is given, then this returns the first link in the |

316 | /// state, if one exists. |

317 | /// |

318 | /// This is useful for manually iterating over the transitions in a single |

319 | /// state without borrowing the NFA. This permits mutating other parts of |

320 | /// the NFA during iteration. Namely, one can access the transition pointed |

321 | /// to by the link via `self.sparse[link]`. |

322 | fn next_link( |

323 | &self, |

324 | sid: StateID, |

325 | prev: Option<StateID>, |

326 | ) -> Option<StateID> { |

327 | let link = |

328 | prev.map_or(self.states[sid].sparse, |p| self.sparse[p].link); |

329 | if link == StateID::ZERO { |

330 | None |

331 | } else { |

332 | Some(link) |

333 | } |

334 | } |

335 | |

336 | /// Follow the transition for the given byte in the given state. If no such |

337 | /// transition exists, then the FAIL state ID is returned. |

338 | #[inline(always)] |

339 | fn follow_transition(&self, sid: StateID, byte: u8) -> StateID { |

340 | let s = &self.states[sid]; |

341 | // This is a special case that targets starting states and states |

342 | // near a start state. Namely, after the initial trie is constructed, |

343 | // we look for states close to the start state to convert to a dense |

344 | // representation for their transitions. This winds up using a lot more |

345 | // memory per state in exchange for faster transition lookups. But |

346 | // since we only do this for a small number of states (by default), the |

347 | // memory usage is usually minimal. |

348 | // |

349 | // This has *massive* benefit when executing searches because the |

350 | // unanchored starting state is by far the hottest state and is |

351 | // frequently visited. Moreover, the 'for' loop below that works |

352 | // decently on an actually sparse state is disastrous on a state that |

353 | // is nearly or completely dense. |

354 | if s.dense == StateID::ZERO { |

355 | self.follow_transition_sparse(sid, byte) |

356 | } else { |

357 | let class = usize::from(self.byte_classes.get(byte)); |

358 | self.dense[s.dense.as_usize() + class] |

359 | } |

360 | } |

361 | |

362 | /// Like `follow_transition`, but always uses the sparse representation. |

363 | #[inline(always)] |

364 | fn follow_transition_sparse(&self, sid: StateID, byte: u8) -> StateID { |

365 | for t in self.iter_trans(sid) { |

366 | if byte <= t.byte { |

367 | if byte == t.byte { |

368 | return t.next; |

369 | } |

370 | break; |

371 | } |

372 | } |

373 | NFA::FAIL |

374 | } |

375 | |

376 | /// Set the transition for the given byte to the state ID given. |

377 | /// |

378 | /// Note that one should not set transitions to the FAIL state. It is not |

379 | /// technically incorrect, but it wastes space. If a transition is not |

380 | /// defined, then it is automatically assumed to lead to the FAIL state. |

381 | fn add_transition( |

382 | &mut self, |

383 | prev: StateID, |

384 | byte: u8, |

385 | next: StateID, |

386 | ) -> Result<(), BuildError> { |

387 | if self.states[prev].dense != StateID::ZERO { |

388 | let dense = self.states[prev].dense; |

389 | let class = usize::from(self.byte_classes.get(byte)); |

390 | self.dense[dense.as_usize() + class] = next; |

391 | } |

392 | |

393 | let head = self.states[prev].sparse; |

394 | if head == StateID::ZERO || byte < self.sparse[head].byte { |

395 | let new_link = self.alloc_transition()?; |

396 | self.sparse[new_link] = Transition { byte, next, link: head }; |

397 | self.states[prev].sparse = new_link; |

398 | return Ok(()); |

399 | } else if byte == self.sparse[head].byte { |

400 | self.sparse[head].next = next; |

401 | return Ok(()); |

402 | } |

403 | |

404 | // We handled the only cases where the beginning of the transition |

405 | // chain needs to change. At this point, we now know that there is |

406 | // at least one entry in the transition chain and the byte for that |

407 | // transition is less than the byte for the transition we're adding. |

408 | let (mut link_prev, mut link_next) = (head, self.sparse[head].link); |

409 | while link_next != StateID::ZERO && byte > self.sparse[link_next].byte |

410 | { |

411 | link_prev = link_next; |

412 | link_next = self.sparse[link_next].link; |

413 | } |

414 | if link_next == StateID::ZERO || byte < self.sparse[link_next].byte { |

415 | let link = self.alloc_transition()?; |

416 | self.sparse[link] = Transition { byte, next, link: link_next }; |

417 | self.sparse[link_prev].link = link; |

418 | } else { |

419 | assert_eq!(byte, self.sparse[link_next].byte); |

420 | self.sparse[link_next].next = next; |

421 | } |

422 | Ok(()) |

423 | } |

424 | |

425 | /// This sets every possible transition (all 255 of them) for the given |

426 | /// state to the name `next` value. |

427 | /// |

428 | /// This is useful for efficiently initializing start/dead states. |

429 | /// |

430 | /// # Panics |

431 | /// |

432 | /// This requires that the state has no transitions added to it already. |

433 | /// If it has any transitions, then this panics. It will also panic if |

434 | /// the state has been densified prior to calling this. |

435 | fn init_full_state( |

436 | &mut self, |

437 | prev: StateID, |

438 | next: StateID, |

439 | ) -> Result<(), BuildError> { |

440 | assert_eq!( |

441 | StateID::ZERO, |

442 | self.states[prev].dense, |

443 | "state must not be dense yet" |

444 | ); |

445 | assert_eq!( |

446 | StateID::ZERO, |

447 | self.states[prev].sparse, |

448 | "state must have zero transitions" |

449 | ); |

450 | let mut prev_link = StateID::ZERO; |

451 | for byte in 0..=255 { |

452 | let new_link = self.alloc_transition()?; |

453 | self.sparse[new_link] = |

454 | Transition { byte, next, link: StateID::ZERO }; |

455 | if prev_link == StateID::ZERO { |

456 | self.states[prev].sparse = new_link; |

457 | } else { |

458 | self.sparse[prev_link].link = new_link; |

459 | } |

460 | prev_link = new_link; |

461 | } |

462 | Ok(()) |

463 | } |

464 | |

465 | /// Add a match for the given pattern ID to the state for the given ID. |

466 | fn add_match( |

467 | &mut self, |

468 | sid: StateID, |

469 | pid: PatternID, |

470 | ) -> Result<(), BuildError> { |

471 | let head = self.states[sid].matches; |

472 | let mut link = head; |

473 | while self.matches[link].link != StateID::ZERO { |

474 | link = self.matches[link].link; |

475 | } |

476 | let new_match_link = self.alloc_match()?; |

477 | self.matches[new_match_link].pid = pid; |

478 | if link == StateID::ZERO { |

479 | self.states[sid].matches = new_match_link; |

480 | } else { |

481 | self.matches[link].link = new_match_link; |

482 | } |

483 | Ok(()) |

484 | } |

485 | |

486 | /// Copy matches from the `src` state to the `dst` state. This is useful |

487 | /// when a match state can be reached via a failure transition. In which |

488 | /// case, you'll want to copy the matches (if any) from the state reached |

489 | /// by the failure transition to the original state you were at. |

490 | fn copy_matches( |

491 | &mut self, |

492 | src: StateID, |

493 | dst: StateID, |

494 | ) -> Result<(), BuildError> { |

495 | let head_dst = self.states[dst].matches; |

496 | let mut link_dst = head_dst; |

497 | while self.matches[link_dst].link != StateID::ZERO { |

498 | link_dst = self.matches[link_dst].link; |

499 | } |

500 | let mut link_src = self.states[src].matches; |

501 | while link_src != StateID::ZERO { |

502 | let new_match_link = |

503 | StateID::new(self.matches.len()).map_err(|e| { |

504 | BuildError::state_id_overflow( |

505 | StateID::MAX.as_u64(), |

506 | e.attempted(), |

507 | ) |

508 | })?; |

509 | self.matches.push(Match { |

510 | pid: self.matches[link_src].pid, |

511 | link: StateID::ZERO, |

512 | }); |

513 | if link_dst == StateID::ZERO { |

514 | self.states[dst].matches = new_match_link; |

515 | } else { |

516 | self.matches[link_dst].link = new_match_link; |

517 | } |

518 | |

519 | link_dst = new_match_link; |

520 | link_src = self.matches[link_src].link; |

521 | } |

522 | Ok(()) |

523 | } |

524 | |

525 | /// Create a new entry in `NFA::trans`, if there's room, and return that |

526 | /// entry's ID. If there's no room, then an error is returned. |

527 | fn alloc_transition(&mut self) -> Result<StateID, BuildError> { |

528 | let id = StateID::new(self.sparse.len()).map_err(|e| { |

529 | BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted()) |

530 | })?; |

531 | self.sparse.push(Transition::default()); |

532 | Ok(id) |

533 | } |

534 | |

535 | /// Create a new entry in `NFA::matches`, if there's room, and return that |

536 | /// entry's ID. If there's no room, then an error is returned. |

537 | fn alloc_match(&mut self) -> Result<StateID, BuildError> { |

538 | let id = StateID::new(self.matches.len()).map_err(|e| { |

539 | BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted()) |

540 | })?; |

541 | self.matches.push(Match::default()); |

542 | Ok(id) |

543 | } |

544 | |

545 | /// Create a new set of `N` transitions in this NFA's dense transition |

546 | /// table. The ID return corresponds to the index at which the `N` |

547 | /// transitions begin. So `id+0` is the first transition and `id+(N-1)` is |

548 | /// the last. |

549 | /// |

550 | /// `N` is determined via `NFA::byte_classes::alphabet_len`. |

551 | fn alloc_dense_state(&mut self) -> Result<StateID, BuildError> { |

552 | let id = StateID::new(self.dense.len()).map_err(|e| { |

553 | BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted()) |

554 | })?; |

555 | // We use FAIL because it's the correct default. If a state doesn't |

556 | // have a transition defined for every possible byte value, then the |

557 | // transition function should return NFA::FAIL. |

558 | self.dense.extend( |

559 | core::iter::repeat(NFA::FAIL) |

560 | .take(self.byte_classes.alphabet_len()), |

561 | ); |

562 | Ok(id) |

563 | } |

564 | |

565 | /// Allocate and add a fresh state to the underlying NFA and return its |

566 | /// ID (guaranteed to be one more than the ID of the previously allocated |

567 | /// state). If the ID would overflow `StateID`, then this returns an error. |

568 | fn alloc_state(&mut self, depth: usize) -> Result<StateID, BuildError> { |

569 | // This is OK because we error when building the trie if we see a |

570 | // pattern whose length cannot fit into a 'SmallIndex', and the longest |

571 | // possible depth corresponds to the length of the longest pattern. |

572 | let depth = SmallIndex::new(depth) |

573 | .expect("patterns longer than SmallIndex::MAX are not allowed"); |

574 | let id = StateID::new(self.states.len()).map_err(|e| { |

575 | BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted()) |

576 | })?; |

577 | self.states.push(State { |

578 | sparse: StateID::ZERO, |

579 | dense: StateID::ZERO, |

580 | matches: StateID::ZERO, |

581 | fail: self.special.start_unanchored_id, |

582 | depth, |

583 | }); |

584 | Ok(id) |

585 | } |

586 | } |

587 | |

588 | // SAFETY: 'start_state' always returns a valid state ID, 'next_state' always |

589 | // returns a valid state ID given a valid state ID. We otherwise claim that |

590 | // all other methods are correct as well. |

591 | unsafe impl Automaton for NFA { |

592 | #[inline(always)] |

593 | fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> { |

594 | match anchored { |

595 | Anchored::No => Ok(self.special.start_unanchored_id), |

596 | Anchored::Yes => Ok(self.special.start_anchored_id), |

597 | } |

598 | } |

599 | |

600 | #[inline(always)] |

601 | fn next_state( |

602 | &self, |

603 | anchored: Anchored, |

604 | mut sid: StateID, |

605 | byte: u8, |

606 | ) -> StateID { |

607 | // This terminates since: |

608 | // |

609 | // 1. state.fail never points to the FAIL state. |

610 | // 2. All state.fail values point to a state closer to the start state. |

611 | // 3. The start state has no transitions to the FAIL state. |

612 | loop { |

613 | let next = self.follow_transition(sid, byte); |

614 | if next != NFA::FAIL { |

615 | return next; |

616 | } |

617 | // For an anchored search, we never follow failure transitions |

618 | // because failure transitions lead us down a path to matching |

619 | // a *proper* suffix of the path we were on. Thus, it can only |

620 | // produce matches that appear after the beginning of the search. |

621 | if anchored.is_anchored() { |

622 | return NFA::DEAD; |

623 | } |

624 | sid = self.states[sid].fail(); |

625 | } |

626 | } |

627 | |

628 | #[inline(always)] |

629 | fn is_special(&self, sid: StateID) -> bool { |

630 | sid <= self.special.max_special_id |

631 | } |

632 | |

633 | #[inline(always)] |

634 | fn is_dead(&self, sid: StateID) -> bool { |

635 | sid == NFA::DEAD |

636 | } |

637 | |

638 | #[inline(always)] |

639 | fn is_match(&self, sid: StateID) -> bool { |

640 | // N.B. This returns true when sid==NFA::FAIL but that's okay because |

641 | // NFA::FAIL is not actually a valid state ID from the perspective of |

642 | // the Automaton trait. Namely, it is never returned by 'start_state' |

643 | // or by 'next_state'. So we don't need to care about it here. |

644 | !self.is_dead(sid) && sid <= self.special.max_match_id |

645 | } |

646 | |

647 | #[inline(always)] |

648 | fn is_start(&self, sid: StateID) -> bool { |

649 | sid == self.special.start_unanchored_id |

650 | || sid == self.special.start_anchored_id |

651 | } |

652 | |

653 | #[inline(always)] |

654 | fn match_kind(&self) -> MatchKind { |

655 | self.match_kind |

656 | } |

657 | |

658 | #[inline(always)] |

659 | fn patterns_len(&self) -> usize { |

660 | self.pattern_lens.len() |

661 | } |

662 | |

663 | #[inline(always)] |

664 | fn pattern_len(&self, pid: PatternID) -> usize { |

665 | self.pattern_lens[pid].as_usize() |

666 | } |

667 | |

668 | #[inline(always)] |

669 | fn min_pattern_len(&self) -> usize { |

670 | self.min_pattern_len |

671 | } |

672 | |

673 | #[inline(always)] |

674 | fn max_pattern_len(&self) -> usize { |

675 | self.max_pattern_len |

676 | } |

677 | |

678 | #[inline(always)] |

679 | fn match_len(&self, sid: StateID) -> usize { |

680 | self.iter_matches(sid).count() |

681 | } |

682 | |

683 | #[inline(always)] |

684 | fn match_pattern(&self, sid: StateID, index: usize) -> PatternID { |

685 | self.iter_matches(sid).nth(index).unwrap() |

686 | } |

687 | |

688 | #[inline(always)] |

689 | fn memory_usage(&self) -> usize { |

690 | self.states.len() * core::mem::size_of::<State>() |

691 | + self.sparse.len() * core::mem::size_of::<Transition>() |

692 | + self.matches.len() * core::mem::size_of::<Match>() |

693 | + self.dense.len() * StateID::SIZE |

694 | + self.pattern_lens.len() * SmallIndex::SIZE |

695 | + self.prefilter.as_ref().map_or(0, |p| p.memory_usage()) |

696 | } |

697 | |

698 | #[inline(always)] |

699 | fn prefilter(&self) -> Option<&Prefilter> { |

700 | self.prefilter.as_ref() |

701 | } |

702 | } |

703 | |

704 | /// A representation of a sparse NFA state for an Aho-Corasick automaton. |

705 | /// |

706 | /// It contains the transitions to the next state, a failure transition for |

707 | /// cases where there exists no other transition for the current input byte |

708 | /// and the matches implied by visiting this state (if any). |

709 | #[derive(Clone, Debug)] |

710 | pub(crate) struct State { |

711 | /// A pointer to `NFA::trans` corresponding to the head of a linked list |

712 | /// containing all of the transitions for this state. |

713 | /// |

714 | /// This is `StateID::ZERO` if and only if this state has zero transitions. |

715 | sparse: StateID, |

716 | /// A pointer to a row of `N` transitions in `NFA::dense`. These |

717 | /// transitions correspond precisely to what is obtained by traversing |

718 | /// `sparse`, but permits constant time lookup. |

719 | /// |

720 | /// When this is zero (which is true for most states in the default |

721 | /// configuration), then this state has no dense representation. |

722 | /// |

723 | /// Note that `N` is equal to `NFA::byte_classes::alphabet_len()`. This is |

724 | /// typically much less than 256 (the maximum value). |

725 | dense: StateID, |

726 | /// A pointer to `NFA::matches` corresponding to the head of a linked list |

727 | /// containing all of the matches for this state. |

728 | /// |

729 | /// This is `StateID::ZERO` if and only if this state is not a match state. |

730 | matches: StateID, |

731 | /// The state that should be transitioned to if the current byte in the |

732 | /// haystack does not have a corresponding transition defined in this |

733 | /// state. |

734 | fail: StateID, |

735 | /// The depth of this state. Specifically, this is the distance from this |

736 | /// state to the starting state. (For the special sentinel states DEAD and |

737 | /// FAIL, their depth is always 0.) The depth of a starting state is 0. |

738 | /// |

739 | /// Note that depth is currently not used in this non-contiguous NFA. It |

740 | /// may in the future, but it is used in the contiguous NFA. Namely, it |

741 | /// permits an optimization where states near the starting state have their |

742 | /// transitions stored in a dense fashion, but all other states have their |

743 | /// transitions stored in a sparse fashion. (This non-contiguous NFA uses |

744 | /// a sparse representation for all states unconditionally.) In any case, |

745 | /// this is really the only convenient place to compute and store this |

746 | /// information, which we need when building the contiguous NFA. |

747 | depth: SmallIndex, |

748 | } |

749 | |

750 | impl State { |

751 | /// Return true if and only if this state is a match state. |

752 | pub(crate) fn is_match(&self) -> bool { |

753 | self.matches != StateID::ZERO |

754 | } |

755 | |

756 | /// Returns the failure transition for this state. |

757 | pub(crate) fn fail(&self) -> StateID { |

758 | self.fail |

759 | } |

760 | |

761 | /// Returns the depth of this state. That is, the number of transitions |

762 | /// this state is from the start state of the NFA. |

763 | pub(crate) fn depth(&self) -> SmallIndex { |

764 | self.depth |

765 | } |

766 | } |

767 | |

768 | /// A single transition in a non-contiguous NFA. |

769 | #[derive(Clone, Copy, Default)] |

770 | #[repr(packed)] |

771 | pub(crate) struct Transition { |

772 | byte: u8, |

773 | next: StateID, |

774 | link: StateID, |

775 | } |

776 | |

777 | impl Transition { |

778 | /// Return the byte for which this transition is defined. |

779 | pub(crate) fn byte(&self) -> u8 { |

780 | self.byte |

781 | } |

782 | |

783 | /// Return the ID of the state that this transition points to. |

784 | pub(crate) fn next(&self) -> StateID { |

785 | self.next |

786 | } |

787 | |

788 | /// Return the ID of the next transition. |

789 | fn link(&self) -> StateID { |

790 | self.link |

791 | } |

792 | } |

793 | |

794 | impl core::fmt::Debug for Transition { |

795 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |

796 | write!( |

797 | f, |

798 | "Transition(byte: {:X?}, next: {:?}, link: {:?})", |

799 | self.byte, |

800 | self.next().as_usize(), |

801 | self.link().as_usize() |

802 | ) |

803 | } |

804 | } |

805 | |

806 | /// A single match in a non-contiguous NFA. |

807 | #[derive(Clone, Copy, Default)] |

808 | struct Match { |

809 | pid: PatternID, |

810 | link: StateID, |

811 | } |

812 | |

813 | impl Match { |

814 | /// Return the pattern ID for this match. |

815 | pub(crate) fn pattern(&self) -> PatternID { |

816 | self.pid |

817 | } |

818 | |

819 | /// Return the ID of the next match. |

820 | fn link(&self) -> StateID { |

821 | self.link |

822 | } |

823 | } |

824 | |

825 | impl core::fmt::Debug for Match { |

826 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |

827 | write!( |

828 | f, |

829 | "Match(pid: {:?}, link: {:?})", |

830 | self.pattern().as_usize(), |

831 | self.link().as_usize() |

832 | ) |

833 | } |

834 | } |

835 | |

836 | /// A builder for configuring an Aho-Corasick noncontiguous NFA. |

837 | /// |

838 | /// This builder has a subset of the options available to a |

839 | /// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options, |

840 | /// their behavior is identical. |

841 | #[derive(Clone, Debug)] |

842 | pub struct Builder { |

843 | match_kind: MatchKind, |

844 | prefilter: bool, |

845 | ascii_case_insensitive: bool, |

846 | dense_depth: usize, |

847 | } |

848 | |

849 | impl Default for Builder { |

850 | fn default() -> Builder { |

851 | Builder { |

852 | match_kind: MatchKind::default(), |

853 | prefilter: true, |

854 | ascii_case_insensitive: false, |

855 | dense_depth: 3, |

856 | } |

857 | } |

858 | } |

859 | |

860 | impl Builder { |

861 | /// Create a new builder for configuring an Aho-Corasick noncontiguous NFA. |

862 | pub fn new() -> Builder { |

863 | Builder::default() |

864 | } |

865 | |

866 | /// Build an Aho-Corasick noncontiguous NFA from the given iterator of |

867 | /// patterns. |

868 | /// |

869 | /// A builder may be reused to create more NFAs. |

870 | pub fn build<I, P>(&self, patterns: I) -> Result<NFA, BuildError> |

871 | where |

872 | I: IntoIterator<Item = P>, |

873 | P: AsRef<[u8]>, |

874 | { |

875 | debug!("building non-contiguous NFA"); |

876 | let nfa = Compiler::new(self)?.compile(patterns)?; |

877 | debug!( |

878 | "non-contiguous NFA built, <states: {:?}, size: {:?}>", |

879 | nfa.states.len(), |

880 | nfa.memory_usage() |

881 | ); |

882 | Ok(nfa) |

883 | } |

884 | |

885 | /// Set the desired match semantics. |

886 | /// |

887 | /// See |

888 | /// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind) |

889 | /// for more documentation and examples. |

890 | pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder { |

891 | self.match_kind = kind; |

892 | self |

893 | } |

894 | |

895 | /// Enable ASCII-aware case insensitive matching. |

896 | /// |

897 | /// See |

898 | /// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive) |

899 | /// for more documentation and examples. |

900 | pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder { |

901 | self.ascii_case_insensitive = yes; |

902 | self |

903 | } |

904 | |

905 | /// Set the limit on how many states use a dense representation for their |

906 | /// transitions. Other states will generally use a sparse representation. |

907 | /// |

908 | /// See |

909 | /// [`AhoCorasickBuilder::dense_depth`](crate::AhoCorasickBuilder::dense_depth) |

910 | /// for more documentation and examples. |

911 | pub fn dense_depth(&mut self, depth: usize) -> &mut Builder { |

912 | self.dense_depth = depth; |

913 | self |

914 | } |

915 | |

916 | /// Enable heuristic prefilter optimizations. |

917 | /// |

918 | /// See |

919 | /// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter) |

920 | /// for more documentation and examples. |

921 | pub fn prefilter(&mut self, yes: bool) -> &mut Builder { |

922 | self.prefilter = yes; |

923 | self |

924 | } |

925 | } |

926 | |

927 | /// A compiler uses a builder configuration and builds up the NFA formulation |

928 | /// of an Aho-Corasick automaton. This roughly corresponds to the standard |

929 | /// formulation described in textbooks, with some tweaks to support leftmost |

930 | /// searching. |

931 | #[derive(Debug)] |

932 | struct Compiler<'a> { |

933 | builder: &'a Builder, |

934 | prefilter: prefilter::Builder, |

935 | nfa: NFA, |

936 | byteset: ByteClassSet, |

937 | } |

938 | |

939 | impl<'a> Compiler<'a> { |

940 | fn new(builder: &'a Builder) -> Result<Compiler<'a>, BuildError> { |

941 | let prefilter = prefilter::Builder::new(builder.match_kind) |

942 | .ascii_case_insensitive(builder.ascii_case_insensitive); |

943 | Ok(Compiler { |

944 | builder, |

945 | prefilter, |

946 | nfa: NFA { |

947 | match_kind: builder.match_kind, |

948 | states: vec![], |

949 | sparse: vec![], |

950 | dense: vec![], |

951 | matches: vec![], |

952 | pattern_lens: vec![], |

953 | prefilter: None, |

954 | byte_classes: ByteClasses::singletons(), |

955 | min_pattern_len: usize::MAX, |

956 | max_pattern_len: 0, |

957 | special: Special::zero(), |

958 | }, |

959 | byteset: ByteClassSet::empty(), |

960 | }) |

961 | } |

962 | |

963 | fn compile<I, P>(mut self, patterns: I) -> Result<NFA, BuildError> |

964 | where |

965 | I: IntoIterator<Item = P>, |

966 | P: AsRef<[u8]>, |

967 | { |

968 | // Add dummy transition/match links, so that no valid link will point |

969 | // to another link at index 0. |

970 | self.nfa.sparse.push(Transition::default()); |

971 | self.nfa.matches.push(Match::default()); |

972 | // Add a dummy dense transition so that no states can have dense==0 |

973 | // represent a valid pointer to dense transitions. This permits |

974 | // dense==0 to be a sentinel indicating "no dense transitions." |

975 | self.nfa.dense.push(NFA::DEAD); |

976 | // the dead state, only used for leftmost and fixed to id==0 |

977 | self.nfa.alloc_state(0)?; |

978 | // the fail state, which is never entered and fixed to id==1 |

979 | self.nfa.alloc_state(0)?; |

980 | // unanchored start state, initially fixed to id==2 but later shuffled |

981 | // to appear after all non-start match states. |

982 | self.nfa.special.start_unanchored_id = self.nfa.alloc_state(0)?; |

983 | // anchored start state, initially fixed to id==3 but later shuffled |

984 | // to appear after unanchored start state. |

985 | self.nfa.special.start_anchored_id = self.nfa.alloc_state(0)?; |

986 | // Initialize the unanchored starting state in order to make it dense, |

987 | // and thus make transition lookups on this state faster. |

988 | self.init_unanchored_start_state()?; |

989 | // Set all transitions on the DEAD state to point to itself. This way, |

990 | // the DEAD state can never be escaped. It MUST be used as a sentinel |

991 | // in any correct search. |

992 | self.add_dead_state_loop()?; |

993 | // Build the base trie from the given patterns. |

994 | self.build_trie(patterns)?; |

995 | self.nfa.states.shrink_to_fit(); |

996 | // Turn our set of bytes into equivalent classes. This NFA |

997 | // implementation uses byte classes only for states that use a dense |

998 | // representation of transitions. (And that's why this comes before |

999 | // `self.densify()`, as the byte classes need to be set first.) |

1000 | self.nfa.byte_classes = self.byteset.byte_classes(); |

1001 | // Add transitions (and maybe matches) to the anchored starting state. |

1002 | // The anchored starting state is used for anchored searches. The only |

1003 | // mechanical difference between it and the unanchored start state is |

1004 | // that missing transitions map to the DEAD state instead of the FAIL |

1005 | // state. |

1006 | self.set_anchored_start_state()?; |

1007 | // Rewrite transitions to the FAIL state on the unanchored start state |

1008 | // as self-transitions. This keeps the start state active at all times. |

1009 | self.add_unanchored_start_state_loop(); |

1010 | // Make some (possibly zero) states use a dense representation for |

1011 | // transitions. It's important to do this right after the states |

1012 | // and non-failure transitions are solidified. That way, subsequent |

1013 | // accesses (particularly `fill_failure_transitions`) will benefit from |

1014 | // the faster transition lookup in densified states. |

1015 | self.densify()?; |

1016 | // The meat of the Aho-Corasick algorithm: compute and write failure |

1017 | // transitions. i.e., the state to move to when a transition isn't |

1018 | // defined in the current state. These are epsilon transitions and thus |

1019 | // make this formulation an NFA. |

1020 | self.fill_failure_transitions()?; |

1021 | // Handle a special case under leftmost semantics when at least one |

1022 | // of the patterns is the empty string. |

1023 | self.close_start_state_loop_for_leftmost(); |

1024 | // Shuffle states so that we have DEAD, FAIL, MATCH, ..., START, START, |

1025 | // NON-MATCH, ... This permits us to very quickly query the type of |

1026 | // the state we're currently in during a search. |

1027 | self.shuffle(); |

1028 | self.nfa.prefilter = self.prefilter.build(); |

1029 | // Store the maximum ID of all *relevant* special states. Start states |

1030 | // are only relevant when we have a prefilter, otherwise, there is zero |

1031 | // reason to care about whether a state is a start state or not during |

1032 | // a search. Indeed, without a prefilter, we are careful to explicitly |

1033 | // NOT care about start states, otherwise the search can ping pong |

1034 | // between the unrolled loop and the handling of special-status states |

1035 | // and destroy perf. |

1036 | self.nfa.special.max_special_id = if self.nfa.prefilter.is_some() { |

1037 | // Why the anchored starting state? Because we always put it |

1038 | // after the unanchored starting state and it is therefore the |

1039 | // maximum. Why put unanchored followed by anchored? No particular |

1040 | // reason, but that's how the states are logically organized in the |

1041 | // Thompson NFA implementation found in regex-automata. Â¯\_(ãƒ„)_/Â¯ |

1042 | self.nfa.special.start_anchored_id |

1043 | } else { |

1044 | self.nfa.special.max_match_id |

1045 | }; |

1046 | self.nfa.sparse.shrink_to_fit(); |

1047 | self.nfa.dense.shrink_to_fit(); |

1048 | self.nfa.matches.shrink_to_fit(); |

1049 | self.nfa.pattern_lens.shrink_to_fit(); |

1050 | Ok(self.nfa) |

1051 | } |

1052 | |

1053 | /// This sets up the initial prefix trie that makes up the Aho-Corasick |

1054 | /// automaton. Effectively, it creates the basic structure of the |

1055 | /// automaton, where every pattern given has a path from the start state to |

1056 | /// the end of the pattern. |

1057 | fn build_trie<I, P>(&mut self, patterns: I) -> Result<(), BuildError> |

1058 | where |

1059 | I: IntoIterator<Item = P>, |

1060 | P: AsRef<[u8]>, |

1061 | { |

1062 | 'PATTERNS: for (i, pat) in patterns.into_iter().enumerate() { |

1063 | let pid = PatternID::new(i).map_err(|e| { |

1064 | BuildError::pattern_id_overflow( |

1065 | PatternID::MAX.as_u64(), |

1066 | e.attempted(), |

1067 | ) |

1068 | })?; |

1069 | let pat = pat.as_ref(); |

1070 | let patlen = SmallIndex::new(pat.len()) |

1071 | .map_err(|_| BuildError::pattern_too_long(pid, pat.len()))?; |

1072 | self.nfa.min_pattern_len = |

1073 | core::cmp::min(self.nfa.min_pattern_len, pat.len()); |

1074 | self.nfa.max_pattern_len = |

1075 | core::cmp::max(self.nfa.max_pattern_len, pat.len()); |

1076 | assert_eq!( |

1077 | i, |

1078 | self.nfa.pattern_lens.len(), |

1079 | "expected number of patterns to match pattern ID" |

1080 | ); |

1081 | self.nfa.pattern_lens.push(patlen); |

1082 | // We add the pattern to the prefilter here because the pattern |

1083 | // ID in the prefilter is determined with respect to the patterns |

1084 | // added to the prefilter. That is, it isn't the ID we have here, |

1085 | // but the one determined by its own accounting of patterns. |

1086 | // To ensure they line up, we add every pattern we see to the |

1087 | // prefilter, even if some patterns ultimately are impossible to |

1088 | // match (in leftmost-first semantics specifically). |

1089 | // |

1090 | // Another way of doing this would be to expose an API in the |

1091 | // prefilter to permit setting your own pattern IDs. Or to just use |

1092 | // our own map and go between them. But this case is sufficiently |

1093 | // rare that we don't bother and just make sure they're in sync. |

1094 | if self.builder.prefilter { |

1095 | self.prefilter.add(pat); |

1096 | } |

1097 | |

1098 | let mut prev = self.nfa.special.start_unanchored_id; |

1099 | let mut saw_match = false; |

1100 | for (depth, &b) in pat.iter().enumerate() { |

1101 | // When leftmost-first match semantics are requested, we |

1102 | // specifically stop adding patterns when a previously added |

1103 | // pattern is a prefix of it. We avoid adding it because |

1104 | // leftmost-first semantics imply that the pattern can never |

1105 | // match. This is not just an optimization to save space! It |

1106 | // is necessary for correctness. In fact, this is the only |

1107 | // difference in the automaton between the implementations for |

1108 | // leftmost-first and leftmost-longest. |

1109 | saw_match = saw_match || self.nfa.states[prev].is_match(); |

1110 | if self.builder.match_kind.is_leftmost_first() && saw_match { |

1111 | // Skip to the next pattern immediately. This avoids |

1112 | // incorrectly adding a match after this loop terminates. |

1113 | continue 'PATTERNS; |

1114 | } |

1115 | |

1116 | // Add this byte to our equivalence classes. These don't |

1117 | // get used while building the trie, but other Aho-Corasick |

1118 | // implementations may use them. |

1119 | self.byteset.set_range(b, b); |

1120 | if self.builder.ascii_case_insensitive { |

1121 | let b = opposite_ascii_case(b); |

1122 | self.byteset.set_range(b, b); |

1123 | } |

1124 | |

1125 | // If the transition from prev using the current byte already |

1126 | // exists, then just move through it. Otherwise, add a new |

1127 | // state. We track the depth here so that we can determine |

1128 | // how to represent transitions. States near the start state |

1129 | // use a dense representation that uses more memory but is |

1130 | // faster. Other states use a sparse representation that uses |

1131 | // less memory but is slower. |

1132 | let next = self.nfa.follow_transition(prev, b); |

1133 | if next != NFA::FAIL { |

1134 | prev = next; |

1135 | } else { |

1136 | let next = self.nfa.alloc_state(depth)?; |

1137 | self.nfa.add_transition(prev, b, next)?; |

1138 | if self.builder.ascii_case_insensitive { |

1139 | let b = opposite_ascii_case(b); |

1140 | self.nfa.add_transition(prev, b, next)?; |

1141 | } |

1142 | prev = next; |

1143 | } |

1144 | } |

1145 | // Once the pattern has been added, log the match in the final |

1146 | // state that it reached. |

1147 | self.nfa.add_match(prev, pid)?; |

1148 | } |

1149 | Ok(()) |

1150 | } |

1151 | |

1152 | /// This routine creates failure transitions according to the standard |

1153 | /// textbook formulation of the Aho-Corasick algorithm, with a couple small |

1154 | /// tweaks to support "leftmost" semantics. |

1155 | /// |

1156 | /// Building failure transitions is the most interesting part of building |

1157 | /// the Aho-Corasick automaton, because they are what allow searches to |

1158 | /// be performed in linear time. Specifically, a failure transition is |

1159 | /// a single transition associated with each state that points back to |

1160 | /// the longest proper suffix of the pattern being searched. The failure |

1161 | /// transition is followed whenever there exists no transition on the |

1162 | /// current state for the current input byte. If there is no other proper |

1163 | /// suffix, then the failure transition points back to the starting state. |

1164 | /// |

1165 | /// For example, let's say we built an Aho-Corasick automaton with the |

1166 | /// following patterns: 'abcd' and 'cef'. The trie looks like this: |

1167 | /// |

1168 | /// ```ignore |

1169 | /// a - S1 - b - S2 - c - S3 - d - S4* |

1170 | /// / |

1171 | /// S0 - c - S5 - e - S6 - f - S7* |

1172 | /// ``` |

1173 | /// |

1174 | /// At this point, it should be fairly straight-forward to see how this |

1175 | /// trie can be used in a simplistic way. At any given position in the |

1176 | /// text we're searching (called the "subject" string), all we need to do |

1177 | /// is follow the transitions in the trie by consuming one transition for |

1178 | /// each byte in the subject string. If we reach a match state, then we can |

1179 | /// report that location as a match. |

1180 | /// |

1181 | /// The trick comes when searching a subject string like 'abcef'. We'll |

1182 | /// initially follow the transition from S0 to S1 and wind up in S3 after |

1183 | /// observng the 'c' byte. At this point, the next byte is 'e' but state |

1184 | /// S3 has no transition for 'e', so the search fails. We then would need |

1185 | /// to restart the search at the next position in 'abcef', which |

1186 | /// corresponds to 'b'. The match would fail, but the next search starting |

1187 | /// at 'c' would finally succeed. The problem with this approach is that |

1188 | /// we wind up searching the subject string potentially many times. In |

1189 | /// effect, this makes the algorithm have worst case `O(n * m)` complexity, |

1190 | /// where `n ~ len(subject)` and `m ~ len(all patterns)`. We would instead |

1191 | /// like to achieve a `O(n + m)` worst case complexity. |

1192 | /// |

1193 | /// This is where failure transitions come in. Instead of dying at S3 in |

1194 | /// the first search, the automaton can instruct the search to move to |

1195 | /// another part of the automaton that corresponds to a suffix of what |

1196 | /// we've seen so far. Recall that we've seen 'abc' in the subject string, |

1197 | /// and the automaton does indeed have a non-empty suffix, 'c', that could |

1198 | /// potentially lead to another match. Thus, the actual Aho-Corasick |

1199 | /// automaton for our patterns in this case looks like this: |

1200 | /// |

1201 | /// ```ignore |

1202 | /// a - S1 - b - S2 - c - S3 - d - S4* |

1203 | /// / / |

1204 | /// / ---------------- |

1205 | /// / / |

1206 | /// S0 - c - S5 - e - S6 - f - S7* |

1207 | /// ``` |

1208 | /// |

1209 | /// That is, we have a failure transition from S3 to S5, which is followed |

1210 | /// exactly in cases when we are in state S3 but see any byte other than |

1211 | /// 'd' (that is, we've "failed" to find a match in this portion of our |

1212 | /// trie). We know we can transition back to S5 because we've already seen |

1213 | /// a 'c' byte, so we don't need to re-scan it. We can then pick back up |

1214 | /// with the search starting at S5 and complete our match. |

1215 | /// |

1216 | /// Adding failure transitions to a trie is fairly simple, but subtle. The |

1217 | /// key issue is that you might have multiple failure transition that you |

1218 | /// need to follow. For example, look at the trie for the patterns |

1219 | /// 'abcd', 'b', 'bcd' and 'cd': |

1220 | /// |

1221 | /// ```ignore |

1222 | /// - a - S1 - b - S2* - c - S3 - d - S4* |

1223 | /// / / / |

1224 | /// / ------- ------- |

1225 | /// / / / |

1226 | /// S0 --- b - S5* - c - S6 - d - S7* |

1227 | /// \ / |

1228 | /// \ -------- |

1229 | /// \ / |

1230 | /// - c - S8 - d - S9* |

1231 | /// ``` |

1232 | /// |

1233 | /// The failure transitions for this trie are defined from S2 to S5, |

1234 | /// S3 to S6 and S6 to S8. Moreover, state S2 needs to track that it |

1235 | /// corresponds to a match, since its failure transition to S5 is itself |

1236 | /// a match state. |

1237 | /// |

1238 | /// Perhaps simplest way to think about adding these failure transitions |

1239 | /// is recursively. That is, if you know the failure transitions for every |

1240 | /// possible previous state that could be visited (e.g., when computing the |

1241 | /// failure transition for S3, you already know the failure transitions |

1242 | /// for S0, S1 and S2), then you can simply follow the failure transition |

1243 | /// of the previous state and check whether the incoming transition is |

1244 | /// defined after following the failure transition. |

1245 | /// |

1246 | /// For example, when determining the failure state for S3, by our |

1247 | /// assumptions, we already know that there is a failure transition from |

1248 | /// S2 (the previous state) to S5. So we follow that transition and check |

1249 | /// whether the transition connecting S2 to S3 is defined. Indeed, it is, |

1250 | /// as there is a transition from S5 to S6 for the byte 'c'. If no such |

1251 | /// transition existed, we could keep following the failure transitions |

1252 | /// until we reach the start state, which is the failure transition for |

1253 | /// every state that has no corresponding proper suffix. |

1254 | /// |

1255 | /// We don't actually use recursion to implement this, but instead, use a |

1256 | /// breadth first search of the automaton. Our base case is the start |

1257 | /// state, whose failure transition is just a transition to itself. |

1258 | /// |

1259 | /// When building a leftmost automaton, we proceed as above, but only |

1260 | /// include a subset of failure transitions. Namely, we omit any failure |

1261 | /// transitions that appear after a match state in the trie. This is |

1262 | /// because failure transitions always point back to a proper suffix of |

1263 | /// what has been seen so far. Thus, following a failure transition after |

1264 | /// a match implies looking for a match that starts after the one that has |

1265 | /// already been seen, which is of course therefore not the leftmost match. |

1266 | /// |

1267 | /// N.B. I came up with this algorithm on my own, and after scouring all of |

1268 | /// the other AC implementations I know of (Perl, Snort, many on GitHub). |

1269 | /// I couldn't find any that implement leftmost semantics like this. |

1270 | /// Perl of course needs leftmost-first semantics, but they implement it |

1271 | /// with a seeming hack at *search* time instead of encoding it into the |

1272 | /// automaton. There are also a couple Java libraries that support leftmost |

1273 | /// longest semantics, but they do it by building a queue of matches at |

1274 | /// search time, which is even worse than what Perl is doing. ---AG |

1275 | fn fill_failure_transitions(&mut self) -> Result<(), BuildError> { |

1276 | let is_leftmost = self.builder.match_kind.is_leftmost(); |

1277 | let start_uid = self.nfa.special.start_unanchored_id; |

1278 | // Initialize the queue for breadth first search with all transitions |

1279 | // out of the start state. We handle the start state specially because |

1280 | // we only want to follow non-self transitions. If we followed self |

1281 | // transitions, then this would never terminate. |

1282 | let mut queue = VecDeque::new(); |

1283 | let mut seen = self.queued_set(); |

1284 | let mut prev_link = None; |

1285 | while let Some(link) = self.nfa.next_link(start_uid, prev_link) { |

1286 | prev_link = Some(link); |

1287 | let t = self.nfa.sparse[link]; |

1288 | |

1289 | // Skip anything we've seen before and any self-transitions on the |

1290 | // start state. |

1291 | if start_uid == t.next() || seen.contains(t.next) { |

1292 | continue; |

1293 | } |

1294 | queue.push_back(t.next); |

1295 | seen.insert(t.next); |

1296 | // Under leftmost semantics, if a state immediately following |

1297 | // the start state is a match state, then we never want to |

1298 | // follow its failure transition since the failure transition |

1299 | // necessarily leads back to the start state, which we never |

1300 | // want to do for leftmost matching after a match has been |

1301 | // found. |

1302 | // |

1303 | // We apply the same logic to non-start states below as well. |

1304 | if is_leftmost && self.nfa.states[t.next].is_match() { |

1305 | self.nfa.states[t.next].fail = NFA::DEAD; |

1306 | } |

1307 | } |

1308 | while let Some(id) = queue.pop_front() { |

1309 | let mut prev_link = None; |

1310 | while let Some(link) = self.nfa.next_link(id, prev_link) { |

1311 | prev_link = Some(link); |

1312 | let t = self.nfa.sparse[link]; |

1313 | |

1314 | if seen.contains(t.next) { |

1315 | // The only way to visit a duplicate state in a transition |

1316 | // list is when ASCII case insensitivity is enabled. In |

1317 | // this case, we want to skip it since it's redundant work. |

1318 | // But it would also end up duplicating matches, which |

1319 | // results in reporting duplicate matches in some cases. |

1320 | // See the 'acasei010' regression test. |

1321 | continue; |

1322 | } |

1323 | queue.push_back(t.next); |

1324 | seen.insert(t.next); |

1325 | |

1326 | // As above for start states, under leftmost semantics, once |

1327 | // we see a match all subsequent states should have no failure |

1328 | // transitions because failure transitions always imply looking |

1329 | // for a match that is a suffix of what has been seen so far |

1330 | // (where "seen so far" corresponds to the string formed by |

1331 | // following the transitions from the start state to the |

1332 | // current state). Under leftmost semantics, we specifically do |

1333 | // not want to allow this to happen because we always want to |

1334 | // report the match found at the leftmost position. |

1335 | // |

1336 | // The difference between leftmost-first and leftmost-longest |

1337 | // occurs previously while we build the trie. For |

1338 | // leftmost-first, we simply omit any entries that would |

1339 | // otherwise require passing through a match state. |

1340 | // |

1341 | // Note that for correctness, the failure transition has to be |

1342 | // set to the dead state for ALL states following a match, not |

1343 | // just the match state itself. However, by setting the failure |

1344 | // transition to the dead state on all match states, the dead |

1345 | // state will automatically propagate to all subsequent states |

1346 | // via the failure state computation below. |

1347 | if is_leftmost && self.nfa.states[t.next].is_match() { |

1348 | self.nfa.states[t.next].fail = NFA::DEAD; |

1349 | continue; |

1350 | } |

1351 | let mut fail = self.nfa.states[id].fail; |

1352 | while self.nfa.follow_transition(fail, t.byte) == NFA::FAIL { |

1353 | fail = self.nfa.states[fail].fail; |

1354 | } |

1355 | fail = self.nfa.follow_transition(fail, t.byte); |

1356 | self.nfa.states[t.next].fail = fail; |

1357 | self.nfa.copy_matches(fail, t.next)?; |

1358 | } |

1359 | // If the start state is a match state, then this automaton can |

1360 | // match the empty string. This implies all states are match states |

1361 | // since every position matches the empty string, so copy the |

1362 | // matches from the start state to every state. Strictly speaking, |

1363 | // this is only necessary for overlapping matches since each |

1364 | // non-empty non-start match state needs to report empty matches |

1365 | // in addition to its own. For the non-overlapping case, such |

1366 | // states only report the first match, which is never empty since |

1367 | // it isn't a start state. |

1368 | if !is_leftmost { |

1369 | self.nfa |

1370 | .copy_matches(self.nfa.special.start_unanchored_id, id)?; |

1371 | } |

1372 | } |

1373 | Ok(()) |

1374 | } |

1375 | |

1376 | /// Shuffle the states so that they appear in this sequence: |

1377 | /// |

1378 | /// DEAD, FAIL, MATCH..., START, START, NON-MATCH... |

1379 | /// |

1380 | /// The idea here is that if we know how special states are laid out in our |

1381 | /// transition table, then we can determine what "kind" of state we're in |

1382 | /// just by comparing our current state ID with a particular value. In this |

1383 | /// way, we avoid doing extra memory lookups. |

1384 | /// |

1385 | /// Before shuffling begins, our states look something like this: |

1386 | /// |

1387 | /// DEAD, FAIL, START, START, (MATCH | NON-MATCH)... |

1388 | /// |

1389 | /// So all we need to do is move all of the MATCH states so that they |

1390 | /// all appear before any NON-MATCH state, like so: |

1391 | /// |

1392 | /// DEAD, FAIL, START, START, MATCH... NON-MATCH... |

1393 | /// |

1394 | /// Then it's just a simple matter of swapping the two START states with |

1395 | /// the last two MATCH states. |

1396 | /// |

1397 | /// (This is the same technique used for fully compiled DFAs in |

1398 | /// regex-automata.) |

1399 | fn shuffle(&mut self) { |

1400 | let old_start_uid = self.nfa.special.start_unanchored_id; |

1401 | let old_start_aid = self.nfa.special.start_anchored_id; |

1402 | assert!(old_start_uid < old_start_aid); |

1403 | assert_eq!( |

1404 | 3, |

1405 | old_start_aid.as_usize(), |

1406 | "anchored start state should be at index 3" |

1407 | ); |

1408 | // We implement shuffling by a sequence of pairwise swaps of states. |

1409 | // Since we have a number of things referencing states via their |

1410 | // IDs and swapping them changes their IDs, we need to record every |

1411 | // swap we make so that we can remap IDs. The remapper handles this |

1412 | // book-keeping for us. |

1413 | let mut remapper = Remapper::new(&self.nfa, 0); |

1414 | // The way we proceed here is by moving all match states so that |

1415 | // they directly follow the start states. So it will go: DEAD, FAIL, |

1416 | // START-UNANCHORED, START-ANCHORED, MATCH, ..., NON-MATCH, ... |

1417 | // |

1418 | // To do that, we proceed forward through all states after |

1419 | // START-ANCHORED and swap match states so that they appear before all |

1420 | // non-match states. |

1421 | let mut next_avail = StateID::from(4u8); |

1422 | for i in next_avail.as_usize()..self.nfa.states.len() { |

1423 | let sid = StateID::new(i).unwrap(); |

1424 | if !self.nfa.states[sid].is_match() { |

1425 | continue; |

1426 | } |

1427 | remapper.swap(&mut self.nfa, sid, next_avail); |

1428 | // The key invariant here is that only non-match states exist |

1429 | // between 'next_avail' and 'sid' (with them being potentially |

1430 | // equivalent). Thus, incrementing 'next_avail' by 1 is guaranteed |

1431 | // to land on the leftmost non-match state. (Unless 'next_avail' |

1432 | // and 'sid' are equivalent, in which case, a swap will occur but |

1433 | // it is a no-op.) |

1434 | next_avail = StateID::new(next_avail.one_more()).unwrap(); |

1435 | } |

1436 | // Now we'd like to move the start states to immediately following the |

1437 | // match states. (The start states may themselves be match states, but |

1438 | // we'll handle that later.) We arrange the states this way so that we |

1439 | // don't necessarily need to check whether a state is a start state or |

1440 | // not before checking whether a state is a match state. For example, |

1441 | // we'd like to be able to write this as our state machine loop: |

1442 | // |

1443 | // sid = start() |

1444 | // for byte in haystack: |

1445 | // sid = next(sid, byte) |

1446 | // if sid <= nfa.max_start_id: |

1447 | // if sid <= nfa.max_dead_id: |

1448 | // # search complete |

1449 | // elif sid <= nfa.max_match_id: |

1450 | // # found match |

1451 | // |

1452 | // The important context here is that we might not want to look for |

1453 | // start states at all. Namely, if a searcher doesn't have a prefilter, |

1454 | // then there is no reason to care about whether we're in a start state |

1455 | // or not. And indeed, if we did check for it, this very hot loop would |

1456 | // ping pong between the special state handling and the main state |

1457 | // transition logic. This in turn stalls the CPU by killing branch |

1458 | // prediction. |

1459 | // |

1460 | // So essentially, we really want to be able to "forget" that start |

1461 | // states even exist and this is why we put them at the end. |

1462 | let new_start_aid = |

1463 | StateID::new(next_avail.as_usize().checked_sub(1).unwrap()) |

1464 | .unwrap(); |

1465 | remapper.swap(&mut self.nfa, old_start_aid, new_start_aid); |

1466 | let new_start_uid = |

1467 | StateID::new(next_avail.as_usize().checked_sub(2).unwrap()) |

1468 | .unwrap(); |

1469 | remapper.swap(&mut self.nfa, old_start_uid, new_start_uid); |

1470 | let new_max_match_id = |

1471 | StateID::new(next_avail.as_usize().checked_sub(3).unwrap()) |

1472 | .unwrap(); |

1473 | self.nfa.special.max_match_id = new_max_match_id; |

1474 | self.nfa.special.start_unanchored_id = new_start_uid; |

1475 | self.nfa.special.start_anchored_id = new_start_aid; |

1476 | // If one start state is a match state, then they both are. |

1477 | if self.nfa.states[self.nfa.special.start_anchored_id].is_match() { |

1478 | self.nfa.special.max_match_id = self.nfa.special.start_anchored_id; |

1479 | } |

1480 | remapper.remap(&mut self.nfa); |

1481 | } |

1482 | |

1483 | /// Attempts to convert the transition representation of a subset of states |

1484 | /// in this NFA from sparse to dense. This can greatly improve search |

1485 | /// performance since states with a higher number of transitions tend to |

1486 | /// correlate with very active states. |

1487 | /// |

1488 | /// We generally only densify states that are close to the start state. |

1489 | /// These tend to be the most active states and thus benefit from a dense |

1490 | /// representation more than other states. |

1491 | /// |

1492 | /// This tends to best balance between memory usage and performance. In |

1493 | /// particular, the *vast majority* of all states in a typical Aho-Corasick |

1494 | /// automaton have only 1 transition and are usually farther from the start |

1495 | /// state and thus don't get densified. |

1496 | /// |

1497 | /// Note that this doesn't remove the sparse representation of transitions |

1498 | /// for states that are densified. It could be done, but actually removing |

1499 | /// entries from `NFA::sparse` is likely more expensive than it's worth. |

1500 | fn densify(&mut self) -> Result<(), BuildError> { |

1501 | for i in 0..self.nfa.states.len() { |

1502 | let sid = StateID::new(i).unwrap(); |

1503 | // Don't bother densifying states that are only used as sentinels. |

1504 | if sid == NFA::DEAD || sid == NFA::FAIL { |

1505 | continue; |

1506 | } |

1507 | // Only densify states that are "close enough" to the start state. |

1508 | if self.nfa.states[sid].depth.as_usize() |

1509 | >= self.builder.dense_depth |

1510 | { |

1511 | continue; |

1512 | } |

1513 | let dense = self.nfa.alloc_dense_state()?; |

1514 | let mut prev_link = None; |

1515 | while let Some(link) = self.nfa.next_link(sid, prev_link) { |

1516 | prev_link = Some(link); |

1517 | let t = self.nfa.sparse[link]; |

1518 | |

1519 | let class = usize::from(self.nfa.byte_classes.get(t.byte)); |

1520 | let index = dense.as_usize() + class; |

1521 | self.nfa.dense[index] = t.next; |

1522 | } |

1523 | self.nfa.states[sid].dense = dense; |

1524 | } |

1525 | Ok(()) |

1526 | } |

1527 | |

1528 | /// Returns a set that tracked queued states. |

1529 | /// |

1530 | /// This is only necessary when ASCII case insensitivity is enabled, since |

1531 | /// it is the only way to visit the same state twice. Otherwise, this |

1532 | /// returns an inert set that nevers adds anything and always reports |

1533 | /// `false` for every member test. |

1534 | fn queued_set(&self) -> QueuedSet { |

1535 | if self.builder.ascii_case_insensitive { |

1536 | QueuedSet::active() |

1537 | } else { |

1538 | QueuedSet::inert() |

1539 | } |

1540 | } |

1541 | |

1542 | /// Initializes the unanchored start state by making it dense. This is |

1543 | /// achieved by explicitly setting every transition to the FAIL state. |

1544 | /// This isn't necessary for correctness, since any missing transition is |

1545 | /// automatically assumed to be mapped to the FAIL state. We do this to |

1546 | /// make the unanchored starting state dense, and thus in turn make |

1547 | /// transition lookups on it faster. (Which is worth doing because it's |

1548 | /// the most active state.) |

1549 | fn init_unanchored_start_state(&mut self) -> Result<(), BuildError> { |

1550 | let start_uid = self.nfa.special.start_unanchored_id; |

1551 | let start_aid = self.nfa.special.start_anchored_id; |

1552 | self.nfa.init_full_state(start_uid, NFA::FAIL)?; |

1553 | self.nfa.init_full_state(start_aid, NFA::FAIL)?; |

1554 | Ok(()) |

1555 | } |

1556 | |

1557 | /// Setup the anchored start state by copying all of the transitions and |

1558 | /// matches from the unanchored starting state with one change: the failure |

1559 | /// transition is changed to the DEAD state, so that for any undefined |

1560 | /// transitions, the search will stop. |

1561 | fn set_anchored_start_state(&mut self) -> Result<(), BuildError> { |

1562 | let start_uid = self.nfa.special.start_unanchored_id; |

1563 | let start_aid = self.nfa.special.start_anchored_id; |

1564 | let (mut uprev_link, mut aprev_link) = (None, None); |

1565 | loop { |

1566 | let unext = self.nfa.next_link(start_uid, uprev_link); |

1567 | let anext = self.nfa.next_link(start_aid, aprev_link); |

1568 | let (ulink, alink) = match (unext, anext) { |

1569 | (Some(ulink), Some(alink)) => (ulink, alink), |

1570 | (None, None) => break, |

1571 | _ => unreachable!(), |

1572 | }; |

1573 | uprev_link = Some(ulink); |

1574 | aprev_link = Some(alink); |

1575 | self.nfa.sparse[alink].next = self.nfa.sparse[ulink].next; |

1576 | } |

1577 | self.nfa.copy_matches(start_uid, start_aid)?; |

1578 | // This is the main difference between the unanchored and anchored |

1579 | // starting states. If a lookup on an anchored starting state fails, |

1580 | // then the search should stop. |

1581 | // |

1582 | // N.B. This assumes that the loop on the unanchored starting state |

1583 | // hasn't been created yet. |

1584 | self.nfa.states[start_aid].fail = NFA::DEAD; |

1585 | Ok(()) |

1586 | } |

1587 | |

1588 | /// Set the failure transitions on the start state to loop back to the |

1589 | /// start state. This effectively permits the Aho-Corasick automaton to |

1590 | /// match at any position. This is also required for finding the next |

1591 | /// state to terminate, namely, finding the next state should never return |

1592 | /// a fail_id. |

1593 | /// |

1594 | /// This must be done after building the initial trie, since trie |

1595 | /// construction depends on transitions to `fail_id` to determine whether a |

1596 | /// state already exists or not. |

1597 | fn add_unanchored_start_state_loop(&mut self) { |

1598 | let start_uid = self.nfa.special.start_unanchored_id; |

1599 | let mut prev_link = None; |

1600 | while let Some(link) = self.nfa.next_link(start_uid, prev_link) { |

1601 | prev_link = Some(link); |

1602 | if self.nfa.sparse[link].next() == NFA::FAIL { |

1603 | self.nfa.sparse[link].next = start_uid; |

1604 | } |

1605 | } |

1606 | } |

1607 | |

1608 | /// Remove the start state loop by rewriting any transitions on the start |

1609 | /// state back to the start state with transitions to the dead state. |

1610 | /// |

1611 | /// The loop is only closed when two conditions are met: the start state |

1612 | /// is a match state and the match kind is leftmost-first or |

1613 | /// leftmost-longest. |

1614 | /// |

1615 | /// The reason for this is that under leftmost semantics, a start state |

1616 | /// that is also a match implies that we should never restart the search |

1617 | /// process. We allow normal transitions out of the start state, but if |

1618 | /// none exist, we transition to the dead state, which signals that |

1619 | /// searching should stop. |

1620 | fn close_start_state_loop_for_leftmost(&mut self) { |

1621 | let start_uid = self.nfa.special.start_unanchored_id; |

1622 | let start = &mut self.nfa.states[start_uid]; |

1623 | let dense = start.dense; |

1624 | if self.builder.match_kind.is_leftmost() && start.is_match() { |

1625 | let mut prev_link = None; |

1626 | while let Some(link) = self.nfa.next_link(start_uid, prev_link) { |

1627 | prev_link = Some(link); |

1628 | if self.nfa.sparse[link].next() == start_uid { |

1629 | self.nfa.sparse[link].next = NFA::DEAD; |

1630 | if dense != StateID::ZERO { |

1631 | let b = self.nfa.sparse[link].byte; |

1632 | let class = usize::from(self.nfa.byte_classes.get(b)); |

1633 | self.nfa.dense[dense.as_usize() + class] = NFA::DEAD; |

1634 | } |

1635 | } |

1636 | } |

1637 | } |

1638 | } |

1639 | |

1640 | /// Sets all transitions on the dead state to point back to the dead state. |

1641 | /// Normally, missing transitions map back to the failure state, but the |

1642 | /// point of the dead state is to act as a sink that can never be escaped. |

1643 | fn add_dead_state_loop(&mut self) -> Result<(), BuildError> { |

1644 | self.nfa.init_full_state(NFA::DEAD, NFA::DEAD)?; |

1645 | Ok(()) |

1646 | } |

1647 | } |

1648 | |

1649 | /// A set of state identifiers used to avoid revisiting the same state multiple |

1650 | /// times when filling in failure transitions. |

1651 | /// |

1652 | /// This set has an "inert" and an "active" mode. When inert, the set never |

1653 | /// stores anything and always returns `false` for every member test. This is |

1654 | /// useful to avoid the performance and memory overhead of maintaining this |

1655 | /// set when it is not needed. |

1656 | #[derive(Debug)] |

1657 | struct QueuedSet { |

1658 | set: Option<BTreeSet<StateID>>, |

1659 | } |

1660 | |

1661 | impl QueuedSet { |

1662 | /// Return an inert set that returns `false` for every state ID membership |

1663 | /// test. |

1664 | fn inert() -> QueuedSet { |

1665 | QueuedSet { set: None } |

1666 | } |

1667 | |

1668 | /// Return an active set that tracks state ID membership. |

1669 | fn active() -> QueuedSet { |

1670 | QueuedSet { set: Some(BTreeSet::new()) } |

1671 | } |

1672 | |

1673 | /// Inserts the given state ID into this set. (If the set is inert, then |

1674 | /// this is a no-op.) |

1675 | fn insert(&mut self, state_id: StateID) { |

1676 | if let Some(ref mut set) = self.set { |

1677 | set.insert(state_id); |

1678 | } |

1679 | } |

1680 | |

1681 | /// Returns true if and only if the given state ID is in this set. If the |

1682 | /// set is inert, this always returns false. |

1683 | fn contains(&self, state_id: StateID) -> bool { |

1684 | match self.set { |

1685 | None => false, |

1686 | Some(ref set) => set.contains(&state_id), |

1687 | } |

1688 | } |

1689 | } |

1690 | |

1691 | impl core::fmt::Debug for NFA { |

1692 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { |

1693 | use crate::{ |

1694 | automaton::{fmt_state_indicator, sparse_transitions}, |

1695 | util::debug::DebugByte, |

1696 | }; |

1697 | |

1698 | writeln!(f, "noncontiguous::NFA(")?; |

1699 | for (sid, state) in self.states.iter().with_state_ids() { |

1700 | // The FAIL state doesn't actually have space for a state allocated |

1701 | // for it, so we have to treat it as a special case. |

1702 | if sid == NFA::FAIL { |

1703 | writeln!(f, "F {:06}:", sid.as_usize())?; |

1704 | continue; |

1705 | } |

1706 | fmt_state_indicator(f, self, sid)?; |

1707 | write!( |

1708 | f, |

1709 | "{:06}({:06}): ", |

1710 | sid.as_usize(), |

1711 | state.fail.as_usize() |

1712 | )?; |

1713 | |

1714 | let it = sparse_transitions( |

1715 | self.iter_trans(sid).map(|t| (t.byte, t.next)), |

1716 | ) |

1717 | .enumerate(); |

1718 | for (i, (start, end, sid)) in it { |

1719 | if i > 0 { |

1720 | write!(f, ", ")?; |

1721 | } |

1722 | if start == end { |

1723 | write!( |

1724 | f, |

1725 | "{:?} => {:?}", |

1726 | DebugByte(start), |

1727 | sid.as_usize() |

1728 | )?; |

1729 | } else { |

1730 | write!( |

1731 | f, |

1732 | "{:?}-{:?} => {:?}", |

1733 | DebugByte(start), |

1734 | DebugByte(end), |

1735 | sid.as_usize() |

1736 | )?; |

1737 | } |

1738 | } |

1739 | |

1740 | write!(f, " \n")?; |

1741 | if self.is_match(sid) { |

1742 | write!(f, " matches: ")?; |

1743 | for (i, pid) in self.iter_matches(sid).enumerate() { |

1744 | if i > 0 { |

1745 | write!(f, ", ")?; |

1746 | } |

1747 | write!(f, "{}", pid.as_usize())?; |

1748 | } |

1749 | write!(f, " \n")?; |

1750 | } |

1751 | } |

1752 | writeln!(f, "match kind: {:?}", self.match_kind)?; |

1753 | writeln!(f, "prefilter: {:?}", self.prefilter.is_some())?; |

1754 | writeln!(f, "state length: {:?}", self.states.len())?; |

1755 | writeln!(f, "pattern length: {:?}", self.patterns_len())?; |

1756 | writeln!(f, "shortest pattern length: {:?}", self.min_pattern_len)?; |

1757 | writeln!(f, "longest pattern length: {:?}", self.max_pattern_len)?; |

1758 | writeln!(f, "memory usage: {:?}", self.memory_usage())?; |

1759 | writeln!(f, ")")?; |

1760 | Ok(()) |

1761 | } |

1762 | } |

1763 |