1 | use core::mem; |
---|---|

2 | |

3 | use alloc::{vec, vec::Vec}; |

4 | |

5 | use crate::{ |

6 | nfa::thompson::{self, compiler::ThompsonRef, BuildError, Builder}, |

7 | util::primitives::{IteratorIndexExt, StateID}, |

8 | }; |

9 | |

10 | /// A trie that preserves leftmost-first match semantics. |

11 | /// |

12 | /// This is a purpose-built data structure for optimizing 'lit1|lit2|..|litN' |

13 | /// patterns. It can *only* handle alternations of literals, which makes it |

14 | /// somewhat restricted in its scope, but literal alternations are fairly |

15 | /// common. |

16 | /// |

17 | /// At a 5,000 foot level, the main idea of this trie is make an alternation of |

18 | /// literals look more like a DFA than an NFA via epsilon removal. |

19 | /// |

20 | /// More precisely, the main issue is in how alternations are compiled into |

21 | /// a Thompson NFA. Namely, each alternation gets a single NFA "union" state |

22 | /// with an epsilon transition for every branch of the alternation pointing to |

23 | /// an NFA state corresponding to the start of that branch. The main problem |

24 | /// with this representation is the cost of computing an epsilon closure. Once |

25 | /// you hit the alternation's start state, it acts as a sort of "clog" that |

26 | /// requires you to traverse all of the epsilon transitions to compute the full |

27 | /// closure. |

28 | /// |

29 | /// While fixing such clogs in the general case is pretty tricky without going |

30 | /// to a DFA (or perhaps a Glushkov NFA, but that comes with other problems). |

31 | /// But at least in the case of an alternation of literals, we can convert |

32 | /// that to a prefix trie without too much cost. In theory, that's all you |

33 | /// really need to do: build the trie and then compile it to a Thompson NFA. |

34 | /// For example, if you have the pattern 'bar|baz|foo', then using a trie, it |

35 | /// is transformed to something like 'b(a(r|z))|f'. This reduces the clog by |

36 | /// reducing the number of epsilon transitions out of the alternation's start |

37 | /// state from 3 to 2 (it actually gets down to 1 when you use a sparse state, |

38 | /// which we do below). It's a small effect here, but when your alternation is |

39 | /// huge, the savings is also huge. |

40 | /// |

41 | /// And that is... essentially what a LiteralTrie does. But there is one |

42 | /// hiccup. Consider a regex like 'sam|samwise'. How does a prefix trie compile |

43 | /// that when leftmost-first semantics are used? If 'sam|samwise' was the |

44 | /// entire regex, then you could just drop the 'samwise' branch entirely since |

45 | /// it is impossible to match ('sam' will always take priority, and since it |

46 | /// is a prefix of 'samwise', 'samwise' will never match). But what about the |

47 | /// regex '\b(sam|samwise)\b'? In that case, you can't remove 'samwise' because |

48 | /// it might match when 'sam' doesn't fall on a word boundary. |

49 | /// |

50 | /// The main idea is that 'sam|samwise' can be translated to 'sam(?:|wise)', |

51 | /// which is a precisely equivalent regex that also gets rid of the clog. |

52 | /// |

53 | /// Another example is 'zapper|z|zap'. That gets translated to |

54 | /// 'z(?:apper||ap)'. |

55 | /// |

56 | /// We accomplish this by giving each state in the trie multiple "chunks" of |

57 | /// transitions. Each chunk barrier represents a match. The idea is that once |

58 | /// you know a match occurs, none of the transitions after the match can be |

59 | /// re-ordered and mixed in with the transitions before the match. Otherwise, |

60 | /// the match semantics could be changed. |

61 | /// |

62 | /// See the 'State' data type for a bit more detail. |

63 | /// |

64 | /// Future work: |

65 | /// |

66 | /// * In theory, it would be nice to generalize the idea of removing clogs and |

67 | /// apply it to the NFA graph itself. Then this could in theory work for |

68 | /// case insensitive alternations of literals, or even just alternations where |

69 | /// each branch starts with a non-epsilon transition. |

70 | /// * Could we instead use the Aho-Corasick algorithm here? The aho-corasick |

71 | /// crate deals with leftmost-first matches correctly, but I think this implies |

72 | /// encoding failure transitions into a Thompson NFA somehow. Which seems fine, |

73 | /// because failure transitions are just unconditional epsilon transitions? |

74 | /// * Or perhaps even better, could we use an aho_corasick::AhoCorasick |

75 | /// directly? At time of writing, 0.7 is the current version of the |

76 | /// aho-corasick crate, and that definitely cannot be used as-is. But if we |

77 | /// expose the underlying finite state machine API, then could we use it? That |

78 | /// would be super. If we could figure that out, it might also lend itself to |

79 | /// more general composition of finite state machines. |

80 | #[derive(Clone)] |

81 | pub(crate) struct LiteralTrie { |

82 | /// The set of trie states. Each state contains one or more chunks, where |

83 | /// each chunk is a sparse set of transitions to other states. A leaf state |

84 | /// is always a match state that contains only empty chunks (i.e., no |

85 | /// transitions). |

86 | states: Vec<State>, |

87 | /// Whether to add literals in reverse to the trie. Useful when building |

88 | /// a reverse NFA automaton. |

89 | rev: bool, |

90 | } |

91 | |

92 | impl LiteralTrie { |

93 | /// Create a new literal trie that adds literals in the forward direction. |

94 | pub(crate) fn forward() -> LiteralTrie { |

95 | let root = State::default(); |

96 | LiteralTrie { states: vec![root], rev: false } |

97 | } |

98 | |

99 | /// Create a new literal trie that adds literals in reverse. |

100 | pub(crate) fn reverse() -> LiteralTrie { |

101 | let root = State::default(); |

102 | LiteralTrie { states: vec![root], rev: true } |

103 | } |

104 | |

105 | /// Add the given literal to this trie. |

106 | /// |

107 | /// If the literal could not be added because the `StateID` space was |

108 | /// exhausted, then an error is returned. If an error returns, the trie |

109 | /// is in an unspecified state. |

110 | pub(crate) fn add(&mut self, bytes: &[u8]) -> Result<(), BuildError> { |

111 | let mut prev = StateID::ZERO; |

112 | let mut it = bytes.iter().copied(); |

113 | while let Some(b) = if self.rev { it.next_back() } else { it.next() } { |

114 | prev = self.get_or_add_state(prev, b)?; |

115 | } |

116 | self.states[prev].add_match(); |

117 | Ok(()) |

118 | } |

119 | |

120 | /// If the given transition is defined, then return the next state ID. |

121 | /// Otherwise, add the transition to `from` and point it to a new state. |

122 | /// |

123 | /// If a new state ID could not be allocated, then an error is returned. |

124 | fn get_or_add_state( |

125 | &mut self, |

126 | from: StateID, |

127 | byte: u8, |

128 | ) -> Result<StateID, BuildError> { |

129 | let active = self.states[from].active_chunk(); |

130 | match active.binary_search_by_key(&byte, |t| t.byte) { |

131 | Ok(i) => Ok(active[i].next), |

132 | Err(i) => { |

133 | // Add a new state and get its ID. |

134 | let next = StateID::new(self.states.len()).map_err(|_| { |

135 | BuildError::too_many_states(self.states.len()) |

136 | })?; |

137 | self.states.push(State::default()); |

138 | // Offset our position to account for all transitions and not |

139 | // just the ones in the active chunk. |

140 | let i = self.states[from].active_chunk_start() + i; |

141 | let t = Transition { byte, next }; |

142 | self.states[from].transitions.insert(i, t); |

143 | Ok(next) |

144 | } |

145 | } |

146 | } |

147 | |

148 | /// Compile this literal trie to the NFA builder given. |

149 | /// |

150 | /// This forwards any errors that may occur while using the given builder. |

151 | pub(crate) fn compile( |

152 | &self, |

153 | builder: &mut Builder, |

154 | ) -> Result<ThompsonRef, BuildError> { |

155 | // Compilation proceeds via depth-first traversal of the trie. |

156 | // |

157 | // This is overall pretty brutal. The recursive version of this is |

158 | // deliciously simple. (See 'compile_to_hir' below for what it might |

159 | // look like.) But recursion on a trie means your call stack grows |

160 | // in accordance with the longest literal, which just does not seem |

161 | // appropriate. So we push the call stack to the heap. But as a result, |

162 | // the trie traversal becomes pretty brutal because we essentially |

163 | // have to encode the state of a double for-loop into an explicit call |

164 | // frame. If someone can simplify this without using recursion, that'd |

165 | // be great. |

166 | |

167 | // 'end' is our match state for this trie, but represented in the the |

168 | // NFA. Any time we see a match in the trie, we insert a transition |

169 | // from the current state we're in to 'end'. |

170 | let end = builder.add_empty()?; |

171 | let mut stack = vec![]; |

172 | let mut f = Frame::new(&self.states[StateID::ZERO]); |

173 | loop { |

174 | if let Some(t) = f.transitions.next() { |

175 | if self.states[t.next].is_leaf() { |

176 | f.sparse.push(thompson::Transition { |

177 | start: t.byte, |

178 | end: t.byte, |

179 | next: end, |

180 | }); |

181 | } else { |

182 | f.sparse.push(thompson::Transition { |

183 | start: t.byte, |

184 | end: t.byte, |

185 | // This is a little funny, but when the frame we create |

186 | // below completes, it will pop this parent frame off |

187 | // and modify this transition to point to the correct |

188 | // state. |

189 | next: StateID::ZERO, |

190 | }); |

191 | stack.push(f); |

192 | f = Frame::new(&self.states[t.next]); |

193 | } |

194 | continue; |

195 | } |

196 | // At this point, we have visited all transitions in f.chunk, so |

197 | // add it as a sparse NFA state. Unless the chunk was empty, in |

198 | // which case, we don't do anything. |

199 | if !f.sparse.is_empty() { |

200 | let chunk_id = if f.sparse.len() == 1 { |

201 | builder.add_range(f.sparse.pop().unwrap())? |

202 | } else { |

203 | let sparse = mem::replace(&mut f.sparse, vec![]); |

204 | builder.add_sparse(sparse)? |

205 | }; |

206 | f.union.push(chunk_id); |

207 | } |

208 | // Now we need to look to see if there are other chunks to visit. |

209 | if let Some(chunk) = f.chunks.next() { |

210 | // If we're here, it means we're on the second (or greater) |

211 | // chunk, which implies there is a match at this point. So |

212 | // connect this state to the final end state. |

213 | f.union.push(end); |

214 | // Advance to the next chunk. |

215 | f.transitions = chunk.iter(); |

216 | continue; |

217 | } |

218 | // Now that we are out of chunks, we have completely visited |

219 | // this state. So turn our union of chunks into an NFA union |

220 | // state, and add that union state to the parent state's current |

221 | // sparse state. (If there is no parent, we're done.) |

222 | let start = builder.add_union(f.union)?; |

223 | match stack.pop() { |

224 | None => { |

225 | return Ok(ThompsonRef { start, end }); |

226 | } |

227 | Some(mut parent) => { |

228 | // OK because the only way a frame gets pushed on to the |

229 | // stack (aside from the root) is when a transition has |

230 | // been added to 'sparse'. |

231 | parent.sparse.last_mut().unwrap().next = start; |

232 | f = parent; |

233 | } |

234 | } |

235 | } |

236 | } |

237 | |

238 | /// Converts this trie to an equivalent HIR expression. |

239 | /// |

240 | /// We don't actually use this, but it's useful for tests. In particular, |

241 | /// it provides a (somewhat) human readable representation of the trie |

242 | /// itself. |

243 | #[cfg(test)] |

244 | fn compile_to_hir(&self) -> regex_syntax::hir::Hir { |

245 | self.compile_state_to_hir(StateID::ZERO) |

246 | } |

247 | |

248 | /// The recursive implementation of 'to_hir'. |

249 | /// |

250 | /// Notice how simple this is compared to 'compile' above. 'compile' could |

251 | /// be similarly simple, but we opt to not use recursion in order to avoid |

252 | /// overflowing the stack in the case of a longer literal. |

253 | #[cfg(test)] |

254 | fn compile_state_to_hir(&self, sid: StateID) -> regex_syntax::hir::Hir { |

255 | use regex_syntax::hir::Hir; |

256 | |

257 | let mut alt = vec![]; |

258 | for (i, chunk) in self.states[sid].chunks().enumerate() { |

259 | if i > 0 { |

260 | alt.push(Hir::empty()); |

261 | } |

262 | if chunk.is_empty() { |

263 | continue; |

264 | } |

265 | let mut chunk_alt = vec![]; |

266 | for t in chunk.iter() { |

267 | chunk_alt.push(Hir::concat(vec![ |

268 | Hir::literal(vec![t.byte]), |

269 | self.compile_state_to_hir(t.next), |

270 | ])); |

271 | } |

272 | alt.push(Hir::alternation(chunk_alt)); |

273 | } |

274 | Hir::alternation(alt) |

275 | } |

276 | } |

277 | |

278 | impl core::fmt::Debug for LiteralTrie { |

279 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |

280 | writeln!(f, "LiteralTrie(")?; |

281 | for (sid, state) in self.states.iter().with_state_ids() { |

282 | writeln!(f, "{:06?}: {:?}", sid.as_usize(), state)?; |

283 | } |

284 | writeln!(f, ")")?; |

285 | Ok(()) |

286 | } |

287 | } |

288 | |

289 | /// An explicit stack frame used for traversing the trie without using |

290 | /// recursion. |

291 | /// |

292 | /// Each frame is tied to the traversal of a single trie state. The frame is |

293 | /// dropped once the entire state (and all of its children) have been visited. |

294 | /// The "output" of compiling a state is the 'union' vector, which is turn |

295 | /// converted to a NFA union state. Each branch of the union corresponds to a |

296 | /// chunk in the trie state. |

297 | /// |

298 | /// 'sparse' corresponds to the set of transitions for a particular chunk in a |

299 | /// trie state. It is ultimately converted to an NFA sparse state. The 'sparse' |

300 | /// field, after being converted to a sparse NFA state, is reused for any |

301 | /// subsequent chunks in the trie state, if any exist. |

302 | #[derive(Debug)] |

303 | struct Frame<'a> { |

304 | /// The remaining chunks to visit for a trie state. |

305 | chunks: StateChunksIter<'a>, |

306 | /// The transitions of the current chunk that we're iterating over. Since |

307 | /// every trie state has at least one chunk, every frame is initialized |

308 | /// with the first chunk's transitions ready to be consumed. |

309 | transitions: core::slice::Iter<'a, Transition>, |

310 | /// The NFA state IDs pointing to the start of each chunk compiled by |

311 | /// this trie state. This ultimately gets converted to an NFA union once |

312 | /// the entire trie state (and all of its children) have been compiled. |

313 | /// The order of these matters for leftmost-first match semantics, since |

314 | /// earlier matches in the union are preferred over later ones. |

315 | union: Vec<StateID>, |

316 | /// The actual NFA transitions for a single chunk in a trie state. This |

317 | /// gets converted to an NFA sparse state, and its corresponding NFA state |

318 | /// ID should get added to 'union'. |

319 | sparse: Vec<thompson::Transition>, |

320 | } |

321 | |

322 | impl<'a> Frame<'a> { |

323 | /// Create a new stack frame for trie traversal. This initializes the |

324 | /// 'transitions' iterator to the transitions for the first chunk, with the |

325 | /// 'chunks' iterator being every chunk after the first one. |

326 | fn new(state: &'a State) -> Frame<'a> { |

327 | let mut chunks = state.chunks(); |

328 | // every state has at least 1 chunk |

329 | let chunk = chunks.next().unwrap(); |

330 | let transitions = chunk.iter(); |

331 | Frame { chunks, transitions, union: vec![], sparse: vec![] } |

332 | } |

333 | } |

334 | |

335 | /// A state in a trie. |

336 | /// |

337 | /// This uses a sparse representation. Since we don't use literal tries |

338 | /// for searching, and ultimately (and compilation requires visiting every |

339 | /// transition anyway), we use a sparse representation for transitions. This |

340 | /// means we save on memory, at the expense of 'LiteralTrie::add' being perhaps |

341 | /// a bit slower. |

342 | /// |

343 | /// While 'transitions' is pretty standard as far as tries goes, the 'chunks' |

344 | /// piece here is more unusual. In effect, 'chunks' defines a partitioning |

345 | /// of 'transitions', where each chunk corresponds to a distinct set of |

346 | /// transitions. The key invariant is that a transition in one chunk cannot |

347 | /// be moved to another chunk. This is the secret sauce that preserve |

348 | /// leftmost-first match semantics. |

349 | /// |

350 | /// A new chunk is added whenever we mark a state as a match state. Once a |

351 | /// new chunk is added, the old active chunk is frozen and is never mutated |

352 | /// again. The new chunk becomes the active chunk, which is defined as |

353 | /// '&transitions[chunks.last().map_or(0, |c| c.1)..]'. Thus, a state where |

354 | /// 'chunks' is empty actually contains one chunk. Thus, every state contains |

355 | /// at least one (possibly empty) chunk. |

356 | /// |

357 | /// A "leaf" state is a state that has no outgoing transitions (so |

358 | /// 'transitions' is empty). Note that there is no way for a leaf state to be a |

359 | /// non-matching state. (Although while building the trie, within 'add', a leaf |

360 | /// state may exist while not containing any matches. But this invariant is |

361 | /// only broken within 'add'. Once 'add' returns, the invariant is upheld.) |

362 | #[derive(Clone, Default)] |

363 | struct State { |

364 | transitions: Vec<Transition>, |

365 | chunks: Vec<(usize, usize)>, |

366 | } |

367 | |

368 | impl State { |

369 | /// Mark this state as a match state and freeze the active chunk such that |

370 | /// it can not be further mutated. |

371 | fn add_match(&mut self) { |

372 | // This is not strictly necessary, but there's no point in recording |

373 | // another match by adding another chunk if the state has no |

374 | // transitions. Note though that we only skip this if we already know |

375 | // this is a match state, which is only true if 'chunks' is not empty. |

376 | // Basically, if we didn't do this, nothing semantically would change, |

377 | // but we'd end up pushing another chunk and potentially triggering an |

378 | // alloc. |

379 | if self.transitions.is_empty() && !self.chunks.is_empty() { |

380 | return; |

381 | } |

382 | let chunk_start = self.active_chunk_start(); |

383 | let chunk_end = self.transitions.len(); |

384 | self.chunks.push((chunk_start, chunk_end)); |

385 | } |

386 | |

387 | /// Returns true if and only if this state is a leaf state. That is, a |

388 | /// state that has no outgoing transitions. |

389 | fn is_leaf(&self) -> bool { |

390 | self.transitions.is_empty() |

391 | } |

392 | |

393 | /// Returns an iterator over all of the chunks (including the currently |

394 | /// active chunk) in this state. Since the active chunk is included, the |

395 | /// iterator is guaranteed to always yield at least one chunk (although the |

396 | /// chunk may be empty). |

397 | fn chunks(&self) -> StateChunksIter<'_> { |

398 | StateChunksIter { |

399 | transitions: &*self.transitions, |

400 | chunks: self.chunks.iter(), |

401 | active: Some(self.active_chunk()), |

402 | } |

403 | } |

404 | |

405 | /// Returns the active chunk as a slice of transitions. |

406 | fn active_chunk(&self) -> &[Transition] { |

407 | let start = self.active_chunk_start(); |

408 | &self.transitions[start..] |

409 | } |

410 | |

411 | /// Returns the index into 'transitions' where the active chunk starts. |

412 | fn active_chunk_start(&self) -> usize { |

413 | self.chunks.last().map_or(0, |&(_, end)| end) |

414 | } |

415 | } |

416 | |

417 | impl core::fmt::Debug for State { |

418 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |

419 | let mut spacing = " "; |

420 | for (i, chunk) in self.chunks().enumerate() { |

421 | if i > 0 { |

422 | write!(f, "{}MATCH", spacing)?; |

423 | } |

424 | spacing = ""; |

425 | for (j, t) in chunk.iter().enumerate() { |

426 | spacing = " "; |

427 | if j == 0 && i > 0 { |

428 | write!(f, " ")?; |

429 | } else if j > 0 { |

430 | write!(f, ", ")?; |

431 | } |

432 | write!(f, "{:?}", t)?; |

433 | } |

434 | } |

435 | Ok(()) |

436 | } |

437 | } |

438 | |

439 | /// An iterator over all of the chunks in a state, including the active chunk. |

440 | /// |

441 | /// This iterator is created by `State::chunks`. We name this iterator so that |

442 | /// we can include it in the `Frame` type for non-recursive trie traversal. |

443 | #[derive(Debug)] |

444 | struct StateChunksIter<'a> { |

445 | transitions: &'a [Transition], |

446 | chunks: core::slice::Iter<'a, (usize, usize)>, |

447 | active: Option<&'a [Transition]>, |

448 | } |

449 | |

450 | impl<'a> Iterator for StateChunksIter<'a> { |

451 | type Item = &'a [Transition]; |

452 | |

453 | fn next(&mut self) -> Option<&'a [Transition]> { |

454 | if let Some(&(start, end)) = self.chunks.next() { |

455 | return Some(&self.transitions[start..end]); |

456 | } |

457 | if let Some(chunk) = self.active.take() { |

458 | return Some(chunk); |

459 | } |

460 | None |

461 | } |

462 | } |

463 | |

464 | /// A single transition in a trie to another state. |

465 | #[derive(Clone, Copy)] |

466 | struct Transition { |

467 | byte: u8, |

468 | next: StateID, |

469 | } |

470 | |

471 | impl core::fmt::Debug for Transition { |

472 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |

473 | write!( |

474 | f, |

475 | "{:?} => {}", |

476 | crate::util::escape::DebugByte(self.byte), |

477 | self.next.as_usize() |

478 | ) |

479 | } |

480 | } |

481 | |

482 | #[cfg(test)] |

483 | mod tests { |

484 | use bstr::B; |

485 | use regex_syntax::hir::Hir; |

486 | |

487 | use super::*; |

488 | |

489 | #[test] |

490 | fn zap() { |

491 | let mut trie = LiteralTrie::forward(); |

492 | trie.add(b"zapper").unwrap(); |

493 | trie.add(b"z").unwrap(); |

494 | trie.add(b"zap").unwrap(); |

495 | |

496 | let got = trie.compile_to_hir(); |

497 | let expected = Hir::concat(vec![ |

498 | Hir::literal(B("z")), |

499 | Hir::alternation(vec![ |

500 | Hir::literal(B("apper")), |

501 | Hir::empty(), |

502 | Hir::literal(B("ap")), |

503 | ]), |

504 | ]); |

505 | assert_eq!(expected, got); |

506 | } |

507 | |

508 | #[test] |

509 | fn maker() { |

510 | let mut trie = LiteralTrie::forward(); |

511 | trie.add(b"make").unwrap(); |

512 | trie.add(b"maple").unwrap(); |

513 | trie.add(b"maker").unwrap(); |

514 | |

515 | let got = trie.compile_to_hir(); |

516 | let expected = Hir::concat(vec![ |

517 | Hir::literal(B("ma")), |

518 | Hir::alternation(vec![ |

519 | Hir::concat(vec![ |

520 | Hir::literal(B("ke")), |

521 | Hir::alternation(vec![Hir::empty(), Hir::literal(B("r"))]), |

522 | ]), |

523 | Hir::literal(B("ple")), |

524 | ]), |

525 | ]); |

526 | assert_eq!(expected, got); |

527 | } |

528 | } |

529 |