1 | /*! |
---|---|

2 | This module contains types and routines for implementing determinization. |

3 | |

4 | In this crate, there are at least two places where we implement |

5 | determinization: fully ahead-of-time compiled DFAs in the `dfa` module and |

6 | lazily compiled DFAs in the `hybrid` module. The stuff in this module |

7 | corresponds to the things that are in common between these implementations. |

8 | |

9 | There are three broad things that our implementations of determinization have |

10 | in common, as defined by this module: |

11 | |

12 | * The classification of start states. That is, whether we're dealing with |

13 | word boundaries, line boundaries, etc., is all the same. This also includes |

14 | the look-behind assertions that are satisfied by each starting state |

15 | classification. |

16 | * The representation of DFA states as sets of NFA states, including |

17 | convenience types for building these DFA states that are amenable to reusing |

18 | allocations. |

19 | * Routines for the "classical" parts of determinization: computing the |

20 | epsilon closure, tracking match states (with corresponding pattern IDs, since |

21 | we support multi-pattern finite automata) and, of course, computing the |

22 | transition function between states for units of input. |

23 | |

24 | I did consider a couple of alternatives to this particular form of code reuse: |

25 | |

26 | 1. Don't do any code reuse. The problem here is that we *really* want both |

27 | forms of determinization to do exactly identical things when it comes to |

28 | their handling of NFA states. While our tests generally ensure this, the code |

29 | is tricky and large enough where not reusing code is a pretty big bummer. |

30 | |

31 | 2. Implement all of determinization once and make it generic over fully |

32 | compiled DFAs and lazily compiled DFAs. While I didn't actually try this |

33 | approach, my instinct is that it would be more complex than is needed here. |

34 | And the interface required would be pretty hairy. Instead, I think splitting |

35 | it into logical sub-components works better. |

36 | */ |

37 | |

38 | use alloc::vec::Vec; |

39 | |

40 | pub(crate) use self::state::{ |

41 | State, StateBuilderEmpty, StateBuilderMatches, StateBuilderNFA, |

42 | }; |

43 | |

44 | use crate::{ |

45 | nfa::thompson, |

46 | util::{ |

47 | alphabet, |

48 | look::{Look, LookSet}, |

49 | primitives::StateID, |

50 | search::MatchKind, |

51 | sparse_set::{SparseSet, SparseSets}, |

52 | start::Start, |

53 | utf8, |

54 | }, |

55 | }; |

56 | |

57 | mod state; |

58 | |

59 | /// Compute the set of all reachable NFA states, including the full epsilon |

60 | /// closure, from a DFA state for a single unit of input. The set of reachable |

61 | /// states is returned as a `StateBuilderNFA`. The `StateBuilderNFA` returned |

62 | /// also includes any look-behind assertions satisfied by `unit`, in addition |

63 | /// to whether it is a match state. For multi-pattern DFAs, the builder will |

64 | /// also include the pattern IDs that match (in the order seen). |

65 | /// |

66 | /// `nfa` must be able to resolve any NFA state in `state` and any NFA state |

67 | /// reachable via the epsilon closure of any NFA state in `state`. `sparses` |

68 | /// must have capacity equivalent to `nfa.len()`. |

69 | /// |

70 | /// `match_kind` should correspond to the match semantics implemented by the |

71 | /// DFA being built. Generally speaking, for leftmost-first match semantics, |

72 | /// states that appear after the first NFA match state will not be included in |

73 | /// the `StateBuilderNFA` returned since they are impossible to visit. |

74 | /// |

75 | /// `sparses` is used as scratch space for NFA traversal. Other than their |

76 | /// capacity requirements (detailed above), there are no requirements on what's |

77 | /// contained within them (if anything). Similarly, what's inside of them once |

78 | /// this routine returns is unspecified. |

79 | /// |

80 | /// `stack` must have length 0. It is used as scratch space for depth first |

81 | /// traversal. After returning, it is guaranteed that `stack` will have length |

82 | /// 0. |

83 | /// |

84 | /// `state` corresponds to the current DFA state on which one wants to compute |

85 | /// the transition for the input `unit`. |

86 | /// |

87 | /// `empty_builder` corresponds to the builder allocation to use to produce a |

88 | /// complete `StateBuilderNFA` state. If the state is not needed (or is already |

89 | /// cached), then it can be cleared and reused without needing to create a new |

90 | /// `State`. The `StateBuilderNFA` state returned is final and ready to be |

91 | /// turned into a `State` if necessary. |

92 | pub(crate) fn next( |

93 | nfa: &thompson::NFA, |

94 | match_kind: MatchKind, |

95 | sparses: &mut SparseSets, |

96 | stack: &mut Vec<StateID>, |

97 | state: &State, |

98 | unit: alphabet::Unit, |

99 | empty_builder: StateBuilderEmpty, |

100 | ) -> StateBuilderNFA { |

101 | sparses.clear(); |

102 | |

103 | // Whether the NFA is matched in reverse or not. We use this in some |

104 | // conditional logic for dealing with the exceptionally annoying CRLF-aware |

105 | // line anchors. |

106 | let rev = nfa.is_reverse(); |

107 | // The look-around matcher that our NFA is configured with. We don't |

108 | // actually use it to match look-around assertions, but we do need its |

109 | // configuration for constructing states consistent with how it matches. |

110 | let lookm = nfa.look_matcher(); |

111 | |

112 | // Put the NFA state IDs into a sparse set in case we need to |

113 | // re-compute their epsilon closure. |

114 | // |

115 | // Doing this state shuffling is technically not necessary unless some |

116 | // kind of look-around is used in the DFA. Some ad hoc experiments |

117 | // suggested that avoiding this didn't lead to much of an improvement, |

118 | // but perhaps more rigorous experimentation should be done. And in |

119 | // particular, avoiding this check requires some light refactoring of |

120 | // the code below. |

121 | state.iter_nfa_state_ids(|nfa_id| { |

122 | sparses.set1.insert(nfa_id); |

123 | }); |

124 | |

125 | // Compute look-ahead assertions originating from the current state. Based |

126 | // on the input unit we're transitioning over, some additional set of |

127 | // assertions may be true. Thus, we re-compute this state's epsilon closure |

128 | // (but only if necessary). Notably, when we build a DFA state initially, |

129 | // we don't enable any look-ahead assertions because we don't know whether |

130 | // they're true or not at that point. |

131 | if !state.look_need().is_empty() { |

132 | // Add look-ahead assertions that are now true based on the current |

133 | // input unit. |

134 | let mut look_have = state.look_have().clone(); |

135 | match unit.as_u8() { |

136 | Some(b' \r') => { |

137 | if !rev || !state.is_half_crlf() { |

138 | look_have = look_have.insert(Look::EndCRLF); |

139 | } |

140 | } |

141 | Some(b' \n') => { |

142 | if rev || !state.is_half_crlf() { |

143 | look_have = look_have.insert(Look::EndCRLF); |

144 | } |

145 | } |

146 | Some(_) => {} |

147 | None => { |

148 | look_have = look_have |

149 | .insert(Look::End) |

150 | .insert(Look::EndLF) |

151 | .insert(Look::EndCRLF); |

152 | } |

153 | } |

154 | if unit.is_byte(lookm.get_line_terminator()) { |

155 | look_have = look_have.insert(Look::EndLF); |

156 | } |

157 | if state.is_half_crlf() |

158 | && ((rev && !unit.is_byte(b' \r')) |

159 | || (!rev && !unit.is_byte(b' \n'))) |

160 | { |

161 | look_have = look_have.insert(Look::StartCRLF); |

162 | } |

163 | if state.is_from_word() == unit.is_word_byte() { |

164 | look_have = look_have |

165 | .insert(Look::WordAsciiNegate) |

166 | .insert(Look::WordUnicodeNegate); |

167 | } else { |

168 | look_have = |

169 | look_have.insert(Look::WordAscii).insert(Look::WordUnicode); |

170 | } |

171 | if !unit.is_word_byte() { |

172 | look_have = look_have |

173 | .insert(Look::WordEndHalfAscii) |

174 | .insert(Look::WordEndHalfUnicode); |

175 | } |

176 | if state.is_from_word() && !unit.is_word_byte() { |

177 | look_have = look_have |

178 | .insert(Look::WordEndAscii) |

179 | .insert(Look::WordEndUnicode); |

180 | } else if !state.is_from_word() && unit.is_word_byte() { |

181 | look_have = look_have |

182 | .insert(Look::WordStartAscii) |

183 | .insert(Look::WordStartUnicode); |

184 | } |

185 | // If we have new assertions satisfied that are among the set of |

186 | // assertions that exist in this state (that is, just because we added |

187 | // an EndLF assertion above doesn't mean there is an EndLF conditional |

188 | // epsilon transition in this state), then we re-compute this state's |

189 | // epsilon closure using the updated set of assertions. |

190 | // |

191 | // Note that since our DFA states omit unconditional epsilon |

192 | // transitions, this check is necessary for correctness. If we re-did |

193 | // the epsilon closure below needlessly, it could change based on the |

194 | // fact that we omitted epsilon states originally. |

195 | if !look_have |

196 | .subtract(state.look_have()) |

197 | .intersect(state.look_need()) |

198 | .is_empty() |

199 | { |

200 | for nfa_id in sparses.set1.iter() { |

201 | epsilon_closure( |

202 | nfa, |

203 | nfa_id, |

204 | look_have, |

205 | stack, |

206 | &mut sparses.set2, |

207 | ); |

208 | } |

209 | sparses.swap(); |

210 | sparses.set2.clear(); |

211 | } |

212 | } |

213 | |

214 | // Convert our empty builder into one that can record assertions and match |

215 | // pattern IDs. |

216 | let mut builder = empty_builder.into_matches(); |

217 | // Set whether the StartLF look-behind assertion is true for this |

218 | // transition or not. The look-behind assertion for ASCII word boundaries |

219 | // is handled below. |

220 | if nfa.look_set_any().contains_anchor_line() |

221 | && unit.is_byte(lookm.get_line_terminator()) |

222 | { |

223 | // Why only handle StartLF here and not Start? That's because Start |

224 | // can only impact the starting state, which is special cased in |

225 | // start state handling. |

226 | builder.set_look_have(|have| have.insert(Look::StartLF)); |

227 | } |

228 | // We also need to add StartCRLF to our assertions too, if we can. This |

229 | // is unfortunately a bit more complicated, because it depends on the |

230 | // direction of the search. In the forward direction, ^ matches after a |

231 | // \n, but in the reverse direction, ^ only matches after a \r. (This is |

232 | // further complicated by the fact that reverse a regex means changing a ^ |

233 | // to a $ and vice versa.) |

234 | if nfa.look_set_any().contains_anchor_crlf() |

235 | && ((rev && unit.is_byte(b' \r')) || (!rev && unit.is_byte( b' \n'))) |

236 | { |

237 | builder.set_look_have(|have| have.insert(Look::StartCRLF)); |

238 | } |

239 | // And also for the start-half word boundary assertions. As long as the |

240 | // look-behind byte is not a word char, then the assertions are satisfied. |

241 | if nfa.look_set_any().contains_word() && !unit.is_word_byte() { |

242 | builder.set_look_have(|have| { |

243 | have.insert(Look::WordStartHalfAscii) |

244 | .insert(Look::WordStartHalfUnicode) |

245 | }); |

246 | } |

247 | for nfa_id in sparses.set1.iter() { |

248 | match *nfa.state(nfa_id) { |

249 | thompson::State::Union { .. } |

250 | | thompson::State::BinaryUnion { .. } |

251 | | thompson::State::Fail |

252 | | thompson::State::Look { .. } |

253 | | thompson::State::Capture { .. } => {} |

254 | thompson::State::Match { pattern_id } => { |

255 | // Notice here that we are calling the NEW state a match |

256 | // state if the OLD state we are transitioning from |

257 | // contains an NFA match state. This is precisely how we |

258 | // delay all matches by one byte and also what therefore |

259 | // guarantees that starting states cannot be match states. |

260 | // |

261 | // If we didn't delay matches by one byte, then whether |

262 | // a DFA is a matching state or not would be determined |

263 | // by whether one of its own constituent NFA states |

264 | // was a match state. (And that would be done in |

265 | // 'add_nfa_states'.) |

266 | // |

267 | // Also, 'add_match_pattern_id' requires that callers never |

268 | // pass duplicative pattern IDs. We do in fact uphold that |

269 | // guarantee here, but it's subtle. In particular, a Thompson |

270 | // NFA guarantees that each pattern has exactly one match |

271 | // state. Moreover, since we're iterating over the NFA state |

272 | // IDs in a set, we are guarateed not to have any duplicative |

273 | // match states. Thus, it is impossible to add the same pattern |

274 | // ID more than once. |

275 | // |

276 | // N.B. We delay matches by 1 byte as a way to hack 1-byte |

277 | // look-around into DFA searches. This lets us support ^, $ |

278 | // and ASCII-only \b. The delay is also why we need a special |

279 | // "end-of-input" (EOI) sentinel and why we need to follow the |

280 | // EOI sentinel at the end of every search. This final EOI |

281 | // transition is necessary to report matches found at the end |

282 | // of a haystack. |

283 | builder.add_match_pattern_id(pattern_id); |

284 | if !match_kind.continue_past_first_match() { |

285 | break; |

286 | } |

287 | } |

288 | thompson::State::ByteRange { ref trans } => { |

289 | if trans.matches_unit(unit) { |

290 | epsilon_closure( |

291 | nfa, |

292 | trans.next, |

293 | builder.look_have(), |

294 | stack, |

295 | &mut sparses.set2, |

296 | ); |

297 | } |

298 | } |

299 | thompson::State::Sparse(ref sparse) => { |

300 | if let Some(next) = sparse.matches_unit(unit) { |

301 | epsilon_closure( |

302 | nfa, |

303 | next, |

304 | builder.look_have(), |

305 | stack, |

306 | &mut sparses.set2, |

307 | ); |

308 | } |

309 | } |

310 | thompson::State::Dense(ref dense) => { |

311 | if let Some(next) = dense.matches_unit(unit) { |

312 | epsilon_closure( |

313 | nfa, |

314 | next, |

315 | builder.look_have(), |

316 | stack, |

317 | &mut sparses.set2, |

318 | ); |

319 | } |

320 | } |

321 | } |

322 | } |

323 | // We only set the word byte if there's a word boundary look-around |

324 | // anywhere in this regex. Otherwise, there's no point in bloating the |

325 | // number of states if we don't have one. |

326 | // |

327 | // We also only set it when the state has a non-zero number of NFA states. |

328 | // Otherwise, we could wind up with states that *should* be DEAD states |

329 | // but are otherwise distinct from DEAD states because of this look-behind |

330 | // assertion being set. While this can't technically impact correctness *in |

331 | // theory*, it can create pathological DFAs that consume input until EOI or |

332 | // a quit byte is seen. Consuming until EOI isn't a correctness problem, |

333 | // but a (serious) perf problem. Hitting a quit byte, however, could be a |

334 | // correctness problem since it could cause search routines to report an |

335 | // error instead of a detected match once the quit state is entered. (The |

336 | // search routine could be made to be a bit smarter by reporting a match |

337 | // if one was detected once it enters a quit state (and indeed, the search |

338 | // routines in this crate do just that), but it seems better to prevent |

339 | // these things by construction if possible.) |

340 | if !sparses.set2.is_empty() { |

341 | if nfa.look_set_any().contains_word() && unit.is_word_byte() { |

342 | builder.set_is_from_word(); |

343 | } |

344 | if nfa.look_set_any().contains_anchor_crlf() |

345 | && ((rev && unit.is_byte(b' \n')) || (!rev && unit.is_byte( b' \r'))) |

346 | { |

347 | builder.set_is_half_crlf(); |

348 | } |

349 | } |

350 | let mut builder_nfa = builder.into_nfa(); |

351 | add_nfa_states(nfa, &sparses.set2, &mut builder_nfa); |

352 | builder_nfa |

353 | } |

354 | |

355 | /// Compute the epsilon closure for the given NFA state. The epsilon closure |

356 | /// consists of all NFA state IDs, including `start_nfa_id`, that can be |

357 | /// reached from `start_nfa_id` without consuming any input. These state IDs |

358 | /// are written to `set` in the order they are visited, but only if they are |

359 | /// not already in `set`. `start_nfa_id` must be a valid state ID for the NFA |

360 | /// given. |

361 | /// |

362 | /// `look_have` consists of the satisfied assertions at the current |

363 | /// position. For conditional look-around epsilon transitions, these are |

364 | /// only followed if they are satisfied by `look_have`. |

365 | /// |

366 | /// `stack` must have length 0. It is used as scratch space for depth first |

367 | /// traversal. After returning, it is guaranteed that `stack` will have length |

368 | /// 0. |

369 | pub(crate) fn epsilon_closure( |

370 | nfa: &thompson::NFA, |

371 | start_nfa_id: StateID, |

372 | look_have: LookSet, |

373 | stack: &mut Vec<StateID>, |

374 | set: &mut SparseSet, |

375 | ) { |

376 | assert!(stack.is_empty()); |

377 | // If this isn't an epsilon state, then the epsilon closure is always just |

378 | // itself, so there's no need to spin up the machinery below to handle it. |

379 | if !nfa.state(start_nfa_id).is_epsilon() { |

380 | set.insert(start_nfa_id); |

381 | return; |

382 | } |

383 | |

384 | stack.push(start_nfa_id); |

385 | while let Some(mut id) = stack.pop() { |

386 | // In many cases, we can avoid stack operations when an NFA state only |

387 | // adds one new state to visit. In that case, we just set our ID to |

388 | // that state and mush on. We only use the stack when an NFA state |

389 | // introduces multiple new states to visit. |

390 | loop { |

391 | // Insert this NFA state, and if it's already in the set and thus |

392 | // already visited, then we can move on to the next one. |

393 | if !set.insert(id) { |

394 | break; |

395 | } |

396 | match *nfa.state(id) { |

397 | thompson::State::ByteRange { .. } |

398 | | thompson::State::Sparse { .. } |

399 | | thompson::State::Dense { .. } |

400 | | thompson::State::Fail |

401 | | thompson::State::Match { .. } => break, |

402 | thompson::State::Look { look, next } => { |

403 | if !look_have.contains(look) { |

404 | break; |

405 | } |

406 | id = next; |

407 | } |

408 | thompson::State::Union { ref alternates } => { |

409 | id = match alternates.get(0) { |

410 | None => break, |

411 | Some(&id) => id, |

412 | }; |

413 | // We need to process our alternates in order to preserve |

414 | // match preferences, so put the earliest alternates closer |

415 | // to the top of the stack. |

416 | stack.extend(alternates[1..].iter().rev()); |

417 | } |

418 | thompson::State::BinaryUnion { alt1, alt2 } => { |

419 | id = alt1; |

420 | stack.push(alt2); |

421 | } |

422 | thompson::State::Capture { next, .. } => { |

423 | id = next; |

424 | } |

425 | } |

426 | } |

427 | } |

428 | } |

429 | |

430 | /// Add the NFA state IDs in the given `set` to the given DFA builder state. |

431 | /// The order in which states are added corresponds to the order in which they |

432 | /// were added to `set`. |

433 | /// |

434 | /// The DFA builder state given should already have its complete set of match |

435 | /// pattern IDs added (if any) and any look-behind assertions (StartLF, Start |

436 | /// and whether this state is being generated for a transition over a word byte |

437 | /// when applicable) that are true immediately prior to transitioning into this |

438 | /// state (via `builder.look_have()`). The match pattern IDs should correspond |

439 | /// to matches that occurred on the previous transition, since all matches are |

440 | /// delayed by one byte. The things that should _not_ be set are look-ahead |

441 | /// assertions (EndLF, End and whether the next byte is a word byte or not). |

442 | /// The builder state should also not have anything in `look_need` set, as this |

443 | /// routine will compute that for you. |

444 | /// |

445 | /// The given NFA should be able to resolve all identifiers in `set` to a |

446 | /// particular NFA state. Additionally, `set` must have capacity equivalent |

447 | /// to `nfa.len()`. |

448 | pub(crate) fn add_nfa_states( |

449 | nfa: &thompson::NFA, |

450 | set: &SparseSet, |

451 | builder: &mut StateBuilderNFA, |

452 | ) { |

453 | for nfa_id in set.iter() { |

454 | match *nfa.state(nfa_id) { |

455 | thompson::State::ByteRange { .. } => { |

456 | builder.add_nfa_state_id(nfa_id); |

457 | } |

458 | thompson::State::Sparse { .. } => { |

459 | builder.add_nfa_state_id(nfa_id); |

460 | } |

461 | thompson::State::Dense { .. } => { |

462 | builder.add_nfa_state_id(nfa_id); |

463 | } |

464 | thompson::State::Look { look, .. } => { |

465 | builder.add_nfa_state_id(nfa_id); |

466 | builder.set_look_need(|need| need.insert(look)); |

467 | } |

468 | thompson::State::Union { .. } |

469 | | thompson::State::BinaryUnion { .. } => { |

470 | // Pure epsilon transitions don't need to be tracked as part |

471 | // of the DFA state. Tracking them is actually superfluous; |

472 | // they won't cause any harm other than making determinization |

473 | // slower. |

474 | // |

475 | // Why aren't these needed? Well, in an NFA, epsilon |

476 | // transitions are really just jumping points to other states. |

477 | // So once you hit an epsilon transition, the same set of |

478 | // resulting states always appears. Therefore, putting them in |

479 | // a DFA's set of ordered NFA states is strictly redundant. |

480 | // |

481 | // Look-around states are also epsilon transitions, but |

482 | // they are *conditional*. So their presence could be |

483 | // discriminatory, and thus, they are tracked above. |

484 | // |

485 | // But wait... why are epsilon states in our `set` in the first |

486 | // place? Why not just leave them out? They're in our `set` |

487 | // because it was generated by computing an epsilon closure, |

488 | // and we want to keep track of all states we visited to avoid |

489 | // re-visiting them. In exchange, we have to do this second |

490 | // iteration over our collected states to finalize our DFA |

491 | // state. In theory, we could avoid this second iteration if |

492 | // we maintained two sets during epsilon closure: the set of |

493 | // visited states (to avoid cycles) and the set of states that |

494 | // will actually be used to construct the next DFA state. |

495 | // |

496 | // Note that this optimization requires that we re-compute the |

497 | // epsilon closure to account for look-ahead in 'next' *only |

498 | // when necessary*. Namely, only when the set of look-around |

499 | // assertions changes and only when those changes are within |

500 | // the set of assertions that are needed in order to step |

501 | // through the closure correctly. Otherwise, if we re-do the |

502 | // epsilon closure needlessly, it could change based on the |

503 | // fact that we are omitting epsilon states here. |

504 | // |

505 | // ----- |

506 | // |

507 | // Welp, scratch the above. It turns out that recording these |

508 | // is in fact necessary to seemingly handle one particularly |

509 | // annoying case: when a conditional epsilon transition is |

510 | // put inside of a repetition operator. One specific case I |

511 | // ran into was the regex `(?:\b|%)+` on the haystack `z%`. |

512 | // The correct leftmost first matches are: [0, 0] and [1, 1]. |

513 | // But the DFA was reporting [0, 0] and [1, 2]. To understand |

514 | // why this happens, consider the NFA for the aforementioned |

515 | // regex: |

516 | // |

517 | // >000000: binary-union(4, 1) |

518 | // 000001: \x00-\xFF => 0 |

519 | // 000002: WordAscii => 5 |

520 | // 000003: % => 5 |

521 | // ^000004: binary-union(2, 3) |

522 | // 000005: binary-union(4, 6) |

523 | // 000006: MATCH(0) |

524 | // |

525 | // The problem here is that one of the DFA start states is |

526 | // going to consist of the NFA states [2, 3] by computing the |

527 | // epsilon closure of state 4. State 4 isn't included because |

528 | // we previously were not keeping track of union states. But |

529 | // only a subset of transitions out of this state will be able |

530 | // to follow WordAscii, and in those cases, the epsilon closure |

531 | // is redone. The only problem is that computing the epsilon |

532 | // closure from [2, 3] is different than computing the epsilon |

533 | // closure from [4]. In the former case, assuming the WordAscii |

534 | // assertion is satisfied, you get: [2, 3, 6]. In the latter |

535 | // case, you get: [2, 6, 3]. Notice that '6' is the match state |

536 | // and appears AFTER '3' in the former case. This leads to a |

537 | // preferential but incorrect match of '%' before returning |

538 | // a match. In the latter case, the match is preferred over |

539 | // continuing to accept the '%'. |

540 | // |

541 | // It almost feels like we might be able to fix the NFA states |

542 | // to avoid this, or to at least only keep track of union |

543 | // states where this actually matters, since in the vast |

544 | // majority of cases, this doesn't matter. |

545 | // |

546 | // Another alternative would be to define a new HIR property |

547 | // called "assertion is repeated anywhere" and compute it |

548 | // inductively over the entire pattern. If it happens anywhere, |

549 | // which is probably pretty rare, then we record union states. |

550 | // Otherwise we don't. |

551 | builder.add_nfa_state_id(nfa_id); |

552 | } |

553 | // Capture states we definitely do not need to record, since they |

554 | // are unconditional epsilon transitions with no branching. |

555 | thompson::State::Capture { .. } => {} |

556 | // It's not totally clear whether we need to record fail states or |

557 | // not, but we do so out of an abundance of caution. Since they are |

558 | // quite rare in practice, there isn't much cost to recording them. |

559 | thompson::State::Fail => { |

560 | builder.add_nfa_state_id(nfa_id); |

561 | } |

562 | thompson::State::Match { .. } => { |

563 | // Normally, the NFA match state doesn't actually need to |

564 | // be inside the DFA state. But since we delay matches by |

565 | // one byte, the matching DFA state corresponds to states |

566 | // that transition from the one we're building here. And |

567 | // the way we detect those cases is by looking for an NFA |

568 | // match state. See 'next' for how this is handled. |

569 | builder.add_nfa_state_id(nfa_id); |

570 | } |

571 | } |

572 | } |

573 | // If we know this state contains no look-around assertions, then |

574 | // there's no reason to track which look-around assertions were |

575 | // satisfied when this state was created. |

576 | if builder.look_need().is_empty() { |

577 | builder.set_look_have(|_| LookSet::empty()); |

578 | } |

579 | } |

580 | |

581 | /// Sets the appropriate look-behind assertions on the given state based on |

582 | /// this starting configuration. |

583 | pub(crate) fn set_lookbehind_from_start( |

584 | nfa: &thompson::NFA, |

585 | start: &Start, |

586 | builder: &mut StateBuilderMatches, |

587 | ) { |

588 | let rev = nfa.is_reverse(); |

589 | let lineterm = nfa.look_matcher().get_line_terminator(); |

590 | let lookset = nfa.look_set_any(); |

591 | match *start { |

592 | Start::NonWordByte => { |

593 | if lookset.contains_word() { |

594 | builder.set_look_have(|have| { |

595 | have.insert(Look::WordStartHalfAscii) |

596 | .insert(Look::WordStartHalfUnicode) |

597 | }); |

598 | } |

599 | } |

600 | Start::WordByte => { |

601 | if lookset.contains_word() { |

602 | builder.set_is_from_word(); |

603 | } |

604 | } |

605 | Start::Text => { |

606 | if lookset.contains_anchor_haystack() { |

607 | builder.set_look_have(|have| have.insert(Look::Start)); |

608 | } |

609 | if lookset.contains_anchor_line() { |

610 | builder.set_look_have(|have| { |

611 | have.insert(Look::StartLF).insert(Look::StartCRLF) |

612 | }); |

613 | } |

614 | if lookset.contains_word() { |

615 | builder.set_look_have(|have| { |

616 | have.insert(Look::WordStartHalfAscii) |

617 | .insert(Look::WordStartHalfUnicode) |

618 | }); |

619 | } |

620 | } |

621 | Start::LineLF => { |

622 | if rev { |

623 | if lookset.contains_anchor_crlf() { |

624 | builder.set_is_half_crlf(); |

625 | } |

626 | if lookset.contains_anchor_line() { |

627 | builder.set_look_have(|have| have.insert(Look::StartLF)); |

628 | } |

629 | } else { |

630 | if lookset.contains_anchor_line() { |

631 | builder.set_look_have(|have| have.insert(Look::StartCRLF)); |

632 | } |

633 | } |

634 | if lookset.contains_anchor_line() && lineterm == b' \n'{ |

635 | builder.set_look_have(|have| have.insert(Look::StartLF)); |

636 | } |

637 | if lookset.contains_word() { |

638 | builder.set_look_have(|have| { |

639 | have.insert(Look::WordStartHalfAscii) |

640 | .insert(Look::WordStartHalfUnicode) |

641 | }); |

642 | } |

643 | } |

644 | Start::LineCR => { |

645 | if lookset.contains_anchor_crlf() { |

646 | if rev { |

647 | builder.set_look_have(|have| have.insert(Look::StartCRLF)); |

648 | } else { |

649 | builder.set_is_half_crlf(); |

650 | } |

651 | } |

652 | if lookset.contains_anchor_line() && lineterm == b' \r'{ |

653 | builder.set_look_have(|have| have.insert(Look::StartLF)); |

654 | } |

655 | if lookset.contains_word() { |

656 | builder.set_look_have(|have| { |

657 | have.insert(Look::WordStartHalfAscii) |

658 | .insert(Look::WordStartHalfUnicode) |

659 | }); |

660 | } |

661 | } |

662 | Start::CustomLineTerminator => { |

663 | if lookset.contains_anchor_line() { |

664 | builder.set_look_have(|have| have.insert(Look::StartLF)); |

665 | } |

666 | // This is a bit of a tricky case, but if the line terminator was |

667 | // set to a word byte, then we also need to behave as if the start |

668 | // configuration is Start::WordByte. That is, we need to mark our |

669 | // state as having come from a word byte. |

670 | if lookset.contains_word() { |

671 | if utf8::is_word_byte(lineterm) { |

672 | builder.set_is_from_word(); |

673 | } else { |

674 | builder.set_look_have(|have| { |

675 | have.insert(Look::WordStartHalfAscii) |

676 | .insert(Look::WordStartHalfUnicode) |

677 | }); |

678 | } |

679 | } |

680 | } |

681 | } |

682 | } |

683 |