1 | /*! |
---|---|

2 | Types and routines specific to sparse DFAs. |

3 | |

4 | This module is the home of [`sparse::DFA`](DFA). |

5 | |

6 | Unlike the [`dense`] module, this module does not contain a builder or |

7 | configuration specific for sparse DFAs. Instead, the intended way to build a |

8 | sparse DFA is either by using a default configuration with its constructor |

9 | [`sparse::DFA::new`](DFA::new), or by first configuring the construction of a |

10 | dense DFA with [`dense::Builder`] and then calling [`dense::DFA::to_sparse`]. |

11 | For example, this configures a sparse DFA to do an overlapping search: |

12 | |

13 | ``` |

14 | use regex_automata::{ |

15 | dfa::{Automaton, OverlappingState, dense}, |

16 | HalfMatch, Input, MatchKind, |

17 | }; |

18 | |

19 | let dense_re = dense::Builder::new() |

20 | .configure(dense::Config::new().match_kind(MatchKind::All)) |

21 | .build(r"Samwise|Sam")?; |

22 | let sparse_re = dense_re.to_sparse()?; |

23 | |

24 | // Setup our haystack and initial start state. |

25 | let input = Input::new("Samwise"); |

26 | let mut state = OverlappingState::start(); |

27 | |

28 | // First, 'Sam' will match. |

29 | sparse_re.try_search_overlapping_fwd(&input, &mut state)?; |

30 | assert_eq!(Some(HalfMatch::must(0, 3)), state.get_match()); |

31 | |

32 | // And now 'Samwise' will match. |

33 | sparse_re.try_search_overlapping_fwd(&input, &mut state)?; |

34 | assert_eq!(Some(HalfMatch::must(0, 7)), state.get_match()); |

35 | # Ok::<(), Box<dyn std::error::Error>>(()) |

36 | ``` |

37 | */ |

38 | |

39 | #[cfg(feature = "dfa-build")] |

40 | use core::iter; |

41 | use core::{ |

42 | convert::{TryFrom, TryInto}, |

43 | fmt, |

44 | mem::size_of, |

45 | }; |

46 | |

47 | #[cfg(feature = "dfa-build")] |

48 | use alloc::{vec, vec::Vec}; |

49 | |

50 | #[cfg(feature = "dfa-build")] |

51 | use crate::dfa::dense::{self, BuildError}; |

52 | use crate::{ |

53 | dfa::{ |

54 | automaton::{fmt_state_indicator, Automaton, StartError}, |

55 | dense::Flags, |

56 | special::Special, |

57 | StartKind, DEAD, |

58 | }, |

59 | util::{ |

60 | alphabet::{ByteClasses, ByteSet}, |

61 | escape::DebugByte, |

62 | int::{Pointer, Usize, U16, U32}, |

63 | prefilter::Prefilter, |

64 | primitives::{PatternID, StateID}, |

65 | search::Anchored, |

66 | start::{self, Start, StartByteMap}, |

67 | wire::{self, DeserializeError, Endian, SerializeError}, |

68 | }, |

69 | }; |

70 | |

71 | const LABEL: &str = "rust-regex-automata-dfa-sparse"; |

72 | const VERSION: u32 = 2; |

73 | |

74 | /// A sparse deterministic finite automaton (DFA) with variable sized states. |

75 | /// |

76 | /// In contrast to a [dense::DFA], a sparse DFA uses a more space efficient |

77 | /// representation for its transitions. Consequently, sparse DFAs may use much |

78 | /// less memory than dense DFAs, but this comes at a price. In particular, |

79 | /// reading the more space efficient transitions takes more work, and |

80 | /// consequently, searching using a sparse DFA is typically slower than a dense |

81 | /// DFA. |

82 | /// |

83 | /// A sparse DFA can be built using the default configuration via the |

84 | /// [`DFA::new`] constructor. Otherwise, one can configure various aspects of a |

85 | /// dense DFA via [`dense::Builder`], and then convert a dense DFA to a sparse |

86 | /// DFA using [`dense::DFA::to_sparse`]. |

87 | /// |

88 | /// In general, a sparse DFA supports all the same search operations as a dense |

89 | /// DFA. |

90 | /// |

91 | /// Making the choice between a dense and sparse DFA depends on your specific |

92 | /// work load. If you can sacrifice a bit of search time performance, then a |

93 | /// sparse DFA might be the best choice. In particular, while sparse DFAs are |

94 | /// probably always slower than dense DFAs, you may find that they are easily |

95 | /// fast enough for your purposes! |

96 | /// |

97 | /// # Type parameters |

98 | /// |

99 | /// A `DFA` has one type parameter, `T`, which is used to represent the parts |

100 | /// of a sparse DFA. `T` is typically a `Vec<u8>` or a `&[u8]`. |

101 | /// |

102 | /// # The `Automaton` trait |

103 | /// |

104 | /// This type implements the [`Automaton`] trait, which means it can be used |

105 | /// for searching. For example: |

106 | /// |

107 | /// ``` |

108 | /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; |

109 | /// |

110 | /// let dfa = DFA::new("foo[0-9]+")?; |

111 | /// let expected = Some(HalfMatch::must(0, 8)); |

112 | /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); |

113 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |

114 | /// ``` |

115 | #[derive(Clone)] |

116 | pub struct DFA<T> { |

117 | // When compared to a dense DFA, a sparse DFA *looks* a lot simpler |

118 | // representation-wise. In reality, it is perhaps more complicated. Namely, |

119 | // in a dense DFA, all information needs to be very cheaply accessible |

120 | // using only state IDs. In a sparse DFA however, each state uses a |

121 | // variable amount of space because each state encodes more information |

122 | // than just its transitions. Each state also includes an accelerator if |

123 | // one exists, along with the matching pattern IDs if the state is a match |

124 | // state. |

125 | // |

126 | // That is, a lot of the complexity is pushed down into how each state |

127 | // itself is represented. |

128 | tt: Transitions<T>, |

129 | st: StartTable<T>, |

130 | special: Special, |

131 | pre: Option<Prefilter>, |

132 | quitset: ByteSet, |

133 | flags: Flags, |

134 | } |

135 | |

136 | #[cfg(feature = "dfa-build")] |

137 | impl DFA<Vec<u8>> { |

138 | /// Parse the given regular expression using a default configuration and |

139 | /// return the corresponding sparse DFA. |

140 | /// |

141 | /// If you want a non-default configuration, then use the |

142 | /// [`dense::Builder`] to set your own configuration, and then call |

143 | /// [`dense::DFA::to_sparse`] to create a sparse DFA. |

144 | /// |

145 | /// # Example |

146 | /// |

147 | /// ``` |

148 | /// use regex_automata::{dfa::{Automaton, sparse}, HalfMatch, Input}; |

149 | /// |

150 | /// let dfa = sparse::DFA::new("foo[0-9]+bar")?; |

151 | /// |

152 | /// let expected = Some(HalfMatch::must(0, 11)); |

153 | /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?); |

154 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |

155 | /// ``` |

156 | #[cfg(feature = "syntax")] |

157 | pub fn new(pattern: &str) -> Result<DFA<Vec<u8>>, BuildError> { |

158 | dense::Builder::new() |

159 | .build(pattern) |

160 | .and_then(|dense| dense.to_sparse()) |

161 | } |

162 | |

163 | /// Parse the given regular expressions using a default configuration and |

164 | /// return the corresponding multi-DFA. |

165 | /// |

166 | /// If you want a non-default configuration, then use the |

167 | /// [`dense::Builder`] to set your own configuration, and then call |

168 | /// [`dense::DFA::to_sparse`] to create a sparse DFA. |

169 | /// |

170 | /// # Example |

171 | /// |

172 | /// ``` |

173 | /// use regex_automata::{dfa::{Automaton, sparse}, HalfMatch, Input}; |

174 | /// |

175 | /// let dfa = sparse::DFA::new_many(&["[0-9]+", "[a-z]+"])?; |

176 | /// let expected = Some(HalfMatch::must(1, 3)); |

177 | /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?); |

178 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |

179 | /// ``` |

180 | #[cfg(feature = "syntax")] |

181 | pub fn new_many<P: AsRef<str>>( |

182 | patterns: &[P], |

183 | ) -> Result<DFA<Vec<u8>>, BuildError> { |

184 | dense::Builder::new() |

185 | .build_many(patterns) |

186 | .and_then(|dense| dense.to_sparse()) |

187 | } |

188 | } |

189 | |

190 | #[cfg(feature = "dfa-build")] |

191 | impl DFA<Vec<u8>> { |

192 | /// Create a new DFA that matches every input. |

193 | /// |

194 | /// # Example |

195 | /// |

196 | /// ``` |

197 | /// use regex_automata::{ |

198 | /// dfa::{Automaton, sparse}, |

199 | /// HalfMatch, Input, |

200 | /// }; |

201 | /// |

202 | /// let dfa = sparse::DFA::always_match()?; |

203 | /// |

204 | /// let expected = Some(HalfMatch::must(0, 0)); |

205 | /// assert_eq!(expected, dfa.try_search_fwd(&Input::new(""))?); |

206 | /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo"))?); |

207 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |

208 | /// ``` |

209 | pub fn always_match() -> Result<DFA<Vec<u8>>, BuildError> { |

210 | dense::DFA::always_match()?.to_sparse() |

211 | } |

212 | |

213 | /// Create a new sparse DFA that never matches any input. |

214 | /// |

215 | /// # Example |

216 | /// |

217 | /// ``` |

218 | /// use regex_automata::{dfa::{Automaton, sparse}, Input}; |

219 | /// |

220 | /// let dfa = sparse::DFA::never_match()?; |

221 | /// assert_eq!(None, dfa.try_search_fwd(&Input::new(""))?); |

222 | /// assert_eq!(None, dfa.try_search_fwd(&Input::new("foo"))?); |

223 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |

224 | /// ``` |

225 | pub fn never_match() -> Result<DFA<Vec<u8>>, BuildError> { |

226 | dense::DFA::never_match()?.to_sparse() |

227 | } |

228 | |

229 | /// The implementation for constructing a sparse DFA from a dense DFA. |

230 | pub(crate) fn from_dense<T: AsRef<[u32]>>( |

231 | dfa: &dense::DFA<T>, |

232 | ) -> Result<DFA<Vec<u8>>, BuildError> { |

233 | // In order to build the transition table, we need to be able to write |

234 | // state identifiers for each of the "next" transitions in each state. |

235 | // Our state identifiers correspond to the byte offset in the |

236 | // transition table at which the state is encoded. Therefore, we do not |

237 | // actually know what the state identifiers are until we've allocated |

238 | // exactly as much space as we need for each state. Thus, construction |

239 | // of the transition table happens in two passes. |

240 | // |

241 | // In the first pass, we fill out the shell of each state, which |

242 | // includes the transition length, the input byte ranges and |

243 | // zero-filled space for the transitions and accelerators, if present. |

244 | // In this first pass, we also build up a map from the state identifier |

245 | // index of the dense DFA to the state identifier in this sparse DFA. |

246 | // |

247 | // In the second pass, we fill in the transitions based on the map |

248 | // built in the first pass. |

249 | |

250 | // The capacity given here reflects a minimum. (Well, the true minimum |

251 | // is likely even bigger, but hopefully this saves a few reallocs.) |

252 | let mut sparse = Vec::with_capacity(StateID::SIZE * dfa.state_len()); |

253 | // This maps state indices from the dense DFA to StateIDs in the sparse |

254 | // DFA. We build out this map on the first pass, and then use it in the |

255 | // second pass to back-fill our transitions. |

256 | let mut remap: Vec<StateID> = vec![DEAD; dfa.state_len()]; |

257 | for state in dfa.states() { |

258 | let pos = sparse.len(); |

259 | |

260 | remap[dfa.to_index(state.id())] = StateID::new(pos) |

261 | .map_err(|_| BuildError::too_many_states())?; |

262 | // zero-filled space for the transition length |

263 | sparse.push(0); |

264 | sparse.push(0); |

265 | |

266 | let mut transition_len = 0; |

267 | for (unit1, unit2, _) in state.sparse_transitions() { |

268 | match (unit1.as_u8(), unit2.as_u8()) { |

269 | (Some(b1), Some(b2)) => { |

270 | transition_len += 1; |

271 | sparse.push(b1); |

272 | sparse.push(b2); |

273 | } |

274 | (None, None) => {} |

275 | (Some(_), None) | (None, Some(_)) => { |

276 | // can never occur because sparse_transitions never |

277 | // groups EOI with any other transition. |

278 | unreachable!() |

279 | } |

280 | } |

281 | } |

282 | // Add dummy EOI transition. This is never actually read while |

283 | // searching, but having space equivalent to the total number |

284 | // of transitions is convenient. Otherwise, we'd need to track |

285 | // a different number of transitions for the byte ranges as for |

286 | // the 'next' states. |

287 | // |

288 | // N.B. The loop above is not guaranteed to yield the EOI |

289 | // transition, since it may point to a DEAD state. By putting |

290 | // it here, we always write the EOI transition, and thus |

291 | // guarantee that our transition length is >0. Why do we always |

292 | // need the EOI transition? Because in order to implement |

293 | // Automaton::next_eoi_state, this lets us just ask for the last |

294 | // transition. There are probably other/better ways to do this. |

295 | transition_len += 1; |

296 | sparse.push(0); |

297 | sparse.push(0); |

298 | |

299 | // Check some assumptions about transition length. |

300 | assert_ne!( |

301 | transition_len, 0, |

302 | "transition length should be non-zero", |

303 | ); |

304 | assert!( |

305 | transition_len <= 257, |

306 | "expected transition length {} to be <= 257", |

307 | transition_len, |

308 | ); |

309 | |

310 | // Fill in the transition length. |

311 | // Since transition length is always <= 257, we use the most |

312 | // significant bit to indicate whether this is a match state or |

313 | // not. |

314 | let ntrans = if dfa.is_match_state(state.id()) { |

315 | transition_len | (1 << 15) |

316 | } else { |

317 | transition_len |

318 | }; |

319 | wire::NE::write_u16(ntrans, &mut sparse[pos..]); |

320 | |

321 | // zero-fill the actual transitions. |

322 | // Unwraps are OK since transition_length <= 257 and our minimum |

323 | // support usize size is 16-bits. |

324 | let zeros = usize::try_from(transition_len) |

325 | .unwrap() |

326 | .checked_mul(StateID::SIZE) |

327 | .unwrap(); |

328 | sparse.extend(iter::repeat(0).take(zeros)); |

329 | |

330 | // If this is a match state, write the pattern IDs matched by this |

331 | // state. |

332 | if dfa.is_match_state(state.id()) { |

333 | let plen = dfa.match_pattern_len(state.id()); |

334 | // Write the actual pattern IDs with a u32 length prefix. |

335 | // First, zero-fill space. |

336 | let mut pos = sparse.len(); |

337 | // Unwraps are OK since it's guaranteed that plen <= |

338 | // PatternID::LIMIT, which is in turn guaranteed to fit into a |

339 | // u32. |

340 | let zeros = size_of::<u32>() |

341 | .checked_mul(plen) |

342 | .unwrap() |

343 | .checked_add(size_of::<u32>()) |

344 | .unwrap(); |

345 | sparse.extend(iter::repeat(0).take(zeros)); |

346 | |

347 | // Now write the length prefix. |

348 | wire::NE::write_u32( |

349 | // Will never fail since u32::MAX is invalid pattern ID. |

350 | // Thus, the number of pattern IDs is representable by a |

351 | // u32. |

352 | plen.try_into().expect("pattern ID length fits in u32"), |

353 | &mut sparse[pos..], |

354 | ); |

355 | pos += size_of::<u32>(); |

356 | |

357 | // Now write the pattern IDs. |

358 | for &pid in dfa.pattern_id_slice(state.id()) { |

359 | pos += wire::write_pattern_id::<wire::NE>( |

360 | pid, |

361 | &mut sparse[pos..], |

362 | ); |

363 | } |

364 | } |

365 | |

366 | // And now add the accelerator, if one exists. An accelerator is |

367 | // at most 4 bytes and at least 1 byte. The first byte is the |

368 | // length, N. N bytes follow the length. The set of bytes that |

369 | // follow correspond (exhaustively) to the bytes that must be seen |

370 | // to leave this state. |

371 | let accel = dfa.accelerator(state.id()); |

372 | sparse.push(accel.len().try_into().unwrap()); |

373 | sparse.extend_from_slice(accel); |

374 | } |

375 | |

376 | let mut new = DFA { |

377 | tt: Transitions { |

378 | sparse, |

379 | classes: dfa.byte_classes().clone(), |

380 | state_len: dfa.state_len(), |

381 | pattern_len: dfa.pattern_len(), |

382 | }, |

383 | st: StartTable::from_dense_dfa(dfa, &remap)?, |

384 | special: dfa.special().remap(|id| remap[dfa.to_index(id)]), |

385 | pre: dfa.get_prefilter().map(|p| p.clone()), |

386 | quitset: dfa.quitset().clone(), |

387 | flags: dfa.flags().clone(), |

388 | }; |

389 | // And here's our second pass. Iterate over all of the dense states |

390 | // again, and update the transitions in each of the states in the |

391 | // sparse DFA. |

392 | for old_state in dfa.states() { |

393 | let new_id = remap[dfa.to_index(old_state.id())]; |

394 | let mut new_state = new.tt.state_mut(new_id); |

395 | let sparse = old_state.sparse_transitions(); |

396 | for (i, (_, _, next)) in sparse.enumerate() { |

397 | let next = remap[dfa.to_index(next)]; |

398 | new_state.set_next_at(i, next); |

399 | } |

400 | } |

401 | debug!( |

402 | "created sparse DFA, memory usage: {} (dense memory usage: {})", |

403 | new.memory_usage(), |

404 | dfa.memory_usage(), |

405 | ); |

406 | Ok(new) |

407 | } |

408 | } |

409 | |

410 | impl<T: AsRef<[u8]>> DFA<T> { |

411 | /// Cheaply return a borrowed version of this sparse DFA. Specifically, the |

412 | /// DFA returned always uses `&[u8]` for its transitions. |

413 | pub fn as_ref<'a>(&'a self) -> DFA<&'a [u8]> { |

414 | DFA { |

415 | tt: self.tt.as_ref(), |

416 | st: self.st.as_ref(), |

417 | special: self.special, |

418 | pre: self.pre.clone(), |

419 | quitset: self.quitset, |

420 | flags: self.flags, |

421 | } |

422 | } |

423 | |

424 | /// Return an owned version of this sparse DFA. Specifically, the DFA |

425 | /// returned always uses `Vec<u8>` for its transitions. |

426 | /// |

427 | /// Effectively, this returns a sparse DFA whose transitions live on the |

428 | /// heap. |

429 | #[cfg(feature = "alloc")] |

430 | pub fn to_owned(&self) -> DFA<alloc::vec::Vec<u8>> { |

431 | DFA { |

432 | tt: self.tt.to_owned(), |

433 | st: self.st.to_owned(), |

434 | special: self.special, |

435 | pre: self.pre.clone(), |

436 | quitset: self.quitset, |

437 | flags: self.flags, |

438 | } |

439 | } |

440 | |

441 | /// Returns the starting state configuration for this DFA. |

442 | /// |

443 | /// The default is [`StartKind::Both`], which means the DFA supports both |

444 | /// unanchored and anchored searches. However, this can generally lead to |

445 | /// bigger DFAs. Therefore, a DFA might be compiled with support for just |

446 | /// unanchored or anchored searches. In that case, running a search with |

447 | /// an unsupported configuration will panic. |

448 | pub fn start_kind(&self) -> StartKind { |

449 | self.st.kind |

450 | } |

451 | |

452 | /// Returns true only if this DFA has starting states for each pattern. |

453 | /// |

454 | /// When a DFA has starting states for each pattern, then a search with the |

455 | /// DFA can be configured to only look for anchored matches of a specific |

456 | /// pattern. Specifically, APIs like [`Automaton::try_search_fwd`] can |

457 | /// accept a [`Anchored::Pattern`] if and only if this method returns true. |

458 | /// Otherwise, an error will be returned. |

459 | /// |

460 | /// Note that if the DFA is empty, this always returns false. |

461 | pub fn starts_for_each_pattern(&self) -> bool { |

462 | self.st.pattern_len.is_some() |

463 | } |

464 | |

465 | /// Returns the equivalence classes that make up the alphabet for this DFA. |

466 | /// |

467 | /// Unless [`dense::Config::byte_classes`] was disabled, it is possible |

468 | /// that multiple distinct bytes are grouped into the same equivalence |

469 | /// class if it is impossible for them to discriminate between a match and |

470 | /// a non-match. This has the effect of reducing the overall alphabet size |

471 | /// and in turn potentially substantially reducing the size of the DFA's |

472 | /// transition table. |

473 | /// |

474 | /// The downside of using equivalence classes like this is that every state |

475 | /// transition will automatically use this map to convert an arbitrary |

476 | /// byte to its corresponding equivalence class. In practice this has a |

477 | /// negligible impact on performance. |

478 | pub fn byte_classes(&self) -> &ByteClasses { |

479 | &self.tt.classes |

480 | } |

481 | |

482 | /// Returns the memory usage, in bytes, of this DFA. |

483 | /// |

484 | /// The memory usage is computed based on the number of bytes used to |

485 | /// represent this DFA. |

486 | /// |

487 | /// This does **not** include the stack size used up by this DFA. To |

488 | /// compute that, use `std::mem::size_of::<sparse::DFA>()`. |

489 | pub fn memory_usage(&self) -> usize { |

490 | self.tt.memory_usage() + self.st.memory_usage() |

491 | } |

492 | } |

493 | |

494 | /// Routines for converting a sparse DFA to other representations, such as raw |

495 | /// bytes suitable for persistent storage. |

496 | impl<T: AsRef<[u8]>> DFA<T> { |

497 | /// Serialize this DFA as raw bytes to a `Vec<u8>` in little endian |

498 | /// format. |

499 | /// |

500 | /// The written bytes are guaranteed to be deserialized correctly and |

501 | /// without errors in a semver compatible release of this crate by a |

502 | /// `DFA`'s deserialization APIs (assuming all other criteria for the |

503 | /// deserialization APIs has been satisfied): |

504 | /// |

505 | /// * [`DFA::from_bytes`] |

506 | /// * [`DFA::from_bytes_unchecked`] |

507 | /// |

508 | /// Note that unlike a [`dense::DFA`]'s serialization methods, this does |

509 | /// not add any initial padding to the returned bytes. Padding isn't |

510 | /// required for sparse DFAs since they have no alignment requirements. |

511 | /// |

512 | /// # Example |

513 | /// |

514 | /// This example shows how to serialize and deserialize a DFA: |

515 | /// |

516 | /// ``` |

517 | /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; |

518 | /// |

519 | /// // Compile our original DFA. |

520 | /// let original_dfa = DFA::new("foo[0-9]+")?; |

521 | /// |

522 | /// // N.B. We use native endianness here to make the example work, but |

523 | /// // using to_bytes_little_endian would work on a little endian target. |

524 | /// let buf = original_dfa.to_bytes_native_endian(); |

525 | /// // Even if buf has initial padding, DFA::from_bytes will automatically |

526 | /// // ignore it. |

527 | /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0; |

528 | /// |

529 | /// let expected = Some(HalfMatch::must(0, 8)); |

530 | /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); |

531 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |

532 | /// ``` |

533 | #[cfg(feature = "dfa-build")] |

534 | pub fn to_bytes_little_endian(&self) -> Vec<u8> { |

535 | self.to_bytes::<wire::LE>() |

536 | } |

537 | |

538 | /// Serialize this DFA as raw bytes to a `Vec<u8>` in big endian |

539 | /// format. |

540 | /// |

541 | /// The written bytes are guaranteed to be deserialized correctly and |

542 | /// without errors in a semver compatible release of this crate by a |

543 | /// `DFA`'s deserialization APIs (assuming all other criteria for the |

544 | /// deserialization APIs has been satisfied): |

545 | /// |

546 | /// * [`DFA::from_bytes`] |

547 | /// * [`DFA::from_bytes_unchecked`] |

548 | /// |

549 | /// Note that unlike a [`dense::DFA`]'s serialization methods, this does |

550 | /// not add any initial padding to the returned bytes. Padding isn't |

551 | /// required for sparse DFAs since they have no alignment requirements. |

552 | /// |

553 | /// # Example |

554 | /// |

555 | /// This example shows how to serialize and deserialize a DFA: |

556 | /// |

557 | /// ``` |

558 | /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; |

559 | /// |

560 | /// // Compile our original DFA. |

561 | /// let original_dfa = DFA::new("foo[0-9]+")?; |

562 | /// |

563 | /// // N.B. We use native endianness here to make the example work, but |

564 | /// // using to_bytes_big_endian would work on a big endian target. |

565 | /// let buf = original_dfa.to_bytes_native_endian(); |

566 | /// // Even if buf has initial padding, DFA::from_bytes will automatically |

567 | /// // ignore it. |

568 | /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0; |

569 | /// |

570 | /// let expected = Some(HalfMatch::must(0, 8)); |

571 | /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); |

572 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |

573 | /// ``` |

574 | #[cfg(feature = "dfa-build")] |

575 | pub fn to_bytes_big_endian(&self) -> Vec<u8> { |

576 | self.to_bytes::<wire::BE>() |

577 | } |

578 | |

579 | /// Serialize this DFA as raw bytes to a `Vec<u8>` in native endian |

580 | /// format. |

581 | /// |

582 | /// The written bytes are guaranteed to be deserialized correctly and |

583 | /// without errors in a semver compatible release of this crate by a |

584 | /// `DFA`'s deserialization APIs (assuming all other criteria for the |

585 | /// deserialization APIs has been satisfied): |

586 | /// |

587 | /// * [`DFA::from_bytes`] |

588 | /// * [`DFA::from_bytes_unchecked`] |

589 | /// |

590 | /// Note that unlike a [`dense::DFA`]'s serialization methods, this does |

591 | /// not add any initial padding to the returned bytes. Padding isn't |

592 | /// required for sparse DFAs since they have no alignment requirements. |

593 | /// |

594 | /// Generally speaking, native endian format should only be used when |

595 | /// you know that the target you're compiling the DFA for matches the |

596 | /// endianness of the target on which you're compiling DFA. For example, |

597 | /// if serialization and deserialization happen in the same process or on |

598 | /// the same machine. Otherwise, when serializing a DFA for use in a |

599 | /// portable environment, you'll almost certainly want to serialize _both_ |

600 | /// a little endian and a big endian version and then load the correct one |

601 | /// based on the target's configuration. |

602 | /// |

603 | /// # Example |

604 | /// |

605 | /// This example shows how to serialize and deserialize a DFA: |

606 | /// |

607 | /// ``` |

608 | /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; |

609 | /// |

610 | /// // Compile our original DFA. |

611 | /// let original_dfa = DFA::new("foo[0-9]+")?; |

612 | /// |

613 | /// let buf = original_dfa.to_bytes_native_endian(); |

614 | /// // Even if buf has initial padding, DFA::from_bytes will automatically |

615 | /// // ignore it. |

616 | /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0; |

617 | /// |

618 | /// let expected = Some(HalfMatch::must(0, 8)); |

619 | /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); |

620 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |

621 | /// ``` |

622 | #[cfg(feature = "dfa-build")] |

623 | pub fn to_bytes_native_endian(&self) -> Vec<u8> { |

624 | self.to_bytes::<wire::NE>() |

625 | } |

626 | |

627 | /// The implementation of the public `to_bytes` serialization methods, |

628 | /// which is generic over endianness. |

629 | #[cfg(feature = "dfa-build")] |

630 | fn to_bytes<E: Endian>(&self) -> Vec<u8> { |

631 | let mut buf = vec![0; self.write_to_len()]; |

632 | // This should always succeed since the only possible serialization |

633 | // error is providing a buffer that's too small, but we've ensured that |

634 | // `buf` is big enough here. |

635 | self.write_to::<E>(&mut buf).unwrap(); |

636 | buf |

637 | } |

638 | |

639 | /// Serialize this DFA as raw bytes to the given slice, in little endian |

640 | /// format. Upon success, the total number of bytes written to `dst` is |

641 | /// returned. |

642 | /// |

643 | /// The written bytes are guaranteed to be deserialized correctly and |

644 | /// without errors in a semver compatible release of this crate by a |

645 | /// `DFA`'s deserialization APIs (assuming all other criteria for the |

646 | /// deserialization APIs has been satisfied): |

647 | /// |

648 | /// * [`DFA::from_bytes`] |

649 | /// * [`DFA::from_bytes_unchecked`] |

650 | /// |

651 | /// # Errors |

652 | /// |

653 | /// This returns an error if the given destination slice is not big enough |

654 | /// to contain the full serialized DFA. If an error occurs, then nothing |

655 | /// is written to `dst`. |

656 | /// |

657 | /// # Example |

658 | /// |

659 | /// This example shows how to serialize and deserialize a DFA without |

660 | /// dynamic memory allocation. |

661 | /// |

662 | /// ``` |

663 | /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; |

664 | /// |

665 | /// // Compile our original DFA. |

666 | /// let original_dfa = DFA::new("foo[0-9]+")?; |

667 | /// |

668 | /// // Create a 4KB buffer on the stack to store our serialized DFA. |

669 | /// let mut buf = [0u8; 4 * (1<<10)]; |

670 | /// // N.B. We use native endianness here to make the example work, but |

671 | /// // using write_to_little_endian would work on a little endian target. |

672 | /// let written = original_dfa.write_to_native_endian(&mut buf)?; |

673 | /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; |

674 | /// |

675 | /// let expected = Some(HalfMatch::must(0, 8)); |

676 | /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); |

677 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |

678 | /// ``` |

679 | pub fn write_to_little_endian( |

680 | &self, |

681 | dst: &mut [u8], |

682 | ) -> Result<usize, SerializeError> { |

683 | self.write_to::<wire::LE>(dst) |

684 | } |

685 | |

686 | /// Serialize this DFA as raw bytes to the given slice, in big endian |

687 | /// format. Upon success, the total number of bytes written to `dst` is |

688 | /// returned. |

689 | /// |

690 | /// The written bytes are guaranteed to be deserialized correctly and |

691 | /// without errors in a semver compatible release of this crate by a |

692 | /// `DFA`'s deserialization APIs (assuming all other criteria for the |

693 | /// deserialization APIs has been satisfied): |

694 | /// |

695 | /// * [`DFA::from_bytes`] |

696 | /// * [`DFA::from_bytes_unchecked`] |

697 | /// |

698 | /// # Errors |

699 | /// |

700 | /// This returns an error if the given destination slice is not big enough |

701 | /// to contain the full serialized DFA. If an error occurs, then nothing |

702 | /// is written to `dst`. |

703 | /// |

704 | /// # Example |

705 | /// |

706 | /// This example shows how to serialize and deserialize a DFA without |

707 | /// dynamic memory allocation. |

708 | /// |

709 | /// ``` |

710 | /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; |

711 | /// |

712 | /// // Compile our original DFA. |

713 | /// let original_dfa = DFA::new("foo[0-9]+")?; |

714 | /// |

715 | /// // Create a 4KB buffer on the stack to store our serialized DFA. |

716 | /// let mut buf = [0u8; 4 * (1<<10)]; |

717 | /// // N.B. We use native endianness here to make the example work, but |

718 | /// // using write_to_big_endian would work on a big endian target. |

719 | /// let written = original_dfa.write_to_native_endian(&mut buf)?; |

720 | /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; |

721 | /// |

722 | /// let expected = Some(HalfMatch::must(0, 8)); |

723 | /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); |

724 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |

725 | /// ``` |

726 | pub fn write_to_big_endian( |

727 | &self, |

728 | dst: &mut [u8], |

729 | ) -> Result<usize, SerializeError> { |

730 | self.write_to::<wire::BE>(dst) |

731 | } |

732 | |

733 | /// Serialize this DFA as raw bytes to the given slice, in native endian |

734 | /// format. Upon success, the total number of bytes written to `dst` is |

735 | /// returned. |

736 | /// |

737 | /// The written bytes are guaranteed to be deserialized correctly and |

738 | /// without errors in a semver compatible release of this crate by a |

739 | /// `DFA`'s deserialization APIs (assuming all other criteria for the |

740 | /// deserialization APIs has been satisfied): |

741 | /// |

742 | /// * [`DFA::from_bytes`] |

743 | /// * [`DFA::from_bytes_unchecked`] |

744 | /// |

745 | /// Generally speaking, native endian format should only be used when |

746 | /// you know that the target you're compiling the DFA for matches the |

747 | /// endianness of the target on which you're compiling DFA. For example, |

748 | /// if serialization and deserialization happen in the same process or on |

749 | /// the same machine. Otherwise, when serializing a DFA for use in a |

750 | /// portable environment, you'll almost certainly want to serialize _both_ |

751 | /// a little endian and a big endian version and then load the correct one |

752 | /// based on the target's configuration. |

753 | /// |

754 | /// # Errors |

755 | /// |

756 | /// This returns an error if the given destination slice is not big enough |

757 | /// to contain the full serialized DFA. If an error occurs, then nothing |

758 | /// is written to `dst`. |

759 | /// |

760 | /// # Example |

761 | /// |

762 | /// This example shows how to serialize and deserialize a DFA without |

763 | /// dynamic memory allocation. |

764 | /// |

765 | /// ``` |

766 | /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; |

767 | /// |

768 | /// // Compile our original DFA. |

769 | /// let original_dfa = DFA::new("foo[0-9]+")?; |

770 | /// |

771 | /// // Create a 4KB buffer on the stack to store our serialized DFA. |

772 | /// let mut buf = [0u8; 4 * (1<<10)]; |

773 | /// let written = original_dfa.write_to_native_endian(&mut buf)?; |

774 | /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; |

775 | /// |

776 | /// let expected = Some(HalfMatch::must(0, 8)); |

777 | /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); |

778 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |

779 | /// ``` |

780 | pub fn write_to_native_endian( |

781 | &self, |

782 | dst: &mut [u8], |

783 | ) -> Result<usize, SerializeError> { |

784 | self.write_to::<wire::NE>(dst) |

785 | } |

786 | |

787 | /// The implementation of the public `write_to` serialization methods, |

788 | /// which is generic over endianness. |

789 | fn write_to<E: Endian>( |

790 | &self, |

791 | dst: &mut [u8], |

792 | ) -> Result<usize, SerializeError> { |

793 | let mut nw = 0; |

794 | nw += wire::write_label(LABEL, &mut dst[nw..])?; |

795 | nw += wire::write_endianness_check::<E>(&mut dst[nw..])?; |

796 | nw += wire::write_version::<E>(VERSION, &mut dst[nw..])?; |

797 | nw += { |

798 | // Currently unused, intended for future flexibility |

799 | E::write_u32(0, &mut dst[nw..]); |

800 | size_of::<u32>() |

801 | }; |

802 | nw += self.flags.write_to::<E>(&mut dst[nw..])?; |

803 | nw += self.tt.write_to::<E>(&mut dst[nw..])?; |

804 | nw += self.st.write_to::<E>(&mut dst[nw..])?; |

805 | nw += self.special.write_to::<E>(&mut dst[nw..])?; |

806 | nw += self.quitset.write_to::<E>(&mut dst[nw..])?; |

807 | Ok(nw) |

808 | } |

809 | |

810 | /// Return the total number of bytes required to serialize this DFA. |

811 | /// |

812 | /// This is useful for determining the size of the buffer required to pass |

813 | /// to one of the serialization routines: |

814 | /// |

815 | /// * [`DFA::write_to_little_endian`] |

816 | /// * [`DFA::write_to_big_endian`] |

817 | /// * [`DFA::write_to_native_endian`] |

818 | /// |

819 | /// Passing a buffer smaller than the size returned by this method will |

820 | /// result in a serialization error. |

821 | /// |

822 | /// # Example |

823 | /// |

824 | /// This example shows how to dynamically allocate enough room to serialize |

825 | /// a sparse DFA. |

826 | /// |

827 | /// ``` |

828 | /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; |

829 | /// |

830 | /// // Compile our original DFA. |

831 | /// let original_dfa = DFA::new("foo[0-9]+")?; |

832 | /// |

833 | /// let mut buf = vec![0; original_dfa.write_to_len()]; |

834 | /// let written = original_dfa.write_to_native_endian(&mut buf)?; |

835 | /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; |

836 | /// |

837 | /// let expected = Some(HalfMatch::must(0, 8)); |

838 | /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); |

839 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |

840 | /// ``` |

841 | pub fn write_to_len(&self) -> usize { |

842 | wire::write_label_len(LABEL) |

843 | + wire::write_endianness_check_len() |

844 | + wire::write_version_len() |

845 | + size_of::<u32>() // unused, intended for future flexibility |

846 | + self.flags.write_to_len() |

847 | + self.tt.write_to_len() |

848 | + self.st.write_to_len() |

849 | + self.special.write_to_len() |

850 | + self.quitset.write_to_len() |

851 | } |

852 | } |

853 | |

854 | impl<'a> DFA<&'a [u8]> { |

855 | /// Safely deserialize a sparse DFA with a specific state identifier |

856 | /// representation. Upon success, this returns both the deserialized DFA |

857 | /// and the number of bytes read from the given slice. Namely, the contents |

858 | /// of the slice beyond the DFA are not read. |

859 | /// |

860 | /// Deserializing a DFA using this routine will never allocate heap memory. |

861 | /// For safety purposes, the DFA's transitions will be verified such that |

862 | /// every transition points to a valid state. If this verification is too |

863 | /// costly, then a [`DFA::from_bytes_unchecked`] API is provided, which |

864 | /// will always execute in constant time. |

865 | /// |

866 | /// The bytes given must be generated by one of the serialization APIs |

867 | /// of a `DFA` using a semver compatible release of this crate. Those |

868 | /// include: |

869 | /// |

870 | /// * [`DFA::to_bytes_little_endian`] |

871 | /// * [`DFA::to_bytes_big_endian`] |

872 | /// * [`DFA::to_bytes_native_endian`] |

873 | /// * [`DFA::write_to_little_endian`] |

874 | /// * [`DFA::write_to_big_endian`] |

875 | /// * [`DFA::write_to_native_endian`] |

876 | /// |

877 | /// The `to_bytes` methods allocate and return a `Vec<u8>` for you. The |

878 | /// `write_to` methods do not allocate and write to an existing slice |

879 | /// (which may be on the stack). Since deserialization always uses the |

880 | /// native endianness of the target platform, the serialization API you use |

881 | /// should match the endianness of the target platform. (It's often a good |

882 | /// idea to generate serialized DFAs for both forms of endianness and then |

883 | /// load the correct one based on endianness.) |

884 | /// |

885 | /// # Errors |

886 | /// |

887 | /// Generally speaking, it's easier to state the conditions in which an |

888 | /// error is _not_ returned. All of the following must be true: |

889 | /// |

890 | /// * The bytes given must be produced by one of the serialization APIs |

891 | /// on this DFA, as mentioned above. |

892 | /// * The endianness of the target platform matches the endianness used to |

893 | /// serialized the provided DFA. |

894 | /// |

895 | /// If any of the above are not true, then an error will be returned. |

896 | /// |

897 | /// Note that unlike deserializing a [`dense::DFA`], deserializing a sparse |

898 | /// DFA has no alignment requirements. That is, an alignment of `1` is |

899 | /// valid. |

900 | /// |

901 | /// # Panics |

902 | /// |

903 | /// This routine will never panic for any input. |

904 | /// |

905 | /// # Example |

906 | /// |

907 | /// This example shows how to serialize a DFA to raw bytes, deserialize it |

908 | /// and then use it for searching. |

909 | /// |

910 | /// ``` |

911 | /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; |

912 | /// |

913 | /// let initial = DFA::new("foo[0-9]+")?; |

914 | /// let bytes = initial.to_bytes_native_endian(); |

915 | /// let dfa: DFA<&[u8]> = DFA::from_bytes(&bytes)?.0; |

916 | /// |

917 | /// let expected = Some(HalfMatch::must(0, 8)); |

918 | /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); |

919 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |

920 | /// ``` |

921 | /// |

922 | /// # Example: loading a DFA from static memory |

923 | /// |

924 | /// One use case this library supports is the ability to serialize a |

925 | /// DFA to disk and then use `include_bytes!` to store it in a compiled |

926 | /// Rust program. Those bytes can then be cheaply deserialized into a |

927 | /// `DFA` structure at runtime and used for searching without having to |

928 | /// re-compile the DFA (which can be quite costly). |

929 | /// |

930 | /// We can show this in two parts. The first part is serializing the DFA to |

931 | /// a file: |

932 | /// |

933 | /// ```no_run |

934 | /// use regex_automata::dfa::sparse::DFA; |

935 | /// |

936 | /// let dfa = DFA::new("foo[0-9]+")?; |

937 | /// |

938 | /// // Write a big endian serialized version of this DFA to a file. |

939 | /// let bytes = dfa.to_bytes_big_endian(); |

940 | /// std::fs::write("foo.bigendian.dfa", &bytes)?; |

941 | /// |

942 | /// // Do it again, but this time for little endian. |

943 | /// let bytes = dfa.to_bytes_little_endian(); |

944 | /// std::fs::write("foo.littleendian.dfa", &bytes)?; |

945 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |

946 | /// ``` |

947 | /// |

948 | /// And now the second part is embedding the DFA into the compiled program |

949 | /// and deserializing it at runtime on first use. We use conditional |

950 | /// compilation to choose the correct endianness. We do not need to employ |

951 | /// any special tricks to ensure a proper alignment, since a sparse DFA has |

952 | /// no alignment requirements. |

953 | /// |

954 | /// ```no_run |

955 | /// use regex_automata::{ |

956 | /// dfa::{Automaton, sparse::DFA}, |

957 | /// util::lazy::Lazy, |

958 | /// HalfMatch, Input, |

959 | /// }; |

960 | /// |

961 | /// // This crate provides its own "lazy" type, kind of like |

962 | /// // lazy_static! or once_cell::sync::Lazy. But it works in no-alloc |

963 | /// // no-std environments and let's us write this using completely |

964 | /// // safe code. |

965 | /// static RE: Lazy<DFA<&'static [u8]>> = Lazy::new(|| { |

966 | /// # const _: &str = stringify! { |

967 | /// #[cfg(target_endian = "big")] |

968 | /// static BYTES: &[u8] = include_bytes!("foo.bigendian.dfa"); |

969 | /// #[cfg(target_endian = "little")] |

970 | /// static BYTES: &[u8] = include_bytes!("foo.littleendian.dfa"); |

971 | /// # }; |

972 | /// # static BYTES: &[u8] = b""; |

973 | /// |

974 | /// let (dfa, _) = DFA::from_bytes(BYTES) |

975 | /// .expect("serialized DFA should be valid"); |

976 | /// dfa |

977 | /// }); |

978 | /// |

979 | /// let expected = Ok(Some(HalfMatch::must(0, 8))); |

980 | /// assert_eq!(expected, RE.try_search_fwd(&Input::new("foo12345"))); |

981 | /// ``` |

982 | /// |

983 | /// Alternatively, consider using |

984 | /// [`lazy_static`](https://crates.io/crates/lazy_static) |

985 | /// or |

986 | /// [`once_cell`](https://crates.io/crates/once_cell), |

987 | /// which will guarantee safety for you. |

988 | pub fn from_bytes( |

989 | slice: &'a [u8], |

990 | ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> { |

991 | // SAFETY: This is safe because we validate both the sparse transitions |

992 | // (by trying to decode every state) and start state ID list below. If |

993 | // either validation fails, then we return an error. |

994 | let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? }; |

995 | let seen = dfa.tt.validate(&dfa.special)?; |

996 | dfa.st.validate(&dfa.special, &seen)?; |

997 | // N.B. dfa.special doesn't have a way to do unchecked deserialization, |

998 | // so it has already been validated. |

999 | Ok((dfa, nread)) |

1000 | } |

1001 | |

1002 | /// Deserialize a DFA with a specific state identifier representation in |

1003 | /// constant time by omitting the verification of the validity of the |

1004 | /// sparse transitions. |

1005 | /// |

1006 | /// This is just like [`DFA::from_bytes`], except it can potentially return |

1007 | /// a DFA that exhibits undefined behavior if its transitions contains |

1008 | /// invalid state identifiers. |

1009 | /// |

1010 | /// This routine is useful if you need to deserialize a DFA cheaply and |

1011 | /// cannot afford the transition validation performed by `from_bytes`. |

1012 | /// |

1013 | /// # Safety |

1014 | /// |

1015 | /// This routine is not safe because it permits callers to provide |

1016 | /// arbitrary transitions with possibly incorrect state identifiers. While |

1017 | /// the various serialization routines will never return an incorrect |

1018 | /// DFA, there is no guarantee that the bytes provided here are correct. |

1019 | /// While `from_bytes_unchecked` will still do several forms of basic |

1020 | /// validation, this routine does not check that the transitions themselves |

1021 | /// are correct. Given an incorrect transition table, it is possible for |

1022 | /// the search routines to access out-of-bounds memory because of explicit |

1023 | /// bounds check elision. |

1024 | /// |

1025 | /// # Example |

1026 | /// |

1027 | /// ``` |

1028 | /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; |

1029 | /// |

1030 | /// let initial = DFA::new("foo[0-9]+")?; |

1031 | /// let bytes = initial.to_bytes_native_endian(); |

1032 | /// // SAFETY: This is guaranteed to be safe since the bytes given come |

1033 | /// // directly from a compatible serialization routine. |

1034 | /// let dfa: DFA<&[u8]> = unsafe { DFA::from_bytes_unchecked(&bytes)?.0 }; |

1035 | /// |

1036 | /// let expected = Some(HalfMatch::must(0, 8)); |

1037 | /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); |

1038 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |

1039 | /// ``` |

1040 | pub unsafe fn from_bytes_unchecked( |

1041 | slice: &'a [u8], |

1042 | ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> { |

1043 | let mut nr = 0; |

1044 | |

1045 | nr += wire::read_label(&slice[nr..], LABEL)?; |

1046 | nr += wire::read_endianness_check(&slice[nr..])?; |

1047 | nr += wire::read_version(&slice[nr..], VERSION)?; |

1048 | |

1049 | let _unused = wire::try_read_u32(&slice[nr..], "unused space")?; |

1050 | nr += size_of::<u32>(); |

1051 | |

1052 | let (flags, nread) = Flags::from_bytes(&slice[nr..])?; |

1053 | nr += nread; |

1054 | |

1055 | let (tt, nread) = Transitions::from_bytes_unchecked(&slice[nr..])?; |

1056 | nr += nread; |

1057 | |

1058 | let (st, nread) = StartTable::from_bytes_unchecked(&slice[nr..])?; |

1059 | nr += nread; |

1060 | |

1061 | let (special, nread) = Special::from_bytes(&slice[nr..])?; |

1062 | nr += nread; |

1063 | if special.max.as_usize() >= tt.sparse().len() { |

1064 | return Err(DeserializeError::generic( |

1065 | "max should not be greater than or equal to sparse bytes", |

1066 | )); |

1067 | } |

1068 | |

1069 | let (quitset, nread) = ByteSet::from_bytes(&slice[nr..])?; |

1070 | nr += nread; |

1071 | |

1072 | // Prefilters don't support serialization, so they're always absent. |

1073 | let pre = None; |

1074 | Ok((DFA { tt, st, special, pre, quitset, flags }, nr)) |

1075 | } |

1076 | } |

1077 | |

1078 | impl<T: AsRef<[u8]>> fmt::Debug for DFA<T> { |

1079 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |

1080 | writeln!(f, "sparse::DFA(")?; |

1081 | for state in self.tt.states() { |

1082 | fmt_state_indicator(f, self, state.id())?; |

1083 | writeln!(f, "{:06?}: {:?}", state.id().as_usize(), state)?; |

1084 | } |

1085 | writeln!(f, "")?; |

1086 | for (i, (start_id, anchored, sty)) in self.st.iter().enumerate() { |

1087 | if i % self.st.stride == 0 { |

1088 | match anchored { |

1089 | Anchored::No => writeln!(f, "START-GROUP(unanchored)")?, |

1090 | Anchored::Yes => writeln!(f, "START-GROUP(anchored)")?, |

1091 | Anchored::Pattern(pid) => writeln!( |

1092 | f, |

1093 | "START_GROUP(pattern: {:?})", |

1094 | pid.as_usize() |

1095 | )?, |

1096 | } |

1097 | } |

1098 | writeln!(f, " {:?} => {:06?}", sty, start_id.as_usize())?; |

1099 | } |

1100 | writeln!(f, "state length: {:?}", self.tt.state_len)?; |

1101 | writeln!(f, "pattern length: {:?}", self.pattern_len())?; |

1102 | writeln!(f, "flags: {:?}", self.flags)?; |

1103 | writeln!(f, ")")?; |

1104 | Ok(()) |

1105 | } |

1106 | } |

1107 | |

1108 | // SAFETY: We assert that our implementation of each method is correct. |

1109 | unsafe impl<T: AsRef<[u8]>> Automaton for DFA<T> { |

1110 | #[inline] |

1111 | fn is_special_state(&self, id: StateID) -> bool { |

1112 | self.special.is_special_state(id) |

1113 | } |

1114 | |

1115 | #[inline] |

1116 | fn is_dead_state(&self, id: StateID) -> bool { |

1117 | self.special.is_dead_state(id) |

1118 | } |

1119 | |

1120 | #[inline] |

1121 | fn is_quit_state(&self, id: StateID) -> bool { |

1122 | self.special.is_quit_state(id) |

1123 | } |

1124 | |

1125 | #[inline] |

1126 | fn is_match_state(&self, id: StateID) -> bool { |

1127 | self.special.is_match_state(id) |

1128 | } |

1129 | |

1130 | #[inline] |

1131 | fn is_start_state(&self, id: StateID) -> bool { |

1132 | self.special.is_start_state(id) |

1133 | } |

1134 | |

1135 | #[inline] |

1136 | fn is_accel_state(&self, id: StateID) -> bool { |

1137 | self.special.is_accel_state(id) |

1138 | } |

1139 | |

1140 | // This is marked as inline to help dramatically boost sparse searching, |

1141 | // which decodes each state it enters to follow the next transition. |

1142 | #[cfg_attr(feature = "perf-inline", inline(always))] |

1143 | fn next_state(&self, current: StateID, input: u8) -> StateID { |

1144 | let input = self.tt.classes.get(input); |

1145 | self.tt.state(current).next(input) |

1146 | } |

1147 | |

1148 | #[inline] |

1149 | unsafe fn next_state_unchecked( |

1150 | &self, |

1151 | current: StateID, |

1152 | input: u8, |

1153 | ) -> StateID { |

1154 | self.next_state(current, input) |

1155 | } |

1156 | |

1157 | #[inline] |

1158 | fn next_eoi_state(&self, current: StateID) -> StateID { |

1159 | self.tt.state(current).next_eoi() |

1160 | } |

1161 | |

1162 | #[inline] |

1163 | fn pattern_len(&self) -> usize { |

1164 | self.tt.pattern_len |

1165 | } |

1166 | |

1167 | #[inline] |

1168 | fn match_len(&self, id: StateID) -> usize { |

1169 | self.tt.state(id).pattern_len() |

1170 | } |

1171 | |

1172 | #[inline] |

1173 | fn match_pattern(&self, id: StateID, match_index: usize) -> PatternID { |

1174 | // This is an optimization for the very common case of a DFA with a |

1175 | // single pattern. This conditional avoids a somewhat more costly path |

1176 | // that finds the pattern ID from the state machine, which requires |

1177 | // a bit of slicing/pointer-chasing. This optimization tends to only |

1178 | // matter when matches are frequent. |

1179 | if self.tt.pattern_len == 1 { |

1180 | return PatternID::ZERO; |

1181 | } |

1182 | self.tt.state(id).pattern_id(match_index) |

1183 | } |

1184 | |

1185 | #[inline] |

1186 | fn has_empty(&self) -> bool { |

1187 | self.flags.has_empty |

1188 | } |

1189 | |

1190 | #[inline] |

1191 | fn is_utf8(&self) -> bool { |

1192 | self.flags.is_utf8 |

1193 | } |

1194 | |

1195 | #[inline] |

1196 | fn is_always_start_anchored(&self) -> bool { |

1197 | self.flags.is_always_start_anchored |

1198 | } |

1199 | |

1200 | #[inline] |

1201 | fn start_state( |

1202 | &self, |

1203 | config: &start::Config, |

1204 | ) -> Result<StateID, StartError> { |

1205 | let anchored = config.get_anchored(); |

1206 | let start = match config.get_look_behind() { |

1207 | None => Start::Text, |

1208 | Some(byte) => { |

1209 | if !self.quitset.is_empty() && self.quitset.contains(byte) { |

1210 | return Err(StartError::quit(byte)); |

1211 | } |

1212 | self.st.start_map.get(byte) |

1213 | } |

1214 | }; |

1215 | self.st.start(anchored, start) |

1216 | } |

1217 | |

1218 | #[inline] |

1219 | fn universal_start_state(&self, mode: Anchored) -> Option<StateID> { |

1220 | match mode { |

1221 | Anchored::No => self.st.universal_start_unanchored, |

1222 | Anchored::Yes => self.st.universal_start_anchored, |

1223 | Anchored::Pattern(_) => None, |

1224 | } |

1225 | } |

1226 | |

1227 | #[inline] |

1228 | fn accelerator(&self, id: StateID) -> &[u8] { |

1229 | self.tt.state(id).accelerator() |

1230 | } |

1231 | |

1232 | #[inline] |

1233 | fn get_prefilter(&self) -> Option<&Prefilter> { |

1234 | self.pre.as_ref() |

1235 | } |

1236 | } |

1237 | |

1238 | /// The transition table portion of a sparse DFA. |

1239 | /// |

1240 | /// The transition table is the core part of the DFA in that it describes how |

1241 | /// to move from one state to another based on the input sequence observed. |

1242 | /// |

1243 | /// Unlike a typical dense table based DFA, states in a sparse transition |

1244 | /// table have variable size. That is, states with more transitions use more |

1245 | /// space than states with fewer transitions. This means that finding the next |

1246 | /// transition takes more work than with a dense DFA, but also typically uses |

1247 | /// much less space. |

1248 | #[derive(Clone)] |

1249 | struct Transitions<T> { |

1250 | /// The raw encoding of each state in this DFA. |

1251 | /// |

1252 | /// Each state has the following information: |

1253 | /// |

1254 | /// * A set of transitions to subsequent states. Transitions to the dead |

1255 | /// state are omitted. |

1256 | /// * If the state can be accelerated, then any additional accelerator |

1257 | /// information. |

1258 | /// * If the state is a match state, then the state contains all pattern |

1259 | /// IDs that match when in that state. |

1260 | /// |

1261 | /// To decode a state, use Transitions::state. |

1262 | /// |

1263 | /// In practice, T is either Vec<u8> or &[u8]. |

1264 | sparse: T, |

1265 | /// A set of equivalence classes, where a single equivalence class |

1266 | /// represents a set of bytes that never discriminate between a match |

1267 | /// and a non-match in the DFA. Each equivalence class corresponds to a |

1268 | /// single character in this DFA's alphabet, where the maximum number of |

1269 | /// characters is 257 (each possible value of a byte plus the special |

1270 | /// EOI transition). Consequently, the number of equivalence classes |

1271 | /// corresponds to the number of transitions for each DFA state. Note |

1272 | /// though that the *space* used by each DFA state in the transition table |

1273 | /// may be larger. The total space used by each DFA state is known as the |

1274 | /// stride and is documented above. |

1275 | /// |

1276 | /// The only time the number of equivalence classes is fewer than 257 is |

1277 | /// if the DFA's kind uses byte classes which is the default. Equivalence |

1278 | /// classes should generally only be disabled when debugging, so that |

1279 | /// the transitions themselves aren't obscured. Disabling them has no |

1280 | /// other benefit, since the equivalence class map is always used while |

1281 | /// searching. In the vast majority of cases, the number of equivalence |

1282 | /// classes is substantially smaller than 257, particularly when large |

1283 | /// Unicode classes aren't used. |

1284 | /// |

1285 | /// N.B. Equivalence classes aren't particularly useful in a sparse DFA |

1286 | /// in the current implementation, since equivalence classes generally tend |

1287 | /// to correspond to continuous ranges of bytes that map to the same |

1288 | /// transition. So in a sparse DFA, equivalence classes don't really lead |

1289 | /// to a space savings. In the future, it would be good to try and remove |

1290 | /// them from sparse DFAs entirely, but requires a bit of work since sparse |

1291 | /// DFAs are built from dense DFAs, which are in turn built on top of |

1292 | /// equivalence classes. |

1293 | classes: ByteClasses, |

1294 | /// The total number of states in this DFA. Note that a DFA always has at |

1295 | /// least one state---the dead state---even the empty DFA. In particular, |

1296 | /// the dead state always has ID 0 and is correspondingly always the first |

1297 | /// state. The dead state is never a match state. |

1298 | state_len: usize, |

1299 | /// The total number of unique patterns represented by these match states. |

1300 | pattern_len: usize, |

1301 | } |

1302 | |

1303 | impl<'a> Transitions<&'a [u8]> { |

1304 | unsafe fn from_bytes_unchecked( |

1305 | mut slice: &'a [u8], |

1306 | ) -> Result<(Transitions<&'a [u8]>, usize), DeserializeError> { |

1307 | let slice_start = slice.as_ptr().as_usize(); |

1308 | |

1309 | let (state_len, nr) = |

1310 | wire::try_read_u32_as_usize(&slice, "state length")?; |

1311 | slice = &slice[nr..]; |

1312 | |

1313 | let (pattern_len, nr) = |

1314 | wire::try_read_u32_as_usize(&slice, "pattern length")?; |

1315 | slice = &slice[nr..]; |

1316 | |

1317 | let (classes, nr) = ByteClasses::from_bytes(&slice)?; |

1318 | slice = &slice[nr..]; |

1319 | |

1320 | let (len, nr) = |

1321 | wire::try_read_u32_as_usize(&slice, "sparse transitions length")?; |

1322 | slice = &slice[nr..]; |

1323 | |

1324 | wire::check_slice_len(slice, len, "sparse states byte length")?; |

1325 | let sparse = &slice[..len]; |

1326 | slice = &slice[len..]; |

1327 | |

1328 | let trans = Transitions { sparse, classes, state_len, pattern_len }; |

1329 | Ok((trans, slice.as_ptr().as_usize() - slice_start)) |

1330 | } |

1331 | } |

1332 | |

1333 | impl<T: AsRef<[u8]>> Transitions<T> { |

1334 | /// Writes a serialized form of this transition table to the buffer given. |

1335 | /// If the buffer is too small, then an error is returned. To determine |

1336 | /// how big the buffer must be, use `write_to_len`. |

1337 | fn write_to<E: Endian>( |

1338 | &self, |

1339 | mut dst: &mut [u8], |

1340 | ) -> Result<usize, SerializeError> { |

1341 | let nwrite = self.write_to_len(); |

1342 | if dst.len() < nwrite { |

1343 | return Err(SerializeError::buffer_too_small( |

1344 | "sparse transition table", |

1345 | )); |

1346 | } |

1347 | dst = &mut dst[..nwrite]; |

1348 | |

1349 | // write state length |

1350 | E::write_u32(u32::try_from(self.state_len).unwrap(), dst); |

1351 | dst = &mut dst[size_of::<u32>()..]; |

1352 | |

1353 | // write pattern length |

1354 | E::write_u32(u32::try_from(self.pattern_len).unwrap(), dst); |

1355 | dst = &mut dst[size_of::<u32>()..]; |

1356 | |

1357 | // write byte class map |

1358 | let n = self.classes.write_to(dst)?; |

1359 | dst = &mut dst[n..]; |

1360 | |

1361 | // write number of bytes in sparse transitions |

1362 | E::write_u32(u32::try_from(self.sparse().len()).unwrap(), dst); |

1363 | dst = &mut dst[size_of::<u32>()..]; |

1364 | |

1365 | // write actual transitions |

1366 | let mut id = DEAD; |

1367 | while id.as_usize() < self.sparse().len() { |

1368 | let state = self.state(id); |

1369 | let n = state.write_to::<E>(&mut dst)?; |

1370 | dst = &mut dst[n..]; |

1371 | // The next ID is the offset immediately following `state`. |

1372 | id = StateID::new(id.as_usize() + state.write_to_len()).unwrap(); |

1373 | } |

1374 | Ok(nwrite) |

1375 | } |

1376 | |

1377 | /// Returns the number of bytes the serialized form of this transition |

1378 | /// table will use. |

1379 | fn write_to_len(&self) -> usize { |

1380 | size_of::<u32>() // state length |

1381 | + size_of::<u32>() // pattern length |

1382 | + self.classes.write_to_len() |

1383 | + size_of::<u32>() // sparse transitions length |

1384 | + self.sparse().len() |

1385 | } |

1386 | |

1387 | /// Validates that every state ID in this transition table is valid. |

1388 | /// |

1389 | /// That is, every state ID can be used to correctly index a state in this |

1390 | /// table. |

1391 | fn validate(&self, sp: &Special) -> Result<Seen, DeserializeError> { |

1392 | let mut verified = Seen::new(); |

1393 | // We need to make sure that we decode the correct number of states. |

1394 | // Otherwise, an empty set of transitions would validate even if the |

1395 | // recorded state length is non-empty. |

1396 | let mut len = 0; |

1397 | // We can't use the self.states() iterator because it assumes the state |

1398 | // encodings are valid. It could panic if they aren't. |

1399 | let mut id = DEAD; |

1400 | while id.as_usize() < self.sparse().len() { |

1401 | // Before we even decode the state, we check that the ID itself |

1402 | // is well formed. That is, if it's a special state then it must |

1403 | // actually be a quit, dead, accel, match or start state. |

1404 | if sp.is_special_state(id) { |

1405 | let is_actually_special = sp.is_dead_state(id) |

1406 | || sp.is_quit_state(id) |

1407 | || sp.is_match_state(id) |

1408 | || sp.is_start_state(id) |

1409 | || sp.is_accel_state(id); |

1410 | if !is_actually_special { |

1411 | // This is kind of a cryptic error message... |

1412 | return Err(DeserializeError::generic( |

1413 | "found sparse state tagged as special but \ |

1414 | wasn't actually special", |

1415 | )); |

1416 | } |

1417 | } |

1418 | let state = self.try_state(sp, id)?; |

1419 | verified.insert(id); |

1420 | // The next ID should be the offset immediately following `state`. |

1421 | id = StateID::new(wire::add( |

1422 | id.as_usize(), |

1423 | state.write_to_len(), |

1424 | "next state ID offset", |

1425 | )?) |

1426 | .map_err(|err| { |

1427 | DeserializeError::state_id_error(err, "next state ID offset") |

1428 | })?; |

1429 | len += 1; |

1430 | } |

1431 | // Now that we've checked that all top-level states are correct and |

1432 | // importantly, collected a set of valid state IDs, we have all the |

1433 | // information we need to check that all transitions are correct too. |

1434 | // |

1435 | // Note that we can't use `valid_ids` to iterate because it will |

1436 | // be empty in no-std no-alloc contexts. (And yes, that means our |

1437 | // verification isn't quite as good.) We can use `self.states()` |

1438 | // though at least, since we know that all states can at least be |

1439 | // decoded and traversed correctly. |

1440 | for state in self.states() { |

1441 | // Check that all transitions in this state are correct. |

1442 | for i in 0..state.ntrans { |

1443 | let to = state.next_at(i); |

1444 | // For no-alloc, we just check that the state can decode. It is |

1445 | // technically possible that the state ID could still point to |

1446 | // a non-existent state even if it decodes (fuzzing proved this |

1447 | // to be true), but it shouldn't result in any memory unsafety |

1448 | // or panics in non-debug mode. |

1449 | #[cfg(not(feature = "alloc"))] |

1450 | { |

1451 | let _ = self.try_state(sp, to)?; |

1452 | } |

1453 | #[cfg(feature = "alloc")] |

1454 | { |

1455 | if !verified.contains(&to) { |

1456 | return Err(DeserializeError::generic( |

1457 | "found transition that points to a \ |

1458 | non-existent state", |

1459 | )); |

1460 | } |

1461 | } |

1462 | } |

1463 | } |

1464 | if len != self.state_len { |

1465 | return Err(DeserializeError::generic( |

1466 | "mismatching sparse state length", |

1467 | )); |

1468 | } |

1469 | Ok(verified) |

1470 | } |

1471 | |

1472 | /// Converts these transitions to a borrowed value. |

1473 | fn as_ref(&self) -> Transitions<&'_ [u8]> { |

1474 | Transitions { |

1475 | sparse: self.sparse(), |

1476 | classes: self.classes.clone(), |

1477 | state_len: self.state_len, |

1478 | pattern_len: self.pattern_len, |

1479 | } |

1480 | } |

1481 | |

1482 | /// Converts these transitions to an owned value. |

1483 | #[cfg(feature = "alloc")] |

1484 | fn to_owned(&self) -> Transitions<alloc::vec::Vec<u8>> { |

1485 | Transitions { |

1486 | sparse: self.sparse().to_vec(), |

1487 | classes: self.classes.clone(), |

1488 | state_len: self.state_len, |

1489 | pattern_len: self.pattern_len, |

1490 | } |

1491 | } |

1492 | |

1493 | /// Return a convenient representation of the given state. |

1494 | /// |

1495 | /// This panics if the state is invalid. |

1496 | /// |

1497 | /// This is marked as inline to help dramatically boost sparse searching, |

1498 | /// which decodes each state it enters to follow the next transition. Other |

1499 | /// functions involved are also inlined, which should hopefully eliminate |

1500 | /// a lot of the extraneous decoding that is never needed just to follow |

1501 | /// the next transition. |

1502 | #[cfg_attr(feature = "perf-inline", inline(always))] |

1503 | fn state(&self, id: StateID) -> State<'_> { |

1504 | let mut state = &self.sparse()[id.as_usize()..]; |

1505 | let mut ntrans = wire::read_u16(&state).as_usize(); |

1506 | let is_match = (1 << 15) & ntrans != 0; |

1507 | ntrans &= !(1 << 15); |

1508 | state = &state[2..]; |

1509 | |

1510 | let (input_ranges, state) = state.split_at(ntrans * 2); |

1511 | let (next, state) = state.split_at(ntrans * StateID::SIZE); |

1512 | let (pattern_ids, state) = if is_match { |

1513 | let npats = wire::read_u32(&state).as_usize(); |

1514 | state[4..].split_at(npats * 4) |

1515 | } else { |

1516 | (&[][..], state) |

1517 | }; |

1518 | |

1519 | let accel_len = usize::from(state[0]); |

1520 | let accel = &state[1..accel_len + 1]; |

1521 | State { id, is_match, ntrans, input_ranges, next, pattern_ids, accel } |

1522 | } |

1523 | |

1524 | /// Like `state`, but will return an error if the state encoding is |

1525 | /// invalid. This is useful for verifying states after deserialization, |

1526 | /// which is required for a safe deserialization API. |

1527 | /// |

1528 | /// Note that this only verifies that this state is decodable and that |

1529 | /// all of its data is consistent. It does not verify that its state ID |

1530 | /// transitions point to valid states themselves, nor does it verify that |

1531 | /// every pattern ID is valid. |

1532 | fn try_state( |

1533 | &self, |

1534 | sp: &Special, |

1535 | id: StateID, |

1536 | ) -> Result<State<'_>, DeserializeError> { |

1537 | if id.as_usize() > self.sparse().len() { |

1538 | return Err(DeserializeError::generic( |

1539 | "invalid caller provided sparse state ID", |

1540 | )); |

1541 | } |

1542 | let mut state = &self.sparse()[id.as_usize()..]; |

1543 | // Encoding format starts with a u16 that stores the total number of |

1544 | // transitions in this state. |

1545 | let (mut ntrans, _) = |

1546 | wire::try_read_u16_as_usize(state, "state transition length")?; |

1547 | let is_match = ((1 << 15) & ntrans) != 0; |

1548 | ntrans &= !(1 << 15); |

1549 | state = &state[2..]; |

1550 | if ntrans > 257 || ntrans == 0 { |

1551 | return Err(DeserializeError::generic( |

1552 | "invalid transition length", |

1553 | )); |

1554 | } |

1555 | if is_match && !sp.is_match_state(id) { |

1556 | return Err(DeserializeError::generic( |

1557 | "state marked as match but not in match ID range", |

1558 | )); |

1559 | } else if !is_match && sp.is_match_state(id) { |

1560 | return Err(DeserializeError::generic( |

1561 | "state in match ID range but not marked as match state", |

1562 | )); |

1563 | } |

1564 | |

1565 | // Each transition has two pieces: an inclusive range of bytes on which |

1566 | // it is defined, and the state ID that those bytes transition to. The |

1567 | // pairs come first, followed by a corresponding sequence of state IDs. |

1568 | let input_ranges_len = ntrans.checked_mul(2).unwrap(); |

1569 | wire::check_slice_len(state, input_ranges_len, "sparse byte pairs")?; |

1570 | let (input_ranges, state) = state.split_at(input_ranges_len); |

1571 | // Every range should be of the form A-B, where A<=B. |

1572 | for pair in input_ranges.chunks(2) { |

1573 | let (start, end) = (pair[0], pair[1]); |

1574 | if start > end { |

1575 | return Err(DeserializeError::generic("invalid input range")); |

1576 | } |

1577 | } |

1578 | |

1579 | // And now extract the corresponding sequence of state IDs. We leave |

1580 | // this sequence as a &[u8] instead of a &[S] because sparse DFAs do |

1581 | // not have any alignment requirements. |

1582 | let next_len = ntrans |

1583 | .checked_mul(self.id_len()) |

1584 | .expect("state size * #trans should always fit in a usize"); |

1585 | wire::check_slice_len(state, next_len, "sparse trans state IDs")?; |

1586 | let (next, state) = state.split_at(next_len); |

1587 | // We can at least verify that every state ID is in bounds. |

1588 | for idbytes in next.chunks(self.id_len()) { |

1589 | let (id, _) = |

1590 | wire::read_state_id(idbytes, "sparse state ID in try_state")?; |

1591 | wire::check_slice_len( |

1592 | self.sparse(), |

1593 | id.as_usize(), |

1594 | "invalid sparse state ID", |

1595 | )?; |

1596 | } |

1597 | |

1598 | // If this is a match state, then read the pattern IDs for this state. |

1599 | // Pattern IDs is a u32-length prefixed sequence of native endian |

1600 | // encoded 32-bit integers. |

1601 | let (pattern_ids, state) = if is_match { |

1602 | let (npats, nr) = |

1603 | wire::try_read_u32_as_usize(state, "pattern ID length")?; |

1604 | let state = &state[nr..]; |

1605 | if npats == 0 { |

1606 | return Err(DeserializeError::generic( |

1607 | "state marked as a match, but pattern length is zero", |

1608 | )); |

1609 | } |

1610 | |

1611 | let pattern_ids_len = |

1612 | wire::mul(npats, 4, "sparse pattern ID byte length")?; |

1613 | wire::check_slice_len( |

1614 | state, |

1615 | pattern_ids_len, |

1616 | "sparse pattern IDs", |

1617 | )?; |

1618 | let (pattern_ids, state) = state.split_at(pattern_ids_len); |

1619 | for patbytes in pattern_ids.chunks(PatternID::SIZE) { |

1620 | wire::read_pattern_id( |

1621 | patbytes, |

1622 | "sparse pattern ID in try_state", |

1623 | )?; |

1624 | } |

1625 | (pattern_ids, state) |

1626 | } else { |

1627 | (&[][..], state) |

1628 | }; |

1629 | if is_match && pattern_ids.is_empty() { |

1630 | return Err(DeserializeError::generic( |

1631 | "state marked as a match, but has no pattern IDs", |

1632 | )); |

1633 | } |

1634 | if sp.is_match_state(id) && pattern_ids.is_empty() { |

1635 | return Err(DeserializeError::generic( |

1636 | "state marked special as a match, but has no pattern IDs", |

1637 | )); |

1638 | } |

1639 | if sp.is_match_state(id) != is_match { |

1640 | return Err(DeserializeError::generic( |

1641 | "whether state is a match or not is inconsistent", |

1642 | )); |

1643 | } |

1644 | |

1645 | // Now read this state's accelerator info. The first byte is the length |

1646 | // of the accelerator, which is typically 0 (for no acceleration) but |

1647 | // is no bigger than 3. The length indicates the number of bytes that |

1648 | // follow, where each byte corresponds to a transition out of this |

1649 | // state. |

1650 | if state.is_empty() { |

1651 | return Err(DeserializeError::generic("no accelerator length")); |

1652 | } |

1653 | let (accel_len, state) = (usize::from(state[0]), &state[1..]); |

1654 | |

1655 | if accel_len > 3 { |

1656 | return Err(DeserializeError::generic( |

1657 | "sparse invalid accelerator length", |

1658 | )); |

1659 | } else if accel_len == 0 && sp.is_accel_state(id) { |

1660 | return Err(DeserializeError::generic( |

1661 | "got no accelerators in state, but in accelerator ID range", |

1662 | )); |

1663 | } else if accel_len > 0 && !sp.is_accel_state(id) { |

1664 | return Err(DeserializeError::generic( |

1665 | "state in accelerator ID range, but has no accelerators", |

1666 | )); |

1667 | } |

1668 | |

1669 | wire::check_slice_len( |

1670 | state, |

1671 | accel_len, |

1672 | "sparse corrupt accelerator length", |

1673 | )?; |

1674 | let (accel, _) = (&state[..accel_len], &state[accel_len..]); |

1675 | |

1676 | let state = State { |

1677 | id, |

1678 | is_match, |

1679 | ntrans, |

1680 | input_ranges, |

1681 | next, |

1682 | pattern_ids, |

1683 | accel, |

1684 | }; |

1685 | if sp.is_quit_state(state.next_at(state.ntrans - 1)) { |

1686 | return Err(DeserializeError::generic( |

1687 | "state with EOI transition to quit state is illegal", |

1688 | )); |

1689 | } |

1690 | Ok(state) |

1691 | } |

1692 | |

1693 | /// Return an iterator over all of the states in this DFA. |

1694 | /// |

1695 | /// The iterator returned yields tuples, where the first element is the |

1696 | /// state ID and the second element is the state itself. |

1697 | fn states(&self) -> StateIter<'_, T> { |

1698 | StateIter { trans: self, id: DEAD.as_usize() } |

1699 | } |

1700 | |

1701 | /// Returns the sparse transitions as raw bytes. |

1702 | fn sparse(&self) -> &[u8] { |

1703 | self.sparse.as_ref() |

1704 | } |

1705 | |

1706 | /// Returns the number of bytes represented by a single state ID. |

1707 | fn id_len(&self) -> usize { |

1708 | StateID::SIZE |

1709 | } |

1710 | |

1711 | /// Return the memory usage, in bytes, of these transitions. |

1712 | /// |

1713 | /// This does not include the size of a `Transitions` value itself. |

1714 | fn memory_usage(&self) -> usize { |

1715 | self.sparse().len() |

1716 | } |

1717 | } |

1718 | |

1719 | #[cfg(feature = "dfa-build")] |

1720 | impl<T: AsMut<[u8]>> Transitions<T> { |

1721 | /// Return a convenient mutable representation of the given state. |

1722 | /// This panics if the state is invalid. |

1723 | fn state_mut(&mut self, id: StateID) -> StateMut<'_> { |

1724 | let mut state = &mut self.sparse_mut()[id.as_usize()..]; |

1725 | let mut ntrans = wire::read_u16(&state).as_usize(); |

1726 | let is_match = (1 << 15) & ntrans != 0; |

1727 | ntrans &= !(1 << 15); |

1728 | state = &mut state[2..]; |

1729 | |

1730 | let (input_ranges, state) = state.split_at_mut(ntrans * 2); |

1731 | let (next, state) = state.split_at_mut(ntrans * StateID::SIZE); |

1732 | let (pattern_ids, state) = if is_match { |

1733 | let npats = wire::read_u32(&state).as_usize(); |

1734 | state[4..].split_at_mut(npats * 4) |

1735 | } else { |

1736 | (&mut [][..], state) |

1737 | }; |

1738 | |

1739 | let accel_len = usize::from(state[0]); |

1740 | let accel = &mut state[1..accel_len + 1]; |

1741 | StateMut { |

1742 | id, |

1743 | is_match, |

1744 | ntrans, |

1745 | input_ranges, |

1746 | next, |

1747 | pattern_ids, |

1748 | accel, |

1749 | } |

1750 | } |

1751 | |

1752 | /// Returns the sparse transitions as raw mutable bytes. |

1753 | fn sparse_mut(&mut self) -> &mut [u8] { |

1754 | self.sparse.as_mut() |

1755 | } |

1756 | } |

1757 | |

1758 | /// The set of all possible starting states in a DFA. |

1759 | /// |

1760 | /// See the eponymous type in the `dense` module for more details. This type |

1761 | /// is very similar to `dense::StartTable`, except that its underlying |

1762 | /// representation is `&[u8]` instead of `&[S]`. (The latter would require |

1763 | /// sparse DFAs to be aligned, which is explicitly something we do not require |

1764 | /// because we don't really need it.) |

1765 | #[derive(Clone)] |

1766 | struct StartTable<T> { |

1767 | /// The initial start state IDs as a contiguous table of native endian |

1768 | /// encoded integers, represented by `S`. |

1769 | /// |

1770 | /// In practice, T is either Vec<u8> or &[u8] and has no alignment |

1771 | /// requirements. |

1772 | /// |

1773 | /// The first `2 * stride` (currently always 8) entries always correspond |

1774 | /// to the starts states for the entire DFA, with the first 4 entries being |

1775 | /// for unanchored searches and the second 4 entries being for anchored |

1776 | /// searches. To keep things simple, we always use 8 entries even if the |

1777 | /// `StartKind` is not both. |

1778 | /// |

1779 | /// After that, there are `stride * patterns` state IDs, where `patterns` |

1780 | /// may be zero in the case of a DFA with no patterns or in the case where |

1781 | /// the DFA was built without enabling starting states for each pattern. |

1782 | table: T, |

1783 | /// The starting state configuration supported. When 'both', both |

1784 | /// unanchored and anchored searches work. When 'unanchored', anchored |

1785 | /// searches panic. When 'anchored', unanchored searches panic. |

1786 | kind: StartKind, |

1787 | /// The start state configuration for every possible byte. |

1788 | start_map: StartByteMap, |

1789 | /// The number of starting state IDs per pattern. |

1790 | stride: usize, |

1791 | /// The total number of patterns for which starting states are encoded. |

1792 | /// This is `None` for DFAs that were built without start states for each |

1793 | /// pattern. Thus, one cannot use this field to say how many patterns |

1794 | /// are in the DFA in all cases. It is specific to how many patterns are |

1795 | /// represented in this start table. |

1796 | pattern_len: Option<usize>, |

1797 | /// The universal starting state for unanchored searches. This is only |

1798 | /// present when the DFA supports unanchored searches and when all starting |

1799 | /// state IDs for an unanchored search are equivalent. |

1800 | universal_start_unanchored: Option<StateID>, |

1801 | /// The universal starting state for anchored searches. This is only |

1802 | /// present when the DFA supports anchored searches and when all starting |

1803 | /// state IDs for an anchored search are equivalent. |

1804 | universal_start_anchored: Option<StateID>, |

1805 | } |

1806 | |

1807 | #[cfg(feature = "dfa-build")] |

1808 | impl StartTable<Vec<u8>> { |

1809 | fn new<T: AsRef<[u32]>>( |

1810 | dfa: &dense::DFA<T>, |

1811 | pattern_len: Option<usize>, |

1812 | ) -> StartTable<Vec<u8>> { |

1813 | let stride = Start::len(); |

1814 | // This is OK since the only way we're here is if a dense DFA could be |

1815 | // constructed successfully, which uses the same space. |

1816 | let len = stride |

1817 | .checked_mul(pattern_len.unwrap_or(0)) |

1818 | .unwrap() |

1819 | .checked_add(stride.checked_mul(2).unwrap()) |

1820 | .unwrap() |

1821 | .checked_mul(StateID::SIZE) |

1822 | .unwrap(); |

1823 | StartTable { |

1824 | table: vec![0; len], |

1825 | kind: dfa.start_kind(), |

1826 | start_map: dfa.start_map().clone(), |

1827 | stride, |

1828 | pattern_len, |

1829 | universal_start_unanchored: dfa |

1830 | .universal_start_state(Anchored::No), |

1831 | universal_start_anchored: dfa.universal_start_state(Anchored::Yes), |

1832 | } |

1833 | } |

1834 | |

1835 | fn from_dense_dfa<T: AsRef<[u32]>>( |

1836 | dfa: &dense::DFA<T>, |

1837 | remap: &[StateID], |

1838 | ) -> Result<StartTable<Vec<u8>>, BuildError> { |

1839 | // Unless the DFA has start states compiled for each pattern, then |

1840 | // as far as the starting state table is concerned, there are zero |

1841 | // patterns to account for. It will instead only store starting states |

1842 | // for the entire DFA. |

1843 | let start_pattern_len = if dfa.starts_for_each_pattern() { |

1844 | Some(dfa.pattern_len()) |

1845 | } else { |

1846 | None |

1847 | }; |

1848 | let mut sl = StartTable::new(dfa, start_pattern_len); |

1849 | for (old_start_id, anchored, sty) in dfa.starts() { |

1850 | let new_start_id = remap[dfa.to_index(old_start_id)]; |

1851 | sl.set_start(anchored, sty, new_start_id); |

1852 | } |

1853 | Ok(sl) |

1854 | } |

1855 | } |

1856 | |

1857 | impl<'a> StartTable<&'a [u8]> { |

1858 | unsafe fn from_bytes_unchecked( |

1859 | mut slice: &'a [u8], |

1860 | ) -> Result<(StartTable<&'a [u8]>, usize), DeserializeError> { |

1861 | let slice_start = slice.as_ptr().as_usize(); |

1862 | |

1863 | let (kind, nr) = StartKind::from_bytes(slice)?; |

1864 | slice = &slice[nr..]; |

1865 | |

1866 | let (start_map, nr) = StartByteMap::from_bytes(slice)?; |

1867 | slice = &slice[nr..]; |

1868 | |

1869 | let (stride, nr) = |

1870 | wire::try_read_u32_as_usize(slice, "sparse start table stride")?; |

1871 | slice = &slice[nr..]; |

1872 | if stride != Start::len() { |

1873 | return Err(DeserializeError::generic( |

1874 | "invalid sparse starting table stride", |

1875 | )); |

1876 | } |

1877 | |

1878 | let (maybe_pattern_len, nr) = |

1879 | wire::try_read_u32_as_usize(slice, "sparse start table patterns")?; |

1880 | slice = &slice[nr..]; |

1881 | let pattern_len = if maybe_pattern_len.as_u32() == u32::MAX { |

1882 | None |

1883 | } else { |

1884 | Some(maybe_pattern_len) |

1885 | }; |

1886 | if pattern_len.map_or(false, |len| len > PatternID::LIMIT) { |

1887 | return Err(DeserializeError::generic( |

1888 | "sparse invalid number of patterns", |

1889 | )); |

1890 | } |

1891 | |

1892 | let (universal_unanchored, nr) = |

1893 | wire::try_read_u32(slice, "universal unanchored start")?; |

1894 | slice = &slice[nr..]; |

1895 | let universal_start_unanchored = if universal_unanchored == u32::MAX { |

1896 | None |

1897 | } else { |

1898 | Some(StateID::try_from(universal_unanchored).map_err(|e| { |

1899 | DeserializeError::state_id_error( |

1900 | e, |

1901 | "universal unanchored start", |

1902 | ) |

1903 | })?) |

1904 | }; |

1905 | |

1906 | let (universal_anchored, nr) = |

1907 | wire::try_read_u32(slice, "universal anchored start")?; |

1908 | slice = &slice[nr..]; |

1909 | let universal_start_anchored = if universal_anchored == u32::MAX { |

1910 | None |

1911 | } else { |

1912 | Some(StateID::try_from(universal_anchored).map_err(|e| { |

1913 | DeserializeError::state_id_error(e, "universal anchored start") |

1914 | })?) |

1915 | }; |

1916 | |

1917 | let pattern_table_size = wire::mul( |

1918 | stride, |

1919 | pattern_len.unwrap_or(0), |

1920 | "sparse invalid pattern length", |

1921 | )?; |

1922 | // Our start states always start with a single stride of start states |

1923 | // for the entire automaton which permit it to match any pattern. What |

1924 | // follows it are an optional set of start states for each pattern. |

1925 | let start_state_len = wire::add( |

1926 | wire::mul(2, stride, "start state stride too big")?, |

1927 | pattern_table_size, |

1928 | "sparse invalid 'any' pattern starts size", |

1929 | )?; |

1930 | let table_bytes_len = wire::mul( |

1931 | start_state_len, |

1932 | StateID::SIZE, |

1933 | "sparse pattern table bytes length", |

1934 | )?; |

1935 | wire::check_slice_len( |

1936 | slice, |

1937 | table_bytes_len, |

1938 | "sparse start ID table", |

1939 | )?; |

1940 | let table = &slice[..table_bytes_len]; |

1941 | slice = &slice[table_bytes_len..]; |

1942 | |

1943 | let sl = StartTable { |

1944 | table, |

1945 | kind, |

1946 | start_map, |

1947 | stride, |

1948 | pattern_len, |

1949 | universal_start_unanchored, |

1950 | universal_start_anchored, |

1951 | }; |

1952 | Ok((sl, slice.as_ptr().as_usize() - slice_start)) |

1953 | } |

1954 | } |

1955 | |

1956 | impl<T: AsRef<[u8]>> StartTable<T> { |

1957 | fn write_to<E: Endian>( |

1958 | &self, |

1959 | mut dst: &mut [u8], |

1960 | ) -> Result<usize, SerializeError> { |

1961 | let nwrite = self.write_to_len(); |

1962 | if dst.len() < nwrite { |

1963 | return Err(SerializeError::buffer_too_small( |

1964 | "sparse starting table ids", |

1965 | )); |

1966 | } |

1967 | dst = &mut dst[..nwrite]; |

1968 | |

1969 | // write start kind |

1970 | let nw = self.kind.write_to::<E>(dst)?; |

1971 | dst = &mut dst[nw..]; |

1972 | // write start byte map |

1973 | let nw = self.start_map.write_to(dst)?; |

1974 | dst = &mut dst[nw..]; |

1975 | // write stride |

1976 | E::write_u32(u32::try_from(self.stride).unwrap(), dst); |

1977 | dst = &mut dst[size_of::<u32>()..]; |

1978 | // write pattern length |

1979 | E::write_u32( |

1980 | u32::try_from(self.pattern_len.unwrap_or(0xFFFF_FFFF)).unwrap(), |

1981 | dst, |

1982 | ); |

1983 | dst = &mut dst[size_of::<u32>()..]; |

1984 | // write universal start unanchored state id, u32::MAX if absent |

1985 | E::write_u32( |

1986 | self.universal_start_unanchored |

1987 | .map_or(u32::MAX, |sid| sid.as_u32()), |

1988 | dst, |

1989 | ); |

1990 | dst = &mut dst[size_of::<u32>()..]; |

1991 | // write universal start anchored state id, u32::MAX if absent |

1992 | E::write_u32( |

1993 | self.universal_start_anchored.map_or(u32::MAX, |sid| sid.as_u32()), |

1994 | dst, |

1995 | ); |

1996 | dst = &mut dst[size_of::<u32>()..]; |

1997 | // write start IDs |

1998 | for (sid, _, _) in self.iter() { |

1999 | E::write_u32(sid.as_u32(), dst); |

2000 | dst = &mut dst[StateID::SIZE..]; |

2001 | } |

2002 | Ok(nwrite) |

2003 | } |

2004 | |

2005 | /// Returns the number of bytes the serialized form of this transition |

2006 | /// table will use. |

2007 | fn write_to_len(&self) -> usize { |

2008 | self.kind.write_to_len() |

2009 | + self.start_map.write_to_len() |

2010 | + size_of::<u32>() // stride |

2011 | + size_of::<u32>() // # patterns |

2012 | + size_of::<u32>() // universal unanchored start |

2013 | + size_of::<u32>() // universal anchored start |

2014 | + self.table().len() |

2015 | } |

2016 | |

2017 | /// Validates that every starting state ID in this table is valid. |

2018 | /// |

2019 | /// That is, every starting state ID can be used to correctly decode a |

2020 | /// state in the DFA's sparse transitions. |

2021 | fn validate( |

2022 | &self, |

2023 | sp: &Special, |

2024 | seen: &Seen, |

2025 | ) -> Result<(), DeserializeError> { |

2026 | for (id, _, _) in self.iter() { |

2027 | if !seen.contains(&id) { |

2028 | return Err(DeserializeError::generic( |

2029 | "found invalid start state ID", |

2030 | )); |

2031 | } |

2032 | if sp.is_match_state(id) { |

2033 | return Err(DeserializeError::generic( |

2034 | "start states cannot be match states", |

2035 | )); |

2036 | } |

2037 | } |

2038 | Ok(()) |

2039 | } |

2040 | |

2041 | /// Converts this start list to a borrowed value. |

2042 | fn as_ref(&self) -> StartTable<&'_ [u8]> { |

2043 | StartTable { |

2044 | table: self.table(), |

2045 | kind: self.kind, |

2046 | start_map: self.start_map.clone(), |

2047 | stride: self.stride, |

2048 | pattern_len: self.pattern_len, |

2049 | universal_start_unanchored: self.universal_start_unanchored, |

2050 | universal_start_anchored: self.universal_start_anchored, |

2051 | } |

2052 | } |

2053 | |

2054 | /// Converts this start list to an owned value. |

2055 | #[cfg(feature = "alloc")] |

2056 | fn to_owned(&self) -> StartTable<alloc::vec::Vec<u8>> { |

2057 | StartTable { |

2058 | table: self.table().to_vec(), |

2059 | kind: self.kind, |

2060 | start_map: self.start_map.clone(), |

2061 | stride: self.stride, |

2062 | pattern_len: self.pattern_len, |

2063 | universal_start_unanchored: self.universal_start_unanchored, |

2064 | universal_start_anchored: self.universal_start_anchored, |

2065 | } |

2066 | } |

2067 | |

2068 | /// Return the start state for the given index and pattern ID. If the |

2069 | /// pattern ID is None, then the corresponding start state for the entire |

2070 | /// DFA is returned. If the pattern ID is not None, then the corresponding |

2071 | /// starting state for the given pattern is returned. If this start table |

2072 | /// does not have individual starting states for each pattern, then this |

2073 | /// panics. |

2074 | fn start( |

2075 | &self, |

2076 | anchored: Anchored, |

2077 | start: Start, |

2078 | ) -> Result<StateID, StartError> { |

2079 | let start_index = start.as_usize(); |

2080 | let index = match anchored { |

2081 | Anchored::No => { |

2082 | if !self.kind.has_unanchored() { |

2083 | return Err(StartError::unsupported_anchored(anchored)); |

2084 | } |

2085 | start_index |

2086 | } |

2087 | Anchored::Yes => { |

2088 | if !self.kind.has_anchored() { |

2089 | return Err(StartError::unsupported_anchored(anchored)); |

2090 | } |

2091 | self.stride + start_index |

2092 | } |

2093 | Anchored::Pattern(pid) => { |

2094 | let len = match self.pattern_len { |

2095 | None => { |

2096 | return Err(StartError::unsupported_anchored(anchored)) |

2097 | } |

2098 | Some(len) => len, |

2099 | }; |

2100 | if pid.as_usize() >= len { |

2101 | return Ok(DEAD); |

2102 | } |

2103 | (2 * self.stride) |

2104 | + (self.stride * pid.as_usize()) |

2105 | + start_index |

2106 | } |

2107 | }; |

2108 | let start = index * StateID::SIZE; |

2109 | // This OK since we're allowed to assume that the start table contains |

2110 | // valid StateIDs. |

2111 | Ok(wire::read_state_id_unchecked(&self.table()[start..]).0) |

2112 | } |

2113 | |

2114 | /// Return an iterator over all start IDs in this table. |

2115 | fn iter(&self) -> StartStateIter<'_, T> { |

2116 | StartStateIter { st: self, i: 0 } |

2117 | } |

2118 | |

2119 | /// Returns the total number of start state IDs in this table. |

2120 | fn len(&self) -> usize { |

2121 | self.table().len() / StateID::SIZE |

2122 | } |

2123 | |

2124 | /// Returns the table as a raw slice of bytes. |

2125 | fn table(&self) -> &[u8] { |

2126 | self.table.as_ref() |

2127 | } |

2128 | |

2129 | /// Return the memory usage, in bytes, of this start list. |

2130 | /// |

2131 | /// This does not include the size of a `StartTable` value itself. |

2132 | fn memory_usage(&self) -> usize { |

2133 | self.table().len() |

2134 | } |

2135 | } |

2136 | |

2137 | #[cfg(feature = "dfa-build")] |

2138 | impl<T: AsMut<[u8]>> StartTable<T> { |

2139 | /// Set the start state for the given index and pattern. |

2140 | /// |

2141 | /// If the pattern ID or state ID are not valid, then this will panic. |

2142 | fn set_start(&mut self, anchored: Anchored, start: Start, id: StateID) { |

2143 | let start_index = start.as_usize(); |

2144 | let index = match anchored { |

2145 | Anchored::No => start_index, |

2146 | Anchored::Yes => self.stride + start_index, |

2147 | Anchored::Pattern(pid) => { |

2148 | let pid = pid.as_usize(); |

2149 | let len = self |

2150 | .pattern_len |

2151 | .expect("start states for each pattern enabled"); |

2152 | assert!(pid < len, "invalid pattern ID {:?}", pid); |

2153 | self.stride |

2154 | .checked_mul(pid) |

2155 | .unwrap() |

2156 | .checked_add(self.stride.checked_mul(2).unwrap()) |

2157 | .unwrap() |

2158 | .checked_add(start_index) |

2159 | .unwrap() |

2160 | } |

2161 | }; |

2162 | let start = index * StateID::SIZE; |

2163 | let end = start + StateID::SIZE; |

2164 | wire::write_state_id::<wire::NE>( |

2165 | id, |

2166 | &mut self.table.as_mut()[start..end], |

2167 | ); |

2168 | } |

2169 | } |

2170 | |

2171 | /// An iterator over all state state IDs in a sparse DFA. |

2172 | struct StartStateIter<'a, T> { |

2173 | st: &'a StartTable<T>, |

2174 | i: usize, |

2175 | } |

2176 | |

2177 | impl<'a, T: AsRef<[u8]>> Iterator for StartStateIter<'a, T> { |

2178 | type Item = (StateID, Anchored, Start); |

2179 | |

2180 | fn next(&mut self) -> Option<(StateID, Anchored, Start)> { |

2181 | let i = self.i; |

2182 | if i >= self.st.len() { |

2183 | return None; |

2184 | } |

2185 | self.i += 1; |

2186 | |

2187 | // This unwrap is okay since the stride of any DFA must always match |

2188 | // the number of start state types. |

2189 | let start_type = Start::from_usize(i % self.st.stride).unwrap(); |

2190 | let anchored = if i < self.st.stride { |

2191 | Anchored::No |

2192 | } else if i < (2 * self.st.stride) { |

2193 | Anchored::Yes |

2194 | } else { |

2195 | let pid = (i - (2 * self.st.stride)) / self.st.stride; |

2196 | Anchored::Pattern(PatternID::new(pid).unwrap()) |

2197 | }; |

2198 | let start = i * StateID::SIZE; |

2199 | let end = start + StateID::SIZE; |

2200 | let bytes = self.st.table()[start..end].try_into().unwrap(); |

2201 | // This is OK since we're allowed to assume that any IDs in this start |

2202 | // table are correct and valid for this DFA. |

2203 | let id = StateID::from_ne_bytes_unchecked(bytes); |

2204 | Some((id, anchored, start_type)) |

2205 | } |

2206 | } |

2207 | |

2208 | impl<'a, T> fmt::Debug for StartStateIter<'a, T> { |

2209 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |

2210 | f.debug_struct("StartStateIter").field( "i", &self.i).finish() |

2211 | } |

2212 | } |

2213 | |

2214 | /// An iterator over all states in a sparse DFA. |

2215 | /// |

2216 | /// This iterator yields tuples, where the first element is the state ID and |

2217 | /// the second element is the state itself. |

2218 | struct StateIter<'a, T> { |

2219 | trans: &'a Transitions<T>, |

2220 | id: usize, |

2221 | } |

2222 | |

2223 | impl<'a, T: AsRef<[u8]>> Iterator for StateIter<'a, T> { |

2224 | type Item = State<'a>; |

2225 | |

2226 | fn next(&mut self) -> Option<State<'a>> { |

2227 | if self.id >= self.trans.sparse().len() { |

2228 | return None; |

2229 | } |

2230 | let state = self.trans.state(StateID::new_unchecked(self.id)); |

2231 | self.id = self.id + state.write_to_len(); |

2232 | Some(state) |

2233 | } |

2234 | } |

2235 | |

2236 | impl<'a, T> fmt::Debug for StateIter<'a, T> { |

2237 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |

2238 | f.debug_struct("StateIter").field( "id", &self.id).finish() |

2239 | } |

2240 | } |

2241 | |

2242 | /// A representation of a sparse DFA state that can be cheaply materialized |

2243 | /// from a state identifier. |

2244 | #[derive(Clone)] |

2245 | struct State<'a> { |

2246 | /// The identifier of this state. |

2247 | id: StateID, |

2248 | /// Whether this is a match state or not. |

2249 | is_match: bool, |

2250 | /// The number of transitions in this state. |

2251 | ntrans: usize, |

2252 | /// Pairs of input ranges, where there is one pair for each transition. |

2253 | /// Each pair specifies an inclusive start and end byte range for the |

2254 | /// corresponding transition. |

2255 | input_ranges: &'a [u8], |

2256 | /// Transitions to the next state. This slice contains native endian |

2257 | /// encoded state identifiers, with `S` as the representation. Thus, there |

2258 | /// are `ntrans * size_of::<S>()` bytes in this slice. |

2259 | next: &'a [u8], |

2260 | /// If this is a match state, then this contains the pattern IDs that match |

2261 | /// when the DFA is in this state. |

2262 | /// |

2263 | /// This is a contiguous sequence of 32-bit native endian encoded integers. |

2264 | pattern_ids: &'a [u8], |

2265 | /// An accelerator for this state, if present. If this state has no |

2266 | /// accelerator, then this is an empty slice. When non-empty, this slice |

2267 | /// has length at most 3 and corresponds to the exhaustive set of bytes |

2268 | /// that must be seen in order to transition out of this state. |

2269 | accel: &'a [u8], |

2270 | } |

2271 | |

2272 | impl<'a> State<'a> { |

2273 | /// Searches for the next transition given an input byte. If no such |

2274 | /// transition could be found, then a dead state is returned. |

2275 | /// |

2276 | /// This is marked as inline to help dramatically boost sparse searching, |

2277 | /// which decodes each state it enters to follow the next transition. |

2278 | #[cfg_attr(feature = "perf-inline", inline(always))] |

2279 | fn next(&self, input: u8) -> StateID { |

2280 | // This straight linear search was observed to be much better than |

2281 | // binary search on ASCII haystacks, likely because a binary search |

2282 | // visits the ASCII case last but a linear search sees it first. A |

2283 | // binary search does do a little better on non-ASCII haystacks, but |

2284 | // not by much. There might be a better trade off lurking here. |

2285 | for i in 0..(self.ntrans - 1) { |

2286 | let (start, end) = self.range(i); |

2287 | if start <= input && input <= end { |

2288 | return self.next_at(i); |

2289 | } |

2290 | // We could bail early with an extra branch: if input < b1, then |

2291 | // we know we'll never find a matching transition. Interestingly, |

2292 | // this extra branch seems to not help performance, or will even |

2293 | // hurt it. It's likely very dependent on the DFA itself and what |

2294 | // is being searched. |

2295 | } |

2296 | DEAD |

2297 | } |

2298 | |

2299 | /// Returns the next state ID for the special EOI transition. |

2300 | fn next_eoi(&self) -> StateID { |

2301 | self.next_at(self.ntrans - 1) |

2302 | } |

2303 | |

2304 | /// Returns the identifier for this state. |

2305 | fn id(&self) -> StateID { |

2306 | self.id |

2307 | } |

2308 | |

2309 | /// Returns the inclusive input byte range for the ith transition in this |

2310 | /// state. |

2311 | fn range(&self, i: usize) -> (u8, u8) { |

2312 | (self.input_ranges[i * 2], self.input_ranges[i * 2 + 1]) |

2313 | } |

2314 | |

2315 | /// Returns the next state for the ith transition in this state. |

2316 | fn next_at(&self, i: usize) -> StateID { |

2317 | let start = i * StateID::SIZE; |

2318 | let end = start + StateID::SIZE; |

2319 | let bytes = self.next[start..end].try_into().unwrap(); |

2320 | StateID::from_ne_bytes_unchecked(bytes) |

2321 | } |

2322 | |

2323 | /// Returns the pattern ID for the given match index. If the match index |

2324 | /// is invalid, then this panics. |

2325 | fn pattern_id(&self, match_index: usize) -> PatternID { |

2326 | let start = match_index * PatternID::SIZE; |

2327 | wire::read_pattern_id_unchecked(&self.pattern_ids[start..]).0 |

2328 | } |

2329 | |

2330 | /// Returns the total number of pattern IDs for this state. This is always |

2331 | /// zero when `is_match` is false. |

2332 | fn pattern_len(&self) -> usize { |

2333 | assert_eq!(0, self.pattern_ids.len() % 4); |

2334 | self.pattern_ids.len() / 4 |

2335 | } |

2336 | |

2337 | /// Return an accelerator for this state. |

2338 | fn accelerator(&self) -> &'a [u8] { |

2339 | self.accel |

2340 | } |

2341 | |

2342 | /// Write the raw representation of this state to the given buffer using |

2343 | /// the given endianness. |

2344 | fn write_to<E: Endian>( |

2345 | &self, |

2346 | mut dst: &mut [u8], |

2347 | ) -> Result<usize, SerializeError> { |

2348 | let nwrite = self.write_to_len(); |

2349 | if dst.len() < nwrite { |

2350 | return Err(SerializeError::buffer_too_small( |

2351 | "sparse state transitions", |

2352 | )); |

2353 | } |

2354 | |

2355 | let ntrans = |

2356 | if self.is_match { self.ntrans | (1 << 15) } else { self.ntrans }; |

2357 | E::write_u16(u16::try_from(ntrans).unwrap(), dst); |

2358 | dst = &mut dst[size_of::<u16>()..]; |

2359 | |

2360 | dst[..self.input_ranges.len()].copy_from_slice(self.input_ranges); |

2361 | dst = &mut dst[self.input_ranges.len()..]; |

2362 | |

2363 | for i in 0..self.ntrans { |

2364 | E::write_u32(self.next_at(i).as_u32(), dst); |

2365 | dst = &mut dst[StateID::SIZE..]; |

2366 | } |

2367 | |

2368 | if self.is_match { |

2369 | E::write_u32(u32::try_from(self.pattern_len()).unwrap(), dst); |

2370 | dst = &mut dst[size_of::<u32>()..]; |

2371 | for i in 0..self.pattern_len() { |

2372 | let pid = self.pattern_id(i); |

2373 | E::write_u32(pid.as_u32(), dst); |

2374 | dst = &mut dst[PatternID::SIZE..]; |

2375 | } |

2376 | } |

2377 | |

2378 | dst[0] = u8::try_from(self.accel.len()).unwrap(); |

2379 | dst[1..][..self.accel.len()].copy_from_slice(self.accel); |

2380 | |

2381 | Ok(nwrite) |

2382 | } |

2383 | |

2384 | /// Return the total number of bytes that this state consumes in its |

2385 | /// encoded form. |

2386 | fn write_to_len(&self) -> usize { |

2387 | let mut len = 2 |

2388 | + (self.ntrans * 2) |

2389 | + (self.ntrans * StateID::SIZE) |

2390 | + (1 + self.accel.len()); |

2391 | if self.is_match { |

2392 | len += size_of::<u32>() + self.pattern_ids.len(); |

2393 | } |

2394 | len |

2395 | } |

2396 | } |

2397 | |

2398 | impl<'a> fmt::Debug for State<'a> { |

2399 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |

2400 | let mut printed = false; |

2401 | for i in 0..(self.ntrans - 1) { |

2402 | let next = self.next_at(i); |

2403 | if next == DEAD { |

2404 | continue; |

2405 | } |

2406 | |

2407 | if printed { |

2408 | write!(f, ", ")?; |

2409 | } |

2410 | let (start, end) = self.range(i); |

2411 | if start == end { |

2412 | write!(f, "{:?} => {:?}", DebugByte(start), next.as_usize())?; |

2413 | } else { |

2414 | write!( |

2415 | f, |

2416 | "{:?}-{:?} => {:?}", |

2417 | DebugByte(start), |

2418 | DebugByte(end), |

2419 | next.as_usize(), |

2420 | )?; |

2421 | } |

2422 | printed = true; |

2423 | } |

2424 | let eoi = self.next_at(self.ntrans - 1); |

2425 | if eoi != DEAD { |

2426 | if printed { |

2427 | write!(f, ", ")?; |

2428 | } |

2429 | write!(f, "EOI => {:?}", eoi.as_usize())?; |

2430 | } |

2431 | Ok(()) |

2432 | } |

2433 | } |

2434 | |

2435 | /// A representation of a mutable sparse DFA state that can be cheaply |

2436 | /// materialized from a state identifier. |

2437 | #[cfg(feature = "dfa-build")] |

2438 | struct StateMut<'a> { |

2439 | /// The identifier of this state. |

2440 | id: StateID, |

2441 | /// Whether this is a match state or not. |

2442 | is_match: bool, |

2443 | /// The number of transitions in this state. |

2444 | ntrans: usize, |

2445 | /// Pairs of input ranges, where there is one pair for each transition. |

2446 | /// Each pair specifies an inclusive start and end byte range for the |

2447 | /// corresponding transition. |

2448 | input_ranges: &'a mut [u8], |

2449 | /// Transitions to the next state. This slice contains native endian |

2450 | /// encoded state identifiers, with `S` as the representation. Thus, there |

2451 | /// are `ntrans * size_of::<S>()` bytes in this slice. |

2452 | next: &'a mut [u8], |

2453 | /// If this is a match state, then this contains the pattern IDs that match |

2454 | /// when the DFA is in this state. |

2455 | /// |

2456 | /// This is a contiguous sequence of 32-bit native endian encoded integers. |

2457 | pattern_ids: &'a [u8], |

2458 | /// An accelerator for this state, if present. If this state has no |

2459 | /// accelerator, then this is an empty slice. When non-empty, this slice |

2460 | /// has length at most 3 and corresponds to the exhaustive set of bytes |

2461 | /// that must be seen in order to transition out of this state. |

2462 | accel: &'a mut [u8], |

2463 | } |

2464 | |

2465 | #[cfg(feature = "dfa-build")] |

2466 | impl<'a> StateMut<'a> { |

2467 | /// Sets the ith transition to the given state. |

2468 | fn set_next_at(&mut self, i: usize, next: StateID) { |

2469 | let start = i * StateID::SIZE; |

2470 | let end = start + StateID::SIZE; |

2471 | wire::write_state_id::<wire::NE>(next, &mut self.next[start..end]); |

2472 | } |

2473 | } |

2474 | |

2475 | #[cfg(feature = "dfa-build")] |

2476 | impl<'a> fmt::Debug for StateMut<'a> { |

2477 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |

2478 | let state = State { |

2479 | id: self.id, |

2480 | is_match: self.is_match, |

2481 | ntrans: self.ntrans, |

2482 | input_ranges: self.input_ranges, |

2483 | next: self.next, |

2484 | pattern_ids: self.pattern_ids, |

2485 | accel: self.accel, |

2486 | }; |

2487 | fmt::Debug::fmt(&state, f) |

2488 | } |

2489 | } |

2490 | |

2491 | // In order to validate everything, we not only need to make sure we |

2492 | // can decode every state, but that every transition in every state |

2493 | // points to a valid state. There are many duplicative transitions, so |

2494 | // we record state IDs that we've verified so that we don't redo the |

2495 | // decoding work. |

2496 | // |

2497 | // Except, when in no_std mode, we don't have dynamic memory allocation |

2498 | // available to us, so we skip this optimization. It's not clear |

2499 | // whether doing something more clever is worth it just yet. If you're |

2500 | // profiling this code and need it to run faster, please file an issue. |

2501 | // |

2502 | // OK, so we also use this to record the set of valid state IDs. Since |

2503 | // it is possible for a transition to point to an invalid state ID that |

2504 | // still (somehow) deserializes to a valid state. So we need to make |

2505 | // sure our transitions are limited to actually correct state IDs. |

2506 | // The problem is, I'm not sure how to do this verification step in |

2507 | // no-std no-alloc mode. I think we'd *have* to store the set of valid |

2508 | // state IDs in the DFA itself. For now, we don't do this verification |

2509 | // in no-std no-alloc mode. The worst thing that can happen is an |

2510 | // incorrect result. But no panics or memory safety problems should |

2511 | // result. Because we still do validate that the state itself is |

2512 | // "valid" in the sense that everything it points to actually exists. |

2513 | // |

2514 | // ---AG |

2515 | #[derive(Debug)] |

2516 | struct Seen { |

2517 | #[cfg(feature = "alloc")] |

2518 | set: alloc::collections::BTreeSet<StateID>, |

2519 | #[cfg(not(feature = "alloc"))] |

2520 | set: core::marker::PhantomData<StateID>, |

2521 | } |

2522 | |

2523 | #[cfg(feature = "alloc")] |

2524 | impl Seen { |

2525 | fn new() -> Seen { |

2526 | Seen { set: alloc::collections::BTreeSet::new() } |

2527 | } |

2528 | fn insert(&mut self, id: StateID) { |

2529 | self.set.insert(id); |

2530 | } |

2531 | fn contains(&self, id: &StateID) -> bool { |

2532 | self.set.contains(id) |

2533 | } |

2534 | } |

2535 | |

2536 | #[cfg(not(feature = "alloc"))] |

2537 | impl Seen { |

2538 | fn new() -> Seen { |

2539 | Seen { set: core::marker::PhantomData } |

2540 | } |

2541 | fn insert(&mut self, _id: StateID) {} |

2542 | fn contains(&self, _id: &StateID) -> bool { |

2543 | true |

2544 | } |

2545 | } |

2546 | |

2547 | /* |

2548 | /// A binary search routine specialized specifically to a sparse DFA state's |

2549 | /// transitions. Specifically, the transitions are defined as a set of pairs |

2550 | /// of input bytes that delineate an inclusive range of bytes. If the input |

2551 | /// byte is in the range, then the corresponding transition is a match. |

2552 | /// |

2553 | /// This binary search accepts a slice of these pairs and returns the position |

2554 | /// of the matching pair (the ith transition), or None if no matching pair |

2555 | /// could be found. |

2556 | /// |

2557 | /// Note that this routine is not currently used since it was observed to |

2558 | /// either decrease performance when searching ASCII, or did not provide enough |

2559 | /// of a boost on non-ASCII haystacks to be worth it. However, we leave it here |

2560 | /// for posterity in case we can find a way to use it. |

2561 | /// |

2562 | /// In theory, we could use the standard library's search routine if we could |

2563 | /// cast a `&[u8]` to a `&[(u8, u8)]`, but I don't believe this is currently |

2564 | /// guaranteed to be safe and is thus UB (since I don't think the in-memory |

2565 | /// representation of `(u8, u8)` has been nailed down). One could define a |

2566 | /// repr(C) type, but the casting doesn't seem justified. |

2567 | #[cfg_attr(feature = "perf-inline", inline(always))] |

2568 | fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option<usize> { |

2569 | debug_assert!(ranges.len() % 2 == 0, "ranges must have even length"); |

2570 | debug_assert!(ranges.len() <= 512, "ranges should be short"); |

2571 | |

2572 | let (mut left, mut right) = (0, ranges.len() / 2); |

2573 | while left < right { |

2574 | let mid = (left + right) / 2; |

2575 | let (b1, b2) = (ranges[mid * 2], ranges[mid * 2 + 1]); |

2576 | if needle < b1 { |

2577 | right = mid; |

2578 | } else if needle > b2 { |

2579 | left = mid + 1; |

2580 | } else { |

2581 | return Some(mid); |

2582 | } |

2583 | } |

2584 | None |

2585 | } |

2586 | */ |

2587 | |

2588 | #[cfg(all(test, feature = "syntax", feature = "dfa-build"))] |

2589 | mod tests { |

2590 | use crate::{ |

2591 | dfa::{dense::DFA, Automaton}, |

2592 | nfa::thompson, |

2593 | Input, MatchError, |

2594 | }; |

2595 | |

2596 | // See the analogous test in src/hybrid/dfa.rs and src/dfa/dense.rs. |

2597 | #[test] |

2598 | fn heuristic_unicode_forward() { |

2599 | let dfa = DFA::builder() |

2600 | .configure(DFA::config().unicode_word_boundary(true)) |

2601 | .thompson(thompson::Config::new().reverse(true)) |

2602 | .build(r"\b[0-9]+\b") |

2603 | .unwrap() |

2604 | .to_sparse() |

2605 | .unwrap(); |

2606 | |

2607 | let input = Input::new("β123").range( 2..); |

2608 | let expected = MatchError::quit(0xB2, 1); |

2609 | let got = dfa.try_search_fwd(&input); |

2610 | assert_eq!(Err(expected), got); |

2611 | |

2612 | let input = Input::new("123β").range(.. 3); |

2613 | let expected = MatchError::quit(0xCE, 3); |

2614 | let got = dfa.try_search_fwd(&input); |

2615 | assert_eq!(Err(expected), got); |

2616 | } |

2617 | |

2618 | // See the analogous test in src/hybrid/dfa.rs and src/dfa/dense.rs. |

2619 | #[test] |

2620 | fn heuristic_unicode_reverse() { |

2621 | let dfa = DFA::builder() |

2622 | .configure(DFA::config().unicode_word_boundary(true)) |

2623 | .thompson(thompson::Config::new().reverse(true)) |

2624 | .build(r"\b[0-9]+\b") |

2625 | .unwrap() |

2626 | .to_sparse() |

2627 | .unwrap(); |

2628 | |

2629 | let input = Input::new("β123").range( 2..); |

2630 | let expected = MatchError::quit(0xB2, 1); |

2631 | let got = dfa.try_search_rev(&input); |

2632 | assert_eq!(Err(expected), got); |

2633 | |

2634 | let input = Input::new("123β").range(.. 3); |

2635 | let expected = MatchError::quit(0xCE, 3); |

2636 | let got = dfa.try_search_rev(&input); |

2637 | assert_eq!(Err(expected), got); |

2638 | } |

2639 | } |

2640 |