1 | #[cfg(feature = "std")] |
---|---|

2 | use core::fmt; |

3 | #[cfg(feature = "std")] |

4 | use core::iter; |

5 | use core::marker::PhantomData; |

6 | use core::mem::size_of; |

7 | #[cfg(feature = "std")] |

8 | use std::collections::HashMap; |

9 | |

10 | #[cfg(feature = "std")] |

11 | use byteorder::{BigEndian, LittleEndian}; |

12 | use byteorder::{ByteOrder, NativeEndian}; |

13 | |

14 | use classes::ByteClasses; |

15 | use dense; |

16 | use dfa::DFA; |

17 | #[cfg(feature = "std")] |

18 | use error::{Error, Result}; |

19 | #[cfg(feature = "std")] |

20 | use state_id::{dead_id, usize_to_state_id, write_state_id_bytes, StateID}; |

21 | #[cfg(not(feature = "std"))] |

22 | use state_id::{dead_id, StateID}; |

23 | |

24 | /// A sparse table-based deterministic finite automaton (DFA). |

25 | /// |

26 | /// In contrast to a [dense DFA](enum.DenseDFA.html), a sparse DFA uses a |

27 | /// more space efficient representation for its transition table. Consequently, |

28 | /// sparse DFAs can use much less memory than dense DFAs, but this comes at a |

29 | /// price. In particular, reading the more space efficient transitions takes |

30 | /// more work, and consequently, searching using a sparse DFA is typically |

31 | /// slower than a dense DFA. |

32 | /// |

33 | /// A sparse DFA can be built using the default configuration via the |

34 | /// [`SparseDFA::new`](enum.SparseDFA.html#method.new) constructor. Otherwise, |

35 | /// one can configure various aspects of a dense DFA via |

36 | /// [`dense::Builder`](dense/struct.Builder.html), and then convert a dense |

37 | /// DFA to a sparse DFA using |

38 | /// [`DenseDFA::to_sparse`](enum.DenseDFA.html#method.to_sparse). |

39 | /// |

40 | /// In general, a sparse DFA supports all the same operations as a dense DFA. |

41 | /// |

42 | /// Making the choice between a dense and sparse DFA depends on your specific |

43 | /// work load. If you can sacrifice a bit of search time performance, then a |

44 | /// sparse DFA might be the best choice. In particular, while sparse DFAs are |

45 | /// probably always slower than dense DFAs, you may find that they are easily |

46 | /// fast enough for your purposes! |

47 | /// |

48 | /// # State size |

49 | /// |

50 | /// A `SparseDFA` has two type parameters, `T` and `S`. `T` corresponds to |

51 | /// the type of the DFA's transition table while `S` corresponds to the |

52 | /// representation used for the DFA's state identifiers as described by the |

53 | /// [`StateID`](trait.StateID.html) trait. This type parameter is typically |

54 | /// `usize`, but other valid choices provided by this crate include `u8`, |

55 | /// `u16`, `u32` and `u64`. The primary reason for choosing a different state |

56 | /// identifier representation than the default is to reduce the amount of |

57 | /// memory used by a DFA. Note though, that if the chosen representation cannot |

58 | /// accommodate the size of your DFA, then building the DFA will fail and |

59 | /// return an error. |

60 | /// |

61 | /// While the reduction in heap memory used by a DFA is one reason for choosing |

62 | /// a smaller state identifier representation, another possible reason is for |

63 | /// decreasing the serialization size of a DFA, as returned by |

64 | /// [`to_bytes_little_endian`](enum.SparseDFA.html#method.to_bytes_little_endian), |

65 | /// [`to_bytes_big_endian`](enum.SparseDFA.html#method.to_bytes_big_endian) |

66 | /// or |

67 | /// [`to_bytes_native_endian`](enum.DenseDFA.html#method.to_bytes_native_endian). |

68 | /// |

69 | /// The type of the transition table is typically either `Vec<u8>` or `&[u8]`, |

70 | /// depending on where the transition table is stored. Note that this is |

71 | /// different than a dense DFA, whose transition table is typically |

72 | /// `Vec<S>` or `&[S]`. The reason for this is that a sparse DFA always reads |

73 | /// its transition table from raw bytes because the table is compactly packed. |

74 | /// |

75 | /// # Variants |

76 | /// |

77 | /// This DFA is defined as a non-exhaustive enumeration of different types of |

78 | /// dense DFAs. All of the variants use the same internal representation |

79 | /// for the transition table, but they vary in how the transition table is |

80 | /// read. A DFA's specific variant depends on the configuration options set via |

81 | /// [`dense::Builder`](dense/struct.Builder.html). The default variant is |

82 | /// `ByteClass`. |

83 | /// |

84 | /// # The `DFA` trait |

85 | /// |

86 | /// This type implements the [`DFA`](trait.DFA.html) trait, which means it |

87 | /// can be used for searching. For example: |

88 | /// |

89 | /// ``` |

90 | /// use regex_automata::{DFA, SparseDFA}; |

91 | /// |

92 | /// # fn example() -> Result<(), regex_automata::Error> { |

93 | /// let dfa = SparseDFA::new("foo[0-9]+")?; |

94 | /// assert_eq!(Some(8), dfa.find(b"foo12345")); |

95 | /// # Ok(()) }; example().unwrap() |

96 | /// ``` |

97 | /// |

98 | /// The `DFA` trait also provides an assortment of other lower level methods |

99 | /// for DFAs, such as `start_state` and `next_state`. While these are correctly |

100 | /// implemented, it is an anti-pattern to use them in performance sensitive |

101 | /// code on the `SparseDFA` type directly. Namely, each implementation requires |

102 | /// a branch to determine which type of sparse DFA is being used. Instead, |

103 | /// this branch should be pushed up a layer in the code since walking the |

104 | /// transitions of a DFA is usually a hot path. If you do need to use these |

105 | /// lower level methods in performance critical code, then you should match on |

106 | /// the variants of this DFA and use each variant's implementation of the `DFA` |

107 | /// trait directly. |

108 | #[derive(Clone, Debug)] |

109 | pub enum SparseDFA<T: AsRef<[u8]>, S: StateID = usize> { |

110 | /// A standard DFA that does not use byte classes. |

111 | Standard(Standard<T, S>), |

112 | /// A DFA that shrinks its alphabet to a set of equivalence classes instead |

113 | /// of using all possible byte values. Any two bytes belong to the same |

114 | /// equivalence class if and only if they can be used interchangeably |

115 | /// anywhere in the DFA while never discriminating between a match and a |

116 | /// non-match. |

117 | /// |

118 | /// Unlike dense DFAs, sparse DFAs do not tend to benefit nearly as much |

119 | /// from using byte classes. In some cases, using byte classes can even |

120 | /// marginally increase the size of a sparse DFA's transition table. The |

121 | /// reason for this is that a sparse DFA already compacts each state's |

122 | /// transitions separate from whether byte classes are used. |

123 | ByteClass(ByteClass<T, S>), |

124 | /// Hints that destructuring should not be exhaustive. |

125 | /// |

126 | /// This enum may grow additional variants, so this makes sure clients |

127 | /// don't count on exhaustive matching. (Otherwise, adding a new variant |

128 | /// could break existing code.) |

129 | #[doc(hidden)] |

130 | __Nonexhaustive, |

131 | } |

132 | |

133 | #[cfg(feature = "std")] |

134 | impl SparseDFA<Vec<u8>, usize> { |

135 | /// Parse the given regular expression using a default configuration and |

136 | /// return the corresponding sparse DFA. |

137 | /// |

138 | /// The default configuration uses `usize` for state IDs and reduces the |

139 | /// alphabet size by splitting bytes into equivalence classes. The |

140 | /// resulting DFA is *not* minimized. |

141 | /// |

142 | /// If you want a non-default configuration, then use the |

143 | /// [`dense::Builder`](dense/struct.Builder.html) |

144 | /// to set your own configuration, and then call |

145 | /// [`DenseDFA::to_sparse`](enum.DenseDFA.html#method.to_sparse) |

146 | /// to create a sparse DFA. |

147 | /// |

148 | /// # Example |

149 | /// |

150 | /// ``` |

151 | /// use regex_automata::{DFA, SparseDFA}; |

152 | /// |

153 | /// # fn example() -> Result<(), regex_automata::Error> { |

154 | /// let dfa = SparseDFA::new("foo[0-9]+bar")?; |

155 | /// assert_eq!(Some(11), dfa.find(b"foo12345bar")); |

156 | /// # Ok(()) }; example().unwrap() |

157 | /// ``` |

158 | pub fn new(pattern: &str) -> Result<SparseDFA<Vec<u8>, usize>> { |

159 | dense::Builder::new() |

160 | .build(pattern) |

161 | .and_then(|dense| dense.to_sparse()) |

162 | } |

163 | } |

164 | |

165 | #[cfg(feature = "std")] |

166 | impl<S: StateID> SparseDFA<Vec<u8>, S> { |

167 | /// Create a new empty sparse DFA that never matches any input. |

168 | /// |

169 | /// # Example |

170 | /// |

171 | /// In order to build an empty DFA, callers must provide a type hint |

172 | /// indicating their choice of state identifier representation. |

173 | /// |

174 | /// ``` |

175 | /// use regex_automata::{DFA, SparseDFA}; |

176 | /// |

177 | /// # fn example() -> Result<(), regex_automata::Error> { |

178 | /// let dfa: SparseDFA<Vec<u8>, usize> = SparseDFA::empty(); |

179 | /// assert_eq!(None, dfa.find(b"")); |

180 | /// assert_eq!(None, dfa.find(b"foo")); |

181 | /// # Ok(()) }; example().unwrap() |

182 | /// ``` |

183 | pub fn empty() -> SparseDFA<Vec<u8>, S> { |

184 | dense::DenseDFA::empty().to_sparse().unwrap() |

185 | } |

186 | |

187 | pub(crate) fn from_dense_sized<T: AsRef<[S]>, A: StateID>( |

188 | dfa: &dense::Repr<T, S>, |

189 | ) -> Result<SparseDFA<Vec<u8>, A>> { |

190 | Repr::from_dense_sized(dfa).map(|r| r.into_sparse_dfa()) |

191 | } |

192 | } |

193 | |

194 | impl<T: AsRef<[u8]>, S: StateID> SparseDFA<T, S> { |

195 | /// Cheaply return a borrowed version of this sparse DFA. Specifically, the |

196 | /// DFA returned always uses `&[u8]` for its transition table while keeping |

197 | /// the same state identifier representation. |

198 | pub fn as_ref<'a>(&'a self) -> SparseDFA<&'a [u8], S> { |

199 | match *self { |

200 | SparseDFA::Standard(Standard(ref r)) => { |

201 | SparseDFA::Standard(Standard(r.as_ref())) |

202 | } |

203 | SparseDFA::ByteClass(ByteClass(ref r)) => { |

204 | SparseDFA::ByteClass(ByteClass(r.as_ref())) |

205 | } |

206 | SparseDFA::__Nonexhaustive => unreachable!(), |

207 | } |

208 | } |

209 | |

210 | /// Return an owned version of this sparse DFA. Specifically, the DFA |

211 | /// returned always uses `Vec<u8>` for its transition table while keeping |

212 | /// the same state identifier representation. |

213 | /// |

214 | /// Effectively, this returns a sparse DFA whose transition table lives |

215 | /// on the heap. |

216 | #[cfg(feature = "std")] |

217 | pub fn to_owned(&self) -> SparseDFA<Vec<u8>, S> { |

218 | match *self { |

219 | SparseDFA::Standard(Standard(ref r)) => { |

220 | SparseDFA::Standard(Standard(r.to_owned())) |

221 | } |

222 | SparseDFA::ByteClass(ByteClass(ref r)) => { |

223 | SparseDFA::ByteClass(ByteClass(r.to_owned())) |

224 | } |

225 | SparseDFA::__Nonexhaustive => unreachable!(), |

226 | } |

227 | } |

228 | |

229 | /// Returns the memory usage, in bytes, of this DFA. |

230 | /// |

231 | /// The memory usage is computed based on the number of bytes used to |

232 | /// represent this DFA's transition table. This typically corresponds to |

233 | /// heap memory usage. |

234 | /// |

235 | /// This does **not** include the stack size used up by this DFA. To |

236 | /// compute that, used `std::mem::size_of::<SparseDFA>()`. |

237 | pub fn memory_usage(&self) -> usize { |

238 | self.repr().memory_usage() |

239 | } |

240 | |

241 | fn repr(&self) -> &Repr<T, S> { |

242 | match *self { |

243 | SparseDFA::Standard(ref r) => &r.0, |

244 | SparseDFA::ByteClass(ref r) => &r.0, |

245 | SparseDFA::__Nonexhaustive => unreachable!(), |

246 | } |

247 | } |

248 | } |

249 | |

250 | /// Routines for converting a sparse DFA to other representations, such as |

251 | /// smaller state identifiers or raw bytes suitable for persistent storage. |

252 | #[cfg(feature = "std")] |

253 | impl<T: AsRef<[u8]>, S: StateID> SparseDFA<T, S> { |

254 | /// Create a new sparse DFA whose match semantics are equivalent to |

255 | /// this DFA, but attempt to use `u8` for the representation of state |

256 | /// identifiers. If `u8` is insufficient to represent all state identifiers |

257 | /// in this DFA, then this returns an error. |

258 | /// |

259 | /// This is a convenience routine for `to_sized::<u8>()`. |

260 | pub fn to_u8(&self) -> Result<SparseDFA<Vec<u8>, u8>> { |

261 | self.to_sized() |

262 | } |

263 | |

264 | /// Create a new sparse DFA whose match semantics are equivalent to |

265 | /// this DFA, but attempt to use `u16` for the representation of state |

266 | /// identifiers. If `u16` is insufficient to represent all state |

267 | /// identifiers in this DFA, then this returns an error. |

268 | /// |

269 | /// This is a convenience routine for `to_sized::<u16>()`. |

270 | pub fn to_u16(&self) -> Result<SparseDFA<Vec<u8>, u16>> { |

271 | self.to_sized() |

272 | } |

273 | |

274 | /// Create a new sparse DFA whose match semantics are equivalent to |

275 | /// this DFA, but attempt to use `u32` for the representation of state |

276 | /// identifiers. If `u32` is insufficient to represent all state |

277 | /// identifiers in this DFA, then this returns an error. |

278 | /// |

279 | /// This is a convenience routine for `to_sized::<u32>()`. |

280 | #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] |

281 | pub fn to_u32(&self) -> Result<SparseDFA<Vec<u8>, u32>> { |

282 | self.to_sized() |

283 | } |

284 | |

285 | /// Create a new sparse DFA whose match semantics are equivalent to |

286 | /// this DFA, but attempt to use `u64` for the representation of state |

287 | /// identifiers. If `u64` is insufficient to represent all state |

288 | /// identifiers in this DFA, then this returns an error. |

289 | /// |

290 | /// This is a convenience routine for `to_sized::<u64>()`. |

291 | #[cfg(target_pointer_width = "64")] |

292 | pub fn to_u64(&self) -> Result<SparseDFA<Vec<u8>, u64>> { |

293 | self.to_sized() |

294 | } |

295 | |

296 | /// Create a new sparse DFA whose match semantics are equivalent to |

297 | /// this DFA, but attempt to use `A` for the representation of state |

298 | /// identifiers. If `A` is insufficient to represent all state identifiers |

299 | /// in this DFA, then this returns an error. |

300 | /// |

301 | /// An alternative way to construct such a DFA is to use |

302 | /// [`DenseDFA::to_sparse_sized`](enum.DenseDFA.html#method.to_sparse_sized). |

303 | /// In general, picking the appropriate size upon initial construction of |

304 | /// a sparse DFA is preferred, since it will do the conversion in one |

305 | /// step instead of two. |

306 | pub fn to_sized<A: StateID>(&self) -> Result<SparseDFA<Vec<u8>, A>> { |

307 | self.repr().to_sized().map(|r| r.into_sparse_dfa()) |

308 | } |

309 | |

310 | /// Serialize a sparse DFA to raw bytes in little endian format. |

311 | /// |

312 | /// If the state identifier representation of this DFA has a size different |

313 | /// than 1, 2, 4 or 8 bytes, then this returns an error. All |

314 | /// implementations of `StateID` provided by this crate satisfy this |

315 | /// requirement. |

316 | pub fn to_bytes_little_endian(&self) -> Result<Vec<u8>> { |

317 | self.repr().to_bytes::<LittleEndian>() |

318 | } |

319 | |

320 | /// Serialize a sparse DFA to raw bytes in big endian format. |

321 | /// |

322 | /// If the state identifier representation of this DFA has a size different |

323 | /// than 1, 2, 4 or 8 bytes, then this returns an error. All |

324 | /// implementations of `StateID` provided by this crate satisfy this |

325 | /// requirement. |

326 | pub fn to_bytes_big_endian(&self) -> Result<Vec<u8>> { |

327 | self.repr().to_bytes::<BigEndian>() |

328 | } |

329 | |

330 | /// Serialize a sparse DFA to raw bytes in native endian format. |

331 | /// Generally, it is better to pick an explicit endianness using either |

332 | /// `to_bytes_little_endian` or `to_bytes_big_endian`. This routine is |

333 | /// useful in tests where the DFA is serialized and deserialized on the |

334 | /// same platform. |

335 | /// |

336 | /// If the state identifier representation of this DFA has a size different |

337 | /// than 1, 2, 4 or 8 bytes, then this returns an error. All |

338 | /// implementations of `StateID` provided by this crate satisfy this |

339 | /// requirement. |

340 | pub fn to_bytes_native_endian(&self) -> Result<Vec<u8>> { |

341 | self.repr().to_bytes::<NativeEndian>() |

342 | } |

343 | } |

344 | |

345 | impl<'a, S: StateID> SparseDFA<&'a [u8], S> { |

346 | /// Deserialize a sparse DFA with a specific state identifier |

347 | /// representation. |

348 | /// |

349 | /// Deserializing a DFA using this routine will never allocate heap memory. |

350 | /// This is also guaranteed to be a constant time operation that does not |

351 | /// vary with the size of the DFA. |

352 | /// |

353 | /// The bytes given should be generated by the serialization of a DFA with |

354 | /// either the |

355 | /// [`to_bytes_little_endian`](enum.DenseDFA.html#method.to_bytes_little_endian) |

356 | /// method or the |

357 | /// [`to_bytes_big_endian`](enum.DenseDFA.html#method.to_bytes_big_endian) |

358 | /// endian, depending on the endianness of the machine you are |

359 | /// deserializing this DFA from. |

360 | /// |

361 | /// If the state identifier representation is `usize`, then deserialization |

362 | /// is dependent on the pointer size. For this reason, it is best to |

363 | /// serialize DFAs using a fixed size representation for your state |

364 | /// identifiers, such as `u8`, `u16`, `u32` or `u64`. |

365 | /// |

366 | /// # Panics |

367 | /// |

368 | /// The bytes given should be *trusted*. In particular, if the bytes |

369 | /// are not a valid serialization of a DFA, or if the endianness of the |

370 | /// serialized bytes is different than the endianness of the machine that |

371 | /// is deserializing the DFA, then this routine will panic. Moreover, it |

372 | /// is possible for this deserialization routine to succeed even if the |

373 | /// given bytes do not represent a valid serialized sparse DFA. |

374 | /// |

375 | /// # Safety |

376 | /// |

377 | /// This routine is unsafe because it permits callers to provide an |

378 | /// arbitrary transition table with possibly incorrect transitions. While |

379 | /// the various serialization routines will never return an incorrect |

380 | /// transition table, there is no guarantee that the bytes provided here |

381 | /// are correct. While deserialization does many checks (as documented |

382 | /// above in the panic conditions), this routine does not check that the |

383 | /// transition table is correct. Given an incorrect transition table, it is |

384 | /// possible for the search routines to access out-of-bounds memory because |

385 | /// of explicit bounds check elision. |

386 | /// |

387 | /// # Example |

388 | /// |

389 | /// This example shows how to serialize a DFA to raw bytes, deserialize it |

390 | /// and then use it for searching. Note that we first convert the DFA to |

391 | /// using `u16` for its state identifier representation before serializing |

392 | /// it. While this isn't strictly necessary, it's good practice in order to |

393 | /// decrease the size of the DFA and to avoid platform specific pitfalls |

394 | /// such as differing pointer sizes. |

395 | /// |

396 | /// ``` |

397 | /// use regex_automata::{DFA, DenseDFA, SparseDFA}; |

398 | /// |

399 | /// # fn example() -> Result<(), regex_automata::Error> { |

400 | /// let sparse = SparseDFA::new("foo[0-9]+")?; |

401 | /// let bytes = sparse.to_u16()?.to_bytes_native_endian()?; |

402 | /// |

403 | /// let dfa: SparseDFA<&[u8], u16> = unsafe { |

404 | /// SparseDFA::from_bytes(&bytes) |

405 | /// }; |

406 | /// |

407 | /// assert_eq!(Some(8), dfa.find(b"foo12345")); |

408 | /// # Ok(()) }; example().unwrap() |

409 | /// ``` |

410 | pub unsafe fn from_bytes(buf: &'a [u8]) -> SparseDFA<&'a [u8], S> { |

411 | Repr::from_bytes(buf).into_sparse_dfa() |

412 | } |

413 | } |

414 | |

415 | impl<T: AsRef<[u8]>, S: StateID> DFA for SparseDFA<T, S> { |

416 | type ID = S; |

417 | |

418 | #[inline] |

419 | fn start_state(&self) -> S { |

420 | self.repr().start_state() |

421 | } |

422 | |

423 | #[inline] |

424 | fn is_match_state(&self, id: S) -> bool { |

425 | self.repr().is_match_state(id) |

426 | } |

427 | |

428 | #[inline] |

429 | fn is_dead_state(&self, id: S) -> bool { |

430 | self.repr().is_dead_state(id) |

431 | } |

432 | |

433 | #[inline] |

434 | fn is_match_or_dead_state(&self, id: S) -> bool { |

435 | self.repr().is_match_or_dead_state(id) |

436 | } |

437 | |

438 | #[inline] |

439 | fn is_anchored(&self) -> bool { |

440 | self.repr().is_anchored() |

441 | } |

442 | |

443 | #[inline] |

444 | fn next_state(&self, current: S, input: u8) -> S { |

445 | match *self { |

446 | SparseDFA::Standard(ref r) => r.next_state(current, input), |

447 | SparseDFA::ByteClass(ref r) => r.next_state(current, input), |

448 | SparseDFA::__Nonexhaustive => unreachable!(), |

449 | } |

450 | } |

451 | |

452 | #[inline] |

453 | unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S { |

454 | self.next_state(current, input) |

455 | } |

456 | |

457 | // We specialize the following methods because it lets us lift the |

458 | // case analysis between the different types of sparse DFAs. Instead of |

459 | // doing the case analysis for every transition, we do it once before |

460 | // searching. For sparse DFAs, this doesn't seem to benefit performance as |

461 | // much as it does for the dense DFAs, but it's easy to do so we might as |

462 | // well do it. |

463 | |

464 | #[inline] |

465 | fn is_match_at(&self, bytes: &[u8], start: usize) -> bool { |

466 | match *self { |

467 | SparseDFA::Standard(ref r) => r.is_match_at(bytes, start), |

468 | SparseDFA::ByteClass(ref r) => r.is_match_at(bytes, start), |

469 | SparseDFA::__Nonexhaustive => unreachable!(), |

470 | } |

471 | } |

472 | |

473 | #[inline] |

474 | fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize> { |

475 | match *self { |

476 | SparseDFA::Standard(ref r) => r.shortest_match_at(bytes, start), |

477 | SparseDFA::ByteClass(ref r) => r.shortest_match_at(bytes, start), |

478 | SparseDFA::__Nonexhaustive => unreachable!(), |

479 | } |

480 | } |

481 | |

482 | #[inline] |

483 | fn find_at(&self, bytes: &[u8], start: usize) -> Option<usize> { |

484 | match *self { |

485 | SparseDFA::Standard(ref r) => r.find_at(bytes, start), |

486 | SparseDFA::ByteClass(ref r) => r.find_at(bytes, start), |

487 | SparseDFA::__Nonexhaustive => unreachable!(), |

488 | } |

489 | } |

490 | |

491 | #[inline] |

492 | fn rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize> { |

493 | match *self { |

494 | SparseDFA::Standard(ref r) => r.rfind_at(bytes, start), |

495 | SparseDFA::ByteClass(ref r) => r.rfind_at(bytes, start), |

496 | SparseDFA::__Nonexhaustive => unreachable!(), |

497 | } |

498 | } |

499 | } |

500 | |

501 | /// A standard sparse DFA that does not use premultiplication or byte classes. |

502 | /// |

503 | /// Generally, it isn't necessary to use this type directly, since a |

504 | /// `SparseDFA` can be used for searching directly. One possible reason why |

505 | /// one might want to use this type directly is if you are implementing your |

506 | /// own search routines by walking a DFA's transitions directly. In that case, |

507 | /// you'll want to use this type (or any of the other DFA variant types) |

508 | /// directly, since they implement `next_state` more efficiently. |

509 | #[derive(Clone, Debug)] |

510 | pub struct Standard<T: AsRef<[u8]>, S: StateID = usize>(Repr<T, S>); |

511 | |

512 | impl<T: AsRef<[u8]>, S: StateID> DFA for Standard<T, S> { |

513 | type ID = S; |

514 | |

515 | #[inline] |

516 | fn start_state(&self) -> S { |

517 | self.0.start_state() |

518 | } |

519 | |

520 | #[inline] |

521 | fn is_match_state(&self, id: S) -> bool { |

522 | self.0.is_match_state(id) |

523 | } |

524 | |

525 | #[inline] |

526 | fn is_dead_state(&self, id: S) -> bool { |

527 | self.0.is_dead_state(id) |

528 | } |

529 | |

530 | #[inline] |

531 | fn is_match_or_dead_state(&self, id: S) -> bool { |

532 | self.0.is_match_or_dead_state(id) |

533 | } |

534 | |

535 | #[inline] |

536 | fn is_anchored(&self) -> bool { |

537 | self.0.is_anchored() |

538 | } |

539 | |

540 | #[inline] |

541 | fn next_state(&self, current: S, input: u8) -> S { |

542 | self.0.state(current).next(input) |

543 | } |

544 | |

545 | #[inline] |

546 | unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S { |

547 | self.next_state(current, input) |

548 | } |

549 | } |

550 | |

551 | /// A sparse DFA that shrinks its alphabet. |

552 | /// |

553 | /// Alphabet shrinking is achieved by using a set of equivalence classes |

554 | /// instead of using all possible byte values. Any two bytes belong to the same |

555 | /// equivalence class if and only if they can be used interchangeably anywhere |

556 | /// in the DFA while never discriminating between a match and a non-match. |

557 | /// |

558 | /// Unlike dense DFAs, sparse DFAs do not tend to benefit nearly as much from |

559 | /// using byte classes. In some cases, using byte classes can even marginally |

560 | /// increase the size of a sparse DFA's transition table. The reason for this |

561 | /// is that a sparse DFA already compacts each state's transitions separate |

562 | /// from whether byte classes are used. |

563 | /// |

564 | /// Generally, it isn't necessary to use this type directly, since a |

565 | /// `SparseDFA` can be used for searching directly. One possible reason why |

566 | /// one might want to use this type directly is if you are implementing your |

567 | /// own search routines by walking a DFA's transitions directly. In that case, |

568 | /// you'll want to use this type (or any of the other DFA variant types) |

569 | /// directly, since they implement `next_state` more efficiently. |

570 | #[derive(Clone, Debug)] |

571 | pub struct ByteClass<T: AsRef<[u8]>, S: StateID = usize>(Repr<T, S>); |

572 | |

573 | impl<T: AsRef<[u8]>, S: StateID> DFA for ByteClass<T, S> { |

574 | type ID = S; |

575 | |

576 | #[inline] |

577 | fn start_state(&self) -> S { |

578 | self.0.start_state() |

579 | } |

580 | |

581 | #[inline] |

582 | fn is_match_state(&self, id: S) -> bool { |

583 | self.0.is_match_state(id) |

584 | } |

585 | |

586 | #[inline] |

587 | fn is_dead_state(&self, id: S) -> bool { |

588 | self.0.is_dead_state(id) |

589 | } |

590 | |

591 | #[inline] |

592 | fn is_match_or_dead_state(&self, id: S) -> bool { |

593 | self.0.is_match_or_dead_state(id) |

594 | } |

595 | |

596 | #[inline] |

597 | fn is_anchored(&self) -> bool { |

598 | self.0.is_anchored() |

599 | } |

600 | |

601 | #[inline] |

602 | fn next_state(&self, current: S, input: u8) -> S { |

603 | let input = self.0.byte_classes.get(input); |

604 | self.0.state(current).next(input) |

605 | } |

606 | |

607 | #[inline] |

608 | unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S { |

609 | self.next_state(current, input) |

610 | } |

611 | } |

612 | |

613 | /// The underlying representation of a sparse DFA. This is shared by all of |

614 | /// the different variants of a sparse DFA. |

615 | #[derive(Clone)] |

616 | #[cfg_attr(not(feature = "std"), derive(Debug))] |

617 | struct Repr<T: AsRef<[u8]>, S: StateID = usize> { |

618 | anchored: bool, |

619 | start: S, |

620 | state_count: usize, |

621 | max_match: S, |

622 | byte_classes: ByteClasses, |

623 | trans: T, |

624 | } |

625 | |

626 | impl<T: AsRef<[u8]>, S: StateID> Repr<T, S> { |

627 | fn into_sparse_dfa(self) -> SparseDFA<T, S> { |

628 | if self.byte_classes.is_singleton() { |

629 | SparseDFA::Standard(Standard(self)) |

630 | } else { |

631 | SparseDFA::ByteClass(ByteClass(self)) |

632 | } |

633 | } |

634 | |

635 | fn as_ref<'a>(&'a self) -> Repr<&'a [u8], S> { |

636 | Repr { |

637 | anchored: self.anchored, |

638 | start: self.start, |

639 | state_count: self.state_count, |

640 | max_match: self.max_match, |

641 | byte_classes: self.byte_classes.clone(), |

642 | trans: self.trans(), |

643 | } |

644 | } |

645 | |

646 | #[cfg(feature = "std")] |

647 | fn to_owned(&self) -> Repr<Vec<u8>, S> { |

648 | Repr { |

649 | anchored: self.anchored, |

650 | start: self.start, |

651 | state_count: self.state_count, |

652 | max_match: self.max_match, |

653 | byte_classes: self.byte_classes.clone(), |

654 | trans: self.trans().to_vec(), |

655 | } |

656 | } |

657 | |

658 | /// Return a convenient representation of the given state. |

659 | /// |

660 | /// This is marked as inline because it doesn't seem to get inlined |

661 | /// otherwise, which leads to a fairly significant performance loss (~25%). |

662 | #[inline] |

663 | fn state<'a>(&'a self, id: S) -> State<'a, S> { |

664 | let mut pos = id.to_usize(); |

665 | let ntrans = NativeEndian::read_u16(&self.trans()[pos..]) as usize; |

666 | pos += 2; |

667 | let input_ranges = &self.trans()[pos..pos + (ntrans * 2)]; |

668 | pos += 2 * ntrans; |

669 | let next = &self.trans()[pos..pos + (ntrans * size_of::<S>())]; |

670 | State { _state_id_repr: PhantomData, ntrans, input_ranges, next } |

671 | } |

672 | |

673 | /// Return an iterator over all of the states in this DFA. |

674 | /// |

675 | /// The iterator returned yields tuples, where the first element is the |

676 | /// state ID and the second element is the state itself. |

677 | #[cfg(feature = "std")] |

678 | fn states<'a>(&'a self) -> StateIter<'a, T, S> { |

679 | StateIter { dfa: self, id: dead_id() } |

680 | } |

681 | |

682 | fn memory_usage(&self) -> usize { |

683 | self.trans().len() |

684 | } |

685 | |

686 | fn start_state(&self) -> S { |

687 | self.start |

688 | } |

689 | |

690 | fn is_match_state(&self, id: S) -> bool { |

691 | self.is_match_or_dead_state(id) && !self.is_dead_state(id) |

692 | } |

693 | |

694 | fn is_dead_state(&self, id: S) -> bool { |

695 | id == dead_id() |

696 | } |

697 | |

698 | fn is_match_or_dead_state(&self, id: S) -> bool { |

699 | id <= self.max_match |

700 | } |

701 | |

702 | fn is_anchored(&self) -> bool { |

703 | self.anchored |

704 | } |

705 | |

706 | fn trans(&self) -> &[u8] { |

707 | self.trans.as_ref() |

708 | } |

709 | |

710 | /// Create a new sparse DFA whose match semantics are equivalent to this |

711 | /// DFA, but attempt to use `A` for the representation of state |

712 | /// identifiers. If `A` is insufficient to represent all state identifiers |

713 | /// in this DFA, then this returns an error. |

714 | #[cfg(feature = "std")] |

715 | fn to_sized<A: StateID>(&self) -> Result<Repr<Vec<u8>, A>> { |

716 | // To build the new DFA, we proceed much like the initial construction |

717 | // of the sparse DFA. Namely, since the state ID size is changing, |

718 | // we don't actually know all of our state IDs until we've allocated |

719 | // all necessary space. So we do one pass that allocates all of the |

720 | // storage we need, and then another pass to fill in the transitions. |

721 | |

722 | let mut trans = Vec::with_capacity(size_of::<A>() * self.state_count); |

723 | let mut map: HashMap<S, A> = HashMap::with_capacity(self.state_count); |

724 | for (old_id, state) in self.states() { |

725 | let pos = trans.len(); |

726 | map.insert(old_id, usize_to_state_id(pos)?); |

727 | |

728 | let n = state.ntrans; |

729 | let zeros = 2 + (n * 2) + (n * size_of::<A>()); |

730 | trans.extend(iter::repeat(0).take(zeros)); |

731 | |

732 | NativeEndian::write_u16(&mut trans[pos..], n as u16); |

733 | let (s, e) = (pos + 2, pos + 2 + (n * 2)); |

734 | trans[s..e].copy_from_slice(state.input_ranges); |

735 | } |

736 | |

737 | let mut new = Repr { |

738 | anchored: self.anchored, |

739 | start: map[&self.start], |

740 | state_count: self.state_count, |

741 | max_match: map[&self.max_match], |

742 | byte_classes: self.byte_classes.clone(), |

743 | trans, |

744 | }; |

745 | for (&old_id, &new_id) in map.iter() { |

746 | let old_state = self.state(old_id); |

747 | let mut new_state = new.state_mut(new_id); |

748 | for i in 0..new_state.ntrans { |

749 | let next = map[&old_state.next_at(i)]; |

750 | new_state.set_next_at(i, usize_to_state_id(next.to_usize())?); |

751 | } |

752 | } |

753 | new.start = map[&self.start]; |

754 | new.max_match = map[&self.max_match]; |

755 | Ok(new) |

756 | } |

757 | |

758 | /// Serialize a sparse DFA to raw bytes using the provided endianness. |

759 | /// |

760 | /// If the state identifier representation of this DFA has a size different |

761 | /// than 1, 2, 4 or 8 bytes, then this returns an error. All |

762 | /// implementations of `StateID` provided by this crate satisfy this |

763 | /// requirement. |

764 | /// |

765 | /// Unlike dense DFAs, the result is not necessarily aligned since a |

766 | /// sparse DFA's transition table is always read as a sequence of bytes. |

767 | #[cfg(feature = "std")] |

768 | fn to_bytes<A: ByteOrder>(&self) -> Result<Vec<u8>> { |

769 | let label = b"rust-regex-automata-sparse-dfa \x00"; |

770 | let size = |

771 | // For human readable label. |

772 | label.len() |

773 | // endiannes check, must be equal to 0xFEFF for native endian |

774 | + 2 |

775 | // For version number. |

776 | + 2 |

777 | // Size of state ID representation, in bytes. |

778 | // Must be 1, 2, 4 or 8. |

779 | + 2 |

780 | // For DFA misc options. (Currently unused.) |

781 | + 2 |

782 | // For start state. |

783 | + 8 |

784 | // For state count. |

785 | + 8 |

786 | // For max match state. |

787 | + 8 |

788 | // For byte class map. |

789 | + 256 |

790 | // For transition table. |

791 | + self.trans().len(); |

792 | |

793 | let mut i = 0; |

794 | let mut buf = vec![0; size]; |

795 | |

796 | // write label |

797 | for &b in label { |

798 | buf[i] = b; |

799 | i += 1; |

800 | } |

801 | // endianness check |

802 | A::write_u16(&mut buf[i..], 0xFEFF); |

803 | i += 2; |

804 | // version number |

805 | A::write_u16(&mut buf[i..], 1); |

806 | i += 2; |

807 | // size of state ID |

808 | let state_size = size_of::<S>(); |

809 | if ![1, 2, 4, 8].contains(&state_size) { |

810 | return Err(Error::serialize(&format!( |

811 | "state size of {} not supported, must be 1, 2, 4 or 8", |

812 | state_size |

813 | ))); |

814 | } |

815 | A::write_u16(&mut buf[i..], state_size as u16); |

816 | i += 2; |

817 | // DFA misc options |

818 | let mut options = 0u16; |

819 | if self.anchored { |

820 | options |= dense::MASK_ANCHORED; |

821 | } |

822 | A::write_u16(&mut buf[i..], options); |

823 | i += 2; |

824 | // start state |

825 | A::write_u64(&mut buf[i..], self.start.to_usize() as u64); |

826 | i += 8; |

827 | // state count |

828 | A::write_u64(&mut buf[i..], self.state_count as u64); |

829 | i += 8; |

830 | // max match state |

831 | A::write_u64(&mut buf[i..], self.max_match.to_usize() as u64); |

832 | i += 8; |

833 | // byte class map |

834 | for b in (0..256).map(|b| b as u8) { |

835 | buf[i] = self.byte_classes.get(b); |

836 | i += 1; |

837 | } |

838 | // transition table |

839 | for (_, state) in self.states() { |

840 | A::write_u16(&mut buf[i..], state.ntrans as u16); |

841 | i += 2; |

842 | buf[i..i + (state.ntrans * 2)].copy_from_slice(state.input_ranges); |

843 | i += state.ntrans * 2; |

844 | for j in 0..state.ntrans { |

845 | write_state_id_bytes::<A, _>(&mut buf[i..], state.next_at(j)); |

846 | i += size_of::<S>(); |

847 | } |

848 | } |

849 | |

850 | assert_eq!(size, i, "expected to consume entire buffer"); |

851 | |

852 | Ok(buf) |

853 | } |

854 | } |

855 | |

856 | impl<'a, S: StateID> Repr<&'a [u8], S> { |

857 | /// The implementation for deserializing a sparse DFA from raw bytes. |

858 | unsafe fn from_bytes(mut buf: &'a [u8]) -> Repr<&'a [u8], S> { |

859 | // skip over label |

860 | match buf.iter().position(|&b| b == b' \x00') { |

861 | None => panic!("could not find label"), |

862 | Some(i) => buf = &buf[i + 1..], |

863 | } |

864 | |

865 | // check that current endianness is same as endianness of DFA |

866 | let endian_check = NativeEndian::read_u16(buf); |

867 | buf = &buf[2..]; |

868 | if endian_check != 0xFEFF { |

869 | panic!( |

870 | "endianness mismatch, expected 0xFEFF but got 0x{:X}. \ |

871 | are you trying to load a SparseDFA serialized with a \ |

872 | different endianness?", |

873 | endian_check, |

874 | ); |

875 | } |

876 | |

877 | // check that the version number is supported |

878 | let version = NativeEndian::read_u16(buf); |

879 | buf = &buf[2..]; |

880 | if version != 1 { |

881 | panic!( |

882 | "expected version 1, but found unsupported version {}", |

883 | version, |

884 | ); |

885 | } |

886 | |

887 | // read size of state |

888 | let state_size = NativeEndian::read_u16(buf) as usize; |

889 | if state_size != size_of::<S>() { |

890 | panic!( |

891 | "state size of SparseDFA ({}) does not match \ |

892 | requested state size ({})", |

893 | state_size, |

894 | size_of::<S>(), |

895 | ); |

896 | } |

897 | buf = &buf[2..]; |

898 | |

899 | // read miscellaneous options |

900 | let opts = NativeEndian::read_u16(buf); |

901 | buf = &buf[2..]; |

902 | |

903 | // read start state |

904 | let start = S::from_usize(NativeEndian::read_u64(buf) as usize); |

905 | buf = &buf[8..]; |

906 | |

907 | // read state count |

908 | let state_count = NativeEndian::read_u64(buf) as usize; |

909 | buf = &buf[8..]; |

910 | |

911 | // read max match state |

912 | let max_match = S::from_usize(NativeEndian::read_u64(buf) as usize); |

913 | buf = &buf[8..]; |

914 | |

915 | // read byte classes |

916 | let byte_classes = ByteClasses::from_slice(&buf[..256]); |

917 | buf = &buf[256..]; |

918 | |

919 | Repr { |

920 | anchored: opts & dense::MASK_ANCHORED > 0, |

921 | start, |

922 | state_count, |

923 | max_match, |

924 | byte_classes, |

925 | trans: buf, |

926 | } |

927 | } |

928 | } |

929 | |

930 | #[cfg(feature = "std")] |

931 | impl<S: StateID> Repr<Vec<u8>, S> { |

932 | /// The implementation for constructing a sparse DFA from a dense DFA. |

933 | fn from_dense_sized<T: AsRef<[S]>, A: StateID>( |

934 | dfa: &dense::Repr<T, S>, |

935 | ) -> Result<Repr<Vec<u8>, A>> { |

936 | // In order to build the transition table, we need to be able to write |

937 | // state identifiers for each of the "next" transitions in each state. |

938 | // Our state identifiers correspond to the byte offset in the |

939 | // transition table at which the state is encoded. Therefore, we do not |

940 | // actually know what the state identifiers are until we've allocated |

941 | // exactly as much space as we need for each state. Thus, construction |

942 | // of the transition table happens in two passes. |

943 | // |

944 | // In the first pass, we fill out the shell of each state, which |

945 | // includes the transition count, the input byte ranges and zero-filled |

946 | // space for the transitions. In this first pass, we also build up a |

947 | // map from the state identifier index of the dense DFA to the state |

948 | // identifier in this sparse DFA. |

949 | // |

950 | // In the second pass, we fill in the transitions based on the map |

951 | // built in the first pass. |

952 | |

953 | let mut trans = Vec::with_capacity(size_of::<A>() * dfa.state_count()); |

954 | let mut remap: Vec<A> = vec![dead_id(); dfa.state_count()]; |

955 | for (old_id, state) in dfa.states() { |

956 | let pos = trans.len(); |

957 | |

958 | remap[dfa.state_id_to_index(old_id)] = usize_to_state_id(pos)?; |

959 | // zero-filled space for the transition count |

960 | trans.push(0); |

961 | trans.push(0); |

962 | |

963 | let mut trans_count = 0; |

964 | for (b1, b2, _) in state.sparse_transitions() { |

965 | trans_count += 1; |

966 | trans.push(b1); |

967 | trans.push(b2); |

968 | } |

969 | // fill in the transition count |

970 | NativeEndian::write_u16(&mut trans[pos..], trans_count); |

971 | |

972 | // zero-fill the actual transitions |

973 | let zeros = trans_count as usize * size_of::<A>(); |

974 | trans.extend(iter::repeat(0).take(zeros)); |

975 | } |

976 | |

977 | let mut new = Repr { |

978 | anchored: dfa.is_anchored(), |

979 | start: remap[dfa.state_id_to_index(dfa.start_state())], |

980 | state_count: dfa.state_count(), |

981 | max_match: remap[dfa.state_id_to_index(dfa.max_match_state())], |

982 | byte_classes: dfa.byte_classes().clone(), |

983 | trans, |

984 | }; |

985 | for (old_id, old_state) in dfa.states() { |

986 | let new_id = remap[dfa.state_id_to_index(old_id)]; |

987 | let mut new_state = new.state_mut(new_id); |

988 | let sparse = old_state.sparse_transitions(); |

989 | for (i, (_, _, next)) in sparse.enumerate() { |

990 | let next = remap[dfa.state_id_to_index(next)]; |

991 | new_state.set_next_at(i, next); |

992 | } |

993 | } |

994 | Ok(new) |

995 | } |

996 | |

997 | /// Return a convenient mutable representation of the given state. |

998 | fn state_mut<'a>(&'a mut self, id: S) -> StateMut<'a, S> { |

999 | let mut pos = id.to_usize(); |

1000 | let ntrans = NativeEndian::read_u16(&self.trans[pos..]) as usize; |

1001 | pos += 2; |

1002 | |

1003 | let size = (ntrans * 2) + (ntrans * size_of::<S>()); |

1004 | let ranges_and_next = &mut self.trans[pos..pos + size]; |

1005 | let (input_ranges, next) = ranges_and_next.split_at_mut(ntrans * 2); |

1006 | StateMut { _state_id_repr: PhantomData, ntrans, input_ranges, next } |

1007 | } |

1008 | } |

1009 | |

1010 | #[cfg(feature = "std")] |

1011 | impl<T: AsRef<[u8]>, S: StateID> fmt::Debug for Repr<T, S> { |

1012 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |

1013 | fn state_status<T: AsRef<[u8]>, S: StateID>( |

1014 | dfa: &Repr<T, S>, |

1015 | id: S, |

1016 | ) -> &'static str { |

1017 | if id == dead_id() { |

1018 | if dfa.is_match_state(id) { |

1019 | "D*" |

1020 | } else { |

1021 | "D " |

1022 | } |

1023 | } else if id == dfa.start_state() { |

1024 | if dfa.is_match_state(id) { |

1025 | ">*" |

1026 | } else { |

1027 | "> " |

1028 | } |

1029 | } else { |

1030 | if dfa.is_match_state(id) { |

1031 | " *" |

1032 | } else { |

1033 | " " |

1034 | } |

1035 | } |

1036 | } |

1037 | |

1038 | writeln!(f, "SparseDFA(")?; |

1039 | for (id, state) in self.states() { |

1040 | let status = state_status(self, id); |

1041 | writeln!(f, "{}{:06}: {:?}", status, id.to_usize(), state)?; |

1042 | } |

1043 | writeln!(f, ")")?; |

1044 | Ok(()) |

1045 | } |

1046 | } |

1047 | |

1048 | /// An iterator over all states in a sparse DFA. |

1049 | /// |

1050 | /// This iterator yields tuples, where the first element is the state ID and |

1051 | /// the second element is the state itself. |

1052 | #[cfg(feature = "std")] |

1053 | #[derive(Debug)] |

1054 | struct StateIter<'a, T: AsRef<[u8]> + 'a, S: StateID + 'a = usize> { |

1055 | dfa: &'a Repr<T, S>, |

1056 | id: S, |

1057 | } |

1058 | |

1059 | #[cfg(feature = "std")] |

1060 | impl<'a, T: AsRef<[u8]>, S: StateID> Iterator for StateIter<'a, T, S> { |

1061 | type Item = (S, State<'a, S>); |

1062 | |

1063 | fn next(&mut self) -> Option<(S, State<'a, S>)> { |

1064 | if self.id.to_usize() >= self.dfa.trans().len() { |

1065 | return None; |

1066 | } |

1067 | let id = self.id; |

1068 | let state = self.dfa.state(id); |

1069 | self.id = S::from_usize(self.id.to_usize() + state.bytes()); |

1070 | Some((id, state)) |

1071 | } |

1072 | } |

1073 | |

1074 | /// A representation of a sparse DFA state that can be cheaply materialized |

1075 | /// from a state identifier. |

1076 | #[derive(Clone)] |

1077 | struct State<'a, S: StateID = usize> { |

1078 | /// The state identifier representation used by the DFA from which this |

1079 | /// state was extracted. Since our transition table is compacted in a |

1080 | /// &[u8], we don't actually use the state ID type parameter explicitly |

1081 | /// anywhere, so we fake it. This prevents callers from using an incorrect |

1082 | /// state ID representation to read from this state. |

1083 | _state_id_repr: PhantomData<S>, |

1084 | /// The number of transitions in this state. |

1085 | ntrans: usize, |

1086 | /// Pairs of input ranges, where there is one pair for each transition. |

1087 | /// Each pair specifies an inclusive start and end byte range for the |

1088 | /// corresponding transition. |

1089 | input_ranges: &'a [u8], |

1090 | /// Transitions to the next state. This slice contains native endian |

1091 | /// encoded state identifiers, with `S` as the representation. Thus, there |

1092 | /// are `ntrans * size_of::<S>()` bytes in this slice. |

1093 | next: &'a [u8], |

1094 | } |

1095 | |

1096 | impl<'a, S: StateID> State<'a, S> { |

1097 | /// Searches for the next transition given an input byte. If no such |

1098 | /// transition could be found, then a dead state is returned. |

1099 | fn next(&self, input: u8) -> S { |

1100 | // This straight linear search was observed to be much better than |

1101 | // binary search on ASCII haystacks, likely because a binary search |

1102 | // visits the ASCII case last but a linear search sees it first. A |

1103 | // binary search does do a little better on non-ASCII haystacks, but |

1104 | // not by much. There might be a better trade off lurking here. |

1105 | for i in 0..self.ntrans { |

1106 | let (start, end) = self.range(i); |

1107 | if start <= input && input <= end { |

1108 | return self.next_at(i); |

1109 | } |

1110 | // We could bail early with an extra branch: if input < b1, then |

1111 | // we know we'll never find a matching transition. Interestingly, |

1112 | // this extra branch seems to not help performance, or will even |

1113 | // hurt it. It's likely very dependent on the DFA itself and what |

1114 | // is being searched. |

1115 | } |

1116 | dead_id() |

1117 | } |

1118 | |

1119 | /// Returns the inclusive input byte range for the ith transition in this |

1120 | /// state. |

1121 | fn range(&self, i: usize) -> (u8, u8) { |

1122 | (self.input_ranges[i * 2], self.input_ranges[i * 2 + 1]) |

1123 | } |

1124 | |

1125 | /// Returns the next state for the ith transition in this state. |

1126 | fn next_at(&self, i: usize) -> S { |

1127 | S::read_bytes(&self.next[i * size_of::<S>()..]) |

1128 | } |

1129 | |

1130 | /// Return the total number of bytes that this state consumes in its |

1131 | /// encoded form. |

1132 | #[cfg(feature = "std")] |

1133 | fn bytes(&self) -> usize { |

1134 | 2 + (self.ntrans * 2) + (self.ntrans * size_of::<S>()) |

1135 | } |

1136 | } |

1137 | |

1138 | #[cfg(feature = "std")] |

1139 | impl<'a, S: StateID> fmt::Debug for State<'a, S> { |

1140 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |

1141 | let mut transitions = vec![]; |

1142 | for i in 0..self.ntrans { |

1143 | let next = self.next_at(i); |

1144 | if next == dead_id() { |

1145 | continue; |

1146 | } |

1147 | |

1148 | let (start, end) = self.range(i); |

1149 | if start == end { |

1150 | transitions.push(format!( |

1151 | "{} => {}", |

1152 | escape(start), |

1153 | next.to_usize() |

1154 | )); |

1155 | } else { |

1156 | transitions.push(format!( |

1157 | "{}-{} => {}", |

1158 | escape(start), |

1159 | escape(end), |

1160 | next.to_usize(), |

1161 | )); |

1162 | } |

1163 | } |

1164 | write!(f, "{}", transitions.join( ", ")) |

1165 | } |

1166 | } |

1167 | |

1168 | /// A representation of a mutable sparse DFA state that can be cheaply |

1169 | /// materialized from a state identifier. |

1170 | #[cfg(feature = "std")] |

1171 | struct StateMut<'a, S: StateID = usize> { |

1172 | /// The state identifier representation used by the DFA from which this |

1173 | /// state was extracted. Since our transition table is compacted in a |

1174 | /// &[u8], we don't actually use the state ID type parameter explicitly |

1175 | /// anywhere, so we fake it. This prevents callers from using an incorrect |

1176 | /// state ID representation to read from this state. |

1177 | _state_id_repr: PhantomData<S>, |

1178 | /// The number of transitions in this state. |

1179 | ntrans: usize, |

1180 | /// Pairs of input ranges, where there is one pair for each transition. |

1181 | /// Each pair specifies an inclusive start and end byte range for the |

1182 | /// corresponding transition. |

1183 | input_ranges: &'a mut [u8], |

1184 | /// Transitions to the next state. This slice contains native endian |

1185 | /// encoded state identifiers, with `S` as the representation. Thus, there |

1186 | /// are `ntrans * size_of::<S>()` bytes in this slice. |

1187 | next: &'a mut [u8], |

1188 | } |

1189 | |

1190 | #[cfg(feature = "std")] |

1191 | impl<'a, S: StateID> StateMut<'a, S> { |

1192 | /// Sets the ith transition to the given state. |

1193 | fn set_next_at(&mut self, i: usize, next: S) { |

1194 | next.write_bytes(&mut self.next[i * size_of::<S>()..]); |

1195 | } |

1196 | } |

1197 | |

1198 | #[cfg(feature = "std")] |

1199 | impl<'a, S: StateID> fmt::Debug for StateMut<'a, S> { |

1200 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |

1201 | let state = State { |

1202 | _state_id_repr: self._state_id_repr, |

1203 | ntrans: self.ntrans, |

1204 | input_ranges: self.input_ranges, |

1205 | next: self.next, |

1206 | }; |

1207 | fmt::Debug::fmt(&state, f) |

1208 | } |

1209 | } |

1210 | |

1211 | /// Return the given byte as its escaped string form. |

1212 | #[cfg(feature = "std")] |

1213 | fn escape(b: u8) -> String { |

1214 | use std::ascii; |

1215 | |

1216 | String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap() |

1217 | } |

1218 | |

1219 | /// A binary search routine specialized specifically to a sparse DFA state's |

1220 | /// transitions. Specifically, the transitions are defined as a set of pairs |

1221 | /// of input bytes that delineate an inclusive range of bytes. If the input |

1222 | /// byte is in the range, then the corresponding transition is a match. |

1223 | /// |

1224 | /// This binary search accepts a slice of these pairs and returns the position |

1225 | /// of the matching pair (the ith transition), or None if no matching pair |

1226 | /// could be found. |

1227 | /// |

1228 | /// Note that this routine is not currently used since it was observed to |

1229 | /// either decrease performance when searching ASCII, or did not provide enough |

1230 | /// of a boost on non-ASCII haystacks to be worth it. However, we leave it here |

1231 | /// for posterity in case we can find a way to use it. |

1232 | /// |

1233 | /// In theory, we could use the standard library's search routine if we could |

1234 | /// cast a `&[u8]` to a `&[(u8, u8)]`, but I don't believe this is currently |

1235 | /// guaranteed to be safe and is thus UB (since I don't think the in-memory |

1236 | /// representation of `(u8, u8)` has been nailed down). |

1237 | #[inline(always)] |

1238 | #[allow(dead_code)] |

1239 | fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option<usize> { |

1240 | debug_assert!(ranges.len() % 2 == 0, "ranges must have even length"); |

1241 | debug_assert!(ranges.len() <= 512, "ranges should be short"); |

1242 | |

1243 | let (mut left, mut right) = (0, ranges.len() / 2); |

1244 | while left < right { |

1245 | let mid = (left + right) / 2; |

1246 | let (b1, b2) = (ranges[mid * 2], ranges[mid * 2 + 1]); |

1247 | if needle < b1 { |

1248 | right = mid; |

1249 | } else if needle > b2 { |

1250 | left = mid + 1; |

1251 | } else { |

1252 | return Some(mid); |

1253 | } |

1254 | } |

1255 | None |

1256 | } |

1257 |