1 | /*! |
---|---|

2 | An implementation of the [Two-Way substring search algorithm][two-way]. |

3 | |

4 | [`Finder`] can be built for forward searches, while [`FinderRev`] can be built |

5 | for reverse searches. |

6 | |

7 | Two-Way makes for a nice general purpose substring search algorithm because of |

8 | its time and space complexity properties. It also performs well in practice. |

9 | Namely, with `m = len(needle)` and `n = len(haystack)`, Two-Way takes `O(m)` |

10 | time to create a finder, `O(1)` space and `O(n)` search time. In other words, |

11 | the preprocessing step is quick, doesn't require any heap memory and the worst |

12 | case search time is guaranteed to be linear in the haystack regardless of the |

13 | size of the needle. |

14 | |

15 | While vector algorithms will usually beat Two-Way handedly, vector algorithms |

16 | also usually have pathological or edge cases that are better handled by Two-Way. |

17 | Moreover, not all targets support vector algorithms or implementations for them |

18 | simply may not exist yet. |

19 | |

20 | Two-Way can be found in the `memmem` implementations in at least [GNU libc] and |

21 | [musl]. |

22 | |

23 | [two-way]: https://en.wikipedia.org/wiki/Two-way_string-matching_algorithm |

24 | [GNU libc]: https://www.gnu.org/software/libc/ |

25 | [musl]: https://www.musl-libc.org/ |

26 | */ |

27 | |

28 | use core::cmp; |

29 | |

30 | use crate::{ |

31 | arch::all::{is_prefix, is_suffix}, |

32 | memmem::Pre, |

33 | }; |

34 | |

35 | /// A forward substring searcher that uses the Two-Way algorithm. |

36 | #[derive(Clone, Copy, Debug)] |

37 | pub struct Finder(TwoWay); |

38 | |

39 | /// A reverse substring searcher that uses the Two-Way algorithm. |

40 | #[derive(Clone, Copy, Debug)] |

41 | pub struct FinderRev(TwoWay); |

42 | |

43 | /// An implementation of the TwoWay substring search algorithm. |

44 | /// |

45 | /// This searcher supports forward and reverse search, although not |

46 | /// simultaneously. It runs in `O(n + m)` time and `O(1)` space, where |

47 | /// `n ~ len(needle)` and `m ~ len(haystack)`. |

48 | /// |

49 | /// The implementation here roughly matches that which was developed by |

50 | /// Crochemore and Perrin in their 1991 paper "Two-way string-matching." The |

51 | /// changes in this implementation are 1) the use of zero-based indices, 2) a |

52 | /// heuristic skip table based on the last byte (borrowed from Rust's standard |

53 | /// library) and 3) the addition of heuristics for a fast skip loop. For (3), |

54 | /// callers can pass any kind of prefilter they want, but usually it's one |

55 | /// based on a heuristic that uses an approximate background frequency of bytes |

56 | /// to choose rare bytes to quickly look for candidate match positions. Note |

57 | /// though that currently, this prefilter functionality is not exposed directly |

58 | /// in the public API. (File an issue if you want it and provide a use case |

59 | /// please.) |

60 | /// |

61 | /// The heuristic for fast skipping is automatically shut off if it's |

62 | /// detected to be ineffective at search time. Generally, this only occurs in |

63 | /// pathological cases. But this is generally necessary in order to preserve |

64 | /// a `O(n + m)` time bound. |

65 | /// |

66 | /// The code below is fairly complex and not obviously correct at all. It's |

67 | /// likely necessary to read the Two-Way paper cited above in order to fully |

68 | /// grok this code. The essence of it is: |

69 | /// |

70 | /// 1. Do something to detect a "critical" position in the needle. |

71 | /// 2. For the current position in the haystack, look if `needle[critical..]` |

72 | /// matches at that position. |

73 | /// 3. If so, look if `needle[..critical]` matches. |

74 | /// 4. If a mismatch occurs, shift the search by some amount based on the |

75 | /// critical position and a pre-computed shift. |

76 | /// |

77 | /// This type is wrapped in the forward and reverse finders that expose |

78 | /// consistent forward or reverse APIs. |

79 | #[derive(Clone, Copy, Debug)] |

80 | struct TwoWay { |

81 | /// A small bitset used as a quick prefilter (in addition to any prefilter |

82 | /// given by the caller). Namely, a bit `i` is set if and only if `b%64==i` |

83 | /// for any `b == needle[i]`. |

84 | /// |

85 | /// When used as a prefilter, if the last byte at the current candidate |

86 | /// position is NOT in this set, then we can skip that entire candidate |

87 | /// position (the length of the needle). This is essentially the shift |

88 | /// trick found in Boyer-Moore, but only applied to bytes that don't appear |

89 | /// in the needle. |

90 | /// |

91 | /// N.B. This trick was inspired by something similar in std's |

92 | /// implementation of Two-Way. |

93 | byteset: ApproximateByteSet, |

94 | /// A critical position in needle. Specifically, this position corresponds |

95 | /// to beginning of either the minimal or maximal suffix in needle. (N.B. |

96 | /// See SuffixType below for why "minimal" isn't quite the correct word |

97 | /// here.) |

98 | /// |

99 | /// This is the position at which every search begins. Namely, search |

100 | /// starts by scanning text to the right of this position, and only if |

101 | /// there's a match does the text to the left of this position get scanned. |

102 | critical_pos: usize, |

103 | /// The amount we shift by in the Two-Way search algorithm. This |

104 | /// corresponds to the "small period" and "large period" cases. |

105 | shift: Shift, |

106 | } |

107 | |

108 | impl Finder { |

109 | /// Create a searcher that finds occurrences of the given `needle`. |

110 | /// |

111 | /// An empty `needle` results in a match at every position in a haystack, |

112 | /// including at `haystack.len()`. |

113 | #[inline] |

114 | pub fn new(needle: &[u8]) -> Finder { |

115 | let byteset = ApproximateByteSet::new(needle); |

116 | let min_suffix = Suffix::forward(needle, SuffixKind::Minimal); |

117 | let max_suffix = Suffix::forward(needle, SuffixKind::Maximal); |

118 | let (period_lower_bound, critical_pos) = |

119 | if min_suffix.pos > max_suffix.pos { |

120 | (min_suffix.period, min_suffix.pos) |

121 | } else { |

122 | (max_suffix.period, max_suffix.pos) |

123 | }; |

124 | let shift = Shift::forward(needle, period_lower_bound, critical_pos); |

125 | Finder(TwoWay { byteset, critical_pos, shift }) |

126 | } |

127 | |

128 | /// Returns the first occurrence of `needle` in the given `haystack`, or |

129 | /// `None` if no such occurrence could be found. |

130 | /// |

131 | /// The `needle` given must be the same as the `needle` provided to |

132 | /// [`Finder::new`]. |

133 | /// |

134 | /// An empty `needle` results in a match at every position in a haystack, |

135 | /// including at `haystack.len()`. |

136 | #[inline] |

137 | pub fn find(&self, haystack: &[u8], needle: &[u8]) -> Option<usize> { |

138 | self.find_with_prefilter(None, haystack, needle) |

139 | } |

140 | |

141 | /// This is like [`Finder::find`], but it accepts a prefilter for |

142 | /// accelerating searches. |

143 | /// |

144 | /// Currently this is not exposed in the public API because, at the time |

145 | /// of writing, I didn't want to spend time thinking about how to expose |

146 | /// the prefilter infrastructure (if at all). If you have a compelling use |

147 | /// case for exposing this routine, please create an issue. Do *not* open |

148 | /// a PR that just exposes `Pre` and friends. Exporting this routine will |

149 | /// require API design. |

150 | #[inline(always)] |

151 | pub(crate) fn find_with_prefilter( |

152 | &self, |

153 | pre: Option<Pre<'_>>, |

154 | haystack: &[u8], |

155 | needle: &[u8], |

156 | ) -> Option<usize> { |

157 | match self.0.shift { |

158 | Shift::Small { period } => { |

159 | self.find_small_imp(pre, haystack, needle, period) |

160 | } |

161 | Shift::Large { shift } => { |

162 | self.find_large_imp(pre, haystack, needle, shift) |

163 | } |

164 | } |

165 | } |

166 | |

167 | // Each of the two search implementations below can be accelerated by a |

168 | // prefilter, but it is not always enabled. To avoid its overhead when |

169 | // its disabled, we explicitly inline each search implementation based on |

170 | // whether a prefilter will be used or not. The decision on which to use |

171 | // is made in the parent meta searcher. |

172 | |

173 | #[inline(always)] |

174 | fn find_small_imp( |

175 | &self, |

176 | mut pre: Option<Pre<'_>>, |

177 | haystack: &[u8], |

178 | needle: &[u8], |

179 | period: usize, |

180 | ) -> Option<usize> { |

181 | let mut pos = 0; |

182 | let mut shift = 0; |

183 | let last_byte_pos = match needle.len().checked_sub(1) { |

184 | None => return Some(pos), |

185 | Some(last_byte) => last_byte, |

186 | }; |

187 | while pos + needle.len() <= haystack.len() { |

188 | let mut i = cmp::max(self.0.critical_pos, shift); |

189 | if let Some(pre) = pre.as_mut() { |

190 | if pre.is_effective() { |

191 | pos += pre.find(&haystack[pos..])?; |

192 | shift = 0; |

193 | i = self.0.critical_pos; |

194 | if pos + needle.len() > haystack.len() { |

195 | return None; |

196 | } |

197 | } |

198 | } |

199 | if !self.0.byteset.contains(haystack[pos + last_byte_pos]) { |

200 | pos += needle.len(); |

201 | shift = 0; |

202 | continue; |

203 | } |

204 | while i < needle.len() && needle[i] == haystack[pos + i] { |

205 | i += 1; |

206 | } |

207 | if i < needle.len() { |

208 | pos += i - self.0.critical_pos + 1; |

209 | shift = 0; |

210 | } else { |

211 | let mut j = self.0.critical_pos; |

212 | while j > shift && needle[j] == haystack[pos + j] { |

213 | j -= 1; |

214 | } |

215 | if j <= shift && needle[shift] == haystack[pos + shift] { |

216 | return Some(pos); |

217 | } |

218 | pos += period; |

219 | shift = needle.len() - period; |

220 | } |

221 | } |

222 | None |

223 | } |

224 | |

225 | #[inline(always)] |

226 | fn find_large_imp( |

227 | &self, |

228 | mut pre: Option<Pre<'_>>, |

229 | haystack: &[u8], |

230 | needle: &[u8], |

231 | shift: usize, |

232 | ) -> Option<usize> { |

233 | let mut pos = 0; |

234 | let last_byte_pos = match needle.len().checked_sub(1) { |

235 | None => return Some(pos), |

236 | Some(last_byte) => last_byte, |

237 | }; |

238 | 'outer: while pos + needle.len() <= haystack.len() { |

239 | if let Some(pre) = pre.as_mut() { |

240 | if pre.is_effective() { |

241 | pos += pre.find(&haystack[pos..])?; |

242 | if pos + needle.len() > haystack.len() { |

243 | return None; |

244 | } |

245 | } |

246 | } |

247 | |

248 | if !self.0.byteset.contains(haystack[pos + last_byte_pos]) { |

249 | pos += needle.len(); |

250 | continue; |

251 | } |

252 | let mut i = self.0.critical_pos; |

253 | while i < needle.len() && needle[i] == haystack[pos + i] { |

254 | i += 1; |

255 | } |

256 | if i < needle.len() { |

257 | pos += i - self.0.critical_pos + 1; |

258 | } else { |

259 | for j in (0..self.0.critical_pos).rev() { |

260 | if needle[j] != haystack[pos + j] { |

261 | pos += shift; |

262 | continue 'outer; |

263 | } |

264 | } |

265 | return Some(pos); |

266 | } |

267 | } |

268 | None |

269 | } |

270 | } |

271 | |

272 | impl FinderRev { |

273 | /// Create a searcher that finds occurrences of the given `needle`. |

274 | /// |

275 | /// An empty `needle` results in a match at every position in a haystack, |

276 | /// including at `haystack.len()`. |

277 | #[inline] |

278 | pub fn new(needle: &[u8]) -> FinderRev { |

279 | let byteset = ApproximateByteSet::new(needle); |

280 | let min_suffix = Suffix::reverse(needle, SuffixKind::Minimal); |

281 | let max_suffix = Suffix::reverse(needle, SuffixKind::Maximal); |

282 | let (period_lower_bound, critical_pos) = |

283 | if min_suffix.pos < max_suffix.pos { |

284 | (min_suffix.period, min_suffix.pos) |

285 | } else { |

286 | (max_suffix.period, max_suffix.pos) |

287 | }; |

288 | let shift = Shift::reverse(needle, period_lower_bound, critical_pos); |

289 | FinderRev(TwoWay { byteset, critical_pos, shift }) |

290 | } |

291 | |

292 | /// Returns the last occurrence of `needle` in the given `haystack`, or |

293 | /// `None` if no such occurrence could be found. |

294 | /// |

295 | /// The `needle` given must be the same as the `needle` provided to |

296 | /// [`FinderRev::new`]. |

297 | /// |

298 | /// An empty `needle` results in a match at every position in a haystack, |

299 | /// including at `haystack.len()`. |

300 | #[inline] |

301 | pub fn rfind(&self, haystack: &[u8], needle: &[u8]) -> Option<usize> { |

302 | // For the reverse case, we don't use a prefilter. It's plausible that |

303 | // perhaps we should, but it's a lot of additional code to do it, and |

304 | // it's not clear that it's actually worth it. If you have a really |

305 | // compelling use case for this, please file an issue. |

306 | match self.0.shift { |

307 | Shift::Small { period } => { |

308 | self.rfind_small_imp(haystack, needle, period) |

309 | } |

310 | Shift::Large { shift } => { |

311 | self.rfind_large_imp(haystack, needle, shift) |

312 | } |

313 | } |

314 | } |

315 | |

316 | #[inline(always)] |

317 | fn rfind_small_imp( |

318 | &self, |

319 | haystack: &[u8], |

320 | needle: &[u8], |

321 | period: usize, |

322 | ) -> Option<usize> { |

323 | let nlen = needle.len(); |

324 | let mut pos = haystack.len(); |

325 | let mut shift = nlen; |

326 | let first_byte = match needle.get(0) { |

327 | None => return Some(pos), |

328 | Some(&first_byte) => first_byte, |

329 | }; |

330 | while pos >= nlen { |

331 | if !self.0.byteset.contains(haystack[pos - nlen]) { |

332 | pos -= nlen; |

333 | shift = nlen; |

334 | continue; |

335 | } |

336 | let mut i = cmp::min(self.0.critical_pos, shift); |

337 | while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] { |

338 | i -= 1; |

339 | } |

340 | if i > 0 || first_byte != haystack[pos - nlen] { |

341 | pos -= self.0.critical_pos - i + 1; |

342 | shift = nlen; |

343 | } else { |

344 | let mut j = self.0.critical_pos; |

345 | while j < shift && needle[j] == haystack[pos - nlen + j] { |

346 | j += 1; |

347 | } |

348 | if j >= shift { |

349 | return Some(pos - nlen); |

350 | } |

351 | pos -= period; |

352 | shift = period; |

353 | } |

354 | } |

355 | None |

356 | } |

357 | |

358 | #[inline(always)] |

359 | fn rfind_large_imp( |

360 | &self, |

361 | haystack: &[u8], |

362 | needle: &[u8], |

363 | shift: usize, |

364 | ) -> Option<usize> { |

365 | let nlen = needle.len(); |

366 | let mut pos = haystack.len(); |

367 | let first_byte = match needle.get(0) { |

368 | None => return Some(pos), |

369 | Some(&first_byte) => first_byte, |

370 | }; |

371 | while pos >= nlen { |

372 | if !self.0.byteset.contains(haystack[pos - nlen]) { |

373 | pos -= nlen; |

374 | continue; |

375 | } |

376 | let mut i = self.0.critical_pos; |

377 | while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] { |

378 | i -= 1; |

379 | } |

380 | if i > 0 || first_byte != haystack[pos - nlen] { |

381 | pos -= self.0.critical_pos - i + 1; |

382 | } else { |

383 | let mut j = self.0.critical_pos; |

384 | while j < nlen && needle[j] == haystack[pos - nlen + j] { |

385 | j += 1; |

386 | } |

387 | if j == nlen { |

388 | return Some(pos - nlen); |

389 | } |

390 | pos -= shift; |

391 | } |

392 | } |

393 | None |

394 | } |

395 | } |

396 | |

397 | /// A representation of the amount we're allowed to shift by during Two-Way |

398 | /// search. |

399 | /// |

400 | /// When computing a critical factorization of the needle, we find the position |

401 | /// of the critical factorization by finding the needle's maximal (or minimal) |

402 | /// suffix, along with the period of that suffix. It turns out that the period |

403 | /// of that suffix is a lower bound on the period of the needle itself. |

404 | /// |

405 | /// This lower bound is equivalent to the actual period of the needle in |

406 | /// some cases. To describe that case, we denote the needle as `x` where |

407 | /// `x = uv` and `v` is the lexicographic maximal suffix of `v`. The lower |

408 | /// bound given here is always the period of `v`, which is `<= period(x)`. The |

409 | /// case where `period(v) == period(x)` occurs when `len(u) < (len(x) / 2)` and |

410 | /// where `u` is a suffix of `v[0..period(v)]`. |

411 | /// |

412 | /// This case is important because the search algorithm for when the |

413 | /// periods are equivalent is slightly different than the search algorithm |

414 | /// for when the periods are not equivalent. In particular, when they aren't |

415 | /// equivalent, we know that the period of the needle is no less than half its |

416 | /// length. In this case, we shift by an amount less than or equal to the |

417 | /// period of the needle (determined by the maximum length of the components |

418 | /// of the critical factorization of `x`, i.e., `max(len(u), len(v))`).. |

419 | /// |

420 | /// The above two cases are represented by the variants below. Each entails |

421 | /// a different instantiation of the Two-Way search algorithm. |

422 | /// |

423 | /// N.B. If we could find a way to compute the exact period in all cases, |

424 | /// then we could collapse this case analysis and simplify the algorithm. The |

425 | /// Two-Way paper suggests this is possible, but more reading is required to |

426 | /// grok why the authors didn't pursue that path. |

427 | #[derive(Clone, Copy, Debug)] |

428 | enum Shift { |

429 | Small { period: usize }, |

430 | Large { shift: usize }, |

431 | } |

432 | |

433 | impl Shift { |

434 | /// Compute the shift for a given needle in the forward direction. |

435 | /// |

436 | /// This requires a lower bound on the period and a critical position. |

437 | /// These can be computed by extracting both the minimal and maximal |

438 | /// lexicographic suffixes, and choosing the right-most starting position. |

439 | /// The lower bound on the period is then the period of the chosen suffix. |

440 | fn forward( |

441 | needle: &[u8], |

442 | period_lower_bound: usize, |

443 | critical_pos: usize, |

444 | ) -> Shift { |

445 | let large = cmp::max(critical_pos, needle.len() - critical_pos); |

446 | if critical_pos * 2 >= needle.len() { |

447 | return Shift::Large { shift: large }; |

448 | } |

449 | |

450 | let (u, v) = needle.split_at(critical_pos); |

451 | if !is_suffix(&v[..period_lower_bound], u) { |

452 | return Shift::Large { shift: large }; |

453 | } |

454 | Shift::Small { period: period_lower_bound } |

455 | } |

456 | |

457 | /// Compute the shift for a given needle in the reverse direction. |

458 | /// |

459 | /// This requires a lower bound on the period and a critical position. |

460 | /// These can be computed by extracting both the minimal and maximal |

461 | /// lexicographic suffixes, and choosing the left-most starting position. |

462 | /// The lower bound on the period is then the period of the chosen suffix. |

463 | fn reverse( |

464 | needle: &[u8], |

465 | period_lower_bound: usize, |

466 | critical_pos: usize, |

467 | ) -> Shift { |

468 | let large = cmp::max(critical_pos, needle.len() - critical_pos); |

469 | if (needle.len() - critical_pos) * 2 >= needle.len() { |

470 | return Shift::Large { shift: large }; |

471 | } |

472 | |

473 | let (v, u) = needle.split_at(critical_pos); |

474 | if !is_prefix(&v[v.len() - period_lower_bound..], u) { |

475 | return Shift::Large { shift: large }; |

476 | } |

477 | Shift::Small { period: period_lower_bound } |

478 | } |

479 | } |

480 | |

481 | /// A suffix extracted from a needle along with its period. |

482 | #[derive(Debug)] |

483 | struct Suffix { |

484 | /// The starting position of this suffix. |

485 | /// |

486 | /// If this is a forward suffix, then `&bytes[pos..]` can be used. If this |

487 | /// is a reverse suffix, then `&bytes[..pos]` can be used. That is, for |

488 | /// forward suffixes, this is an inclusive starting position, where as for |

489 | /// reverse suffixes, this is an exclusive ending position. |

490 | pos: usize, |

491 | /// The period of this suffix. |

492 | /// |

493 | /// Note that this is NOT necessarily the period of the string from which |

494 | /// this suffix comes from. (It is always less than or equal to the period |

495 | /// of the original string.) |

496 | period: usize, |

497 | } |

498 | |

499 | impl Suffix { |

500 | fn forward(needle: &[u8], kind: SuffixKind) -> Suffix { |

501 | // suffix represents our maximal (or minimal) suffix, along with |

502 | // its period. |

503 | let mut suffix = Suffix { pos: 0, period: 1 }; |

504 | // The start of a suffix in `needle` that we are considering as a |

505 | // more maximal (or minimal) suffix than what's in `suffix`. |

506 | let mut candidate_start = 1; |

507 | // The current offset of our suffixes that we're comparing. |

508 | // |

509 | // When the characters at this offset are the same, then we mush on |

510 | // to the next position since no decision is possible. When the |

511 | // candidate's character is greater (or lesser) than the corresponding |

512 | // character than our current maximal (or minimal) suffix, then the |

513 | // current suffix is changed over to the candidate and we restart our |

514 | // search. Otherwise, the candidate suffix is no good and we restart |

515 | // our search on the next candidate. |

516 | // |

517 | // The three cases above correspond to the three cases in the loop |

518 | // below. |

519 | let mut offset = 0; |

520 | |

521 | while candidate_start + offset < needle.len() { |

522 | let current = needle[suffix.pos + offset]; |

523 | let candidate = needle[candidate_start + offset]; |

524 | match kind.cmp(current, candidate) { |

525 | SuffixOrdering::Accept => { |

526 | suffix = Suffix { pos: candidate_start, period: 1 }; |

527 | candidate_start += 1; |

528 | offset = 0; |

529 | } |

530 | SuffixOrdering::Skip => { |

531 | candidate_start += offset + 1; |

532 | offset = 0; |

533 | suffix.period = candidate_start - suffix.pos; |

534 | } |

535 | SuffixOrdering::Push => { |

536 | if offset + 1 == suffix.period { |

537 | candidate_start += suffix.period; |

538 | offset = 0; |

539 | } else { |

540 | offset += 1; |

541 | } |

542 | } |

543 | } |

544 | } |

545 | suffix |

546 | } |

547 | |

548 | fn reverse(needle: &[u8], kind: SuffixKind) -> Suffix { |

549 | // See the comments in `forward` for how this works. |

550 | let mut suffix = Suffix { pos: needle.len(), period: 1 }; |

551 | if needle.len() == 1 { |

552 | return suffix; |

553 | } |

554 | let mut candidate_start = match needle.len().checked_sub(1) { |

555 | None => return suffix, |

556 | Some(candidate_start) => candidate_start, |

557 | }; |

558 | let mut offset = 0; |

559 | |

560 | while offset < candidate_start { |

561 | let current = needle[suffix.pos - offset - 1]; |

562 | let candidate = needle[candidate_start - offset - 1]; |

563 | match kind.cmp(current, candidate) { |

564 | SuffixOrdering::Accept => { |

565 | suffix = Suffix { pos: candidate_start, period: 1 }; |

566 | candidate_start -= 1; |

567 | offset = 0; |

568 | } |

569 | SuffixOrdering::Skip => { |

570 | candidate_start -= offset + 1; |

571 | offset = 0; |

572 | suffix.period = suffix.pos - candidate_start; |

573 | } |

574 | SuffixOrdering::Push => { |

575 | if offset + 1 == suffix.period { |

576 | candidate_start -= suffix.period; |

577 | offset = 0; |

578 | } else { |

579 | offset += 1; |

580 | } |

581 | } |

582 | } |

583 | } |

584 | suffix |

585 | } |

586 | } |

587 | |

588 | /// The kind of suffix to extract. |

589 | #[derive(Clone, Copy, Debug)] |

590 | enum SuffixKind { |

591 | /// Extract the smallest lexicographic suffix from a string. |

592 | /// |

593 | /// Technically, this doesn't actually pick the smallest lexicographic |

594 | /// suffix. e.g., Given the choice between `a` and `aa`, this will choose |

595 | /// the latter over the former, even though `a < aa`. The reasoning for |

596 | /// this isn't clear from the paper, but it still smells like a minimal |

597 | /// suffix. |

598 | Minimal, |

599 | /// Extract the largest lexicographic suffix from a string. |

600 | /// |

601 | /// Unlike `Minimal`, this really does pick the maximum suffix. e.g., Given |

602 | /// the choice between `z` and `zz`, this will choose the latter over the |

603 | /// former. |

604 | Maximal, |

605 | } |

606 | |

607 | /// The result of comparing corresponding bytes between two suffixes. |

608 | #[derive(Clone, Copy, Debug)] |

609 | enum SuffixOrdering { |

610 | /// This occurs when the given candidate byte indicates that the candidate |

611 | /// suffix is better than the current maximal (or minimal) suffix. That is, |

612 | /// the current candidate suffix should supplant the current maximal (or |

613 | /// minimal) suffix. |

614 | Accept, |

615 | /// This occurs when the given candidate byte excludes the candidate suffix |

616 | /// from being better than the current maximal (or minimal) suffix. That |

617 | /// is, the current candidate suffix should be dropped and the next one |

618 | /// should be considered. |

619 | Skip, |

620 | /// This occurs when no decision to accept or skip the candidate suffix |

621 | /// can be made, e.g., when corresponding bytes are equivalent. In this |

622 | /// case, the next corresponding bytes should be compared. |

623 | Push, |

624 | } |

625 | |

626 | impl SuffixKind { |

627 | /// Returns true if and only if the given candidate byte indicates that |

628 | /// it should replace the current suffix as the maximal (or minimal) |

629 | /// suffix. |

630 | fn cmp(self, current: u8, candidate: u8) -> SuffixOrdering { |

631 | use self::SuffixOrdering::*; |

632 | |

633 | match self { |

634 | SuffixKind::Minimal if candidate < current => Accept, |

635 | SuffixKind::Minimal if candidate > current => Skip, |

636 | SuffixKind::Minimal => Push, |

637 | SuffixKind::Maximal if candidate > current => Accept, |

638 | SuffixKind::Maximal if candidate < current => Skip, |

639 | SuffixKind::Maximal => Push, |

640 | } |

641 | } |

642 | } |

643 | |

644 | /// A bitset used to track whether a particular byte exists in a needle or not. |

645 | /// |

646 | /// Namely, bit 'i' is set if and only if byte%64==i for any byte in the |

647 | /// needle. If a particular byte in the haystack is NOT in this set, then one |

648 | /// can conclude that it is also not in the needle, and thus, one can advance |

649 | /// in the haystack by needle.len() bytes. |

650 | #[derive(Clone, Copy, Debug)] |

651 | struct ApproximateByteSet(u64); |

652 | |

653 | impl ApproximateByteSet { |

654 | /// Create a new set from the given needle. |

655 | fn new(needle: &[u8]) -> ApproximateByteSet { |

656 | let mut bits = 0; |

657 | for &b in needle { |

658 | bits |= 1 << (b % 64); |

659 | } |

660 | ApproximateByteSet(bits) |

661 | } |

662 | |

663 | /// Return true if and only if the given byte might be in this set. This |

664 | /// may return a false positive, but will never return a false negative. |

665 | #[inline(always)] |

666 | fn contains(&self, byte: u8) -> bool { |

667 | self.0 & (1 << (byte % 64)) != 0 |

668 | } |

669 | } |

670 | |

671 | #[cfg(test)] |

672 | mod tests { |

673 | use alloc::vec::Vec; |

674 | |

675 | use super::*; |

676 | |

677 | /// Convenience wrapper for computing the suffix as a byte string. |

678 | fn get_suffix_forward(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) { |

679 | let s = Suffix::forward(needle, kind); |

680 | (&needle[s.pos..], s.period) |

681 | } |

682 | |

683 | /// Convenience wrapper for computing the reverse suffix as a byte string. |

684 | fn get_suffix_reverse(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) { |

685 | let s = Suffix::reverse(needle, kind); |

686 | (&needle[..s.pos], s.period) |

687 | } |

688 | |

689 | /// Return all of the non-empty suffixes in the given byte string. |

690 | fn suffixes(bytes: &[u8]) -> Vec<&[u8]> { |

691 | (0..bytes.len()).map(|i| &bytes[i..]).collect() |

692 | } |

693 | |

694 | /// Return the lexicographically maximal suffix of the given byte string. |

695 | fn naive_maximal_suffix_forward(needle: &[u8]) -> &[u8] { |

696 | let mut sufs = suffixes(needle); |

697 | sufs.sort(); |

698 | sufs.pop().unwrap() |

699 | } |

700 | |

701 | /// Return the lexicographically maximal suffix of the reverse of the given |

702 | /// byte string. |

703 | fn naive_maximal_suffix_reverse(needle: &[u8]) -> Vec<u8> { |

704 | let mut reversed = needle.to_vec(); |

705 | reversed.reverse(); |

706 | let mut got = naive_maximal_suffix_forward(&reversed).to_vec(); |

707 | got.reverse(); |

708 | got |

709 | } |

710 | |

711 | define_substring_forward_quickcheck!(|h, n| Some( |

712 | Finder::new(n).find(h, n) |

713 | )); |

714 | define_substring_reverse_quickcheck!(|h, n| Some( |

715 | FinderRev::new(n).rfind(h, n) |

716 | )); |

717 | |

718 | #[test] |

719 | fn forward() { |

720 | crate::tests::substring::Runner::new() |

721 | .fwd(|h, n| Some(Finder::new(n).find(h, n))) |

722 | .run(); |

723 | } |

724 | |

725 | #[test] |

726 | fn reverse() { |

727 | crate::tests::substring::Runner::new() |

728 | .rev(|h, n| Some(FinderRev::new(n).rfind(h, n))) |

729 | .run(); |

730 | } |

731 | |

732 | #[test] |

733 | fn suffix_forward() { |

734 | macro_rules! assert_suffix_min { |

735 | ($given:expr, $expected:expr, $period:expr) => { |

736 | let (got_suffix, got_period) = |

737 | get_suffix_forward($given.as_bytes(), SuffixKind::Minimal); |

738 | let got_suffix = core::str::from_utf8(got_suffix).unwrap(); |

739 | assert_eq!(($expected, $period), (got_suffix, got_period)); |

740 | }; |

741 | } |

742 | |

743 | macro_rules! assert_suffix_max { |

744 | ($given:expr, $expected:expr, $period:expr) => { |

745 | let (got_suffix, got_period) = |

746 | get_suffix_forward($given.as_bytes(), SuffixKind::Maximal); |

747 | let got_suffix = core::str::from_utf8(got_suffix).unwrap(); |

748 | assert_eq!(($expected, $period), (got_suffix, got_period)); |

749 | }; |

750 | } |

751 | |

752 | assert_suffix_min!("a", "a", 1); |

753 | assert_suffix_max!("a", "a", 1); |

754 | |

755 | assert_suffix_min!("ab", "ab", 2); |

756 | assert_suffix_max!("ab", "b", 1); |

757 | |

758 | assert_suffix_min!("ba", "a", 1); |

759 | assert_suffix_max!("ba", "ba", 2); |

760 | |

761 | assert_suffix_min!("abc", "abc", 3); |

762 | assert_suffix_max!("abc", "c", 1); |

763 | |

764 | assert_suffix_min!("acb", "acb", 3); |

765 | assert_suffix_max!("acb", "cb", 2); |

766 | |

767 | assert_suffix_min!("cba", "a", 1); |

768 | assert_suffix_max!("cba", "cba", 3); |

769 | |

770 | assert_suffix_min!("abcabc", "abcabc", 3); |

771 | assert_suffix_max!("abcabc", "cabc", 3); |

772 | |

773 | assert_suffix_min!("abcabcabc", "abcabcabc", 3); |

774 | assert_suffix_max!("abcabcabc", "cabcabc", 3); |

775 | |

776 | assert_suffix_min!("abczz", "abczz", 5); |

777 | assert_suffix_max!("abczz", "zz", 1); |

778 | |

779 | assert_suffix_min!("zzabc", "abc", 3); |

780 | assert_suffix_max!("zzabc", "zzabc", 5); |

781 | |

782 | assert_suffix_min!("aaa", "aaa", 1); |

783 | assert_suffix_max!("aaa", "aaa", 1); |

784 | |

785 | assert_suffix_min!("foobar", "ar", 2); |

786 | assert_suffix_max!("foobar", "r", 1); |

787 | } |

788 | |

789 | #[test] |

790 | fn suffix_reverse() { |

791 | macro_rules! assert_suffix_min { |

792 | ($given:expr, $expected:expr, $period:expr) => { |

793 | let (got_suffix, got_period) = |

794 | get_suffix_reverse($given.as_bytes(), SuffixKind::Minimal); |

795 | let got_suffix = core::str::from_utf8(got_suffix).unwrap(); |

796 | assert_eq!(($expected, $period), (got_suffix, got_period)); |

797 | }; |

798 | } |

799 | |

800 | macro_rules! assert_suffix_max { |

801 | ($given:expr, $expected:expr, $period:expr) => { |

802 | let (got_suffix, got_period) = |

803 | get_suffix_reverse($given.as_bytes(), SuffixKind::Maximal); |

804 | let got_suffix = core::str::from_utf8(got_suffix).unwrap(); |

805 | assert_eq!(($expected, $period), (got_suffix, got_period)); |

806 | }; |

807 | } |

808 | |

809 | assert_suffix_min!("a", "a", 1); |

810 | assert_suffix_max!("a", "a", 1); |

811 | |

812 | assert_suffix_min!("ab", "a", 1); |

813 | assert_suffix_max!("ab", "ab", 2); |

814 | |

815 | assert_suffix_min!("ba", "ba", 2); |

816 | assert_suffix_max!("ba", "b", 1); |

817 | |

818 | assert_suffix_min!("abc", "a", 1); |

819 | assert_suffix_max!("abc", "abc", 3); |

820 | |

821 | assert_suffix_min!("acb", "a", 1); |

822 | assert_suffix_max!("acb", "ac", 2); |

823 | |

824 | assert_suffix_min!("cba", "cba", 3); |

825 | assert_suffix_max!("cba", "c", 1); |

826 | |

827 | assert_suffix_min!("abcabc", "abca", 3); |

828 | assert_suffix_max!("abcabc", "abcabc", 3); |

829 | |

830 | assert_suffix_min!("abcabcabc", "abcabca", 3); |

831 | assert_suffix_max!("abcabcabc", "abcabcabc", 3); |

832 | |

833 | assert_suffix_min!("abczz", "a", 1); |

834 | assert_suffix_max!("abczz", "abczz", 5); |

835 | |

836 | assert_suffix_min!("zzabc", "zza", 3); |

837 | assert_suffix_max!("zzabc", "zz", 1); |

838 | |

839 | assert_suffix_min!("aaa", "aaa", 1); |

840 | assert_suffix_max!("aaa", "aaa", 1); |

841 | } |

842 | |

843 | #[cfg(not(miri))] |

844 | quickcheck::quickcheck! { |

845 | fn qc_suffix_forward_maximal(bytes: Vec<u8>) -> bool { |

846 | if bytes.is_empty() { |

847 | return true; |

848 | } |

849 | |

850 | let (got, _) = get_suffix_forward(&bytes, SuffixKind::Maximal); |

851 | let expected = naive_maximal_suffix_forward(&bytes); |

852 | got == expected |

853 | } |

854 | |

855 | fn qc_suffix_reverse_maximal(bytes: Vec<u8>) -> bool { |

856 | if bytes.is_empty() { |

857 | return true; |

858 | } |

859 | |

860 | let (got, _) = get_suffix_reverse(&bytes, SuffixKind::Maximal); |

861 | let expected = naive_maximal_suffix_reverse(&bytes); |

862 | expected == got |

863 | } |

864 | } |

865 | |

866 | // This is a regression test caught by quickcheck that exercised a bug in |

867 | // the reverse small period handling. The bug was that we were using 'if j |

868 | // == shift' to determine if a match occurred, but the correct guard is 'if |

869 | // j >= shift', which matches the corresponding guard in the forward impl. |

870 | #[test] |

871 | fn regression_rev_small_period() { |

872 | let rfind = |h, n| FinderRev::new(n).rfind(h, n); |

873 | let haystack = "ababaz"; |

874 | let needle = "abab"; |

875 | assert_eq!(Some(0), rfind(haystack.as_bytes(), needle.as_bytes())); |

876 | } |

877 | } |

878 |