1/*!
2An implementation of the [Two-Way substring search algorithm][two-way].
3
4[`Finder`] can be built for forward searches, while [`FinderRev`] can be built
5for reverse searches.
6
7Two-Way makes for a nice general purpose substring search algorithm because of
8its time and space complexity properties. It also performs well in practice.
9Namely, with `m = len(needle)` and `n = len(haystack)`, Two-Way takes `O(m)`
10time to create a finder, `O(1)` space and `O(n)` search time. In other words,
11the preprocessing step is quick, doesn't require any heap memory and the worst
12case search time is guaranteed to be linear in the haystack regardless of the
13size of the needle.
14
15While vector algorithms will usually beat Two-Way handedly, vector algorithms
16also usually have pathological or edge cases that are better handled by Two-Way.
17Moreover, not all targets support vector algorithms or implementations for them
18simply may not exist yet.
19
20Two-Way can be found in the `memmem` implementations in at least [GNU libc] and
21[musl].
22
23[two-way]: https://en.wikipedia.org/wiki/Two-way_string-matching_algorithm
24[GNU libc]: https://www.gnu.org/software/libc/
25[musl]: https://www.musl-libc.org/
26*/
27
28use core::cmp;
29
30use crate::{
31 arch::all::{is_prefix, is_suffix},
32 memmem::Pre,
33};
34
35/// A forward substring searcher that uses the Two-Way algorithm.
36#[derive(Clone, Copy, Debug)]
37pub struct Finder(TwoWay);
38
39/// A reverse substring searcher that uses the Two-Way algorithm.
40#[derive(Clone, Copy, Debug)]
41pub struct FinderRev(TwoWay);
42
43/// An implementation of the TwoWay substring search algorithm.
44///
45/// This searcher supports forward and reverse search, although not
46/// simultaneously. It runs in `O(n + m)` time and `O(1)` space, where
47/// `n ~ len(needle)` and `m ~ len(haystack)`.
48///
49/// The implementation here roughly matches that which was developed by
50/// Crochemore and Perrin in their 1991 paper "Two-way string-matching." The
51/// changes in this implementation are 1) the use of zero-based indices, 2) a
52/// heuristic skip table based on the last byte (borrowed from Rust's standard
53/// library) and 3) the addition of heuristics for a fast skip loop. For (3),
54/// callers can pass any kind of prefilter they want, but usually it's one
55/// based on a heuristic that uses an approximate background frequency of bytes
56/// to choose rare bytes to quickly look for candidate match positions. Note
57/// though that currently, this prefilter functionality is not exposed directly
58/// in the public API. (File an issue if you want it and provide a use case
59/// please.)
60///
61/// The heuristic for fast skipping is automatically shut off if it's
62/// detected to be ineffective at search time. Generally, this only occurs in
63/// pathological cases. But this is generally necessary in order to preserve
64/// a `O(n + m)` time bound.
65///
66/// The code below is fairly complex and not obviously correct at all. It's
67/// likely necessary to read the Two-Way paper cited above in order to fully
68/// grok this code. The essence of it is:
69///
70/// 1. Do something to detect a "critical" position in the needle.
71/// 2. For the current position in the haystack, look if `needle[critical..]`
72/// matches at that position.
73/// 3. If so, look if `needle[..critical]` matches.
74/// 4. If a mismatch occurs, shift the search by some amount based on the
75/// critical position and a pre-computed shift.
76///
77/// This type is wrapped in the forward and reverse finders that expose
78/// consistent forward or reverse APIs.
79#[derive(Clone, Copy, Debug)]
80struct TwoWay {
81 /// A small bitset used as a quick prefilter (in addition to any prefilter
82 /// given by the caller). Namely, a bit `i` is set if and only if `b%64==i`
83 /// for any `b == needle[i]`.
84 ///
85 /// When used as a prefilter, if the last byte at the current candidate
86 /// position is NOT in this set, then we can skip that entire candidate
87 /// position (the length of the needle). This is essentially the shift
88 /// trick found in Boyer-Moore, but only applied to bytes that don't appear
89 /// in the needle.
90 ///
91 /// N.B. This trick was inspired by something similar in std's
92 /// implementation of Two-Way.
93 byteset: ApproximateByteSet,
94 /// A critical position in needle. Specifically, this position corresponds
95 /// to beginning of either the minimal or maximal suffix in needle. (N.B.
96 /// See SuffixType below for why "minimal" isn't quite the correct word
97 /// here.)
98 ///
99 /// This is the position at which every search begins. Namely, search
100 /// starts by scanning text to the right of this position, and only if
101 /// there's a match does the text to the left of this position get scanned.
102 critical_pos: usize,
103 /// The amount we shift by in the Two-Way search algorithm. This
104 /// corresponds to the "small period" and "large period" cases.
105 shift: Shift,
106}
107
108impl Finder {
109 /// Create a searcher that finds occurrences of the given `needle`.
110 ///
111 /// An empty `needle` results in a match at every position in a haystack,
112 /// including at `haystack.len()`.
113 #[inline]
114 pub fn new(needle: &[u8]) -> Finder {
115 let byteset = ApproximateByteSet::new(needle);
116 let min_suffix = Suffix::forward(needle, SuffixKind::Minimal);
117 let max_suffix = Suffix::forward(needle, SuffixKind::Maximal);
118 let (period_lower_bound, critical_pos) =
119 if min_suffix.pos > max_suffix.pos {
120 (min_suffix.period, min_suffix.pos)
121 } else {
122 (max_suffix.period, max_suffix.pos)
123 };
124 let shift = Shift::forward(needle, period_lower_bound, critical_pos);
125 Finder(TwoWay { byteset, critical_pos, shift })
126 }
127
128 /// Returns the first occurrence of `needle` in the given `haystack`, or
129 /// `None` if no such occurrence could be found.
130 ///
131 /// The `needle` given must be the same as the `needle` provided to
132 /// [`Finder::new`].
133 ///
134 /// An empty `needle` results in a match at every position in a haystack,
135 /// including at `haystack.len()`.
136 #[inline]
137 pub fn find(&self, haystack: &[u8], needle: &[u8]) -> Option<usize> {
138 self.find_with_prefilter(None, haystack, needle)
139 }
140
141 /// This is like [`Finder::find`], but it accepts a prefilter for
142 /// accelerating searches.
143 ///
144 /// Currently this is not exposed in the public API because, at the time
145 /// of writing, I didn't want to spend time thinking about how to expose
146 /// the prefilter infrastructure (if at all). If you have a compelling use
147 /// case for exposing this routine, please create an issue. Do *not* open
148 /// a PR that just exposes `Pre` and friends. Exporting this routine will
149 /// require API design.
150 #[inline(always)]
151 pub(crate) fn find_with_prefilter(
152 &self,
153 pre: Option<Pre<'_>>,
154 haystack: &[u8],
155 needle: &[u8],
156 ) -> Option<usize> {
157 match self.0.shift {
158 Shift::Small { period } => {
159 self.find_small_imp(pre, haystack, needle, period)
160 }
161 Shift::Large { shift } => {
162 self.find_large_imp(pre, haystack, needle, shift)
163 }
164 }
165 }
166
167 // Each of the two search implementations below can be accelerated by a
168 // prefilter, but it is not always enabled. To avoid its overhead when
169 // its disabled, we explicitly inline each search implementation based on
170 // whether a prefilter will be used or not. The decision on which to use
171 // is made in the parent meta searcher.
172
173 #[inline(always)]
174 fn find_small_imp(
175 &self,
176 mut pre: Option<Pre<'_>>,
177 haystack: &[u8],
178 needle: &[u8],
179 period: usize,
180 ) -> Option<usize> {
181 let mut pos = 0;
182 let mut shift = 0;
183 let last_byte_pos = match needle.len().checked_sub(1) {
184 None => return Some(pos),
185 Some(last_byte) => last_byte,
186 };
187 while pos + needle.len() <= haystack.len() {
188 let mut i = cmp::max(self.0.critical_pos, shift);
189 if let Some(pre) = pre.as_mut() {
190 if pre.is_effective() {
191 pos += pre.find(&haystack[pos..])?;
192 shift = 0;
193 i = self.0.critical_pos;
194 if pos + needle.len() > haystack.len() {
195 return None;
196 }
197 }
198 }
199 if !self.0.byteset.contains(haystack[pos + last_byte_pos]) {
200 pos += needle.len();
201 shift = 0;
202 continue;
203 }
204 while i < needle.len() && needle[i] == haystack[pos + i] {
205 i += 1;
206 }
207 if i < needle.len() {
208 pos += i - self.0.critical_pos + 1;
209 shift = 0;
210 } else {
211 let mut j = self.0.critical_pos;
212 while j > shift && needle[j] == haystack[pos + j] {
213 j -= 1;
214 }
215 if j <= shift && needle[shift] == haystack[pos + shift] {
216 return Some(pos);
217 }
218 pos += period;
219 shift = needle.len() - period;
220 }
221 }
222 None
223 }
224
225 #[inline(always)]
226 fn find_large_imp(
227 &self,
228 mut pre: Option<Pre<'_>>,
229 haystack: &[u8],
230 needle: &[u8],
231 shift: usize,
232 ) -> Option<usize> {
233 let mut pos = 0;
234 let last_byte_pos = match needle.len().checked_sub(1) {
235 None => return Some(pos),
236 Some(last_byte) => last_byte,
237 };
238 'outer: while pos + needle.len() <= haystack.len() {
239 if let Some(pre) = pre.as_mut() {
240 if pre.is_effective() {
241 pos += pre.find(&haystack[pos..])?;
242 if pos + needle.len() > haystack.len() {
243 return None;
244 }
245 }
246 }
247
248 if !self.0.byteset.contains(haystack[pos + last_byte_pos]) {
249 pos += needle.len();
250 continue;
251 }
252 let mut i = self.0.critical_pos;
253 while i < needle.len() && needle[i] == haystack[pos + i] {
254 i += 1;
255 }
256 if i < needle.len() {
257 pos += i - self.0.critical_pos + 1;
258 } else {
259 for j in (0..self.0.critical_pos).rev() {
260 if needle[j] != haystack[pos + j] {
261 pos += shift;
262 continue 'outer;
263 }
264 }
265 return Some(pos);
266 }
267 }
268 None
269 }
270}
271
272impl FinderRev {
273 /// Create a searcher that finds occurrences of the given `needle`.
274 ///
275 /// An empty `needle` results in a match at every position in a haystack,
276 /// including at `haystack.len()`.
277 #[inline]
278 pub fn new(needle: &[u8]) -> FinderRev {
279 let byteset = ApproximateByteSet::new(needle);
280 let min_suffix = Suffix::reverse(needle, SuffixKind::Minimal);
281 let max_suffix = Suffix::reverse(needle, SuffixKind::Maximal);
282 let (period_lower_bound, critical_pos) =
283 if min_suffix.pos < max_suffix.pos {
284 (min_suffix.period, min_suffix.pos)
285 } else {
286 (max_suffix.period, max_suffix.pos)
287 };
288 let shift = Shift::reverse(needle, period_lower_bound, critical_pos);
289 FinderRev(TwoWay { byteset, critical_pos, shift })
290 }
291
292 /// Returns the last occurrence of `needle` in the given `haystack`, or
293 /// `None` if no such occurrence could be found.
294 ///
295 /// The `needle` given must be the same as the `needle` provided to
296 /// [`FinderRev::new`].
297 ///
298 /// An empty `needle` results in a match at every position in a haystack,
299 /// including at `haystack.len()`.
300 #[inline]
301 pub fn rfind(&self, haystack: &[u8], needle: &[u8]) -> Option<usize> {
302 // For the reverse case, we don't use a prefilter. It's plausible that
303 // perhaps we should, but it's a lot of additional code to do it, and
304 // it's not clear that it's actually worth it. If you have a really
305 // compelling use case for this, please file an issue.
306 match self.0.shift {
307 Shift::Small { period } => {
308 self.rfind_small_imp(haystack, needle, period)
309 }
310 Shift::Large { shift } => {
311 self.rfind_large_imp(haystack, needle, shift)
312 }
313 }
314 }
315
316 #[inline(always)]
317 fn rfind_small_imp(
318 &self,
319 haystack: &[u8],
320 needle: &[u8],
321 period: usize,
322 ) -> Option<usize> {
323 let nlen = needle.len();
324 let mut pos = haystack.len();
325 let mut shift = nlen;
326 let first_byte = match needle.get(0) {
327 None => return Some(pos),
328 Some(&first_byte) => first_byte,
329 };
330 while pos >= nlen {
331 if !self.0.byteset.contains(haystack[pos - nlen]) {
332 pos -= nlen;
333 shift = nlen;
334 continue;
335 }
336 let mut i = cmp::min(self.0.critical_pos, shift);
337 while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] {
338 i -= 1;
339 }
340 if i > 0 || first_byte != haystack[pos - nlen] {
341 pos -= self.0.critical_pos - i + 1;
342 shift = nlen;
343 } else {
344 let mut j = self.0.critical_pos;
345 while j < shift && needle[j] == haystack[pos - nlen + j] {
346 j += 1;
347 }
348 if j >= shift {
349 return Some(pos - nlen);
350 }
351 pos -= period;
352 shift = period;
353 }
354 }
355 None
356 }
357
358 #[inline(always)]
359 fn rfind_large_imp(
360 &self,
361 haystack: &[u8],
362 needle: &[u8],
363 shift: usize,
364 ) -> Option<usize> {
365 let nlen = needle.len();
366 let mut pos = haystack.len();
367 let first_byte = match needle.get(0) {
368 None => return Some(pos),
369 Some(&first_byte) => first_byte,
370 };
371 while pos >= nlen {
372 if !self.0.byteset.contains(haystack[pos - nlen]) {
373 pos -= nlen;
374 continue;
375 }
376 let mut i = self.0.critical_pos;
377 while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] {
378 i -= 1;
379 }
380 if i > 0 || first_byte != haystack[pos - nlen] {
381 pos -= self.0.critical_pos - i + 1;
382 } else {
383 let mut j = self.0.critical_pos;
384 while j < nlen && needle[j] == haystack[pos - nlen + j] {
385 j += 1;
386 }
387 if j == nlen {
388 return Some(pos - nlen);
389 }
390 pos -= shift;
391 }
392 }
393 None
394 }
395}
396
397/// A representation of the amount we're allowed to shift by during Two-Way
398/// search.
399///
400/// When computing a critical factorization of the needle, we find the position
401/// of the critical factorization by finding the needle's maximal (or minimal)
402/// suffix, along with the period of that suffix. It turns out that the period
403/// of that suffix is a lower bound on the period of the needle itself.
404///
405/// This lower bound is equivalent to the actual period of the needle in
406/// some cases. To describe that case, we denote the needle as `x` where
407/// `x = uv` and `v` is the lexicographic maximal suffix of `v`. The lower
408/// bound given here is always the period of `v`, which is `<= period(x)`. The
409/// case where `period(v) == period(x)` occurs when `len(u) < (len(x) / 2)` and
410/// where `u` is a suffix of `v[0..period(v)]`.
411///
412/// This case is important because the search algorithm for when the
413/// periods are equivalent is slightly different than the search algorithm
414/// for when the periods are not equivalent. In particular, when they aren't
415/// equivalent, we know that the period of the needle is no less than half its
416/// length. In this case, we shift by an amount less than or equal to the
417/// period of the needle (determined by the maximum length of the components
418/// of the critical factorization of `x`, i.e., `max(len(u), len(v))`)..
419///
420/// The above two cases are represented by the variants below. Each entails
421/// a different instantiation of the Two-Way search algorithm.
422///
423/// N.B. If we could find a way to compute the exact period in all cases,
424/// then we could collapse this case analysis and simplify the algorithm. The
425/// Two-Way paper suggests this is possible, but more reading is required to
426/// grok why the authors didn't pursue that path.
427#[derive(Clone, Copy, Debug)]
428enum Shift {
429 Small { period: usize },
430 Large { shift: usize },
431}
432
433impl Shift {
434 /// Compute the shift for a given needle in the forward direction.
435 ///
436 /// This requires a lower bound on the period and a critical position.
437 /// These can be computed by extracting both the minimal and maximal
438 /// lexicographic suffixes, and choosing the right-most starting position.
439 /// The lower bound on the period is then the period of the chosen suffix.
440 fn forward(
441 needle: &[u8],
442 period_lower_bound: usize,
443 critical_pos: usize,
444 ) -> Shift {
445 let large = cmp::max(critical_pos, needle.len() - critical_pos);
446 if critical_pos * 2 >= needle.len() {
447 return Shift::Large { shift: large };
448 }
449
450 let (u, v) = needle.split_at(critical_pos);
451 if !is_suffix(&v[..period_lower_bound], u) {
452 return Shift::Large { shift: large };
453 }
454 Shift::Small { period: period_lower_bound }
455 }
456
457 /// Compute the shift for a given needle in the reverse direction.
458 ///
459 /// This requires a lower bound on the period and a critical position.
460 /// These can be computed by extracting both the minimal and maximal
461 /// lexicographic suffixes, and choosing the left-most starting position.
462 /// The lower bound on the period is then the period of the chosen suffix.
463 fn reverse(
464 needle: &[u8],
465 period_lower_bound: usize,
466 critical_pos: usize,
467 ) -> Shift {
468 let large = cmp::max(critical_pos, needle.len() - critical_pos);
469 if (needle.len() - critical_pos) * 2 >= needle.len() {
470 return Shift::Large { shift: large };
471 }
472
473 let (v, u) = needle.split_at(critical_pos);
474 if !is_prefix(&v[v.len() - period_lower_bound..], u) {
475 return Shift::Large { shift: large };
476 }
477 Shift::Small { period: period_lower_bound }
478 }
479}
480
481/// A suffix extracted from a needle along with its period.
482#[derive(Debug)]
483struct Suffix {
484 /// The starting position of this suffix.
485 ///
486 /// If this is a forward suffix, then `&bytes[pos..]` can be used. If this
487 /// is a reverse suffix, then `&bytes[..pos]` can be used. That is, for
488 /// forward suffixes, this is an inclusive starting position, where as for
489 /// reverse suffixes, this is an exclusive ending position.
490 pos: usize,
491 /// The period of this suffix.
492 ///
493 /// Note that this is NOT necessarily the period of the string from which
494 /// this suffix comes from. (It is always less than or equal to the period
495 /// of the original string.)
496 period: usize,
497}
498
499impl Suffix {
500 fn forward(needle: &[u8], kind: SuffixKind) -> Suffix {
501 // suffix represents our maximal (or minimal) suffix, along with
502 // its period.
503 let mut suffix = Suffix { pos: 0, period: 1 };
504 // The start of a suffix in `needle` that we are considering as a
505 // more maximal (or minimal) suffix than what's in `suffix`.
506 let mut candidate_start = 1;
507 // The current offset of our suffixes that we're comparing.
508 //
509 // When the characters at this offset are the same, then we mush on
510 // to the next position since no decision is possible. When the
511 // candidate's character is greater (or lesser) than the corresponding
512 // character than our current maximal (or minimal) suffix, then the
513 // current suffix is changed over to the candidate and we restart our
514 // search. Otherwise, the candidate suffix is no good and we restart
515 // our search on the next candidate.
516 //
517 // The three cases above correspond to the three cases in the loop
518 // below.
519 let mut offset = 0;
520
521 while candidate_start + offset < needle.len() {
522 let current = needle[suffix.pos + offset];
523 let candidate = needle[candidate_start + offset];
524 match kind.cmp(current, candidate) {
525 SuffixOrdering::Accept => {
526 suffix = Suffix { pos: candidate_start, period: 1 };
527 candidate_start += 1;
528 offset = 0;
529 }
530 SuffixOrdering::Skip => {
531 candidate_start += offset + 1;
532 offset = 0;
533 suffix.period = candidate_start - suffix.pos;
534 }
535 SuffixOrdering::Push => {
536 if offset + 1 == suffix.period {
537 candidate_start += suffix.period;
538 offset = 0;
539 } else {
540 offset += 1;
541 }
542 }
543 }
544 }
545 suffix
546 }
547
548 fn reverse(needle: &[u8], kind: SuffixKind) -> Suffix {
549 // See the comments in `forward` for how this works.
550 let mut suffix = Suffix { pos: needle.len(), period: 1 };
551 if needle.len() == 1 {
552 return suffix;
553 }
554 let mut candidate_start = match needle.len().checked_sub(1) {
555 None => return suffix,
556 Some(candidate_start) => candidate_start,
557 };
558 let mut offset = 0;
559
560 while offset < candidate_start {
561 let current = needle[suffix.pos - offset - 1];
562 let candidate = needle[candidate_start - offset - 1];
563 match kind.cmp(current, candidate) {
564 SuffixOrdering::Accept => {
565 suffix = Suffix { pos: candidate_start, period: 1 };
566 candidate_start -= 1;
567 offset = 0;
568 }
569 SuffixOrdering::Skip => {
570 candidate_start -= offset + 1;
571 offset = 0;
572 suffix.period = suffix.pos - candidate_start;
573 }
574 SuffixOrdering::Push => {
575 if offset + 1 == suffix.period {
576 candidate_start -= suffix.period;
577 offset = 0;
578 } else {
579 offset += 1;
580 }
581 }
582 }
583 }
584 suffix
585 }
586}
587
588/// The kind of suffix to extract.
589#[derive(Clone, Copy, Debug)]
590enum SuffixKind {
591 /// Extract the smallest lexicographic suffix from a string.
592 ///
593 /// Technically, this doesn't actually pick the smallest lexicographic
594 /// suffix. e.g., Given the choice between `a` and `aa`, this will choose
595 /// the latter over the former, even though `a < aa`. The reasoning for
596 /// this isn't clear from the paper, but it still smells like a minimal
597 /// suffix.
598 Minimal,
599 /// Extract the largest lexicographic suffix from a string.
600 ///
601 /// Unlike `Minimal`, this really does pick the maximum suffix. e.g., Given
602 /// the choice between `z` and `zz`, this will choose the latter over the
603 /// former.
604 Maximal,
605}
606
607/// The result of comparing corresponding bytes between two suffixes.
608#[derive(Clone, Copy, Debug)]
609enum SuffixOrdering {
610 /// This occurs when the given candidate byte indicates that the candidate
611 /// suffix is better than the current maximal (or minimal) suffix. That is,
612 /// the current candidate suffix should supplant the current maximal (or
613 /// minimal) suffix.
614 Accept,
615 /// This occurs when the given candidate byte excludes the candidate suffix
616 /// from being better than the current maximal (or minimal) suffix. That
617 /// is, the current candidate suffix should be dropped and the next one
618 /// should be considered.
619 Skip,
620 /// This occurs when no decision to accept or skip the candidate suffix
621 /// can be made, e.g., when corresponding bytes are equivalent. In this
622 /// case, the next corresponding bytes should be compared.
623 Push,
624}
625
626impl SuffixKind {
627 /// Returns true if and only if the given candidate byte indicates that
628 /// it should replace the current suffix as the maximal (or minimal)
629 /// suffix.
630 fn cmp(self, current: u8, candidate: u8) -> SuffixOrdering {
631 use self::SuffixOrdering::*;
632
633 match self {
634 SuffixKind::Minimal if candidate < current => Accept,
635 SuffixKind::Minimal if candidate > current => Skip,
636 SuffixKind::Minimal => Push,
637 SuffixKind::Maximal if candidate > current => Accept,
638 SuffixKind::Maximal if candidate < current => Skip,
639 SuffixKind::Maximal => Push,
640 }
641 }
642}
643
644/// A bitset used to track whether a particular byte exists in a needle or not.
645///
646/// Namely, bit 'i' is set if and only if byte%64==i for any byte in the
647/// needle. If a particular byte in the haystack is NOT in this set, then one
648/// can conclude that it is also not in the needle, and thus, one can advance
649/// in the haystack by needle.len() bytes.
650#[derive(Clone, Copy, Debug)]
651struct ApproximateByteSet(u64);
652
653impl ApproximateByteSet {
654 /// Create a new set from the given needle.
655 fn new(needle: &[u8]) -> ApproximateByteSet {
656 let mut bits = 0;
657 for &b in needle {
658 bits |= 1 << (b % 64);
659 }
660 ApproximateByteSet(bits)
661 }
662
663 /// Return true if and only if the given byte might be in this set. This
664 /// may return a false positive, but will never return a false negative.
665 #[inline(always)]
666 fn contains(&self, byte: u8) -> bool {
667 self.0 & (1 << (byte % 64)) != 0
668 }
669}
670
671#[cfg(test)]
672mod tests {
673 use alloc::vec::Vec;
674
675 use super::*;
676
677 /// Convenience wrapper for computing the suffix as a byte string.
678 fn get_suffix_forward(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) {
679 let s = Suffix::forward(needle, kind);
680 (&needle[s.pos..], s.period)
681 }
682
683 /// Convenience wrapper for computing the reverse suffix as a byte string.
684 fn get_suffix_reverse(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) {
685 let s = Suffix::reverse(needle, kind);
686 (&needle[..s.pos], s.period)
687 }
688
689 /// Return all of the non-empty suffixes in the given byte string.
690 fn suffixes(bytes: &[u8]) -> Vec<&[u8]> {
691 (0..bytes.len()).map(|i| &bytes[i..]).collect()
692 }
693
694 /// Return the lexicographically maximal suffix of the given byte string.
695 fn naive_maximal_suffix_forward(needle: &[u8]) -> &[u8] {
696 let mut sufs = suffixes(needle);
697 sufs.sort();
698 sufs.pop().unwrap()
699 }
700
701 /// Return the lexicographically maximal suffix of the reverse of the given
702 /// byte string.
703 fn naive_maximal_suffix_reverse(needle: &[u8]) -> Vec<u8> {
704 let mut reversed = needle.to_vec();
705 reversed.reverse();
706 let mut got = naive_maximal_suffix_forward(&reversed).to_vec();
707 got.reverse();
708 got
709 }
710
711 define_substring_forward_quickcheck!(|h, n| Some(
712 Finder::new(n).find(h, n)
713 ));
714 define_substring_reverse_quickcheck!(|h, n| Some(
715 FinderRev::new(n).rfind(h, n)
716 ));
717
718 #[test]
719 fn forward() {
720 crate::tests::substring::Runner::new()
721 .fwd(|h, n| Some(Finder::new(n).find(h, n)))
722 .run();
723 }
724
725 #[test]
726 fn reverse() {
727 crate::tests::substring::Runner::new()
728 .rev(|h, n| Some(FinderRev::new(n).rfind(h, n)))
729 .run();
730 }
731
732 #[test]
733 fn suffix_forward() {
734 macro_rules! assert_suffix_min {
735 ($given:expr, $expected:expr, $period:expr) => {
736 let (got_suffix, got_period) =
737 get_suffix_forward($given.as_bytes(), SuffixKind::Minimal);
738 let got_suffix = core::str::from_utf8(got_suffix).unwrap();
739 assert_eq!(($expected, $period), (got_suffix, got_period));
740 };
741 }
742
743 macro_rules! assert_suffix_max {
744 ($given:expr, $expected:expr, $period:expr) => {
745 let (got_suffix, got_period) =
746 get_suffix_forward($given.as_bytes(), SuffixKind::Maximal);
747 let got_suffix = core::str::from_utf8(got_suffix).unwrap();
748 assert_eq!(($expected, $period), (got_suffix, got_period));
749 };
750 }
751
752 assert_suffix_min!("a", "a", 1);
753 assert_suffix_max!("a", "a", 1);
754
755 assert_suffix_min!("ab", "ab", 2);
756 assert_suffix_max!("ab", "b", 1);
757
758 assert_suffix_min!("ba", "a", 1);
759 assert_suffix_max!("ba", "ba", 2);
760
761 assert_suffix_min!("abc", "abc", 3);
762 assert_suffix_max!("abc", "c", 1);
763
764 assert_suffix_min!("acb", "acb", 3);
765 assert_suffix_max!("acb", "cb", 2);
766
767 assert_suffix_min!("cba", "a", 1);
768 assert_suffix_max!("cba", "cba", 3);
769
770 assert_suffix_min!("abcabc", "abcabc", 3);
771 assert_suffix_max!("abcabc", "cabc", 3);
772
773 assert_suffix_min!("abcabcabc", "abcabcabc", 3);
774 assert_suffix_max!("abcabcabc", "cabcabc", 3);
775
776 assert_suffix_min!("abczz", "abczz", 5);
777 assert_suffix_max!("abczz", "zz", 1);
778
779 assert_suffix_min!("zzabc", "abc", 3);
780 assert_suffix_max!("zzabc", "zzabc", 5);
781
782 assert_suffix_min!("aaa", "aaa", 1);
783 assert_suffix_max!("aaa", "aaa", 1);
784
785 assert_suffix_min!("foobar", "ar", 2);
786 assert_suffix_max!("foobar", "r", 1);
787 }
788
789 #[test]
790 fn suffix_reverse() {
791 macro_rules! assert_suffix_min {
792 ($given:expr, $expected:expr, $period:expr) => {
793 let (got_suffix, got_period) =
794 get_suffix_reverse($given.as_bytes(), SuffixKind::Minimal);
795 let got_suffix = core::str::from_utf8(got_suffix).unwrap();
796 assert_eq!(($expected, $period), (got_suffix, got_period));
797 };
798 }
799
800 macro_rules! assert_suffix_max {
801 ($given:expr, $expected:expr, $period:expr) => {
802 let (got_suffix, got_period) =
803 get_suffix_reverse($given.as_bytes(), SuffixKind::Maximal);
804 let got_suffix = core::str::from_utf8(got_suffix).unwrap();
805 assert_eq!(($expected, $period), (got_suffix, got_period));
806 };
807 }
808
809 assert_suffix_min!("a", "a", 1);
810 assert_suffix_max!("a", "a", 1);
811
812 assert_suffix_min!("ab", "a", 1);
813 assert_suffix_max!("ab", "ab", 2);
814
815 assert_suffix_min!("ba", "ba", 2);
816 assert_suffix_max!("ba", "b", 1);
817
818 assert_suffix_min!("abc", "a", 1);
819 assert_suffix_max!("abc", "abc", 3);
820
821 assert_suffix_min!("acb", "a", 1);
822 assert_suffix_max!("acb", "ac", 2);
823
824 assert_suffix_min!("cba", "cba", 3);
825 assert_suffix_max!("cba", "c", 1);
826
827 assert_suffix_min!("abcabc", "abca", 3);
828 assert_suffix_max!("abcabc", "abcabc", 3);
829
830 assert_suffix_min!("abcabcabc", "abcabca", 3);
831 assert_suffix_max!("abcabcabc", "abcabcabc", 3);
832
833 assert_suffix_min!("abczz", "a", 1);
834 assert_suffix_max!("abczz", "abczz", 5);
835
836 assert_suffix_min!("zzabc", "zza", 3);
837 assert_suffix_max!("zzabc", "zz", 1);
838
839 assert_suffix_min!("aaa", "aaa", 1);
840 assert_suffix_max!("aaa", "aaa", 1);
841 }
842
843 #[cfg(not(miri))]
844 quickcheck::quickcheck! {
845 fn qc_suffix_forward_maximal(bytes: Vec<u8>) -> bool {
846 if bytes.is_empty() {
847 return true;
848 }
849
850 let (got, _) = get_suffix_forward(&bytes, SuffixKind::Maximal);
851 let expected = naive_maximal_suffix_forward(&bytes);
852 got == expected
853 }
854
855 fn qc_suffix_reverse_maximal(bytes: Vec<u8>) -> bool {
856 if bytes.is_empty() {
857 return true;
858 }
859
860 let (got, _) = get_suffix_reverse(&bytes, SuffixKind::Maximal);
861 let expected = naive_maximal_suffix_reverse(&bytes);
862 expected == got
863 }
864 }
865
866 // This is a regression test caught by quickcheck that exercised a bug in
867 // the reverse small period handling. The bug was that we were using 'if j
868 // == shift' to determine if a match occurred, but the correct guard is 'if
869 // j >= shift', which matches the corresponding guard in the forward impl.
870 #[test]
871 fn regression_rev_small_period() {
872 let rfind = |h, n| FinderRev::new(n).rfind(h, n);
873 let haystack = "ababaz";
874 let needle = "abab";
875 assert_eq!(Some(0), rfind(haystack.as_bytes(), needle.as_bytes()));
876 }
877}
878