1use core::cmp;
2
3use crate::memmem::{prefilter::Pre, util};
4
5/// Two-Way search in the forward direction.
6#[derive(Clone, Copy, Debug)]
7pub(crate) struct Forward(TwoWay);
8
9/// Two-Way search in the reverse direction.
10#[derive(Clone, Copy, Debug)]
11pub(crate) struct Reverse(TwoWay);
12
13/// An implementation of the TwoWay substring search algorithm, with heuristics
14/// for accelerating search based on frequency analysis.
15///
16/// This searcher supports forward and reverse search, although not
17/// simultaneously. It runs in O(n + m) time and O(1) space, where
18/// `n ~ len(needle)` and `m ~ len(haystack)`.
19///
20/// The implementation here roughly matches that which was developed by
21/// Crochemore and Perrin in their 1991 paper "Two-way string-matching." The
22/// changes in this implementation are 1) the use of zero-based indices, 2) a
23/// heuristic skip table based on the last byte (borrowed from Rust's standard
24/// library) and 3) the addition of heuristics for a fast skip loop. That is,
25/// (3) this will detect bytes that are believed to be rare in the needle and
26/// use fast vectorized instructions to find their occurrences quickly. The
27/// Two-Way algorithm is then used to confirm whether a match at that location
28/// occurred.
29///
30/// The heuristic for fast skipping is automatically shut off if it's
31/// detected to be ineffective at search time. Generally, this only occurs in
32/// pathological cases. But this is generally necessary in order to preserve
33/// a `O(n + m)` time bound.
34///
35/// The code below is fairly complex and not obviously correct at all. It's
36/// likely necessary to read the Two-Way paper cited above in order to fully
37/// grok this code. The essence of it is:
38///
39/// 1) Do something to detect a "critical" position in the needle.
40/// 2) For the current position in the haystack, look if needle[critical..]
41/// matches at that position.
42/// 3) If so, look if needle[..critical] matches.
43/// 4) If a mismatch occurs, shift the search by some amount based on the
44/// critical position and a pre-computed shift.
45///
46/// This type is wrapped in Forward and Reverse types that expose consistent
47/// forward or reverse APIs.
48#[derive(Clone, Copy, Debug)]
49struct TwoWay {
50 /// A small bitset used as a quick prefilter (in addition to the faster
51 /// SIMD based prefilter). Namely, a bit 'i' is set if and only if b%64==i
52 /// for any b in the needle.
53 ///
54 /// When used as a prefilter, if the last byte at the current candidate
55 /// position is NOT in this set, then we can skip that entire candidate
56 /// position (the length of the needle). This is essentially the shift
57 /// trick found in Boyer-Moore, but only applied to bytes that don't appear
58 /// in the needle.
59 ///
60 /// N.B. This trick was inspired by something similar in std's
61 /// implementation of Two-Way.
62 byteset: ApproximateByteSet,
63 /// A critical position in needle. Specifically, this position corresponds
64 /// to beginning of either the minimal or maximal suffix in needle. (N.B.
65 /// See SuffixType below for why "minimal" isn't quite the correct word
66 /// here.)
67 ///
68 /// This is the position at which every search begins. Namely, search
69 /// starts by scanning text to the right of this position, and only if
70 /// there's a match does the text to the left of this position get scanned.
71 critical_pos: usize,
72 /// The amount we shift by in the Two-Way search algorithm. This
73 /// corresponds to the "small period" and "large period" cases.
74 shift: Shift,
75}
76
77impl Forward {
78 /// Create a searcher that uses the Two-Way algorithm by searching forwards
79 /// through any haystack.
80 pub(crate) fn new(needle: &[u8]) -> Forward {
81 if needle.is_empty() {
82 return Forward(TwoWay::empty());
83 }
84
85 let byteset = ApproximateByteSet::new(needle);
86 let min_suffix = Suffix::forward(needle, SuffixKind::Minimal);
87 let max_suffix = Suffix::forward(needle, SuffixKind::Maximal);
88 let (period_lower_bound, critical_pos) =
89 if min_suffix.pos > max_suffix.pos {
90 (min_suffix.period, min_suffix.pos)
91 } else {
92 (max_suffix.period, max_suffix.pos)
93 };
94 let shift = Shift::forward(needle, period_lower_bound, critical_pos);
95 Forward(TwoWay { byteset, critical_pos, shift })
96 }
97
98 /// Find the position of the first occurrence of this searcher's needle in
99 /// the given haystack. If one does not exist, then return None.
100 ///
101 /// This accepts prefilter state that is useful when using the same
102 /// searcher multiple times, such as in an iterator.
103 ///
104 /// Callers must guarantee that the needle is non-empty and its length is
105 /// <= the haystack's length.
106 #[inline(always)]
107 pub(crate) fn find(
108 &self,
109 pre: Option<&mut Pre<'_>>,
110 haystack: &[u8],
111 needle: &[u8],
112 ) -> Option<usize> {
113 debug_assert!(!needle.is_empty(), "needle should not be empty");
114 debug_assert!(needle.len() <= haystack.len(), "haystack too short");
115
116 match self.0.shift {
117 Shift::Small { period } => {
118 self.find_small_imp(pre, haystack, needle, period)
119 }
120 Shift::Large { shift } => {
121 self.find_large_imp(pre, haystack, needle, shift)
122 }
123 }
124 }
125
126 /// Like find, but handles the degenerate substring test cases. This is
127 /// only useful for conveniently testing this substring implementation in
128 /// isolation.
129 #[cfg(test)]
130 fn find_general(
131 &self,
132 pre: Option<&mut Pre<'_>>,
133 haystack: &[u8],
134 needle: &[u8],
135 ) -> Option<usize> {
136 if needle.is_empty() {
137 Some(0)
138 } else if haystack.len() < needle.len() {
139 None
140 } else {
141 self.find(pre, haystack, needle)
142 }
143 }
144
145 // Each of the two search implementations below can be accelerated by a
146 // prefilter, but it is not always enabled. To avoid its overhead when
147 // its disabled, we explicitly inline each search implementation based on
148 // whether a prefilter will be used or not. The decision on which to use
149 // is made in the parent meta searcher.
150
151 #[inline(always)]
152 fn find_small_imp(
153 &self,
154 mut pre: Option<&mut Pre<'_>>,
155 haystack: &[u8],
156 needle: &[u8],
157 period: usize,
158 ) -> Option<usize> {
159 let last_byte = needle.len() - 1;
160 let mut pos = 0;
161 let mut shift = 0;
162 while pos + needle.len() <= haystack.len() {
163 let mut i = cmp::max(self.0.critical_pos, shift);
164 if let Some(pre) = pre.as_mut() {
165 if pre.should_call() {
166 pos += pre.call(&haystack[pos..], needle)?;
167 shift = 0;
168 i = self.0.critical_pos;
169 if pos + needle.len() > haystack.len() {
170 return None;
171 }
172 }
173 }
174 if !self.0.byteset.contains(haystack[pos + last_byte]) {
175 pos += needle.len();
176 shift = 0;
177 continue;
178 }
179 while i < needle.len() && needle[i] == haystack[pos + i] {
180 i += 1;
181 }
182 if i < needle.len() {
183 pos += i - self.0.critical_pos + 1;
184 shift = 0;
185 } else {
186 let mut j = self.0.critical_pos;
187 while j > shift && needle[j] == haystack[pos + j] {
188 j -= 1;
189 }
190 if j <= shift && needle[shift] == haystack[pos + shift] {
191 return Some(pos);
192 }
193 pos += period;
194 shift = needle.len() - period;
195 }
196 }
197 None
198 }
199
200 #[inline(always)]
201 fn find_large_imp(
202 &self,
203 mut pre: Option<&mut Pre<'_>>,
204 haystack: &[u8],
205 needle: &[u8],
206 shift: usize,
207 ) -> Option<usize> {
208 let last_byte = needle.len() - 1;
209 let mut pos = 0;
210 'outer: while pos + needle.len() <= haystack.len() {
211 if let Some(pre) = pre.as_mut() {
212 if pre.should_call() {
213 pos += pre.call(&haystack[pos..], needle)?;
214 if pos + needle.len() > haystack.len() {
215 return None;
216 }
217 }
218 }
219
220 if !self.0.byteset.contains(haystack[pos + last_byte]) {
221 pos += needle.len();
222 continue;
223 }
224 let mut i = self.0.critical_pos;
225 while i < needle.len() && needle[i] == haystack[pos + i] {
226 i += 1;
227 }
228 if i < needle.len() {
229 pos += i - self.0.critical_pos + 1;
230 } else {
231 for j in (0..self.0.critical_pos).rev() {
232 if needle[j] != haystack[pos + j] {
233 pos += shift;
234 continue 'outer;
235 }
236 }
237 return Some(pos);
238 }
239 }
240 None
241 }
242}
243
244impl Reverse {
245 /// Create a searcher that uses the Two-Way algorithm by searching in
246 /// reverse through any haystack.
247 pub(crate) fn new(needle: &[u8]) -> Reverse {
248 if needle.is_empty() {
249 return Reverse(TwoWay::empty());
250 }
251
252 let byteset = ApproximateByteSet::new(needle);
253 let min_suffix = Suffix::reverse(needle, SuffixKind::Minimal);
254 let max_suffix = Suffix::reverse(needle, SuffixKind::Maximal);
255 let (period_lower_bound, critical_pos) =
256 if min_suffix.pos < max_suffix.pos {
257 (min_suffix.period, min_suffix.pos)
258 } else {
259 (max_suffix.period, max_suffix.pos)
260 };
261 // let critical_pos = needle.len() - critical_pos;
262 let shift = Shift::reverse(needle, period_lower_bound, critical_pos);
263 Reverse(TwoWay { byteset, critical_pos, shift })
264 }
265
266 /// Find the position of the last occurrence of this searcher's needle
267 /// in the given haystack. If one does not exist, then return None.
268 ///
269 /// This will automatically initialize prefilter state. This should only
270 /// be used for one-off searches.
271 ///
272 /// Callers must guarantee that the needle is non-empty and its length is
273 /// <= the haystack's length.
274 #[inline(always)]
275 pub(crate) fn rfind(
276 &self,
277 haystack: &[u8],
278 needle: &[u8],
279 ) -> Option<usize> {
280 debug_assert!(!needle.is_empty(), "needle should not be empty");
281 debug_assert!(needle.len() <= haystack.len(), "haystack too short");
282 // For the reverse case, we don't use a prefilter. It's plausible that
283 // perhaps we should, but it's a lot of additional code to do it, and
284 // it's not clear that it's actually worth it. If you have a really
285 // compelling use case for this, please file an issue.
286 match self.0.shift {
287 Shift::Small { period } => {
288 self.rfind_small_imp(haystack, needle, period)
289 }
290 Shift::Large { shift } => {
291 self.rfind_large_imp(haystack, needle, shift)
292 }
293 }
294 }
295
296 /// Like rfind, but handles the degenerate substring test cases. This is
297 /// only useful for conveniently testing this substring implementation in
298 /// isolation.
299 #[cfg(test)]
300 fn rfind_general(&self, haystack: &[u8], needle: &[u8]) -> Option<usize> {
301 if needle.is_empty() {
302 Some(haystack.len())
303 } else if haystack.len() < needle.len() {
304 None
305 } else {
306 self.rfind(haystack, needle)
307 }
308 }
309
310 #[inline(always)]
311 fn rfind_small_imp(
312 &self,
313 haystack: &[u8],
314 needle: &[u8],
315 period: usize,
316 ) -> Option<usize> {
317 let nlen = needle.len();
318 let mut pos = haystack.len();
319 let mut shift = nlen;
320 while pos >= nlen {
321 if !self.0.byteset.contains(haystack[pos - nlen]) {
322 pos -= nlen;
323 shift = nlen;
324 continue;
325 }
326 let mut i = cmp::min(self.0.critical_pos, shift);
327 while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] {
328 i -= 1;
329 }
330 if i > 0 || needle[0] != haystack[pos - nlen] {
331 pos -= self.0.critical_pos - i + 1;
332 shift = nlen;
333 } else {
334 let mut j = self.0.critical_pos;
335 while j < shift && needle[j] == haystack[pos - nlen + j] {
336 j += 1;
337 }
338 if j >= shift {
339 return Some(pos - nlen);
340 }
341 pos -= period;
342 shift = period;
343 }
344 }
345 None
346 }
347
348 #[inline(always)]
349 fn rfind_large_imp(
350 &self,
351 haystack: &[u8],
352 needle: &[u8],
353 shift: usize,
354 ) -> Option<usize> {
355 let nlen = needle.len();
356 let mut pos = haystack.len();
357 while pos >= nlen {
358 if !self.0.byteset.contains(haystack[pos - nlen]) {
359 pos -= nlen;
360 continue;
361 }
362 let mut i = self.0.critical_pos;
363 while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] {
364 i -= 1;
365 }
366 if i > 0 || needle[0] != haystack[pos - nlen] {
367 pos -= self.0.critical_pos - i + 1;
368 } else {
369 let mut j = self.0.critical_pos;
370 while j < nlen && needle[j] == haystack[pos - nlen + j] {
371 j += 1;
372 }
373 if j == nlen {
374 return Some(pos - nlen);
375 }
376 pos -= shift;
377 }
378 }
379 None
380 }
381}
382
383impl TwoWay {
384 fn empty() -> TwoWay {
385 TwoWay {
386 byteset: ApproximateByteSet::new(needle:b""),
387 critical_pos: 0,
388 shift: Shift::Large { shift: 0 },
389 }
390 }
391}
392
393/// A representation of the amount we're allowed to shift by during Two-Way
394/// search.
395///
396/// When computing a critical factorization of the needle, we find the position
397/// of the critical factorization by finding the needle's maximal (or minimal)
398/// suffix, along with the period of that suffix. It turns out that the period
399/// of that suffix is a lower bound on the period of the needle itself.
400///
401/// This lower bound is equivalent to the actual period of the needle in
402/// some cases. To describe that case, we denote the needle as `x` where
403/// `x = uv` and `v` is the lexicographic maximal suffix of `v`. The lower
404/// bound given here is always the period of `v`, which is `<= period(x)`. The
405/// case where `period(v) == period(x)` occurs when `len(u) < (len(x) / 2)` and
406/// where `u` is a suffix of `v[0..period(v)]`.
407///
408/// This case is important because the search algorithm for when the
409/// periods are equivalent is slightly different than the search algorithm
410/// for when the periods are not equivalent. In particular, when they aren't
411/// equivalent, we know that the period of the needle is no less than half its
412/// length. In this case, we shift by an amount less than or equal to the
413/// period of the needle (determined by the maximum length of the components
414/// of the critical factorization of `x`, i.e., `max(len(u), len(v))`)..
415///
416/// The above two cases are represented by the variants below. Each entails
417/// a different instantiation of the Two-Way search algorithm.
418///
419/// N.B. If we could find a way to compute the exact period in all cases,
420/// then we could collapse this case analysis and simplify the algorithm. The
421/// Two-Way paper suggests this is possible, but more reading is required to
422/// grok why the authors didn't pursue that path.
423#[derive(Clone, Copy, Debug)]
424enum Shift {
425 Small { period: usize },
426 Large { shift: usize },
427}
428
429impl Shift {
430 /// Compute the shift for a given needle in the forward direction.
431 ///
432 /// This requires a lower bound on the period and a critical position.
433 /// These can be computed by extracting both the minimal and maximal
434 /// lexicographic suffixes, and choosing the right-most starting position.
435 /// The lower bound on the period is then the period of the chosen suffix.
436 fn forward(
437 needle: &[u8],
438 period_lower_bound: usize,
439 critical_pos: usize,
440 ) -> Shift {
441 let large = cmp::max(critical_pos, needle.len() - critical_pos);
442 if critical_pos * 2 >= needle.len() {
443 return Shift::Large { shift: large };
444 }
445
446 let (u, v) = needle.split_at(critical_pos);
447 if !util::is_suffix(&v[..period_lower_bound], u) {
448 return Shift::Large { shift: large };
449 }
450 Shift::Small { period: period_lower_bound }
451 }
452
453 /// Compute the shift for a given needle in the reverse direction.
454 ///
455 /// This requires a lower bound on the period and a critical position.
456 /// These can be computed by extracting both the minimal and maximal
457 /// lexicographic suffixes, and choosing the left-most starting position.
458 /// The lower bound on the period is then the period of the chosen suffix.
459 fn reverse(
460 needle: &[u8],
461 period_lower_bound: usize,
462 critical_pos: usize,
463 ) -> Shift {
464 let large = cmp::max(critical_pos, needle.len() - critical_pos);
465 if (needle.len() - critical_pos) * 2 >= needle.len() {
466 return Shift::Large { shift: large };
467 }
468
469 let (v, u) = needle.split_at(critical_pos);
470 if !util::is_prefix(&v[v.len() - period_lower_bound..], u) {
471 return Shift::Large { shift: large };
472 }
473 Shift::Small { period: period_lower_bound }
474 }
475}
476
477/// A suffix extracted from a needle along with its period.
478#[derive(Debug)]
479struct Suffix {
480 /// The starting position of this suffix.
481 ///
482 /// If this is a forward suffix, then `&bytes[pos..]` can be used. If this
483 /// is a reverse suffix, then `&bytes[..pos]` can be used. That is, for
484 /// forward suffixes, this is an inclusive starting position, where as for
485 /// reverse suffixes, this is an exclusive ending position.
486 pos: usize,
487 /// The period of this suffix.
488 ///
489 /// Note that this is NOT necessarily the period of the string from which
490 /// this suffix comes from. (It is always less than or equal to the period
491 /// of the original string.)
492 period: usize,
493}
494
495impl Suffix {
496 fn forward(needle: &[u8], kind: SuffixKind) -> Suffix {
497 debug_assert!(!needle.is_empty());
498
499 // suffix represents our maximal (or minimal) suffix, along with
500 // its period.
501 let mut suffix = Suffix { pos: 0, period: 1 };
502 // The start of a suffix in `needle` that we are considering as a
503 // more maximal (or minimal) suffix than what's in `suffix`.
504 let mut candidate_start = 1;
505 // The current offset of our suffixes that we're comparing.
506 //
507 // When the characters at this offset are the same, then we mush on
508 // to the next position since no decision is possible. When the
509 // candidate's character is greater (or lesser) than the corresponding
510 // character than our current maximal (or minimal) suffix, then the
511 // current suffix is changed over to the candidate and we restart our
512 // search. Otherwise, the candidate suffix is no good and we restart
513 // our search on the next candidate.
514 //
515 // The three cases above correspond to the three cases in the loop
516 // below.
517 let mut offset = 0;
518
519 while candidate_start + offset < needle.len() {
520 let current = needle[suffix.pos + offset];
521 let candidate = needle[candidate_start + offset];
522 match kind.cmp(current, candidate) {
523 SuffixOrdering::Accept => {
524 suffix = Suffix { pos: candidate_start, period: 1 };
525 candidate_start += 1;
526 offset = 0;
527 }
528 SuffixOrdering::Skip => {
529 candidate_start += offset + 1;
530 offset = 0;
531 suffix.period = candidate_start - suffix.pos;
532 }
533 SuffixOrdering::Push => {
534 if offset + 1 == suffix.period {
535 candidate_start += suffix.period;
536 offset = 0;
537 } else {
538 offset += 1;
539 }
540 }
541 }
542 }
543 suffix
544 }
545
546 fn reverse(needle: &[u8], kind: SuffixKind) -> Suffix {
547 debug_assert!(!needle.is_empty());
548
549 // See the comments in `forward` for how this works.
550 let mut suffix = Suffix { pos: needle.len(), period: 1 };
551 if needle.len() == 1 {
552 return suffix;
553 }
554 let mut candidate_start = needle.len() - 1;
555 let mut offset = 0;
556
557 while offset < candidate_start {
558 let current = needle[suffix.pos - offset - 1];
559 let candidate = needle[candidate_start - offset - 1];
560 match kind.cmp(current, candidate) {
561 SuffixOrdering::Accept => {
562 suffix = Suffix { pos: candidate_start, period: 1 };
563 candidate_start -= 1;
564 offset = 0;
565 }
566 SuffixOrdering::Skip => {
567 candidate_start -= offset + 1;
568 offset = 0;
569 suffix.period = suffix.pos - candidate_start;
570 }
571 SuffixOrdering::Push => {
572 if offset + 1 == suffix.period {
573 candidate_start -= suffix.period;
574 offset = 0;
575 } else {
576 offset += 1;
577 }
578 }
579 }
580 }
581 suffix
582 }
583}
584
585/// The kind of suffix to extract.
586#[derive(Clone, Copy, Debug)]
587enum SuffixKind {
588 /// Extract the smallest lexicographic suffix from a string.
589 ///
590 /// Technically, this doesn't actually pick the smallest lexicographic
591 /// suffix. e.g., Given the choice between `a` and `aa`, this will choose
592 /// the latter over the former, even though `a < aa`. The reasoning for
593 /// this isn't clear from the paper, but it still smells like a minimal
594 /// suffix.
595 Minimal,
596 /// Extract the largest lexicographic suffix from a string.
597 ///
598 /// Unlike `Minimal`, this really does pick the maximum suffix. e.g., Given
599 /// the choice between `z` and `zz`, this will choose the latter over the
600 /// former.
601 Maximal,
602}
603
604/// The result of comparing corresponding bytes between two suffixes.
605#[derive(Clone, Copy, Debug)]
606enum SuffixOrdering {
607 /// This occurs when the given candidate byte indicates that the candidate
608 /// suffix is better than the current maximal (or minimal) suffix. That is,
609 /// the current candidate suffix should supplant the current maximal (or
610 /// minimal) suffix.
611 Accept,
612 /// This occurs when the given candidate byte excludes the candidate suffix
613 /// from being better than the current maximal (or minimal) suffix. That
614 /// is, the current candidate suffix should be dropped and the next one
615 /// should be considered.
616 Skip,
617 /// This occurs when no decision to accept or skip the candidate suffix
618 /// can be made, e.g., when corresponding bytes are equivalent. In this
619 /// case, the next corresponding bytes should be compared.
620 Push,
621}
622
623impl SuffixKind {
624 /// Returns true if and only if the given candidate byte indicates that
625 /// it should replace the current suffix as the maximal (or minimal)
626 /// suffix.
627 fn cmp(self, current: u8, candidate: u8) -> SuffixOrdering {
628 use self::SuffixOrdering::*;
629
630 match self {
631 SuffixKind::Minimal if candidate < current => Accept,
632 SuffixKind::Minimal if candidate > current => Skip,
633 SuffixKind::Minimal => Push,
634 SuffixKind::Maximal if candidate > current => Accept,
635 SuffixKind::Maximal if candidate < current => Skip,
636 SuffixKind::Maximal => Push,
637 }
638 }
639}
640
641/// A bitset used to track whether a particular byte exists in a needle or not.
642///
643/// Namely, bit 'i' is set if and only if byte%64==i for any byte in the
644/// needle. If a particular byte in the haystack is NOT in this set, then one
645/// can conclude that it is also not in the needle, and thus, one can advance
646/// in the haystack by needle.len() bytes.
647#[derive(Clone, Copy, Debug)]
648struct ApproximateByteSet(u64);
649
650impl ApproximateByteSet {
651 /// Create a new set from the given needle.
652 fn new(needle: &[u8]) -> ApproximateByteSet {
653 let mut bits: u64 = 0;
654 for &b: u8 in needle {
655 bits |= 1 << (b % 64);
656 }
657 ApproximateByteSet(bits)
658 }
659
660 /// Return true if and only if the given byte might be in this set. This
661 /// may return a false positive, but will never return a false negative.
662 #[inline(always)]
663 fn contains(&self, byte: u8) -> bool {
664 self.0 & (1 << (byte % 64)) != 0
665 }
666}
667
668#[cfg(all(test, feature = "std", not(miri)))]
669mod tests {
670 use quickcheck::quickcheck;
671
672 use super::*;
673
674 define_memmem_quickcheck_tests!(
675 super::simpletests::twoway_find,
676 super::simpletests::twoway_rfind
677 );
678
679 /// Convenience wrapper for computing the suffix as a byte string.
680 fn get_suffix_forward(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) {
681 let s = Suffix::forward(needle, kind);
682 (&needle[s.pos..], s.period)
683 }
684
685 /// Convenience wrapper for computing the reverse suffix as a byte string.
686 fn get_suffix_reverse(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) {
687 let s = Suffix::reverse(needle, kind);
688 (&needle[..s.pos], s.period)
689 }
690
691 /// Return all of the non-empty suffixes in the given byte string.
692 fn suffixes(bytes: &[u8]) -> Vec<&[u8]> {
693 (0..bytes.len()).map(|i| &bytes[i..]).collect()
694 }
695
696 /// Return the lexicographically maximal suffix of the given byte string.
697 fn naive_maximal_suffix_forward(needle: &[u8]) -> &[u8] {
698 let mut sufs = suffixes(needle);
699 sufs.sort();
700 sufs.pop().unwrap()
701 }
702
703 /// Return the lexicographically maximal suffix of the reverse of the given
704 /// byte string.
705 fn naive_maximal_suffix_reverse(needle: &[u8]) -> Vec<u8> {
706 let mut reversed = needle.to_vec();
707 reversed.reverse();
708 let mut got = naive_maximal_suffix_forward(&reversed).to_vec();
709 got.reverse();
710 got
711 }
712
713 #[test]
714 fn suffix_forward() {
715 macro_rules! assert_suffix_min {
716 ($given:expr, $expected:expr, $period:expr) => {
717 let (got_suffix, got_period) =
718 get_suffix_forward($given.as_bytes(), SuffixKind::Minimal);
719 let got_suffix = std::str::from_utf8(got_suffix).unwrap();
720 assert_eq!(($expected, $period), (got_suffix, got_period));
721 };
722 }
723
724 macro_rules! assert_suffix_max {
725 ($given:expr, $expected:expr, $period:expr) => {
726 let (got_suffix, got_period) =
727 get_suffix_forward($given.as_bytes(), SuffixKind::Maximal);
728 let got_suffix = std::str::from_utf8(got_suffix).unwrap();
729 assert_eq!(($expected, $period), (got_suffix, got_period));
730 };
731 }
732
733 assert_suffix_min!("a", "a", 1);
734 assert_suffix_max!("a", "a", 1);
735
736 assert_suffix_min!("ab", "ab", 2);
737 assert_suffix_max!("ab", "b", 1);
738
739 assert_suffix_min!("ba", "a", 1);
740 assert_suffix_max!("ba", "ba", 2);
741
742 assert_suffix_min!("abc", "abc", 3);
743 assert_suffix_max!("abc", "c", 1);
744
745 assert_suffix_min!("acb", "acb", 3);
746 assert_suffix_max!("acb", "cb", 2);
747
748 assert_suffix_min!("cba", "a", 1);
749 assert_suffix_max!("cba", "cba", 3);
750
751 assert_suffix_min!("abcabc", "abcabc", 3);
752 assert_suffix_max!("abcabc", "cabc", 3);
753
754 assert_suffix_min!("abcabcabc", "abcabcabc", 3);
755 assert_suffix_max!("abcabcabc", "cabcabc", 3);
756
757 assert_suffix_min!("abczz", "abczz", 5);
758 assert_suffix_max!("abczz", "zz", 1);
759
760 assert_suffix_min!("zzabc", "abc", 3);
761 assert_suffix_max!("zzabc", "zzabc", 5);
762
763 assert_suffix_min!("aaa", "aaa", 1);
764 assert_suffix_max!("aaa", "aaa", 1);
765
766 assert_suffix_min!("foobar", "ar", 2);
767 assert_suffix_max!("foobar", "r", 1);
768 }
769
770 #[test]
771 fn suffix_reverse() {
772 macro_rules! assert_suffix_min {
773 ($given:expr, $expected:expr, $period:expr) => {
774 let (got_suffix, got_period) =
775 get_suffix_reverse($given.as_bytes(), SuffixKind::Minimal);
776 let got_suffix = std::str::from_utf8(got_suffix).unwrap();
777 assert_eq!(($expected, $period), (got_suffix, got_period));
778 };
779 }
780
781 macro_rules! assert_suffix_max {
782 ($given:expr, $expected:expr, $period:expr) => {
783 let (got_suffix, got_period) =
784 get_suffix_reverse($given.as_bytes(), SuffixKind::Maximal);
785 let got_suffix = std::str::from_utf8(got_suffix).unwrap();
786 assert_eq!(($expected, $period), (got_suffix, got_period));
787 };
788 }
789
790 assert_suffix_min!("a", "a", 1);
791 assert_suffix_max!("a", "a", 1);
792
793 assert_suffix_min!("ab", "a", 1);
794 assert_suffix_max!("ab", "ab", 2);
795
796 assert_suffix_min!("ba", "ba", 2);
797 assert_suffix_max!("ba", "b", 1);
798
799 assert_suffix_min!("abc", "a", 1);
800 assert_suffix_max!("abc", "abc", 3);
801
802 assert_suffix_min!("acb", "a", 1);
803 assert_suffix_max!("acb", "ac", 2);
804
805 assert_suffix_min!("cba", "cba", 3);
806 assert_suffix_max!("cba", "c", 1);
807
808 assert_suffix_min!("abcabc", "abca", 3);
809 assert_suffix_max!("abcabc", "abcabc", 3);
810
811 assert_suffix_min!("abcabcabc", "abcabca", 3);
812 assert_suffix_max!("abcabcabc", "abcabcabc", 3);
813
814 assert_suffix_min!("abczz", "a", 1);
815 assert_suffix_max!("abczz", "abczz", 5);
816
817 assert_suffix_min!("zzabc", "zza", 3);
818 assert_suffix_max!("zzabc", "zz", 1);
819
820 assert_suffix_min!("aaa", "aaa", 1);
821 assert_suffix_max!("aaa", "aaa", 1);
822 }
823
824 quickcheck! {
825 fn qc_suffix_forward_maximal(bytes: Vec<u8>) -> bool {
826 if bytes.is_empty() {
827 return true;
828 }
829
830 let (got, _) = get_suffix_forward(&bytes, SuffixKind::Maximal);
831 let expected = naive_maximal_suffix_forward(&bytes);
832 got == expected
833 }
834
835 fn qc_suffix_reverse_maximal(bytes: Vec<u8>) -> bool {
836 if bytes.is_empty() {
837 return true;
838 }
839
840 let (got, _) = get_suffix_reverse(&bytes, SuffixKind::Maximal);
841 let expected = naive_maximal_suffix_reverse(&bytes);
842 expected == got
843 }
844 }
845}
846
847#[cfg(test)]
848mod simpletests {
849 use super::*;
850
851 pub(crate) fn twoway_find(
852 haystack: &[u8],
853 needle: &[u8],
854 ) -> Option<usize> {
855 Forward::new(needle).find_general(None, haystack, needle)
856 }
857
858 pub(crate) fn twoway_rfind(
859 haystack: &[u8],
860 needle: &[u8],
861 ) -> Option<usize> {
862 Reverse::new(needle).rfind_general(haystack, needle)
863 }
864
865 define_memmem_simple_tests!(twoway_find, twoway_rfind);
866
867 // This is a regression test caught by quickcheck that exercised a bug in
868 // the reverse small period handling. The bug was that we were using 'if j
869 // == shift' to determine if a match occurred, but the correct guard is 'if
870 // j >= shift', which matches the corresponding guard in the forward impl.
871 #[test]
872 fn regression_rev_small_period() {
873 let rfind = super::simpletests::twoway_rfind;
874 let haystack = "ababaz";
875 let needle = "abab";
876 assert_eq!(Some(0), rfind(haystack.as_bytes(), needle.as_bytes()));
877 }
878}
879