1 | use core::cmp; |
2 | |
3 | use crate::memmem::{prefilter::Pre, util}; |
4 | |
5 | /// Two-Way search in the forward direction. |
6 | #[derive (Clone, Copy, Debug)] |
7 | pub(crate) struct Forward(TwoWay); |
8 | |
9 | /// Two-Way search in the reverse direction. |
10 | #[derive (Clone, Copy, Debug)] |
11 | pub(crate) struct Reverse(TwoWay); |
12 | |
13 | /// An implementation of the TwoWay substring search algorithm, with heuristics |
14 | /// for accelerating search based on frequency analysis. |
15 | /// |
16 | /// This searcher supports forward and reverse search, although not |
17 | /// simultaneously. It runs in O(n + m) time and O(1) space, where |
18 | /// `n ~ len(needle)` and `m ~ len(haystack)`. |
19 | /// |
20 | /// The implementation here roughly matches that which was developed by |
21 | /// Crochemore and Perrin in their 1991 paper "Two-way string-matching." The |
22 | /// changes in this implementation are 1) the use of zero-based indices, 2) a |
23 | /// heuristic skip table based on the last byte (borrowed from Rust's standard |
24 | /// library) and 3) the addition of heuristics for a fast skip loop. That is, |
25 | /// (3) this will detect bytes that are believed to be rare in the needle and |
26 | /// use fast vectorized instructions to find their occurrences quickly. The |
27 | /// Two-Way algorithm is then used to confirm whether a match at that location |
28 | /// occurred. |
29 | /// |
30 | /// The heuristic for fast skipping is automatically shut off if it's |
31 | /// detected to be ineffective at search time. Generally, this only occurs in |
32 | /// pathological cases. But this is generally necessary in order to preserve |
33 | /// a `O(n + m)` time bound. |
34 | /// |
35 | /// The code below is fairly complex and not obviously correct at all. It's |
36 | /// likely necessary to read the Two-Way paper cited above in order to fully |
37 | /// grok this code. The essence of it is: |
38 | /// |
39 | /// 1) Do something to detect a "critical" position in the needle. |
40 | /// 2) For the current position in the haystack, look if needle[critical..] |
41 | /// matches at that position. |
42 | /// 3) If so, look if needle[..critical] matches. |
43 | /// 4) If a mismatch occurs, shift the search by some amount based on the |
44 | /// critical position and a pre-computed shift. |
45 | /// |
46 | /// This type is wrapped in Forward and Reverse types that expose consistent |
47 | /// forward or reverse APIs. |
48 | #[derive (Clone, Copy, Debug)] |
49 | struct TwoWay { |
50 | /// A small bitset used as a quick prefilter (in addition to the faster |
51 | /// SIMD based prefilter). Namely, a bit 'i' is set if and only if b%64==i |
52 | /// for any b in the needle. |
53 | /// |
54 | /// When used as a prefilter, if the last byte at the current candidate |
55 | /// position is NOT in this set, then we can skip that entire candidate |
56 | /// position (the length of the needle). This is essentially the shift |
57 | /// trick found in Boyer-Moore, but only applied to bytes that don't appear |
58 | /// in the needle. |
59 | /// |
60 | /// N.B. This trick was inspired by something similar in std's |
61 | /// implementation of Two-Way. |
62 | byteset: ApproximateByteSet, |
63 | /// A critical position in needle. Specifically, this position corresponds |
64 | /// to beginning of either the minimal or maximal suffix in needle. (N.B. |
65 | /// See SuffixType below for why "minimal" isn't quite the correct word |
66 | /// here.) |
67 | /// |
68 | /// This is the position at which every search begins. Namely, search |
69 | /// starts by scanning text to the right of this position, and only if |
70 | /// there's a match does the text to the left of this position get scanned. |
71 | critical_pos: usize, |
72 | /// The amount we shift by in the Two-Way search algorithm. This |
73 | /// corresponds to the "small period" and "large period" cases. |
74 | shift: Shift, |
75 | } |
76 | |
77 | impl Forward { |
78 | /// Create a searcher that uses the Two-Way algorithm by searching forwards |
79 | /// through any haystack. |
80 | pub(crate) fn new(needle: &[u8]) -> Forward { |
81 | if needle.is_empty() { |
82 | return Forward(TwoWay::empty()); |
83 | } |
84 | |
85 | let byteset = ApproximateByteSet::new(needle); |
86 | let min_suffix = Suffix::forward(needle, SuffixKind::Minimal); |
87 | let max_suffix = Suffix::forward(needle, SuffixKind::Maximal); |
88 | let (period_lower_bound, critical_pos) = |
89 | if min_suffix.pos > max_suffix.pos { |
90 | (min_suffix.period, min_suffix.pos) |
91 | } else { |
92 | (max_suffix.period, max_suffix.pos) |
93 | }; |
94 | let shift = Shift::forward(needle, period_lower_bound, critical_pos); |
95 | Forward(TwoWay { byteset, critical_pos, shift }) |
96 | } |
97 | |
98 | /// Find the position of the first occurrence of this searcher's needle in |
99 | /// the given haystack. If one does not exist, then return None. |
100 | /// |
101 | /// This accepts prefilter state that is useful when using the same |
102 | /// searcher multiple times, such as in an iterator. |
103 | /// |
104 | /// Callers must guarantee that the needle is non-empty and its length is |
105 | /// <= the haystack's length. |
106 | #[inline (always)] |
107 | pub(crate) fn find( |
108 | &self, |
109 | pre: Option<&mut Pre<'_>>, |
110 | haystack: &[u8], |
111 | needle: &[u8], |
112 | ) -> Option<usize> { |
113 | debug_assert!(!needle.is_empty(), "needle should not be empty" ); |
114 | debug_assert!(needle.len() <= haystack.len(), "haystack too short" ); |
115 | |
116 | match self.0.shift { |
117 | Shift::Small { period } => { |
118 | self.find_small_imp(pre, haystack, needle, period) |
119 | } |
120 | Shift::Large { shift } => { |
121 | self.find_large_imp(pre, haystack, needle, shift) |
122 | } |
123 | } |
124 | } |
125 | |
126 | /// Like find, but handles the degenerate substring test cases. This is |
127 | /// only useful for conveniently testing this substring implementation in |
128 | /// isolation. |
129 | #[cfg (test)] |
130 | fn find_general( |
131 | &self, |
132 | pre: Option<&mut Pre<'_>>, |
133 | haystack: &[u8], |
134 | needle: &[u8], |
135 | ) -> Option<usize> { |
136 | if needle.is_empty() { |
137 | Some(0) |
138 | } else if haystack.len() < needle.len() { |
139 | None |
140 | } else { |
141 | self.find(pre, haystack, needle) |
142 | } |
143 | } |
144 | |
145 | // Each of the two search implementations below can be accelerated by a |
146 | // prefilter, but it is not always enabled. To avoid its overhead when |
147 | // its disabled, we explicitly inline each search implementation based on |
148 | // whether a prefilter will be used or not. The decision on which to use |
149 | // is made in the parent meta searcher. |
150 | |
151 | #[inline (always)] |
152 | fn find_small_imp( |
153 | &self, |
154 | mut pre: Option<&mut Pre<'_>>, |
155 | haystack: &[u8], |
156 | needle: &[u8], |
157 | period: usize, |
158 | ) -> Option<usize> { |
159 | let last_byte = needle.len() - 1; |
160 | let mut pos = 0; |
161 | let mut shift = 0; |
162 | while pos + needle.len() <= haystack.len() { |
163 | let mut i = cmp::max(self.0.critical_pos, shift); |
164 | if let Some(pre) = pre.as_mut() { |
165 | if pre.should_call() { |
166 | pos += pre.call(&haystack[pos..], needle)?; |
167 | shift = 0; |
168 | i = self.0.critical_pos; |
169 | if pos + needle.len() > haystack.len() { |
170 | return None; |
171 | } |
172 | } |
173 | } |
174 | if !self.0.byteset.contains(haystack[pos + last_byte]) { |
175 | pos += needle.len(); |
176 | shift = 0; |
177 | continue; |
178 | } |
179 | while i < needle.len() && needle[i] == haystack[pos + i] { |
180 | i += 1; |
181 | } |
182 | if i < needle.len() { |
183 | pos += i - self.0.critical_pos + 1; |
184 | shift = 0; |
185 | } else { |
186 | let mut j = self.0.critical_pos; |
187 | while j > shift && needle[j] == haystack[pos + j] { |
188 | j -= 1; |
189 | } |
190 | if j <= shift && needle[shift] == haystack[pos + shift] { |
191 | return Some(pos); |
192 | } |
193 | pos += period; |
194 | shift = needle.len() - period; |
195 | } |
196 | } |
197 | None |
198 | } |
199 | |
200 | #[inline (always)] |
201 | fn find_large_imp( |
202 | &self, |
203 | mut pre: Option<&mut Pre<'_>>, |
204 | haystack: &[u8], |
205 | needle: &[u8], |
206 | shift: usize, |
207 | ) -> Option<usize> { |
208 | let last_byte = needle.len() - 1; |
209 | let mut pos = 0; |
210 | 'outer: while pos + needle.len() <= haystack.len() { |
211 | if let Some(pre) = pre.as_mut() { |
212 | if pre.should_call() { |
213 | pos += pre.call(&haystack[pos..], needle)?; |
214 | if pos + needle.len() > haystack.len() { |
215 | return None; |
216 | } |
217 | } |
218 | } |
219 | |
220 | if !self.0.byteset.contains(haystack[pos + last_byte]) { |
221 | pos += needle.len(); |
222 | continue; |
223 | } |
224 | let mut i = self.0.critical_pos; |
225 | while i < needle.len() && needle[i] == haystack[pos + i] { |
226 | i += 1; |
227 | } |
228 | if i < needle.len() { |
229 | pos += i - self.0.critical_pos + 1; |
230 | } else { |
231 | for j in (0..self.0.critical_pos).rev() { |
232 | if needle[j] != haystack[pos + j] { |
233 | pos += shift; |
234 | continue 'outer; |
235 | } |
236 | } |
237 | return Some(pos); |
238 | } |
239 | } |
240 | None |
241 | } |
242 | } |
243 | |
244 | impl Reverse { |
245 | /// Create a searcher that uses the Two-Way algorithm by searching in |
246 | /// reverse through any haystack. |
247 | pub(crate) fn new(needle: &[u8]) -> Reverse { |
248 | if needle.is_empty() { |
249 | return Reverse(TwoWay::empty()); |
250 | } |
251 | |
252 | let byteset = ApproximateByteSet::new(needle); |
253 | let min_suffix = Suffix::reverse(needle, SuffixKind::Minimal); |
254 | let max_suffix = Suffix::reverse(needle, SuffixKind::Maximal); |
255 | let (period_lower_bound, critical_pos) = |
256 | if min_suffix.pos < max_suffix.pos { |
257 | (min_suffix.period, min_suffix.pos) |
258 | } else { |
259 | (max_suffix.period, max_suffix.pos) |
260 | }; |
261 | // let critical_pos = needle.len() - critical_pos; |
262 | let shift = Shift::reverse(needle, period_lower_bound, critical_pos); |
263 | Reverse(TwoWay { byteset, critical_pos, shift }) |
264 | } |
265 | |
266 | /// Find the position of the last occurrence of this searcher's needle |
267 | /// in the given haystack. If one does not exist, then return None. |
268 | /// |
269 | /// This will automatically initialize prefilter state. This should only |
270 | /// be used for one-off searches. |
271 | /// |
272 | /// Callers must guarantee that the needle is non-empty and its length is |
273 | /// <= the haystack's length. |
274 | #[inline (always)] |
275 | pub(crate) fn rfind( |
276 | &self, |
277 | haystack: &[u8], |
278 | needle: &[u8], |
279 | ) -> Option<usize> { |
280 | debug_assert!(!needle.is_empty(), "needle should not be empty" ); |
281 | debug_assert!(needle.len() <= haystack.len(), "haystack too short" ); |
282 | // For the reverse case, we don't use a prefilter. It's plausible that |
283 | // perhaps we should, but it's a lot of additional code to do it, and |
284 | // it's not clear that it's actually worth it. If you have a really |
285 | // compelling use case for this, please file an issue. |
286 | match self.0.shift { |
287 | Shift::Small { period } => { |
288 | self.rfind_small_imp(haystack, needle, period) |
289 | } |
290 | Shift::Large { shift } => { |
291 | self.rfind_large_imp(haystack, needle, shift) |
292 | } |
293 | } |
294 | } |
295 | |
296 | /// Like rfind, but handles the degenerate substring test cases. This is |
297 | /// only useful for conveniently testing this substring implementation in |
298 | /// isolation. |
299 | #[cfg (test)] |
300 | fn rfind_general(&self, haystack: &[u8], needle: &[u8]) -> Option<usize> { |
301 | if needle.is_empty() { |
302 | Some(haystack.len()) |
303 | } else if haystack.len() < needle.len() { |
304 | None |
305 | } else { |
306 | self.rfind(haystack, needle) |
307 | } |
308 | } |
309 | |
310 | #[inline (always)] |
311 | fn rfind_small_imp( |
312 | &self, |
313 | haystack: &[u8], |
314 | needle: &[u8], |
315 | period: usize, |
316 | ) -> Option<usize> { |
317 | let nlen = needle.len(); |
318 | let mut pos = haystack.len(); |
319 | let mut shift = nlen; |
320 | while pos >= nlen { |
321 | if !self.0.byteset.contains(haystack[pos - nlen]) { |
322 | pos -= nlen; |
323 | shift = nlen; |
324 | continue; |
325 | } |
326 | let mut i = cmp::min(self.0.critical_pos, shift); |
327 | while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] { |
328 | i -= 1; |
329 | } |
330 | if i > 0 || needle[0] != haystack[pos - nlen] { |
331 | pos -= self.0.critical_pos - i + 1; |
332 | shift = nlen; |
333 | } else { |
334 | let mut j = self.0.critical_pos; |
335 | while j < shift && needle[j] == haystack[pos - nlen + j] { |
336 | j += 1; |
337 | } |
338 | if j >= shift { |
339 | return Some(pos - nlen); |
340 | } |
341 | pos -= period; |
342 | shift = period; |
343 | } |
344 | } |
345 | None |
346 | } |
347 | |
348 | #[inline (always)] |
349 | fn rfind_large_imp( |
350 | &self, |
351 | haystack: &[u8], |
352 | needle: &[u8], |
353 | shift: usize, |
354 | ) -> Option<usize> { |
355 | let nlen = needle.len(); |
356 | let mut pos = haystack.len(); |
357 | while pos >= nlen { |
358 | if !self.0.byteset.contains(haystack[pos - nlen]) { |
359 | pos -= nlen; |
360 | continue; |
361 | } |
362 | let mut i = self.0.critical_pos; |
363 | while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] { |
364 | i -= 1; |
365 | } |
366 | if i > 0 || needle[0] != haystack[pos - nlen] { |
367 | pos -= self.0.critical_pos - i + 1; |
368 | } else { |
369 | let mut j = self.0.critical_pos; |
370 | while j < nlen && needle[j] == haystack[pos - nlen + j] { |
371 | j += 1; |
372 | } |
373 | if j == nlen { |
374 | return Some(pos - nlen); |
375 | } |
376 | pos -= shift; |
377 | } |
378 | } |
379 | None |
380 | } |
381 | } |
382 | |
383 | impl TwoWay { |
384 | fn empty() -> TwoWay { |
385 | TwoWay { |
386 | byteset: ApproximateByteSet::new(needle:b"" ), |
387 | critical_pos: 0, |
388 | shift: Shift::Large { shift: 0 }, |
389 | } |
390 | } |
391 | } |
392 | |
393 | /// A representation of the amount we're allowed to shift by during Two-Way |
394 | /// search. |
395 | /// |
396 | /// When computing a critical factorization of the needle, we find the position |
397 | /// of the critical factorization by finding the needle's maximal (or minimal) |
398 | /// suffix, along with the period of that suffix. It turns out that the period |
399 | /// of that suffix is a lower bound on the period of the needle itself. |
400 | /// |
401 | /// This lower bound is equivalent to the actual period of the needle in |
402 | /// some cases. To describe that case, we denote the needle as `x` where |
403 | /// `x = uv` and `v` is the lexicographic maximal suffix of `v`. The lower |
404 | /// bound given here is always the period of `v`, which is `<= period(x)`. The |
405 | /// case where `period(v) == period(x)` occurs when `len(u) < (len(x) / 2)` and |
406 | /// where `u` is a suffix of `v[0..period(v)]`. |
407 | /// |
408 | /// This case is important because the search algorithm for when the |
409 | /// periods are equivalent is slightly different than the search algorithm |
410 | /// for when the periods are not equivalent. In particular, when they aren't |
411 | /// equivalent, we know that the period of the needle is no less than half its |
412 | /// length. In this case, we shift by an amount less than or equal to the |
413 | /// period of the needle (determined by the maximum length of the components |
414 | /// of the critical factorization of `x`, i.e., `max(len(u), len(v))`).. |
415 | /// |
416 | /// The above two cases are represented by the variants below. Each entails |
417 | /// a different instantiation of the Two-Way search algorithm. |
418 | /// |
419 | /// N.B. If we could find a way to compute the exact period in all cases, |
420 | /// then we could collapse this case analysis and simplify the algorithm. The |
421 | /// Two-Way paper suggests this is possible, but more reading is required to |
422 | /// grok why the authors didn't pursue that path. |
423 | #[derive (Clone, Copy, Debug)] |
424 | enum Shift { |
425 | Small { period: usize }, |
426 | Large { shift: usize }, |
427 | } |
428 | |
429 | impl Shift { |
430 | /// Compute the shift for a given needle in the forward direction. |
431 | /// |
432 | /// This requires a lower bound on the period and a critical position. |
433 | /// These can be computed by extracting both the minimal and maximal |
434 | /// lexicographic suffixes, and choosing the right-most starting position. |
435 | /// The lower bound on the period is then the period of the chosen suffix. |
436 | fn forward( |
437 | needle: &[u8], |
438 | period_lower_bound: usize, |
439 | critical_pos: usize, |
440 | ) -> Shift { |
441 | let large = cmp::max(critical_pos, needle.len() - critical_pos); |
442 | if critical_pos * 2 >= needle.len() { |
443 | return Shift::Large { shift: large }; |
444 | } |
445 | |
446 | let (u, v) = needle.split_at(critical_pos); |
447 | if !util::is_suffix(&v[..period_lower_bound], u) { |
448 | return Shift::Large { shift: large }; |
449 | } |
450 | Shift::Small { period: period_lower_bound } |
451 | } |
452 | |
453 | /// Compute the shift for a given needle in the reverse direction. |
454 | /// |
455 | /// This requires a lower bound on the period and a critical position. |
456 | /// These can be computed by extracting both the minimal and maximal |
457 | /// lexicographic suffixes, and choosing the left-most starting position. |
458 | /// The lower bound on the period is then the period of the chosen suffix. |
459 | fn reverse( |
460 | needle: &[u8], |
461 | period_lower_bound: usize, |
462 | critical_pos: usize, |
463 | ) -> Shift { |
464 | let large = cmp::max(critical_pos, needle.len() - critical_pos); |
465 | if (needle.len() - critical_pos) * 2 >= needle.len() { |
466 | return Shift::Large { shift: large }; |
467 | } |
468 | |
469 | let (v, u) = needle.split_at(critical_pos); |
470 | if !util::is_prefix(&v[v.len() - period_lower_bound..], u) { |
471 | return Shift::Large { shift: large }; |
472 | } |
473 | Shift::Small { period: period_lower_bound } |
474 | } |
475 | } |
476 | |
477 | /// A suffix extracted from a needle along with its period. |
478 | #[derive (Debug)] |
479 | struct Suffix { |
480 | /// The starting position of this suffix. |
481 | /// |
482 | /// If this is a forward suffix, then `&bytes[pos..]` can be used. If this |
483 | /// is a reverse suffix, then `&bytes[..pos]` can be used. That is, for |
484 | /// forward suffixes, this is an inclusive starting position, where as for |
485 | /// reverse suffixes, this is an exclusive ending position. |
486 | pos: usize, |
487 | /// The period of this suffix. |
488 | /// |
489 | /// Note that this is NOT necessarily the period of the string from which |
490 | /// this suffix comes from. (It is always less than or equal to the period |
491 | /// of the original string.) |
492 | period: usize, |
493 | } |
494 | |
495 | impl Suffix { |
496 | fn forward(needle: &[u8], kind: SuffixKind) -> Suffix { |
497 | debug_assert!(!needle.is_empty()); |
498 | |
499 | // suffix represents our maximal (or minimal) suffix, along with |
500 | // its period. |
501 | let mut suffix = Suffix { pos: 0, period: 1 }; |
502 | // The start of a suffix in `needle` that we are considering as a |
503 | // more maximal (or minimal) suffix than what's in `suffix`. |
504 | let mut candidate_start = 1; |
505 | // The current offset of our suffixes that we're comparing. |
506 | // |
507 | // When the characters at this offset are the same, then we mush on |
508 | // to the next position since no decision is possible. When the |
509 | // candidate's character is greater (or lesser) than the corresponding |
510 | // character than our current maximal (or minimal) suffix, then the |
511 | // current suffix is changed over to the candidate and we restart our |
512 | // search. Otherwise, the candidate suffix is no good and we restart |
513 | // our search on the next candidate. |
514 | // |
515 | // The three cases above correspond to the three cases in the loop |
516 | // below. |
517 | let mut offset = 0; |
518 | |
519 | while candidate_start + offset < needle.len() { |
520 | let current = needle[suffix.pos + offset]; |
521 | let candidate = needle[candidate_start + offset]; |
522 | match kind.cmp(current, candidate) { |
523 | SuffixOrdering::Accept => { |
524 | suffix = Suffix { pos: candidate_start, period: 1 }; |
525 | candidate_start += 1; |
526 | offset = 0; |
527 | } |
528 | SuffixOrdering::Skip => { |
529 | candidate_start += offset + 1; |
530 | offset = 0; |
531 | suffix.period = candidate_start - suffix.pos; |
532 | } |
533 | SuffixOrdering::Push => { |
534 | if offset + 1 == suffix.period { |
535 | candidate_start += suffix.period; |
536 | offset = 0; |
537 | } else { |
538 | offset += 1; |
539 | } |
540 | } |
541 | } |
542 | } |
543 | suffix |
544 | } |
545 | |
546 | fn reverse(needle: &[u8], kind: SuffixKind) -> Suffix { |
547 | debug_assert!(!needle.is_empty()); |
548 | |
549 | // See the comments in `forward` for how this works. |
550 | let mut suffix = Suffix { pos: needle.len(), period: 1 }; |
551 | if needle.len() == 1 { |
552 | return suffix; |
553 | } |
554 | let mut candidate_start = needle.len() - 1; |
555 | let mut offset = 0; |
556 | |
557 | while offset < candidate_start { |
558 | let current = needle[suffix.pos - offset - 1]; |
559 | let candidate = needle[candidate_start - offset - 1]; |
560 | match kind.cmp(current, candidate) { |
561 | SuffixOrdering::Accept => { |
562 | suffix = Suffix { pos: candidate_start, period: 1 }; |
563 | candidate_start -= 1; |
564 | offset = 0; |
565 | } |
566 | SuffixOrdering::Skip => { |
567 | candidate_start -= offset + 1; |
568 | offset = 0; |
569 | suffix.period = suffix.pos - candidate_start; |
570 | } |
571 | SuffixOrdering::Push => { |
572 | if offset + 1 == suffix.period { |
573 | candidate_start -= suffix.period; |
574 | offset = 0; |
575 | } else { |
576 | offset += 1; |
577 | } |
578 | } |
579 | } |
580 | } |
581 | suffix |
582 | } |
583 | } |
584 | |
585 | /// The kind of suffix to extract. |
586 | #[derive (Clone, Copy, Debug)] |
587 | enum SuffixKind { |
588 | /// Extract the smallest lexicographic suffix from a string. |
589 | /// |
590 | /// Technically, this doesn't actually pick the smallest lexicographic |
591 | /// suffix. e.g., Given the choice between `a` and `aa`, this will choose |
592 | /// the latter over the former, even though `a < aa`. The reasoning for |
593 | /// this isn't clear from the paper, but it still smells like a minimal |
594 | /// suffix. |
595 | Minimal, |
596 | /// Extract the largest lexicographic suffix from a string. |
597 | /// |
598 | /// Unlike `Minimal`, this really does pick the maximum suffix. e.g., Given |
599 | /// the choice between `z` and `zz`, this will choose the latter over the |
600 | /// former. |
601 | Maximal, |
602 | } |
603 | |
604 | /// The result of comparing corresponding bytes between two suffixes. |
605 | #[derive (Clone, Copy, Debug)] |
606 | enum SuffixOrdering { |
607 | /// This occurs when the given candidate byte indicates that the candidate |
608 | /// suffix is better than the current maximal (or minimal) suffix. That is, |
609 | /// the current candidate suffix should supplant the current maximal (or |
610 | /// minimal) suffix. |
611 | Accept, |
612 | /// This occurs when the given candidate byte excludes the candidate suffix |
613 | /// from being better than the current maximal (or minimal) suffix. That |
614 | /// is, the current candidate suffix should be dropped and the next one |
615 | /// should be considered. |
616 | Skip, |
617 | /// This occurs when no decision to accept or skip the candidate suffix |
618 | /// can be made, e.g., when corresponding bytes are equivalent. In this |
619 | /// case, the next corresponding bytes should be compared. |
620 | Push, |
621 | } |
622 | |
623 | impl SuffixKind { |
624 | /// Returns true if and only if the given candidate byte indicates that |
625 | /// it should replace the current suffix as the maximal (or minimal) |
626 | /// suffix. |
627 | fn cmp(self, current: u8, candidate: u8) -> SuffixOrdering { |
628 | use self::SuffixOrdering::*; |
629 | |
630 | match self { |
631 | SuffixKind::Minimal if candidate < current => Accept, |
632 | SuffixKind::Minimal if candidate > current => Skip, |
633 | SuffixKind::Minimal => Push, |
634 | SuffixKind::Maximal if candidate > current => Accept, |
635 | SuffixKind::Maximal if candidate < current => Skip, |
636 | SuffixKind::Maximal => Push, |
637 | } |
638 | } |
639 | } |
640 | |
641 | /// A bitset used to track whether a particular byte exists in a needle or not. |
642 | /// |
643 | /// Namely, bit 'i' is set if and only if byte%64==i for any byte in the |
644 | /// needle. If a particular byte in the haystack is NOT in this set, then one |
645 | /// can conclude that it is also not in the needle, and thus, one can advance |
646 | /// in the haystack by needle.len() bytes. |
647 | #[derive (Clone, Copy, Debug)] |
648 | struct ApproximateByteSet(u64); |
649 | |
650 | impl ApproximateByteSet { |
651 | /// Create a new set from the given needle. |
652 | fn new(needle: &[u8]) -> ApproximateByteSet { |
653 | let mut bits: u64 = 0; |
654 | for &b: u8 in needle { |
655 | bits |= 1 << (b % 64); |
656 | } |
657 | ApproximateByteSet(bits) |
658 | } |
659 | |
660 | /// Return true if and only if the given byte might be in this set. This |
661 | /// may return a false positive, but will never return a false negative. |
662 | #[inline (always)] |
663 | fn contains(&self, byte: u8) -> bool { |
664 | self.0 & (1 << (byte % 64)) != 0 |
665 | } |
666 | } |
667 | |
668 | #[cfg (all(test, feature = "std" , not(miri)))] |
669 | mod tests { |
670 | use quickcheck::quickcheck; |
671 | |
672 | use super::*; |
673 | |
674 | define_memmem_quickcheck_tests!( |
675 | super::simpletests::twoway_find, |
676 | super::simpletests::twoway_rfind |
677 | ); |
678 | |
679 | /// Convenience wrapper for computing the suffix as a byte string. |
680 | fn get_suffix_forward(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) { |
681 | let s = Suffix::forward(needle, kind); |
682 | (&needle[s.pos..], s.period) |
683 | } |
684 | |
685 | /// Convenience wrapper for computing the reverse suffix as a byte string. |
686 | fn get_suffix_reverse(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) { |
687 | let s = Suffix::reverse(needle, kind); |
688 | (&needle[..s.pos], s.period) |
689 | } |
690 | |
691 | /// Return all of the non-empty suffixes in the given byte string. |
692 | fn suffixes(bytes: &[u8]) -> Vec<&[u8]> { |
693 | (0..bytes.len()).map(|i| &bytes[i..]).collect() |
694 | } |
695 | |
696 | /// Return the lexicographically maximal suffix of the given byte string. |
697 | fn naive_maximal_suffix_forward(needle: &[u8]) -> &[u8] { |
698 | let mut sufs = suffixes(needle); |
699 | sufs.sort(); |
700 | sufs.pop().unwrap() |
701 | } |
702 | |
703 | /// Return the lexicographically maximal suffix of the reverse of the given |
704 | /// byte string. |
705 | fn naive_maximal_suffix_reverse(needle: &[u8]) -> Vec<u8> { |
706 | let mut reversed = needle.to_vec(); |
707 | reversed.reverse(); |
708 | let mut got = naive_maximal_suffix_forward(&reversed).to_vec(); |
709 | got.reverse(); |
710 | got |
711 | } |
712 | |
713 | #[test ] |
714 | fn suffix_forward() { |
715 | macro_rules! assert_suffix_min { |
716 | ($given:expr, $expected:expr, $period:expr) => { |
717 | let (got_suffix, got_period) = |
718 | get_suffix_forward($given.as_bytes(), SuffixKind::Minimal); |
719 | let got_suffix = std::str::from_utf8(got_suffix).unwrap(); |
720 | assert_eq!(($expected, $period), (got_suffix, got_period)); |
721 | }; |
722 | } |
723 | |
724 | macro_rules! assert_suffix_max { |
725 | ($given:expr, $expected:expr, $period:expr) => { |
726 | let (got_suffix, got_period) = |
727 | get_suffix_forward($given.as_bytes(), SuffixKind::Maximal); |
728 | let got_suffix = std::str::from_utf8(got_suffix).unwrap(); |
729 | assert_eq!(($expected, $period), (got_suffix, got_period)); |
730 | }; |
731 | } |
732 | |
733 | assert_suffix_min!("a" , "a" , 1); |
734 | assert_suffix_max!("a" , "a" , 1); |
735 | |
736 | assert_suffix_min!("ab" , "ab" , 2); |
737 | assert_suffix_max!("ab" , "b" , 1); |
738 | |
739 | assert_suffix_min!("ba" , "a" , 1); |
740 | assert_suffix_max!("ba" , "ba" , 2); |
741 | |
742 | assert_suffix_min!("abc" , "abc" , 3); |
743 | assert_suffix_max!("abc" , "c" , 1); |
744 | |
745 | assert_suffix_min!("acb" , "acb" , 3); |
746 | assert_suffix_max!("acb" , "cb" , 2); |
747 | |
748 | assert_suffix_min!("cba" , "a" , 1); |
749 | assert_suffix_max!("cba" , "cba" , 3); |
750 | |
751 | assert_suffix_min!("abcabc" , "abcabc" , 3); |
752 | assert_suffix_max!("abcabc" , "cabc" , 3); |
753 | |
754 | assert_suffix_min!("abcabcabc" , "abcabcabc" , 3); |
755 | assert_suffix_max!("abcabcabc" , "cabcabc" , 3); |
756 | |
757 | assert_suffix_min!("abczz" , "abczz" , 5); |
758 | assert_suffix_max!("abczz" , "zz" , 1); |
759 | |
760 | assert_suffix_min!("zzabc" , "abc" , 3); |
761 | assert_suffix_max!("zzabc" , "zzabc" , 5); |
762 | |
763 | assert_suffix_min!("aaa" , "aaa" , 1); |
764 | assert_suffix_max!("aaa" , "aaa" , 1); |
765 | |
766 | assert_suffix_min!("foobar" , "ar" , 2); |
767 | assert_suffix_max!("foobar" , "r" , 1); |
768 | } |
769 | |
770 | #[test ] |
771 | fn suffix_reverse() { |
772 | macro_rules! assert_suffix_min { |
773 | ($given:expr, $expected:expr, $period:expr) => { |
774 | let (got_suffix, got_period) = |
775 | get_suffix_reverse($given.as_bytes(), SuffixKind::Minimal); |
776 | let got_suffix = std::str::from_utf8(got_suffix).unwrap(); |
777 | assert_eq!(($expected, $period), (got_suffix, got_period)); |
778 | }; |
779 | } |
780 | |
781 | macro_rules! assert_suffix_max { |
782 | ($given:expr, $expected:expr, $period:expr) => { |
783 | let (got_suffix, got_period) = |
784 | get_suffix_reverse($given.as_bytes(), SuffixKind::Maximal); |
785 | let got_suffix = std::str::from_utf8(got_suffix).unwrap(); |
786 | assert_eq!(($expected, $period), (got_suffix, got_period)); |
787 | }; |
788 | } |
789 | |
790 | assert_suffix_min!("a" , "a" , 1); |
791 | assert_suffix_max!("a" , "a" , 1); |
792 | |
793 | assert_suffix_min!("ab" , "a" , 1); |
794 | assert_suffix_max!("ab" , "ab" , 2); |
795 | |
796 | assert_suffix_min!("ba" , "ba" , 2); |
797 | assert_suffix_max!("ba" , "b" , 1); |
798 | |
799 | assert_suffix_min!("abc" , "a" , 1); |
800 | assert_suffix_max!("abc" , "abc" , 3); |
801 | |
802 | assert_suffix_min!("acb" , "a" , 1); |
803 | assert_suffix_max!("acb" , "ac" , 2); |
804 | |
805 | assert_suffix_min!("cba" , "cba" , 3); |
806 | assert_suffix_max!("cba" , "c" , 1); |
807 | |
808 | assert_suffix_min!("abcabc" , "abca" , 3); |
809 | assert_suffix_max!("abcabc" , "abcabc" , 3); |
810 | |
811 | assert_suffix_min!("abcabcabc" , "abcabca" , 3); |
812 | assert_suffix_max!("abcabcabc" , "abcabcabc" , 3); |
813 | |
814 | assert_suffix_min!("abczz" , "a" , 1); |
815 | assert_suffix_max!("abczz" , "abczz" , 5); |
816 | |
817 | assert_suffix_min!("zzabc" , "zza" , 3); |
818 | assert_suffix_max!("zzabc" , "zz" , 1); |
819 | |
820 | assert_suffix_min!("aaa" , "aaa" , 1); |
821 | assert_suffix_max!("aaa" , "aaa" , 1); |
822 | } |
823 | |
824 | quickcheck! { |
825 | fn qc_suffix_forward_maximal(bytes: Vec<u8>) -> bool { |
826 | if bytes.is_empty() { |
827 | return true; |
828 | } |
829 | |
830 | let (got, _) = get_suffix_forward(&bytes, SuffixKind::Maximal); |
831 | let expected = naive_maximal_suffix_forward(&bytes); |
832 | got == expected |
833 | } |
834 | |
835 | fn qc_suffix_reverse_maximal(bytes: Vec<u8>) -> bool { |
836 | if bytes.is_empty() { |
837 | return true; |
838 | } |
839 | |
840 | let (got, _) = get_suffix_reverse(&bytes, SuffixKind::Maximal); |
841 | let expected = naive_maximal_suffix_reverse(&bytes); |
842 | expected == got |
843 | } |
844 | } |
845 | } |
846 | |
847 | #[cfg (test)] |
848 | mod simpletests { |
849 | use super::*; |
850 | |
851 | pub(crate) fn twoway_find( |
852 | haystack: &[u8], |
853 | needle: &[u8], |
854 | ) -> Option<usize> { |
855 | Forward::new(needle).find_general(None, haystack, needle) |
856 | } |
857 | |
858 | pub(crate) fn twoway_rfind( |
859 | haystack: &[u8], |
860 | needle: &[u8], |
861 | ) -> Option<usize> { |
862 | Reverse::new(needle).rfind_general(haystack, needle) |
863 | } |
864 | |
865 | define_memmem_simple_tests!(twoway_find, twoway_rfind); |
866 | |
867 | // This is a regression test caught by quickcheck that exercised a bug in |
868 | // the reverse small period handling. The bug was that we were using 'if j |
869 | // == shift' to determine if a match occurred, but the correct guard is 'if |
870 | // j >= shift', which matches the corresponding guard in the forward impl. |
871 | #[test ] |
872 | fn regression_rev_small_period() { |
873 | let rfind = super::simpletests::twoway_rfind; |
874 | let haystack = "ababaz" ; |
875 | let needle = "abab" ; |
876 | assert_eq!(Some(0), rfind(haystack.as_bytes(), needle.as_bytes())); |
877 | } |
878 | } |
879 | |