1 | /*! |
2 | An implementation of the [Shift-Or substring search algorithm][shiftor]. |
3 | |
4 | [shiftor]: https://en.wikipedia.org/wiki/Bitap_algorithm |
5 | */ |
6 | |
7 | use alloc::boxed::Box; |
8 | |
9 | /// The type of our mask. |
10 | /// |
11 | /// While we don't expose anyway to configure this in the public API, if one |
12 | /// really needs less memory usage or support for longer needles, then it is |
13 | /// suggested to copy the code from this module and modify it to fit your |
14 | /// needs. The code below is written to be correct regardless of whether Mask |
15 | /// is a u8, u16, u32, u64 or u128. |
16 | type Mask = u16; |
17 | |
18 | /// A forward substring searcher using the Shift-Or algorithm. |
19 | #[derive(Debug)] |
20 | pub struct Finder { |
21 | masks: Box<[Mask; 256]>, |
22 | needle_len: usize, |
23 | } |
24 | |
25 | impl Finder { |
26 | const MAX_NEEDLE_LEN: usize = (Mask::BITS - 1) as usize; |
27 | |
28 | /// Create a new Shift-Or forward searcher for the given `needle`. |
29 | /// |
30 | /// The needle may be empty. The empty needle matches at every byte offset. |
31 | #[inline ] |
32 | pub fn new(needle: &[u8]) -> Option<Finder> { |
33 | let needle_len = needle.len(); |
34 | if needle_len > Finder::MAX_NEEDLE_LEN { |
35 | // A match is found when bit 7 is set in 'result' in the search |
36 | // routine below. So our needle can't be bigger than 7. We could |
37 | // permit bigger needles by using u16, u32 or u64 for our mask |
38 | // entries. But this is all we need for this example. |
39 | return None; |
40 | } |
41 | let mut searcher = Finder { masks: Box::from([!0; 256]), needle_len }; |
42 | for (i, &byte) in needle.iter().enumerate() { |
43 | searcher.masks[usize::from(byte)] &= !(1 << i); |
44 | } |
45 | Some(searcher) |
46 | } |
47 | |
48 | /// Return the first occurrence of the needle given to `Finder::new` in |
49 | /// the `haystack` given. If no such occurrence exists, then `None` is |
50 | /// returned. |
51 | /// |
52 | /// Unlike most other substring search implementations in this crate, this |
53 | /// finder does not require passing the needle at search time. A match can |
54 | /// be determined without the needle at all since the required information |
55 | /// is already encoded into this finder at construction time. |
56 | /// |
57 | /// The maximum value this can return is `haystack.len()`, which can only |
58 | /// occur when the needle and haystack both have length zero. Otherwise, |
59 | /// for non-empty haystacks, the maximum value is `haystack.len() - 1`. |
60 | #[inline ] |
61 | pub fn find(&self, haystack: &[u8]) -> Option<usize> { |
62 | if self.needle_len == 0 { |
63 | return Some(0); |
64 | } |
65 | let mut result = !1; |
66 | for (i, &byte) in haystack.iter().enumerate() { |
67 | result |= self.masks[usize::from(byte)]; |
68 | result <<= 1; |
69 | if result & (1 << self.needle_len) == 0 { |
70 | return Some(i + 1 - self.needle_len); |
71 | } |
72 | } |
73 | None |
74 | } |
75 | } |
76 | |
77 | #[cfg (test)] |
78 | mod tests { |
79 | use super::*; |
80 | |
81 | define_substring_forward_quickcheck!(|h, n| Some(Finder::new(n)?.find(h))); |
82 | |
83 | #[test] |
84 | fn forward() { |
85 | crate::tests::substring::Runner::new() |
86 | .fwd(|h, n| Some(Finder::new(n)?.find(h))) |
87 | .run(); |
88 | } |
89 | } |
90 | |