1 | use alloc::{sync::Arc, vec, vec::Vec}; |
---|---|

2 | |

3 | use crate::{packed::pattern::Patterns, util::search::Match, PatternID}; |

4 | |

5 | /// The type of the rolling hash used in the Rabin-Karp algorithm. |

6 | type Hash = usize; |

7 | |

8 | /// The number of buckets to store our patterns in. We don't want this to be |

9 | /// too big in order to avoid wasting memory, but we don't want it to be too |

10 | /// small either to avoid spending too much time confirming literals. |

11 | /// |

12 | /// The number of buckets MUST be a power of two. Otherwise, determining the |

13 | /// bucket from a hash will slow down the code considerably. Using a power |

14 | /// of two means `hash % NUM_BUCKETS` can compile down to a simple `and` |

15 | /// instruction. |

16 | const NUM_BUCKETS: usize = 64; |

17 | |

18 | /// An implementation of the Rabin-Karp algorithm. The main idea of this |

19 | /// algorithm is to maintain a rolling hash as it moves through the input, and |

20 | /// then check whether that hash corresponds to the same hash for any of the |

21 | /// patterns we're looking for. |

22 | /// |

23 | /// A draw back of naively scaling Rabin-Karp to multiple patterns is that |

24 | /// it requires all of the patterns to be the same length, which in turn |

25 | /// corresponds to the number of bytes to hash. We adapt this to work for |

26 | /// multiple patterns of varying size by fixing the number of bytes to hash |

27 | /// to be the length of the smallest pattern. We also split the patterns into |

28 | /// several buckets to hopefully make the confirmation step faster. |

29 | /// |

30 | /// Wikipedia has a decent explanation, if a bit heavy on the theory: |

31 | /// https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm |

32 | /// |

33 | /// But ESMAJ provides something a bit more concrete: |

34 | /// https://www-igm.univ-mlv.fr/~lecroq/string/node5.html |

35 | #[derive(Clone, Debug)] |

36 | pub(crate) struct RabinKarp { |

37 | /// The patterns we're searching for. |

38 | patterns: Arc<Patterns>, |

39 | /// The order of patterns in each bucket is significant. Namely, they are |

40 | /// arranged such that the first one to match is the correct match. This |

41 | /// may not necessarily correspond to the order provided by the caller. |

42 | /// For example, if leftmost-longest semantics are used, then the patterns |

43 | /// are sorted by their length in descending order. If leftmost-first |

44 | /// semantics are used, then the patterns are sorted by their pattern ID |

45 | /// in ascending order (which corresponds to the caller's order). |

46 | buckets: Vec<Vec<(Hash, PatternID)>>, |

47 | /// The length of the hashing window. Generally, this corresponds to the |

48 | /// length of the smallest pattern. |

49 | hash_len: usize, |

50 | /// The factor to subtract out of a hash before updating it with a new |

51 | /// byte. |

52 | hash_2pow: usize, |

53 | } |

54 | |

55 | impl RabinKarp { |

56 | /// Compile a new Rabin-Karp matcher from the patterns given. |

57 | /// |

58 | /// This panics if any of the patterns in the collection are empty, or if |

59 | /// the collection is itself empty. |

60 | pub(crate) fn new(patterns: &Arc<Patterns>) -> RabinKarp { |

61 | assert!(patterns.len() >= 1); |

62 | let hash_len = patterns.minimum_len(); |

63 | assert!(hash_len >= 1); |

64 | |

65 | let mut hash_2pow = 1usize; |

66 | for _ in 1..hash_len { |

67 | hash_2pow = hash_2pow.wrapping_shl(1); |

68 | } |

69 | |

70 | let mut rk = RabinKarp { |

71 | patterns: Arc::clone(patterns), |

72 | buckets: vec![vec![]; NUM_BUCKETS], |

73 | hash_len, |

74 | hash_2pow, |

75 | }; |

76 | for (id, pat) in patterns.iter() { |

77 | let hash = rk.hash(&pat.bytes()[..rk.hash_len]); |

78 | let bucket = hash % NUM_BUCKETS; |

79 | rk.buckets[bucket].push((hash, id)); |

80 | } |

81 | rk |

82 | } |

83 | |

84 | /// Return the first matching pattern in the given haystack, begining the |

85 | /// search at `at`. |

86 | pub(crate) fn find_at( |

87 | &self, |

88 | haystack: &[u8], |

89 | mut at: usize, |

90 | ) -> Option<Match> { |

91 | assert_eq!(NUM_BUCKETS, self.buckets.len()); |

92 | |

93 | if at + self.hash_len > haystack.len() { |

94 | return None; |

95 | } |

96 | let mut hash = self.hash(&haystack[at..at + self.hash_len]); |

97 | loop { |

98 | let bucket = &self.buckets[hash % NUM_BUCKETS]; |

99 | for &(phash, pid) in bucket { |

100 | if phash == hash { |

101 | if let Some(c) = self.verify(pid, haystack, at) { |

102 | return Some(c); |

103 | } |

104 | } |

105 | } |

106 | if at + self.hash_len >= haystack.len() { |

107 | return None; |

108 | } |

109 | hash = self.update_hash( |

110 | hash, |

111 | haystack[at], |

112 | haystack[at + self.hash_len], |

113 | ); |

114 | at += 1; |

115 | } |

116 | } |

117 | |

118 | /// Returns the approximate total amount of heap used by this searcher, in |

119 | /// units of bytes. |

120 | pub(crate) fn memory_usage(&self) -> usize { |

121 | self.buckets.len() * core::mem::size_of::<Vec<(Hash, PatternID)>>() |

122 | + self.patterns.len() * core::mem::size_of::<(Hash, PatternID)>() |

123 | } |

124 | |

125 | /// Verify whether the pattern with the given id matches at |

126 | /// `haystack[at..]`. |

127 | /// |

128 | /// We tag this function as `cold` because it helps improve codegen. |

129 | /// Intuitively, it would seem like inlining it would be better. However, |

130 | /// the only time this is called and a match is not found is when there |

131 | /// there is a hash collision, or when a prefix of a pattern matches but |

132 | /// the entire pattern doesn't match. This is hopefully fairly rare, and |

133 | /// if it does occur a lot, it's going to be slow no matter what we do. |

134 | #[cold] |

135 | fn verify( |

136 | &self, |

137 | id: PatternID, |

138 | haystack: &[u8], |

139 | at: usize, |

140 | ) -> Option<Match> { |

141 | let pat = self.patterns.get(id); |

142 | if pat.is_prefix(&haystack[at..]) { |

143 | Some(Match::new(id, at..at + pat.len())) |

144 | } else { |

145 | None |

146 | } |

147 | } |

148 | |

149 | /// Hash the given bytes. |

150 | fn hash(&self, bytes: &[u8]) -> Hash { |

151 | assert_eq!(self.hash_len, bytes.len()); |

152 | |

153 | let mut hash = 0usize; |

154 | for &b in bytes { |

155 | hash = hash.wrapping_shl(1).wrapping_add(b as usize); |

156 | } |

157 | hash |

158 | } |

159 | |

160 | /// Update the hash given based on removing `old_byte` at the beginning |

161 | /// of some byte string, and appending `new_byte` to the end of that same |

162 | /// byte string. |

163 | fn update_hash(&self, prev: Hash, old_byte: u8, new_byte: u8) -> Hash { |

164 | prev.wrapping_sub((old_byte as usize).wrapping_mul(self.hash_2pow)) |

165 | .wrapping_shl(1) |

166 | .wrapping_add(new_byte as usize) |

167 | } |

168 | } |

169 |