1 | use crate::UnalignedBuffer; |
2 | use core::{cmp, hash::Hasher}; |
3 | |
4 | #[cfg (feature = "serialize" )] |
5 | use serde::{Deserialize, Serialize}; |
6 | |
7 | const CHUNK_SIZE: usize = 16; |
8 | |
9 | pub const PRIME_1: u32 = 2_654_435_761; |
10 | pub const PRIME_2: u32 = 2_246_822_519; |
11 | pub const PRIME_3: u32 = 3_266_489_917; |
12 | pub const PRIME_4: u32 = 668_265_263; |
13 | pub const PRIME_5: u32 = 374_761_393; |
14 | |
15 | #[cfg_attr (feature = "serialize" , derive(Deserialize, Serialize))] |
16 | #[derive (Copy, Clone, PartialEq)] |
17 | struct XxCore { |
18 | v1: u32, |
19 | v2: u32, |
20 | v3: u32, |
21 | v4: u32, |
22 | } |
23 | |
24 | /// Calculates the 32-bit hash. Care should be taken when using this |
25 | /// hash. |
26 | /// |
27 | /// Although this struct implements `Hasher`, it only calculates a |
28 | /// 32-bit number, leaving the upper bits as 0. This means it is |
29 | /// unlikely to be correct to use this in places like a `HashMap`. |
30 | #[cfg_attr (feature = "serialize" , derive(Deserialize, Serialize))] |
31 | #[derive (Debug, Copy, Clone, PartialEq)] |
32 | pub struct XxHash32 { |
33 | total_len: u64, |
34 | seed: u32, |
35 | core: XxCore, |
36 | #[cfg_attr (feature = "serialize" , serde(flatten))] |
37 | buffer: Buffer, |
38 | } |
39 | |
40 | impl XxCore { |
41 | fn with_seed(seed: u32) -> XxCore { |
42 | XxCore { |
43 | v1: seed.wrapping_add(PRIME_1).wrapping_add(PRIME_2), |
44 | v2: seed.wrapping_add(PRIME_2), |
45 | v3: seed, |
46 | v4: seed.wrapping_sub(PRIME_1), |
47 | } |
48 | } |
49 | |
50 | #[inline (always)] |
51 | fn ingest_chunks<I>(&mut self, values: I) |
52 | where |
53 | I: IntoIterator<Item = [u32; 4]>, |
54 | { |
55 | #[inline (always)] |
56 | fn ingest_one_number(mut current_value: u32, mut value: u32) -> u32 { |
57 | value = value.wrapping_mul(PRIME_2); |
58 | current_value = current_value.wrapping_add(value); |
59 | current_value = current_value.rotate_left(13); |
60 | current_value.wrapping_mul(PRIME_1) |
61 | } |
62 | |
63 | // By drawing these out, we can avoid going back and forth to |
64 | // memory. It only really helps for large files, when we need |
65 | // to iterate multiple times here. |
66 | |
67 | let mut v1 = self.v1; |
68 | let mut v2 = self.v2; |
69 | let mut v3 = self.v3; |
70 | let mut v4 = self.v4; |
71 | |
72 | for [n1, n2, n3, n4] in values { |
73 | v1 = ingest_one_number(v1, n1.to_le()); |
74 | v2 = ingest_one_number(v2, n2.to_le()); |
75 | v3 = ingest_one_number(v3, n3.to_le()); |
76 | v4 = ingest_one_number(v4, n4.to_le()); |
77 | } |
78 | |
79 | self.v1 = v1; |
80 | self.v2 = v2; |
81 | self.v3 = v3; |
82 | self.v4 = v4; |
83 | } |
84 | |
85 | #[inline (always)] |
86 | fn finish(&self) -> u32 { |
87 | // The original code pulls out local vars for v[1234] |
88 | // here. Performance tests did not show that to be effective |
89 | // here, presumably because this method is not called in a |
90 | // tight loop. |
91 | |
92 | #[allow (unknown_lints, clippy::needless_late_init)] // keeping things parallel |
93 | let mut hash; |
94 | |
95 | hash = self.v1.rotate_left(1); |
96 | hash = hash.wrapping_add(self.v2.rotate_left(7)); |
97 | hash = hash.wrapping_add(self.v3.rotate_left(12)); |
98 | hash = hash.wrapping_add(self.v4.rotate_left(18)); |
99 | |
100 | hash |
101 | } |
102 | } |
103 | |
104 | impl core::fmt::Debug for XxCore { |
105 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> { |
106 | write!( |
107 | f, |
108 | "XxCore {{ {:016x} {:016x} {:016x} {:016x} }}" , |
109 | self.v1, self.v2, self.v3, self.v4 |
110 | ) |
111 | } |
112 | } |
113 | |
114 | #[cfg_attr (feature = "serialize" , derive(Serialize, Deserialize))] |
115 | #[derive (Debug, Copy, Clone, Default, PartialEq)] |
116 | #[repr (align(4))] |
117 | #[cfg_attr (feature = "serialize" , serde(transparent))] |
118 | struct AlignToU32<T>(T); |
119 | |
120 | #[cfg_attr (feature = "serialize" , derive(Serialize, Deserialize))] |
121 | #[derive (Debug, Copy, Clone, Default, PartialEq)] |
122 | struct Buffer { |
123 | #[cfg_attr (feature = "serialize" , serde(rename = "buffer" ))] |
124 | data: AlignToU32<[u8; CHUNK_SIZE]>, |
125 | #[cfg_attr (feature = "serialize" , serde(rename = "buffer_usage" ))] |
126 | len: usize, |
127 | } |
128 | |
129 | impl Buffer { |
130 | fn data(&self) -> &[u8] { |
131 | &self.data.0[..self.len] |
132 | } |
133 | |
134 | /// Consumes as much of the parameter as it can, returning the unused part. |
135 | fn consume<'a>(&mut self, data: &'a [u8]) -> &'a [u8] { |
136 | let to_use = cmp::min(self.available(), data.len()); |
137 | let (data, remaining) = data.split_at(to_use); |
138 | self.data.0[self.len..][..to_use].copy_from_slice(data); |
139 | self.len += to_use; |
140 | remaining |
141 | } |
142 | |
143 | fn set_data(&mut self, data: &[u8]) { |
144 | debug_assert!(self.is_empty()); |
145 | debug_assert!(data.len() < CHUNK_SIZE); |
146 | self.data.0[..data.len()].copy_from_slice(data); |
147 | self.len = data.len(); |
148 | } |
149 | |
150 | fn available(&self) -> usize { |
151 | CHUNK_SIZE - self.len |
152 | } |
153 | |
154 | fn is_empty(&self) -> bool { |
155 | self.len == 0 |
156 | } |
157 | |
158 | fn is_full(&self) -> bool { |
159 | self.len == CHUNK_SIZE |
160 | } |
161 | } |
162 | |
163 | impl XxHash32 { |
164 | /// Constructs the hash with an initial seed |
165 | pub fn with_seed(seed: u32) -> XxHash32 { |
166 | XxHash32 { |
167 | total_len: 0, |
168 | seed, |
169 | core: XxCore::with_seed(seed), |
170 | buffer: Buffer::default(), |
171 | } |
172 | } |
173 | |
174 | pub(crate) fn write(&mut self, bytes: &[u8]) { |
175 | let remaining = self.maybe_consume_bytes(bytes); |
176 | if !remaining.is_empty() { |
177 | let mut remaining = UnalignedBuffer::new(remaining); |
178 | self.core.ingest_chunks(&mut remaining); |
179 | self.buffer.set_data(remaining.remaining()); |
180 | } |
181 | self.total_len += bytes.len() as u64; |
182 | } |
183 | |
184 | // Consume bytes and try to make `self.buffer` empty. |
185 | // If there are not enough bytes, `self.buffer` can be non-empty, and this |
186 | // function returns an empty slice. |
187 | fn maybe_consume_bytes<'a>(&mut self, data: &'a [u8]) -> &'a [u8] { |
188 | if self.buffer.is_empty() { |
189 | data |
190 | } else { |
191 | let data = self.buffer.consume(data); |
192 | if self.buffer.is_full() { |
193 | let mut u32s = UnalignedBuffer::new(self.buffer.data()); |
194 | self.core.ingest_chunks(&mut u32s); |
195 | debug_assert!(u32s.remaining().is_empty()); |
196 | self.buffer.len = 0; |
197 | } |
198 | data |
199 | } |
200 | } |
201 | |
202 | pub(crate) fn finish(&self) -> u32 { |
203 | let mut hash = if self.total_len >= CHUNK_SIZE as u64 { |
204 | // We have processed at least one full chunk |
205 | self.core.finish() |
206 | } else { |
207 | self.seed.wrapping_add(PRIME_5) |
208 | }; |
209 | |
210 | hash = hash.wrapping_add(self.total_len as u32); |
211 | |
212 | let mut buffered_u32s = UnalignedBuffer::<u32>::new(self.buffer.data()); |
213 | for buffered_u32 in &mut buffered_u32s { |
214 | let k1 = buffered_u32.to_le().wrapping_mul(PRIME_3); |
215 | hash = hash.wrapping_add(k1); |
216 | hash = hash.rotate_left(17); |
217 | hash = hash.wrapping_mul(PRIME_4); |
218 | } |
219 | |
220 | let buffered_u8s = buffered_u32s.remaining(); |
221 | for &buffered_u8 in buffered_u8s { |
222 | let k1 = u32::from(buffered_u8).wrapping_mul(PRIME_5); |
223 | hash = hash.wrapping_add(k1); |
224 | hash = hash.rotate_left(11); |
225 | hash = hash.wrapping_mul(PRIME_1); |
226 | } |
227 | |
228 | // The final intermixing |
229 | hash ^= hash >> 15; |
230 | hash = hash.wrapping_mul(PRIME_2); |
231 | hash ^= hash >> 13; |
232 | hash = hash.wrapping_mul(PRIME_3); |
233 | hash ^= hash >> 16; |
234 | |
235 | hash |
236 | } |
237 | |
238 | pub fn seed(&self) -> u32 { |
239 | self.seed |
240 | } |
241 | |
242 | /// Get the total number of bytes hashed, truncated to 32 bits. |
243 | /// For the full 64-bit byte count, use `total_len_64` |
244 | pub fn total_len(&self) -> u32 { |
245 | self.total_len as u32 |
246 | } |
247 | |
248 | /// Get the total number of bytes hashed. |
249 | pub fn total_len_64(&self) -> u64 { |
250 | self.total_len |
251 | } |
252 | } |
253 | |
254 | impl Default for XxHash32 { |
255 | fn default() -> XxHash32 { |
256 | XxHash32::with_seed(0) |
257 | } |
258 | } |
259 | |
260 | impl Hasher for XxHash32 { |
261 | fn finish(&self) -> u64 { |
262 | u64::from(XxHash32::finish(self)) |
263 | } |
264 | |
265 | fn write(&mut self, bytes: &[u8]) { |
266 | XxHash32::write(self, bytes) |
267 | } |
268 | } |
269 | |
270 | #[cfg (feature = "std" )] |
271 | pub use crate::std_support::thirty_two::RandomXxHashBuilder32; |
272 | |
273 | #[cfg (test)] |
274 | mod test { |
275 | use super::{RandomXxHashBuilder32, XxHash32}; |
276 | use std::collections::HashMap; |
277 | use std::hash::BuildHasherDefault; |
278 | use std::prelude::v1::*; |
279 | |
280 | #[test ] |
281 | fn ingesting_byte_by_byte_is_equivalent_to_large_chunks() { |
282 | let bytes: Vec<_> = (0..32).map(|_| 0).collect(); |
283 | |
284 | let mut byte_by_byte = XxHash32::with_seed(0); |
285 | for byte in bytes.chunks(1) { |
286 | byte_by_byte.write(byte); |
287 | } |
288 | |
289 | let mut one_chunk = XxHash32::with_seed(0); |
290 | one_chunk.write(&bytes); |
291 | |
292 | assert_eq!(byte_by_byte.core, one_chunk.core); |
293 | } |
294 | |
295 | #[test ] |
296 | fn hash_of_nothing_matches_c_implementation() { |
297 | let mut hasher = XxHash32::with_seed(0); |
298 | hasher.write(&[]); |
299 | assert_eq!(hasher.finish(), 0x02cc_5d05); |
300 | } |
301 | |
302 | #[test ] |
303 | fn hash_of_single_byte_matches_c_implementation() { |
304 | let mut hasher = XxHash32::with_seed(0); |
305 | hasher.write(&[42]); |
306 | assert_eq!(hasher.finish(), 0xe0fe_705f); |
307 | } |
308 | |
309 | #[test ] |
310 | fn hash_of_multiple_bytes_matches_c_implementation() { |
311 | let mut hasher = XxHash32::with_seed(0); |
312 | hasher.write(b"Hello, world! \0" ); |
313 | assert_eq!(hasher.finish(), 0x9e5e_7e93); |
314 | } |
315 | |
316 | #[test ] |
317 | fn hash_of_multiple_chunks_matches_c_implementation() { |
318 | let bytes: Vec<_> = (0..100).collect(); |
319 | let mut hasher = XxHash32::with_seed(0); |
320 | hasher.write(&bytes); |
321 | assert_eq!(hasher.finish(), 0x7f89_ba44); |
322 | } |
323 | |
324 | #[test ] |
325 | fn hash_with_different_seed_matches_c_implementation() { |
326 | let mut hasher = XxHash32::with_seed(0x42c9_1977); |
327 | hasher.write(&[]); |
328 | assert_eq!(hasher.finish(), 0xd6bf_8459); |
329 | } |
330 | |
331 | #[test ] |
332 | fn hash_with_different_seed_and_multiple_chunks_matches_c_implementation() { |
333 | let bytes: Vec<_> = (0..100).collect(); |
334 | let mut hasher = XxHash32::with_seed(0x42c9_1977); |
335 | hasher.write(&bytes); |
336 | assert_eq!(hasher.finish(), 0x6d2f_6c17); |
337 | } |
338 | |
339 | #[test ] |
340 | fn can_be_used_in_a_hashmap_with_a_default_seed() { |
341 | let mut hash: HashMap<_, _, BuildHasherDefault<XxHash32>> = Default::default(); |
342 | hash.insert(42, "the answer" ); |
343 | assert_eq!(hash.get(&42), Some(&"the answer" )); |
344 | } |
345 | |
346 | #[test ] |
347 | fn can_be_used_in_a_hashmap_with_a_random_seed() { |
348 | let mut hash: HashMap<_, _, RandomXxHashBuilder32> = Default::default(); |
349 | hash.insert(42, "the answer" ); |
350 | assert_eq!(hash.get(&42), Some(&"the answer" )); |
351 | } |
352 | |
353 | #[cfg (feature = "serialize" )] |
354 | type TestResult<T = ()> = Result<T, Box<dyn std::error::Error>>; |
355 | |
356 | #[cfg (feature = "serialize" )] |
357 | #[test ] |
358 | fn test_serialization_cycle() -> TestResult { |
359 | let mut hasher = XxHash32::with_seed(0); |
360 | hasher.write(b"Hello, world! \0" ); |
361 | hasher.finish(); |
362 | |
363 | let serialized = serde_json::to_string(&hasher)?; |
364 | let unserialized: XxHash32 = serde_json::from_str(&serialized)?; |
365 | assert_eq!(hasher, unserialized); |
366 | Ok(()) |
367 | } |
368 | |
369 | #[cfg (feature = "serialize" )] |
370 | #[test ] |
371 | fn test_serialization_stability() -> TestResult { |
372 | let mut hasher = XxHash32::with_seed(0); |
373 | hasher.write(b"Hello, world! \0" ); |
374 | hasher.finish(); |
375 | |
376 | let serialized = r#"{ |
377 | "total_len": 14, |
378 | "seed": 0, |
379 | "core": { |
380 | "v1": 606290984, |
381 | "v2": 2246822519, |
382 | "v3": 0, |
383 | "v4": 1640531535 |
384 | }, |
385 | "buffer": [ |
386 | 72, 101, 108, 108, 111, 44, 32, 119, |
387 | 111, 114, 108, 100, 33, 0, 0, 0 |
388 | ], |
389 | "buffer_usage": 14 |
390 | }"# ; |
391 | |
392 | let unserialized: XxHash32 = serde_json::from_str(serialized).unwrap(); |
393 | assert_eq!(hasher, unserialized); |
394 | Ok(()) |
395 | } |
396 | |
397 | // This test validates wraparound/truncation behavior for very large inputs |
398 | // of a 32-bit hash, but runs very slowly in the normal "cargo test" |
399 | // build config since it hashes 4.3GB of data. It runs reasonably quick |
400 | // under "cargo test --release". |
401 | /* |
402 | #[test] |
403 | fn len_overflow_32bit() { |
404 | // Hash 4.3 billion (4_300_000_000) bytes, which overflows a u32. |
405 | let bytes200: Vec<u8> = (0..200).collect(); |
406 | let mut hasher = XxHash32::with_seed(0); |
407 | for _ in 0..(4_300_000_000u64 / 200u64) { |
408 | hasher.write(&bytes200); |
409 | } |
410 | assert_eq!(hasher.total_len_64(), 0x0000_0001_004c_cb00); |
411 | assert_eq!(hasher.total_len(), 0x004c_cb00); |
412 | // retult is tested against the C implementation |
413 | assert_eq!(hasher.finish(), 0x1522_4ca7); |
414 | } |
415 | */ |
416 | } |
417 | |