1use crate::UnalignedBuffer;
2use core::{cmp, hash::Hasher};
3
4#[cfg(feature = "serialize")]
5use serde::{Deserialize, Serialize};
6
7const CHUNK_SIZE: usize = 16;
8
9pub const PRIME_1: u32 = 2_654_435_761;
10pub const PRIME_2: u32 = 2_246_822_519;
11pub const PRIME_3: u32 = 3_266_489_917;
12pub const PRIME_4: u32 = 668_265_263;
13pub const PRIME_5: u32 = 374_761_393;
14
15#[cfg_attr(feature = "serialize", derive(Deserialize, Serialize))]
16#[derive(Copy, Clone, PartialEq)]
17struct XxCore {
18 v1: u32,
19 v2: u32,
20 v3: u32,
21 v4: u32,
22}
23
24/// Calculates the 32-bit hash. Care should be taken when using this
25/// hash.
26///
27/// Although this struct implements `Hasher`, it only calculates a
28/// 32-bit number, leaving the upper bits as 0. This means it is
29/// unlikely to be correct to use this in places like a `HashMap`.
30#[cfg_attr(feature = "serialize", derive(Deserialize, Serialize))]
31#[derive(Debug, Copy, Clone, PartialEq)]
32pub struct XxHash32 {
33 total_len: u64,
34 seed: u32,
35 core: XxCore,
36 #[cfg_attr(feature = "serialize", serde(flatten))]
37 buffer: Buffer,
38}
39
40impl XxCore {
41 fn with_seed(seed: u32) -> XxCore {
42 XxCore {
43 v1: seed.wrapping_add(PRIME_1).wrapping_add(PRIME_2),
44 v2: seed.wrapping_add(PRIME_2),
45 v3: seed,
46 v4: seed.wrapping_sub(PRIME_1),
47 }
48 }
49
50 #[inline(always)]
51 fn ingest_chunks<I>(&mut self, values: I)
52 where
53 I: IntoIterator<Item = [u32; 4]>,
54 {
55 #[inline(always)]
56 fn ingest_one_number(mut current_value: u32, mut value: u32) -> u32 {
57 value = value.wrapping_mul(PRIME_2);
58 current_value = current_value.wrapping_add(value);
59 current_value = current_value.rotate_left(13);
60 current_value.wrapping_mul(PRIME_1)
61 }
62
63 // By drawing these out, we can avoid going back and forth to
64 // memory. It only really helps for large files, when we need
65 // to iterate multiple times here.
66
67 let mut v1 = self.v1;
68 let mut v2 = self.v2;
69 let mut v3 = self.v3;
70 let mut v4 = self.v4;
71
72 for [n1, n2, n3, n4] in values {
73 v1 = ingest_one_number(v1, n1.to_le());
74 v2 = ingest_one_number(v2, n2.to_le());
75 v3 = ingest_one_number(v3, n3.to_le());
76 v4 = ingest_one_number(v4, n4.to_le());
77 }
78
79 self.v1 = v1;
80 self.v2 = v2;
81 self.v3 = v3;
82 self.v4 = v4;
83 }
84
85 #[inline(always)]
86 fn finish(&self) -> u32 {
87 // The original code pulls out local vars for v[1234]
88 // here. Performance tests did not show that to be effective
89 // here, presumably because this method is not called in a
90 // tight loop.
91
92 #[allow(unknown_lints, clippy::needless_late_init)] // keeping things parallel
93 let mut hash;
94
95 hash = self.v1.rotate_left(1);
96 hash = hash.wrapping_add(self.v2.rotate_left(7));
97 hash = hash.wrapping_add(self.v3.rotate_left(12));
98 hash = hash.wrapping_add(self.v4.rotate_left(18));
99
100 hash
101 }
102}
103
104impl core::fmt::Debug for XxCore {
105 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> {
106 write!(
107 f,
108 "XxCore {{ {:016x} {:016x} {:016x} {:016x} }}",
109 self.v1, self.v2, self.v3, self.v4
110 )
111 }
112}
113
114#[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))]
115#[derive(Debug, Copy, Clone, Default, PartialEq)]
116#[repr(align(4))]
117#[cfg_attr(feature = "serialize", serde(transparent))]
118struct AlignToU32<T>(T);
119
120#[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))]
121#[derive(Debug, Copy, Clone, Default, PartialEq)]
122struct Buffer {
123 #[cfg_attr(feature = "serialize", serde(rename = "buffer"))]
124 data: AlignToU32<[u8; CHUNK_SIZE]>,
125 #[cfg_attr(feature = "serialize", serde(rename = "buffer_usage"))]
126 len: usize,
127}
128
129impl Buffer {
130 fn data(&self) -> &[u8] {
131 &self.data.0[..self.len]
132 }
133
134 /// Consumes as much of the parameter as it can, returning the unused part.
135 fn consume<'a>(&mut self, data: &'a [u8]) -> &'a [u8] {
136 let to_use = cmp::min(self.available(), data.len());
137 let (data, remaining) = data.split_at(to_use);
138 self.data.0[self.len..][..to_use].copy_from_slice(data);
139 self.len += to_use;
140 remaining
141 }
142
143 fn set_data(&mut self, data: &[u8]) {
144 debug_assert!(self.is_empty());
145 debug_assert!(data.len() < CHUNK_SIZE);
146 self.data.0[..data.len()].copy_from_slice(data);
147 self.len = data.len();
148 }
149
150 fn available(&self) -> usize {
151 CHUNK_SIZE - self.len
152 }
153
154 fn is_empty(&self) -> bool {
155 self.len == 0
156 }
157
158 fn is_full(&self) -> bool {
159 self.len == CHUNK_SIZE
160 }
161}
162
163impl XxHash32 {
164 /// Constructs the hash with an initial seed
165 pub fn with_seed(seed: u32) -> XxHash32 {
166 XxHash32 {
167 total_len: 0,
168 seed,
169 core: XxCore::with_seed(seed),
170 buffer: Buffer::default(),
171 }
172 }
173
174 pub(crate) fn write(&mut self, bytes: &[u8]) {
175 let remaining = self.maybe_consume_bytes(bytes);
176 if !remaining.is_empty() {
177 let mut remaining = UnalignedBuffer::new(remaining);
178 self.core.ingest_chunks(&mut remaining);
179 self.buffer.set_data(remaining.remaining());
180 }
181 self.total_len += bytes.len() as u64;
182 }
183
184 // Consume bytes and try to make `self.buffer` empty.
185 // If there are not enough bytes, `self.buffer` can be non-empty, and this
186 // function returns an empty slice.
187 fn maybe_consume_bytes<'a>(&mut self, data: &'a [u8]) -> &'a [u8] {
188 if self.buffer.is_empty() {
189 data
190 } else {
191 let data = self.buffer.consume(data);
192 if self.buffer.is_full() {
193 let mut u32s = UnalignedBuffer::new(self.buffer.data());
194 self.core.ingest_chunks(&mut u32s);
195 debug_assert!(u32s.remaining().is_empty());
196 self.buffer.len = 0;
197 }
198 data
199 }
200 }
201
202 pub(crate) fn finish(&self) -> u32 {
203 let mut hash = if self.total_len >= CHUNK_SIZE as u64 {
204 // We have processed at least one full chunk
205 self.core.finish()
206 } else {
207 self.seed.wrapping_add(PRIME_5)
208 };
209
210 hash = hash.wrapping_add(self.total_len as u32);
211
212 let mut buffered_u32s = UnalignedBuffer::<u32>::new(self.buffer.data());
213 for buffered_u32 in &mut buffered_u32s {
214 let k1 = buffered_u32.to_le().wrapping_mul(PRIME_3);
215 hash = hash.wrapping_add(k1);
216 hash = hash.rotate_left(17);
217 hash = hash.wrapping_mul(PRIME_4);
218 }
219
220 let buffered_u8s = buffered_u32s.remaining();
221 for &buffered_u8 in buffered_u8s {
222 let k1 = u32::from(buffered_u8).wrapping_mul(PRIME_5);
223 hash = hash.wrapping_add(k1);
224 hash = hash.rotate_left(11);
225 hash = hash.wrapping_mul(PRIME_1);
226 }
227
228 // The final intermixing
229 hash ^= hash >> 15;
230 hash = hash.wrapping_mul(PRIME_2);
231 hash ^= hash >> 13;
232 hash = hash.wrapping_mul(PRIME_3);
233 hash ^= hash >> 16;
234
235 hash
236 }
237
238 pub fn seed(&self) -> u32 {
239 self.seed
240 }
241
242 /// Get the total number of bytes hashed, truncated to 32 bits.
243 /// For the full 64-bit byte count, use `total_len_64`
244 pub fn total_len(&self) -> u32 {
245 self.total_len as u32
246 }
247
248 /// Get the total number of bytes hashed.
249 pub fn total_len_64(&self) -> u64 {
250 self.total_len
251 }
252}
253
254impl Default for XxHash32 {
255 fn default() -> XxHash32 {
256 XxHash32::with_seed(0)
257 }
258}
259
260impl Hasher for XxHash32 {
261 fn finish(&self) -> u64 {
262 u64::from(XxHash32::finish(self))
263 }
264
265 fn write(&mut self, bytes: &[u8]) {
266 XxHash32::write(self, bytes)
267 }
268}
269
270#[cfg(feature = "std")]
271pub use crate::std_support::thirty_two::RandomXxHashBuilder32;
272
273#[cfg(test)]
274mod test {
275 use super::{RandomXxHashBuilder32, XxHash32};
276 use std::collections::HashMap;
277 use std::hash::BuildHasherDefault;
278 use std::prelude::v1::*;
279
280 #[test]
281 fn ingesting_byte_by_byte_is_equivalent_to_large_chunks() {
282 let bytes: Vec<_> = (0..32).map(|_| 0).collect();
283
284 let mut byte_by_byte = XxHash32::with_seed(0);
285 for byte in bytes.chunks(1) {
286 byte_by_byte.write(byte);
287 }
288
289 let mut one_chunk = XxHash32::with_seed(0);
290 one_chunk.write(&bytes);
291
292 assert_eq!(byte_by_byte.core, one_chunk.core);
293 }
294
295 #[test]
296 fn hash_of_nothing_matches_c_implementation() {
297 let mut hasher = XxHash32::with_seed(0);
298 hasher.write(&[]);
299 assert_eq!(hasher.finish(), 0x02cc_5d05);
300 }
301
302 #[test]
303 fn hash_of_single_byte_matches_c_implementation() {
304 let mut hasher = XxHash32::with_seed(0);
305 hasher.write(&[42]);
306 assert_eq!(hasher.finish(), 0xe0fe_705f);
307 }
308
309 #[test]
310 fn hash_of_multiple_bytes_matches_c_implementation() {
311 let mut hasher = XxHash32::with_seed(0);
312 hasher.write(b"Hello, world!\0");
313 assert_eq!(hasher.finish(), 0x9e5e_7e93);
314 }
315
316 #[test]
317 fn hash_of_multiple_chunks_matches_c_implementation() {
318 let bytes: Vec<_> = (0..100).collect();
319 let mut hasher = XxHash32::with_seed(0);
320 hasher.write(&bytes);
321 assert_eq!(hasher.finish(), 0x7f89_ba44);
322 }
323
324 #[test]
325 fn hash_with_different_seed_matches_c_implementation() {
326 let mut hasher = XxHash32::with_seed(0x42c9_1977);
327 hasher.write(&[]);
328 assert_eq!(hasher.finish(), 0xd6bf_8459);
329 }
330
331 #[test]
332 fn hash_with_different_seed_and_multiple_chunks_matches_c_implementation() {
333 let bytes: Vec<_> = (0..100).collect();
334 let mut hasher = XxHash32::with_seed(0x42c9_1977);
335 hasher.write(&bytes);
336 assert_eq!(hasher.finish(), 0x6d2f_6c17);
337 }
338
339 #[test]
340 fn can_be_used_in_a_hashmap_with_a_default_seed() {
341 let mut hash: HashMap<_, _, BuildHasherDefault<XxHash32>> = Default::default();
342 hash.insert(42, "the answer");
343 assert_eq!(hash.get(&42), Some(&"the answer"));
344 }
345
346 #[test]
347 fn can_be_used_in_a_hashmap_with_a_random_seed() {
348 let mut hash: HashMap<_, _, RandomXxHashBuilder32> = Default::default();
349 hash.insert(42, "the answer");
350 assert_eq!(hash.get(&42), Some(&"the answer"));
351 }
352
353 #[cfg(feature = "serialize")]
354 type TestResult<T = ()> = Result<T, Box<dyn std::error::Error>>;
355
356 #[cfg(feature = "serialize")]
357 #[test]
358 fn test_serialization_cycle() -> TestResult {
359 let mut hasher = XxHash32::with_seed(0);
360 hasher.write(b"Hello, world!\0");
361 hasher.finish();
362
363 let serialized = serde_json::to_string(&hasher)?;
364 let unserialized: XxHash32 = serde_json::from_str(&serialized)?;
365 assert_eq!(hasher, unserialized);
366 Ok(())
367 }
368
369 #[cfg(feature = "serialize")]
370 #[test]
371 fn test_serialization_stability() -> TestResult {
372 let mut hasher = XxHash32::with_seed(0);
373 hasher.write(b"Hello, world!\0");
374 hasher.finish();
375
376 let serialized = r#"{
377 "total_len": 14,
378 "seed": 0,
379 "core": {
380 "v1": 606290984,
381 "v2": 2246822519,
382 "v3": 0,
383 "v4": 1640531535
384 },
385 "buffer": [
386 72, 101, 108, 108, 111, 44, 32, 119,
387 111, 114, 108, 100, 33, 0, 0, 0
388 ],
389 "buffer_usage": 14
390 }"#;
391
392 let unserialized: XxHash32 = serde_json::from_str(serialized).unwrap();
393 assert_eq!(hasher, unserialized);
394 Ok(())
395 }
396
397 // This test validates wraparound/truncation behavior for very large inputs
398 // of a 32-bit hash, but runs very slowly in the normal "cargo test"
399 // build config since it hashes 4.3GB of data. It runs reasonably quick
400 // under "cargo test --release".
401 /*
402 #[test]
403 fn len_overflow_32bit() {
404 // Hash 4.3 billion (4_300_000_000) bytes, which overflows a u32.
405 let bytes200: Vec<u8> = (0..200).collect();
406 let mut hasher = XxHash32::with_seed(0);
407 for _ in 0..(4_300_000_000u64 / 200u64) {
408 hasher.write(&bytes200);
409 }
410 assert_eq!(hasher.total_len_64(), 0x0000_0001_004c_cb00);
411 assert_eq!(hasher.total_len(), 0x004c_cb00);
412 // retult is tested against the C implementation
413 assert_eq!(hasher.finish(), 0x1522_4ca7);
414 }
415 */
416}
417