thirty_two.rs source code [crates/twox-hash/src/thirty_two.rs]

1	use crate::UnalignedBuffer;
2	use core::{cmp, hash::Hasher};
3
4	#[cfg(feature = "serialize")]
5	use serde::{Deserialize, Serialize};
6
7	const CHUNK_SIZE: usize = `16`;
8
9	pub const PRIME_1: u32 = `2_654_435_761`;
10	pub const PRIME_2: u32 = `2_246_822_519`;
11	pub const PRIME_3: u32 = `3_266_489_917`;
12	pub const PRIME_4: u32 = `668_265_263`;
13	pub const PRIME_5: u32 = `374_761_393`;
14
15	#[cfg_attr(feature = "serialize", derive(Deserialize, Serialize))]
16	#[derive(Copy, Clone, PartialEq)]
17	struct XxCore {
18	v1: u32,
19	v2: u32,
20	v3: u32,
21	v4: u32,
22	}
23
24	/// Calculates the 32-bit hash. Care should be taken when using this
25	/// hash.
26	///
27	/// Although this struct implements `Hasher`, it only calculates a
28	/// 32-bit number, leaving the upper bits as 0. This means it is
29	/// unlikely to be correct to use this in places like a `HashMap`.
30	#[cfg_attr(feature = "serialize", derive(Deserialize, Serialize))]
31	#[derive(Debug, Copy, Clone, PartialEq)]
32	pub struct XxHash32 {
33	total_len: u64,
34	seed: u32,
35	core: XxCore,
36	#[cfg_attr(feature = "serialize", serde(flatten))]
37	buffer: Buffer,
38	}
39
40	impl XxCore {
41	fn with_seed(seed: u32) -> XxCore {
42	XxCore {
43	v1: seed.wrapping_add(PRIME_1).wrapping_add(PRIME_2),
44	v2: seed.wrapping_add(PRIME_2),
45	v3: seed,
46	v4: seed.wrapping_sub(PRIME_1),
47	}
48	}
49
50	#[inline(always)]
51	fn ingest_chunks<I>(&mut self, values: I)
52	where
53	I: IntoIterator<Item = [u32; `4`]>,
54	{
55	#[inline(always)]
56	fn ingest_one_number(mut current_value: u32, mut value: u32) -> u32 {
57	value = value.wrapping_mul(PRIME_2);
58	current_value = current_value.wrapping_add(value);
59	current_value = current_value.rotate_left(`13`);
60	current_value.wrapping_mul(PRIME_1)
61	}
62
63	// By drawing these out, we can avoid going back and forth to
64	// memory. It only really helps for large files, when we need
65	// to iterate multiple times here.
66
67	let mut v1 = self.v1;
68	let mut v2 = self.v2;
69	let mut v3 = self.v3;
70	let mut v4 = self.v4;
71
72	for [n1, n2, n3, n4] in values {
73	v1 = ingest_one_number(v1, n1.to_le());
74	v2 = ingest_one_number(v2, n2.to_le());
75	v3 = ingest_one_number(v3, n3.to_le());
76	v4 = ingest_one_number(v4, n4.to_le());
77	}
78
79	self.v1 = v1;
80	self.v2 = v2;
81	self.v3 = v3;
82	self.v4 = v4;
83	}
84
85	#[inline(always)]
86	fn finish(&self) -> u32 {
87	// The original code pulls out local vars for v[1234]
88	// here. Performance tests did not show that to be effective
89	// here, presumably because this method is not called in a
90	// tight loop.
91
92	#[allow(unknown_lints, clippy::needless_late_init)] // keeping things parallel
93	let mut hash;
94
95	hash = self.v1.rotate_left(`1`);
96	hash = hash.wrapping_add(self.v2.rotate_left(`7`));
97	hash = hash.wrapping_add(self.v3.rotate_left(`12`));
98	hash = hash.wrapping_add(self.v4.rotate_left(`18`));
99
100	hash
101	}
102	}
103
104	impl core::fmt::Debug for XxCore {
105	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> {
106	write!(
107	f,
108	"XxCore `{{` {:`016`x} {:`016`x} {:`016`x} {:`016`x} `}}`",
109	self.v1, self.v2, self.v3, self.v4
110	)
111	}
112	}
113
114	#[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))]
115	#[derive(Debug, Copy, Clone, Default, PartialEq)]
116	#[repr(align(`4`))]
117	#[cfg_attr(feature = "serialize", serde(transparent))]
118	struct AlignToU32<T>(T);
119
120	#[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))]
121	#[derive(Debug, Copy, Clone, Default, PartialEq)]
122	struct Buffer {
123	#[cfg_attr(feature = "serialize", serde(rename = "buffer"))]
124	data: AlignToU32<[u8; CHUNK_SIZE]>,
125	#[cfg_attr(feature = "serialize", serde(rename = "buffer_usage"))]
126	len: usize,
127	}
128
129	impl Buffer {
130	fn data(&self) -> &[u8] {
131	&self.data.0[..self.len]
132	}
133
134	/// Consumes as much of the parameter as it can, returning the unused part.
135	fn consume<'a>(&mut self, data: &'a [u8]) -> &'a [u8] {
136	let to_use = cmp::min(self.available(), data.len());
137	let (data, remaining) = data.split_at(to_use);
138	self.data.0[self.len..][..to_use].copy_from_slice(data);
139	self.len += to_use;
140	remaining
141	}
142
143	fn set_data(&mut self, data: &[u8]) {
144	debug_assert!(self.is_empty());
145	debug_assert!(data.len() < CHUNK_SIZE);
146	self.data.0[..data.len()].copy_from_slice(data);
147	self.len = data.len();
148	}
149
150	fn available(&self) -> usize {
151	CHUNK_SIZE - self.len
152	}
153
154	fn is_empty(&self) -> bool {
155	self.len == `0`
156	}
157
158	fn is_full(&self) -> bool {
159	self.len == CHUNK_SIZE
160	}
161	}
162
163	impl XxHash32 {
164	/// Constructs the hash with an initial seed
165	pub fn with_seed(seed: u32) -> XxHash32 {
166	XxHash32 {
167	total_len: `0`,
168	seed,
169	core: XxCore::with_seed(seed),
170	buffer: Buffer::default(),
171	}
172	}
173
174	pub(crate) fn write(&mut self, bytes: &[u8]) {
175	let remaining = self.maybe_consume_bytes(bytes);
176	if !remaining.is_empty() {
177	let mut remaining = UnalignedBuffer::new(remaining);
178	self.core.ingest_chunks(&mut remaining);
179	self.buffer.set_data(remaining.remaining());
180	}
181	self.total_len += bytes.len() as u64;
182	}
183
184	// Consume bytes and try to make `self.buffer` empty.
185	// If there are not enough bytes, `self.buffer` can be non-empty, and this
186	// function returns an empty slice.
187	fn maybe_consume_bytes<'a>(&mut self, data: &'a [u8]) -> &'a [u8] {
188	if self.buffer.is_empty() {
189	data
190	} else {
191	let data = self.buffer.consume(data);
192	if self.buffer.is_full() {
193	let mut u32s = UnalignedBuffer::new(self.buffer.data());
194	self.core.ingest_chunks(&mut u32s);
195	debug_assert!(u32s.remaining().is_empty());
196	self.buffer.len = `0`;
197	}
198	data
199	}
200	}
201
202	pub(crate) fn finish(&self) -> u32 {
203	let mut hash = if self.total_len >= CHUNK_SIZE as u64 {
204	// We have processed at least one full chunk
205	self.core.finish()
206	} else {
207	self.seed.wrapping_add(PRIME_5)
208	};
209
210	hash = hash.wrapping_add(self.total_len as u32);
211
212	let mut buffered_u32s = UnalignedBuffer::<u32>::new(self.buffer.data());
213	for buffered_u32 in &mut buffered_u32s {
214	let k1 = buffered_u32.to_le().wrapping_mul(PRIME_3);
215	hash = hash.wrapping_add(k1);
216	hash = hash.rotate_left(`17`);
217	hash = hash.wrapping_mul(PRIME_4);
218	}
219
220	let buffered_u8s = buffered_u32s.remaining();
221	for &buffered_u8 in buffered_u8s {
222	let k1 = u32::from(buffered_u8).wrapping_mul(PRIME_5);
223	hash = hash.wrapping_add(k1);
224	hash = hash.rotate_left(`11`);
225	hash = hash.wrapping_mul(PRIME_1);
226	}
227
228	// The final intermixing
229	hash ^= hash >> `15`;
230	hash = hash.wrapping_mul(PRIME_2);
231	hash ^= hash >> `13`;
232	hash = hash.wrapping_mul(PRIME_3);
233	hash ^= hash >> `16`;
234
235	hash
236	}
237
238	pub fn seed(&self) -> u32 {
239	self.seed
240	}
241
242	/// Get the total number of bytes hashed, truncated to 32 bits.
243	/// For the full 64-bit byte count, use `total_len_64`
244	pub fn total_len(&self) -> u32 {
245	self.total_len as u32
246	}
247
248	/// Get the total number of bytes hashed.
249	pub fn total_len_64(&self) -> u64 {
250	self.total_len
251	}
252	}
253
254	impl Default for XxHash32 {
255	fn default() -> XxHash32 {
256	XxHash32::with_seed(`0`)
257	}
258	}
259
260	impl Hasher for XxHash32 {
261	fn finish(&self) -> u64 {
262	u64::from(XxHash32::finish(self))
263	}
264
265	fn write(&mut self, bytes: &[u8]) {
266	XxHash32::write(self, bytes)
267	}
268	}
269
270	#[cfg(feature = "std")]
271	pub use crate::std_support::thirty_two::RandomXxHashBuilder32;
272
273	#[cfg(test)]
274	mod test {
275	use super::{RandomXxHashBuilder32, XxHash32};
276	use std::collections::HashMap;
277	use std::hash::BuildHasherDefault;
278	use std::prelude::v1::*;
279
280	#[test]
281	fn ingesting_byte_by_byte_is_equivalent_to_large_chunks() {
282	let bytes: Vec<_> = (`0`..`32`).map(\|_\| `0`).collect();
283
284	let mut byte_by_byte = XxHash32::with_seed(`0`);
285	for byte in bytes.chunks(`1`) {
286	byte_by_byte.write(byte);
287	}
288
289	let mut one_chunk = XxHash32::with_seed(`0`);
290	one_chunk.write(&bytes);
291
292	assert_eq!(byte_by_byte.core, one_chunk.core);
293	}
294
295	#[test]
296	fn hash_of_nothing_matches_c_implementation() {
297	let mut hasher = XxHash32::with_seed(`0`);
298	hasher.write(&[]);
299	assert_eq!(hasher.finish(), `0x02cc_5d05`);
300	}
301
302	#[test]
303	fn hash_of_single_byte_matches_c_implementation() {
304	let mut hasher = XxHash32::with_seed(`0`);
305	hasher.write(&[`42`]);
306	assert_eq!(hasher.finish(), `0xe0fe_705f`);
307	}
308
309	#[test]
310	fn hash_of_multiple_bytes_matches_c_implementation() {
311	let mut hasher = XxHash32::with_seed(`0`);
312	hasher.write(b"Hello, world!`\0`");
313	assert_eq!(hasher.finish(), `0x9e5e_7e93`);
314	}
315
316	#[test]
317	fn hash_of_multiple_chunks_matches_c_implementation() {
318	let bytes: Vec<_> = (`0`..`100`).collect();
319	let mut hasher = XxHash32::with_seed(`0`);
320	hasher.write(&bytes);
321	assert_eq!(hasher.finish(), `0x7f89_ba44`);
322	}
323
324	#[test]
325	fn hash_with_different_seed_matches_c_implementation() {
326	let mut hasher = XxHash32::with_seed(`0x42c9_1977`);
327	hasher.write(&[]);
328	assert_eq!(hasher.finish(), `0xd6bf_8459`);
329	}
330
331	#[test]
332	fn hash_with_different_seed_and_multiple_chunks_matches_c_implementation() {
333	let bytes: Vec<_> = (`0`..`100`).collect();
334	let mut hasher = XxHash32::with_seed(`0x42c9_1977`);
335	hasher.write(&bytes);
336	assert_eq!(hasher.finish(), `0x6d2f_6c17`);
337	}
338
339	#[test]
340	fn can_be_used_in_a_hashmap_with_a_default_seed() {
341	let mut hash: HashMap<_, _, BuildHasherDefault<XxHash32>> = Default::default();
342	hash.insert(`42`, "the answer");
343	assert_eq!(hash.get(&`42`), Some(&"the answer"));
344	}
345
346	#[test]
347	fn can_be_used_in_a_hashmap_with_a_random_seed() {
348	let mut hash: HashMap<_, _, RandomXxHashBuilder32> = Default::default();
349	hash.insert(`42`, "the answer");
350	assert_eq!(hash.get(&`42`), Some(&"the answer"));
351	}
352
353	#[cfg(feature = "serialize")]
354	type TestResult<T = ()> = Result<T, Box<dyn std::error::Error>>;
355
356	#[cfg(feature = "serialize")]
357	#[test]
358	fn test_serialization_cycle() -> TestResult {
359	let mut hasher = XxHash32::with_seed(`0`);
360	hasher.write(b"Hello, world!`\0`");
361	hasher.finish();
362
363	let serialized = serde_json::to_string(&hasher)?;
364	let unserialized: XxHash32 = serde_json::from_str(&serialized)?;
365	assert_eq!(hasher, unserialized);
366	Ok(())
367	}
368
369	#[cfg(feature = "serialize")]
370	#[test]
371	fn test_serialization_stability() -> TestResult {
372	let mut hasher = XxHash32::with_seed(`0`);
373	hasher.write(b"Hello, world!`\0`");
374	hasher.finish();
375
376	let serialized = r#"{
377	"total_len": 14,
378	"seed": 0,
379	"core": {
380	"v1": 606290984,
381	"v2": 2246822519,
382	"v3": 0,
383	"v4": 1640531535
384	},
385	"buffer": [
386	72, 101, 108, 108, 111, 44, 32, 119,
387	111, 114, 108, 100, 33, 0, 0, 0
388	],
389	"buffer_usage": 14
390	}"#;
391
392	let unserialized: XxHash32 = serde_json::from_str(serialized).unwrap();
393	assert_eq!(hasher, unserialized);
394	Ok(())
395	}
396
397	// This test validates wraparound/truncation behavior for very large inputs
398	// of a 32-bit hash, but runs very slowly in the normal "cargo test"
399	// build config since it hashes 4.3GB of data. It runs reasonably quick
400	// under "cargo test --release".
401	/*
402	#[test]
403	fn len_overflow_32bit() {
404	// Hash 4.3 billion (4_300_000_000) bytes, which overflows a u32.
405	let bytes200: Vec<u8> = (0..200).collect();
406	let mut hasher = XxHash32::with_seed(0);
407	for _ in 0..(4_300_000_000u64 / 200u64) {
408	hasher.write(&bytes200);
409	}
410	assert_eq!(hasher.total_len_64(), 0x0000_0001_004c_cb00);
411	assert_eq!(hasher.total_len(), 0x004c_cb00);
412	// retult is tested against the C implementation
413	assert_eq!(hasher.finish(), 0x1522_4ca7);
414	}
415	*/
416	}
417