lib.rs source code [crates/foldhash/src/lib.rs]

1	//! This crate provides foldhash, a fast, non-cryptographic, minimally
2	//! DoS-resistant hashing algorithm designed for computational uses such as
3	//! hashmaps, bloom filters, count sketching, etc.
4	//!
5	//! When should you not* use foldhash:*
6	//!
7	//! - You are afraid of people studying your long-running program's behavior
8	//! to reverse engineer its internal random state and using this knowledge to
9	//! create many colliding inputs for computational complexity attacks.
10	//!
11	//! - You expect foldhash to have a consistent output across versions or
12	//! platforms, such as for persistent file formats or communication protocols.
13	//!
14	//! - You are relying on foldhash's properties for any kind of security.
15	//! Foldhash is not appropriate for any cryptographic purpose.
16	//!
17	//! Foldhash has two variants, one optimized for speed which is ideal for data
18	//! structures such as hash maps and bloom filters, and one optimized for
19	//! statistical quality which is ideal for algorithms such as
20	//! [HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog) and
21	//! [MinHash](https://en.wikipedia.org/wiki/MinHash).
22	//!
23	//! Foldhash can be used in a `#![no_std]` environment by disabling its default
24	//! `"std"` feature.
25	//!
26	//! # Usage
27	//!
28	//! The easiest way to use this crate with the standard library [`HashMap`] or
29	//! [`HashSet`] is to import them from `foldhash` instead, along with the
30	//! extension traits to make [`HashMap::new`] and [`HashMap::with_capacity`]
31	//! work out-of-the-box:
32	//!
33	//! ```rust
34	//! use foldhash::{HashMap, HashMapExt};
35	//!
36	//! let mut hm = HashMap::new();
37	//! hm.insert(`42`, "hello");
38	//! ```
39	//!
40	//! You can also avoid the convenience types and do it manually by initializing
41	//! a [`RandomState`](fast::RandomState), for example if you are using a different hash map
42	//! implementation like [`hashbrown`](https://docs.rs/hashbrown/):
43	//!
44	//! ```rust
45	//! use hashbrown::HashMap;
46	//! use foldhash::fast::RandomState;
47	//!
48	//! let mut hm = HashMap::with_hasher(RandomState::default());
49	//! hm.insert("foo", "bar");
50	//! ```
51	//!
52	//! The above methods are the recommended way to use foldhash, which will
53	//! automatically generate a randomly generated hasher instance for you. If you
54	//! absolutely must have determinism you can use [`FixedState`](fast::FixedState)
55	//! instead, but note that this makes you trivially vulnerable to HashDoS
56	//! attacks and might lead to quadratic runtime when moving data from one
57	//! hashmap/set into another:
58	//!
59	//! ```rust
60	//! use std::collections::HashSet;
61	//! use foldhash::fast::FixedState;
62	//!
63	//! let mut hm = HashSet::with_hasher(FixedState::with_seed(`42`));
64	//! hm.insert([`1`, `10`, `100`]);
65	//! ```
66	//!
67	//! If you rely on statistical properties of the hash for the correctness of
68	//! your algorithm, such as in [HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog),
69	//! it is suggested to use the [`RandomState`](quality::RandomState)
70	//! or [`FixedState`](quality::FixedState) from the [`quality`] module instead
71	//! of the [`fast`] module. The latter is optimized purely for speed in hash
72	//! tables and has known statistical imperfections.
73	//!
74	//! Finally, you can also directly use the [`RandomState`](quality::RandomState)
75	//! or [`FixedState`](quality::FixedState) to manually hash items using the
76	//! [`BuildHasher`](std::hash::BuildHasher) trait:
77	//! ```rust
78	//! use std::hash::BuildHasher;
79	//! use foldhash::quality::RandomState;
80	//!
81	//! let random_state = RandomState::default();
82	//! let hash = random_state.hash_one("hello world");
83	//! ```
84
85	#![cfg_attr(all(not(test), not(feature = "std")), no_std)]
86	#![warn(missing_docs)]
87
88	use core::hash::Hasher;
89
90	#[cfg(feature = "std")]
91	mod convenience;
92	mod seed;
93
94	#[cfg(feature = "std")]
95	pub use convenience::*;
96
97	// Arbitrary constants with high entropy. Hexadecimal digits of pi were used.
98	const ARBITRARY0: u64 = `0x243f6a8885a308d3`;
99	const ARBITRARY1: u64 = `0x13198a2e03707344`;
100	const ARBITRARY2: u64 = `0xa4093822299f31d0`;
101	const ARBITRARY3: u64 = `0x082efa98ec4e6c89`;
102	const ARBITRARY4: u64 = `0x452821e638d01377`;
103	const ARBITRARY5: u64 = `0xbe5466cf34e90c6c`;
104	const ARBITRARY6: u64 = `0xc0ac29b7c97c50dd`;
105	const ARBITRARY7: u64 = `0x3f84d5b5b5470917`;
106	const ARBITRARY8: u64 = `0x9216d5d98979fb1b`;
107	const ARBITRARY9: u64 = `0xd1310ba698dfb5ac`;
108
109	#[inline(always)]
110	const fn folded_multiply(x: u64, y: u64) -> u64 {
111	#[cfg(target_pointer_width = "64")]
112	{
113	// We compute the full u64 x u64 -> u128 product, this is a single mul
114	// instruction on x86-64, one mul plus one mulhi on ARM64.
115	let full = (x as u128) * (y as u128);
116	let lo = full as u64;
117	let hi = (full >> `64`) as u64;
118
119	// The middle bits of the full product fluctuate the most with small
120	// changes in the input. This is the top bits of lo and the bottom bits
121	// of hi. We can thus make the entire output fluctuate with small
122	// changes to the input by XOR'ing these two halves.
123	lo ^ hi
124	}
125
126	#[cfg(target_pointer_width = "32")]
127	{
128	// u64 x u64 -> u128 product is prohibitively expensive on 32-bit.
129	// Decompose into 32-bit parts.
130	let lx = x as u32;
131	let ly = y as u32;
132	let hx = (x >> `32`) as u32;
133	let hy = (y >> `32`) as u32;
134
135	// u32 x u32 -> u64 the low bits of one with the high bits of the other.
136	let afull = (lx as u64) * (hy as u64);
137	let bfull = (hx as u64) * (ly as u64);
138
139	// Combine, swapping low/high of one of them so the upper bits of the
140	// product of one combine with the lower bits of the other.
141	afull ^ bfull.rotate_right(`32`)
142	}
143	}
144
145	/// The foldhash implementation optimized for speed.
146	pub mod fast {
147	use super::*;
148
149	pub use seed::fast::{FixedState, RandomState};
150
151	/// A [`Hasher`] instance implementing foldhash, optimized for speed.
152	///
153	/// It can't be created directly, see [`RandomState`] or [`FixedState`].
154	#[derive(Clone)]
155	pub struct FoldHasher {
156	accumulator: u64,
157	sponge: u128,
158	sponge_len: u8,
159	fold_seed: u64,
160	expand_seed: u64,
161	expand_seed2: u64,
162	expand_seed3: u64,
163	}
164
165	impl FoldHasher {
166	#[inline]
167	pub(crate) fn with_seed(per_hasher_seed: u64, global_seed: &[u64; `4`]) -> FoldHasher {
168	FoldHasher {
169	accumulator: per_hasher_seed,
170	sponge: `0`,
171	sponge_len: `0`,
172	fold_seed: global_seed[`0`],
173	expand_seed: global_seed[`1`],
174	expand_seed2: global_seed[`2`],
175	expand_seed3: global_seed[`3`],
176	}
177	}
178
179	#[inline(always)]
180	fn write_num<T: Into<u128>>(&mut self, x: T) {
181	let bits: usize = `8` * core::mem::size_of::<T>();
182	if self.sponge_len as usize + bits > `128` {
183	let lo = self.sponge as u64;
184	let hi = (self.sponge >> `64`) as u64;
185	self.accumulator = folded_multiply(lo ^ self.accumulator, hi ^ self.fold_seed);
186	self.sponge = x.into();
187	self.sponge_len = bits as u8;
188	} else {
189	self.sponge \|= x.into() << self.sponge_len;
190	self.sponge_len += bits as u8;
191	}
192	}
193	}
194
195	impl Hasher for FoldHasher {
196	#[inline(always)]
197	fn write(&mut self, bytes: &[u8]) {
198	let mut s0 = self.accumulator;
199	let mut s1 = self.expand_seed;
200	let len = bytes.len();
201	if len <= `16` {
202	// XOR the input into s0, s1, then multiply and fold.
203	if len >= `8` {
204	s0 ^= u64::from_ne_bytes(bytes[`0`..`8`].try_into().unwrap());
205	s1 ^= u64::from_ne_bytes(bytes[len - `8`..].try_into().unwrap());
206	} else if len >= `4` {
207	s0 ^= u32::from_ne_bytes(bytes[`0`..`4`].try_into().unwrap()) as u64;
208	s1 ^= u32::from_ne_bytes(bytes[len - `4`..].try_into().unwrap()) as u64;
209	} else if len > `0` {
210	let lo = bytes[`0`];
211	let mid = bytes[len / `2`];
212	let hi = bytes[len - `1`];
213	s0 ^= lo as u64;
214	s1 ^= ((hi as u64) << `8`) \| mid as u64;
215	}
216	self.accumulator = folded_multiply(s0, s1);
217	} else if len < `256` {
218	self.accumulator = hash_bytes_medium(bytes, s0, s1, self.fold_seed);
219	} else {
220	self.accumulator = hash_bytes_long(
221	bytes,
222	s0,
223	s1,
224	self.expand_seed2,
225	self.expand_seed3,
226	self.fold_seed,
227	);
228	}
229	}
230
231	#[inline(always)]
232	fn write_u8(&mut self, i: u8) {
233	self.write_num(i);
234	}
235
236	#[inline(always)]
237	fn write_u16(&mut self, i: u16) {
238	self.write_num(i);
239	}
240
241	#[inline(always)]
242	fn write_u32(&mut self, i: u32) {
243	self.write_num(i);
244	}
245
246	#[inline(always)]
247	fn write_u64(&mut self, i: u64) {
248	self.write_num(i);
249	}
250
251	#[inline(always)]
252	fn write_u128(&mut self, i: u128) {
253	let lo = i as u64;
254	let hi = (i >> `64`) as u64;
255	self.accumulator = folded_multiply(lo ^ self.accumulator, hi ^ self.fold_seed);
256	}
257
258	#[inline(always)]
259	fn write_usize(&mut self, i: usize) {
260	// u128 doesn't implement From<usize>.
261	#[cfg(target_pointer_width = "32")]
262	self.write_num(i as u32);
263	#[cfg(target_pointer_width = "64")]
264	self.write_num(i as u64);
265	}
266
267	#[inline(always)]
268	fn finish(&self) -> u64 {
269	if self.sponge_len > `0` {
270	let lo = self.sponge as u64;
271	let hi = (self.sponge >> `64`) as u64;
272	folded_multiply(lo ^ self.accumulator, hi ^ self.fold_seed)
273	} else {
274	self.accumulator
275	}
276	}
277	}
278	}
279
280	/// The foldhash implementation optimized for quality.
281	pub mod quality {
282	use super::*;
283
284	pub use seed::quality::{FixedState, RandomState};
285
286	/// A [`Hasher`] instance implementing foldhash, optimized for quality.
287	///
288	/// It can't be created directly, see [`RandomState`] or [`FixedState`].
289	#[derive(Clone)]
290	pub struct FoldHasher {
291	pub(crate) inner: fast::FoldHasher,
292	}
293
294	impl Hasher for FoldHasher {
295	#[inline(always)]
296	fn write(&mut self, bytes: &[u8]) {
297	self.inner.write(bytes);
298	}
299
300	#[inline(always)]
301	fn write_u8(&mut self, i: u8) {
302	self.inner.write_u8(i);
303	}
304
305	#[inline(always)]
306	fn write_u16(&mut self, i: u16) {
307	self.inner.write_u16(i);
308	}
309
310	#[inline(always)]
311	fn write_u32(&mut self, i: u32) {
312	self.inner.write_u32(i);
313	}
314
315	#[inline(always)]
316	fn write_u64(&mut self, i: u64) {
317	self.inner.write_u64(i);
318	}
319
320	#[inline(always)]
321	fn write_u128(&mut self, i: u128) {
322	self.inner.write_u128(i);
323	}
324
325	#[inline(always)]
326	fn write_usize(&mut self, i: usize) {
327	self.inner.write_usize(i);
328	}
329
330	#[inline(always)]
331	fn finish(&self) -> u64 {
332	folded_multiply(self.inner.finish(), ARBITRARY0)
333	}
334	}
335	}
336
337	/// Hashes strings >= 16 bytes, has unspecified behavior when bytes.len() < 16.
338	fn hash_bytes_medium(bytes: &[u8], mut s0: u64, mut s1: u64, fold_seed: u64) -> u64 {
339	// Process 32 bytes per iteration, 16 bytes from the start, 16 bytes from
340	// the end. On the last iteration these two chunks can overlap, but that is
341	// perfectly fine.
342	let left_to_right: ChunksExact<'_, u8> = bytes.chunks_exact(chunk_size:`16`);
343	let mut right_to_left: RChunksExact<'_, u8> = bytes.rchunks_exact(chunk_size:`16`);
344	for lo: &[u8] in left_to_right {
345	let hi: &[u8] = right_to_left.next().unwrap();
346	let unconsumed_start: *const u8 = lo.as_ptr();
347	let unconsumed_end: *const u8 = hi.as_ptr_range().end;
348	if unconsumed_start >= unconsumed_end {
349	break;
350	}
351
352	let a: u64 = u64::from_ne_bytes(lo[`0`..`8`].try_into().unwrap());
353	let b: u64 = u64::from_ne_bytes(lo[`8`..`16`].try_into().unwrap());
354	let c: u64 = u64::from_ne_bytes(hi[`0`..`8`].try_into().unwrap());
355	let d: u64 = u64::from_ne_bytes(hi[`8`..`16`].try_into().unwrap());
356	s0 = folded_multiply(x:a ^ s0, y:c ^ fold_seed);
357	s1 = folded_multiply(x:b ^ s1, y:d ^ fold_seed);
358	}
359
360	s0 ^ s1
361	}
362
363	/// Hashes strings >= 16 bytes, has unspecified behavior when bytes.len() < 16.
364	#[cold]
365	#[inline(never)]
366	fn hash_bytes_long(
367	bytes: &[u8],
368	mut s0: u64,
369	mut s1: u64,
370	mut s2: u64,
371	mut s3: u64,
372	fold_seed: u64,
373	) -> u64 {
374	let chunks = bytes.chunks_exact(`64`);
375	let remainder = chunks.remainder().len();
376	for chunk in chunks {
377	let a = u64::from_ne_bytes(chunk[`0`..`8`].try_into().unwrap());
378	let b = u64::from_ne_bytes(chunk[`8`..`16`].try_into().unwrap());
379	let c = u64::from_ne_bytes(chunk[`16`..`24`].try_into().unwrap());
380	let d = u64::from_ne_bytes(chunk[`24`..`32`].try_into().unwrap());
381	let e = u64::from_ne_bytes(chunk[`32`..`40`].try_into().unwrap());
382	let f = u64::from_ne_bytes(chunk[`40`..`48`].try_into().unwrap());
383	let g = u64::from_ne_bytes(chunk[`48`..`56`].try_into().unwrap());
384	let h = u64::from_ne_bytes(chunk[`56`..`64`].try_into().unwrap());
385	s0 = folded_multiply(a ^ s0, e ^ fold_seed);
386	s1 = folded_multiply(b ^ s1, f ^ fold_seed);
387	s2 = folded_multiply(c ^ s2, g ^ fold_seed);
388	s3 = folded_multiply(d ^ s3, h ^ fold_seed);
389	}
390	s0 ^= s2;
391	s1 ^= s3;
392
393	if remainder > `0` {
394	hash_bytes_medium(&bytes[bytes.len() - remainder.max(`16`)..], s0, s1, fold_seed)
395	} else {
396	s0 ^ s1
397	}
398	}
399