| 1 | use crate::UnalignedBuffer; |
| 2 | use core::{cmp, hash::Hasher}; |
| 3 | |
| 4 | #[cfg (feature = "serialize" )] |
| 5 | use serde::{Deserialize, Serialize}; |
| 6 | |
| 7 | const CHUNK_SIZE: usize = 32; |
| 8 | |
| 9 | pub const PRIME_1: u64 = 11_400_714_785_074_694_791; |
| 10 | pub const PRIME_2: u64 = 14_029_467_366_897_019_727; |
| 11 | pub const PRIME_3: u64 = 1_609_587_929_392_839_161; |
| 12 | pub const PRIME_4: u64 = 9_650_029_242_287_828_579; |
| 13 | pub const PRIME_5: u64 = 2_870_177_450_012_600_261; |
| 14 | |
| 15 | #[cfg_attr (feature = "serialize" , derive(Deserialize, Serialize))] |
| 16 | #[derive (Copy, Clone, PartialEq)] |
| 17 | struct XxCore { |
| 18 | v1: u64, |
| 19 | v2: u64, |
| 20 | v3: u64, |
| 21 | v4: u64, |
| 22 | } |
| 23 | |
| 24 | /// Calculates the 64-bit hash. |
| 25 | #[cfg_attr (feature = "serialize" , derive(Deserialize, Serialize))] |
| 26 | #[derive (Debug, Copy, Clone, PartialEq)] |
| 27 | pub struct XxHash64 { |
| 28 | total_len: u64, |
| 29 | seed: u64, |
| 30 | core: XxCore, |
| 31 | #[cfg_attr (feature = "serialize" , serde(flatten))] |
| 32 | buffer: Buffer, |
| 33 | } |
| 34 | |
| 35 | impl XxCore { |
| 36 | fn with_seed(seed: u64) -> XxCore { |
| 37 | XxCore { |
| 38 | v1: seed.wrapping_add(PRIME_1).wrapping_add(PRIME_2), |
| 39 | v2: seed.wrapping_add(PRIME_2), |
| 40 | v3: seed, |
| 41 | v4: seed.wrapping_sub(PRIME_1), |
| 42 | } |
| 43 | } |
| 44 | |
| 45 | #[inline (always)] |
| 46 | fn ingest_chunks<I>(&mut self, values: I) |
| 47 | where |
| 48 | I: IntoIterator<Item = [u64; 4]>, |
| 49 | { |
| 50 | #[inline (always)] |
| 51 | fn ingest_one_number(mut current_value: u64, mut value: u64) -> u64 { |
| 52 | value = value.wrapping_mul(PRIME_2); |
| 53 | current_value = current_value.wrapping_add(value); |
| 54 | current_value = current_value.rotate_left(31); |
| 55 | current_value.wrapping_mul(PRIME_1) |
| 56 | } |
| 57 | |
| 58 | // By drawing these out, we can avoid going back and forth to |
| 59 | // memory. It only really helps for large files, when we need |
| 60 | // to iterate multiple times here. |
| 61 | |
| 62 | let mut v1 = self.v1; |
| 63 | let mut v2 = self.v2; |
| 64 | let mut v3 = self.v3; |
| 65 | let mut v4 = self.v4; |
| 66 | |
| 67 | for [n1, n2, n3, n4] in values { |
| 68 | v1 = ingest_one_number(v1, n1.to_le()); |
| 69 | v2 = ingest_one_number(v2, n2.to_le()); |
| 70 | v3 = ingest_one_number(v3, n3.to_le()); |
| 71 | v4 = ingest_one_number(v4, n4.to_le()); |
| 72 | } |
| 73 | |
| 74 | self.v1 = v1; |
| 75 | self.v2 = v2; |
| 76 | self.v3 = v3; |
| 77 | self.v4 = v4; |
| 78 | } |
| 79 | |
| 80 | #[inline (always)] |
| 81 | fn finish(&self) -> u64 { |
| 82 | // The original code pulls out local vars for v[1234] |
| 83 | // here. Performance tests did not show that to be effective |
| 84 | // here, presumably because this method is not called in a |
| 85 | // tight loop. |
| 86 | |
| 87 | #[allow (unknown_lints, clippy::needless_late_init)] // keeping things parallel |
| 88 | let mut hash; |
| 89 | |
| 90 | hash = self.v1.rotate_left(1); |
| 91 | hash = hash.wrapping_add(self.v2.rotate_left(7)); |
| 92 | hash = hash.wrapping_add(self.v3.rotate_left(12)); |
| 93 | hash = hash.wrapping_add(self.v4.rotate_left(18)); |
| 94 | |
| 95 | #[inline (always)] |
| 96 | fn mix_one(mut hash: u64, mut value: u64) -> u64 { |
| 97 | value = value.wrapping_mul(PRIME_2); |
| 98 | value = value.rotate_left(31); |
| 99 | value = value.wrapping_mul(PRIME_1); |
| 100 | hash ^= value; |
| 101 | hash = hash.wrapping_mul(PRIME_1); |
| 102 | hash.wrapping_add(PRIME_4) |
| 103 | } |
| 104 | |
| 105 | hash = mix_one(hash, self.v1); |
| 106 | hash = mix_one(hash, self.v2); |
| 107 | hash = mix_one(hash, self.v3); |
| 108 | hash = mix_one(hash, self.v4); |
| 109 | |
| 110 | hash |
| 111 | } |
| 112 | } |
| 113 | |
| 114 | impl core::fmt::Debug for XxCore { |
| 115 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> { |
| 116 | write!( |
| 117 | f, |
| 118 | "XxCore {{ {:016x} {:016x} {:016x} {:016x} }}" , |
| 119 | self.v1, self.v2, self.v3, self.v4 |
| 120 | ) |
| 121 | } |
| 122 | } |
| 123 | |
| 124 | #[cfg_attr (feature = "serialize" , derive(Serialize, Deserialize))] |
| 125 | #[derive (Debug, Copy, Clone, Default, PartialEq)] |
| 126 | #[repr (align(8))] |
| 127 | #[cfg_attr (feature = "serialize" , serde(transparent))] |
| 128 | struct AlignToU64<T>(T); |
| 129 | |
| 130 | #[cfg_attr (feature = "serialize" , derive(Serialize, Deserialize))] |
| 131 | #[derive (Debug, Copy, Clone, Default, PartialEq)] |
| 132 | struct Buffer { |
| 133 | #[cfg_attr (feature = "serialize" , serde(rename = "buffer" ))] |
| 134 | data: AlignToU64<[u8; CHUNK_SIZE]>, |
| 135 | #[cfg_attr (feature = "serialize" , serde(rename = "buffer_usage" ))] |
| 136 | len: usize, |
| 137 | } |
| 138 | |
| 139 | impl Buffer { |
| 140 | fn data(&self) -> &[u8] { |
| 141 | &self.data.0[..self.len] |
| 142 | } |
| 143 | |
| 144 | /// Consumes as much of the parameter as it can, returning the unused part. |
| 145 | fn consume<'a>(&mut self, data: &'a [u8]) -> &'a [u8] { |
| 146 | let to_use = cmp::min(self.available(), data.len()); |
| 147 | let (data, remaining) = data.split_at(to_use); |
| 148 | self.data.0[self.len..][..to_use].copy_from_slice(data); |
| 149 | self.len += to_use; |
| 150 | remaining |
| 151 | } |
| 152 | |
| 153 | fn set_data(&mut self, data: &[u8]) { |
| 154 | debug_assert!(self.is_empty()); |
| 155 | debug_assert!(data.len() < CHUNK_SIZE); |
| 156 | self.data.0[..data.len()].copy_from_slice(data); |
| 157 | self.len = data.len(); |
| 158 | } |
| 159 | |
| 160 | fn available(&self) -> usize { |
| 161 | CHUNK_SIZE - self.len |
| 162 | } |
| 163 | |
| 164 | fn is_empty(&self) -> bool { |
| 165 | self.len == 0 |
| 166 | } |
| 167 | |
| 168 | fn is_full(&self) -> bool { |
| 169 | self.len == CHUNK_SIZE |
| 170 | } |
| 171 | } |
| 172 | |
| 173 | impl XxHash64 { |
| 174 | /// Constructs the hash with an initial seed |
| 175 | pub fn with_seed(seed: u64) -> XxHash64 { |
| 176 | XxHash64 { |
| 177 | total_len: 0, |
| 178 | seed, |
| 179 | core: XxCore::with_seed(seed), |
| 180 | buffer: Buffer::default(), |
| 181 | } |
| 182 | } |
| 183 | |
| 184 | pub(crate) fn write(&mut self, bytes: &[u8]) { |
| 185 | let remaining = self.maybe_consume_bytes(bytes); |
| 186 | if !remaining.is_empty() { |
| 187 | let mut remaining = UnalignedBuffer::new(remaining); |
| 188 | self.core.ingest_chunks(&mut remaining); |
| 189 | self.buffer.set_data(remaining.remaining()); |
| 190 | } |
| 191 | self.total_len += bytes.len() as u64; |
| 192 | } |
| 193 | |
| 194 | // Consume bytes and try to make `self.buffer` empty. |
| 195 | // If there are not enough bytes, `self.buffer` can be non-empty, and this |
| 196 | // function returns an empty slice. |
| 197 | fn maybe_consume_bytes<'a>(&mut self, data: &'a [u8]) -> &'a [u8] { |
| 198 | if self.buffer.is_empty() { |
| 199 | data |
| 200 | } else { |
| 201 | let data = self.buffer.consume(data); |
| 202 | if self.buffer.is_full() { |
| 203 | let mut u64s = UnalignedBuffer::new(self.buffer.data()); |
| 204 | self.core.ingest_chunks(&mut u64s); |
| 205 | debug_assert!(u64s.remaining().is_empty()); |
| 206 | self.buffer.len = 0; |
| 207 | } |
| 208 | data |
| 209 | } |
| 210 | } |
| 211 | |
| 212 | pub(crate) fn finish(&self) -> u64 { |
| 213 | let mut hash = if self.total_len >= CHUNK_SIZE as u64 { |
| 214 | // We have processed at least one full chunk |
| 215 | self.core.finish() |
| 216 | } else { |
| 217 | self.seed.wrapping_add(PRIME_5) |
| 218 | }; |
| 219 | |
| 220 | hash = hash.wrapping_add(self.total_len); |
| 221 | |
| 222 | let mut buffered_u64s = UnalignedBuffer::<u64>::new(self.buffer.data()); |
| 223 | for buffered_u64 in &mut buffered_u64s { |
| 224 | let mut k1 = buffered_u64.to_le().wrapping_mul(PRIME_2); |
| 225 | k1 = k1.rotate_left(31); |
| 226 | k1 = k1.wrapping_mul(PRIME_1); |
| 227 | hash ^= k1; |
| 228 | hash = hash.rotate_left(27); |
| 229 | hash = hash.wrapping_mul(PRIME_1); |
| 230 | hash = hash.wrapping_add(PRIME_4); |
| 231 | } |
| 232 | |
| 233 | let mut buffered_u32s = UnalignedBuffer::<u32>::new(buffered_u64s.remaining()); |
| 234 | for buffered_u32 in &mut buffered_u32s { |
| 235 | let k1 = u64::from(buffered_u32.to_le()).wrapping_mul(PRIME_1); |
| 236 | hash ^= k1; |
| 237 | hash = hash.rotate_left(23); |
| 238 | hash = hash.wrapping_mul(PRIME_2); |
| 239 | hash = hash.wrapping_add(PRIME_3); |
| 240 | } |
| 241 | |
| 242 | let buffered_u8s = buffered_u32s.remaining(); |
| 243 | for &buffered_u8 in buffered_u8s { |
| 244 | let k1 = u64::from(buffered_u8).wrapping_mul(PRIME_5); |
| 245 | hash ^= k1; |
| 246 | hash = hash.rotate_left(11); |
| 247 | hash = hash.wrapping_mul(PRIME_1); |
| 248 | } |
| 249 | |
| 250 | // The final intermixing |
| 251 | hash ^= hash >> 33; |
| 252 | hash = hash.wrapping_mul(PRIME_2); |
| 253 | hash ^= hash >> 29; |
| 254 | hash = hash.wrapping_mul(PRIME_3); |
| 255 | hash ^= hash >> 32; |
| 256 | |
| 257 | hash |
| 258 | } |
| 259 | |
| 260 | pub fn seed(&self) -> u64 { |
| 261 | self.seed |
| 262 | } |
| 263 | |
| 264 | pub fn total_len(&self) -> u64 { |
| 265 | self.total_len |
| 266 | } |
| 267 | } |
| 268 | |
| 269 | impl Default for XxHash64 { |
| 270 | fn default() -> XxHash64 { |
| 271 | XxHash64::with_seed(0) |
| 272 | } |
| 273 | } |
| 274 | |
| 275 | impl Hasher for XxHash64 { |
| 276 | fn finish(&self) -> u64 { |
| 277 | XxHash64::finish(self) |
| 278 | } |
| 279 | |
| 280 | fn write(&mut self, bytes: &[u8]) { |
| 281 | XxHash64::write(self, bytes) |
| 282 | } |
| 283 | } |
| 284 | |
| 285 | #[cfg (feature = "std" )] |
| 286 | pub use crate::std_support::sixty_four::RandomXxHashBuilder64; |
| 287 | |
| 288 | #[cfg (test)] |
| 289 | mod test { |
| 290 | use super::{RandomXxHashBuilder64, XxHash64}; |
| 291 | use std::collections::HashMap; |
| 292 | use std::hash::BuildHasherDefault; |
| 293 | use std::prelude::v1::*; |
| 294 | |
| 295 | #[test ] |
| 296 | fn ingesting_byte_by_byte_is_equivalent_to_large_chunks() { |
| 297 | let bytes: Vec<_> = (0..32).map(|_| 0).collect(); |
| 298 | |
| 299 | let mut byte_by_byte = XxHash64::with_seed(0); |
| 300 | for byte in bytes.chunks(1) { |
| 301 | byte_by_byte.write(byte); |
| 302 | } |
| 303 | |
| 304 | let mut one_chunk = XxHash64::with_seed(0); |
| 305 | one_chunk.write(&bytes); |
| 306 | |
| 307 | assert_eq!(byte_by_byte.core, one_chunk.core); |
| 308 | } |
| 309 | |
| 310 | #[test ] |
| 311 | fn hash_of_nothing_matches_c_implementation() { |
| 312 | let mut hasher = XxHash64::with_seed(0); |
| 313 | hasher.write(&[]); |
| 314 | assert_eq!(hasher.finish(), 0xef46_db37_51d8_e999); |
| 315 | } |
| 316 | |
| 317 | #[test ] |
| 318 | fn hash_of_single_byte_matches_c_implementation() { |
| 319 | let mut hasher = XxHash64::with_seed(0); |
| 320 | hasher.write(&[42]); |
| 321 | assert_eq!(hasher.finish(), 0x0a9e_dece_beb0_3ae4); |
| 322 | } |
| 323 | |
| 324 | #[test ] |
| 325 | fn hash_of_multiple_bytes_matches_c_implementation() { |
| 326 | let mut hasher = XxHash64::with_seed(0); |
| 327 | hasher.write(b"Hello, world! \0" ); |
| 328 | assert_eq!(hasher.finish(), 0x7b06_c531_ea43_e89f); |
| 329 | } |
| 330 | |
| 331 | #[test ] |
| 332 | fn hash_of_multiple_chunks_matches_c_implementation() { |
| 333 | let bytes: Vec<_> = (0..100).collect(); |
| 334 | let mut hasher = XxHash64::with_seed(0); |
| 335 | hasher.write(&bytes); |
| 336 | assert_eq!(hasher.finish(), 0x6ac1_e580_3216_6597); |
| 337 | } |
| 338 | |
| 339 | #[test ] |
| 340 | fn hash_with_different_seed_matches_c_implementation() { |
| 341 | let mut hasher = XxHash64::with_seed(0xae05_4331_1b70_2d91); |
| 342 | hasher.write(&[]); |
| 343 | assert_eq!(hasher.finish(), 0x4b6a_04fc_df7a_4672); |
| 344 | } |
| 345 | |
| 346 | #[test ] |
| 347 | fn hash_with_different_seed_and_multiple_chunks_matches_c_implementation() { |
| 348 | let bytes: Vec<_> = (0..100).collect(); |
| 349 | let mut hasher = XxHash64::with_seed(0xae05_4331_1b70_2d91); |
| 350 | hasher.write(&bytes); |
| 351 | assert_eq!(hasher.finish(), 0x567e_355e_0682_e1f1); |
| 352 | } |
| 353 | |
| 354 | #[test ] |
| 355 | fn can_be_used_in_a_hashmap_with_a_default_seed() { |
| 356 | let mut hash: HashMap<_, _, BuildHasherDefault<XxHash64>> = Default::default(); |
| 357 | hash.insert(42, "the answer" ); |
| 358 | assert_eq!(hash.get(&42), Some(&"the answer" )); |
| 359 | } |
| 360 | |
| 361 | #[test ] |
| 362 | fn can_be_used_in_a_hashmap_with_a_random_seed() { |
| 363 | let mut hash: HashMap<_, _, RandomXxHashBuilder64> = Default::default(); |
| 364 | hash.insert(42, "the answer" ); |
| 365 | assert_eq!(hash.get(&42), Some(&"the answer" )); |
| 366 | } |
| 367 | |
| 368 | #[cfg (feature = "serialize" )] |
| 369 | type TestResult<T = ()> = Result<T, Box<dyn std::error::Error>>; |
| 370 | |
| 371 | #[cfg (feature = "serialize" )] |
| 372 | #[test ] |
| 373 | fn test_serialization_cycle() -> TestResult { |
| 374 | let mut hasher = XxHash64::with_seed(0); |
| 375 | hasher.write(b"Hello, world! \0" ); |
| 376 | hasher.finish(); |
| 377 | |
| 378 | let serialized = serde_json::to_string(&hasher)?; |
| 379 | let unserialized: XxHash64 = serde_json::from_str(&serialized)?; |
| 380 | assert_eq!(hasher, unserialized); |
| 381 | Ok(()) |
| 382 | } |
| 383 | |
| 384 | #[cfg (feature = "serialize" )] |
| 385 | #[test ] |
| 386 | fn test_serialization_stability() -> TestResult { |
| 387 | let mut hasher = XxHash64::with_seed(0); |
| 388 | hasher.write(b"Hello, world! \0" ); |
| 389 | hasher.finish(); |
| 390 | |
| 391 | let serialized = r#"{ |
| 392 | "total_len": 14, |
| 393 | "seed": 0, |
| 394 | "core": { |
| 395 | "v1": 6983438078262162902, |
| 396 | "v2": 14029467366897019727, |
| 397 | "v3": 0, |
| 398 | "v4": 7046029288634856825 |
| 399 | }, |
| 400 | "buffer": [ |
| 401 | 72, 101, 108, 108, 111, 44, 32, 119, |
| 402 | 111, 114, 108, 100, 33, 0, 0, 0, |
| 403 | 0, 0, 0, 0, 0, 0, 0, 0, |
| 404 | 0, 0, 0, 0, 0, 0, 0, 0 |
| 405 | ], |
| 406 | "buffer_usage": 14 |
| 407 | }"# ; |
| 408 | |
| 409 | let unserialized: XxHash64 = serde_json::from_str(serialized).unwrap(); |
| 410 | assert_eq!(hasher, unserialized); |
| 411 | Ok(()) |
| 412 | } |
| 413 | } |
| 414 | |