1 | //===-- Portable string hash function ---------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #ifndef LLVM_LIBC_SRC___SUPPORT_HASH_H |
10 | #define LLVM_LIBC_SRC___SUPPORT_HASH_H |
11 | |
12 | #include "src/__support/CPP/bit.h" // rotl |
13 | #include "src/__support/CPP/limits.h" // numeric_limits |
14 | #include "src/__support/macros/attributes.h" // LIBC_INLINE |
15 | #include "src/__support/uint128.h" // UInt128 |
16 | #include <stdint.h> // For uint64_t |
17 | |
18 | namespace LIBC_NAMESPACE { |
19 | namespace internal { |
20 | |
21 | // Folded multiplication. |
22 | // This function multiplies two 64-bit integers and xor the high and |
23 | // low 64-bit parts of the result. |
24 | LIBC_INLINE uint64_t folded_multiply(uint64_t x, uint64_t y) { |
25 | UInt128 p = static_cast<UInt128>(x) * static_cast<UInt128>(y); |
26 | uint64_t low = static_cast<uint64_t>(p); |
27 | uint64_t high = static_cast<uint64_t>(p >> 64); |
28 | return low ^ high; |
29 | } |
30 | |
31 | // Read as little endian. |
32 | // Shift-and-or implementation does not give a satisfactory code on aarch64. |
33 | // Therefore, we use a union to read the value. |
34 | template <typename T> LIBC_INLINE T read_little_endian(const void *ptr) { |
35 | const uint8_t *bytes = static_cast<const uint8_t *>(ptr); |
36 | union { |
37 | T value; |
38 | uint8_t buffer[sizeof(T)]; |
39 | } data; |
40 | #if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ |
41 | // Compiler should able to optimize this as a load followed by a byte swap. |
42 | // On aarch64 (-mbig-endian), this compiles to the following for int: |
43 | // ldr w0, [x0] |
44 | // rev w0, w0 |
45 | // ret |
46 | for (size_t i = 0; i < sizeof(T); ++i) { |
47 | data.buffer[i] = bytes[sizeof(T) - i - 1]; |
48 | } |
49 | #else |
50 | for (size_t i = 0; i < sizeof(T); ++i) { |
51 | data.buffer[i] = bytes[i]; |
52 | } |
53 | #endif |
54 | return data.value; |
55 | } |
56 | |
57 | // Specialized read functions for small values. size must be <= 8. |
58 | LIBC_INLINE void read_small_values(const void *ptr, size_t size, uint64_t &low, |
59 | uint64_t &high) { |
60 | const uint8_t *bytes = static_cast<const uint8_t *>(ptr); |
61 | if (size >= 2) { |
62 | if (size >= 4) { |
63 | low = static_cast<uint64_t>(read_little_endian<uint32_t>(ptr: &bytes[0])); |
64 | high = |
65 | static_cast<uint64_t>(read_little_endian<uint32_t>(ptr: &bytes[size - 4])); |
66 | } else { |
67 | low = static_cast<uint64_t>(read_little_endian<uint16_t>(ptr: &bytes[0])); |
68 | high = static_cast<uint64_t>(bytes[size - 1]); |
69 | } |
70 | } else { |
71 | if (size > 0) { |
72 | low = static_cast<uint64_t>(bytes[0]); |
73 | high = static_cast<uint64_t>(bytes[0]); |
74 | } else { |
75 | low = 0; |
76 | high = 0; |
77 | } |
78 | } |
79 | } |
80 | |
81 | // This constant comes from Kunth's prng (it empirically works well). |
82 | LIBC_INLINE_VAR constexpr uint64_t MULTIPLE = 6364136223846793005; |
83 | // Rotation amount for mixing. |
84 | LIBC_INLINE_VAR constexpr uint64_t ROTATE = 23; |
85 | |
86 | // Randomly generated values. For now, we use the same values as in aHash as |
87 | // they are widely tested. |
88 | // https://github.com/tkaitchuck/aHash/blob/9f6a2ad8b721fd28da8dc1d0b7996677b374357c/src/random_state.rs#L38 |
89 | LIBC_INLINE_VAR constexpr uint64_t RANDOMNESS[2][4] = { |
90 | {0x243f6a8885a308d3, 0x13198a2e03707344, 0xa4093822299f31d0, |
91 | 0x082efa98ec4e6c89}, |
92 | {0x452821e638d01377, 0xbe5466cf34e90c6c, 0xc0ac29b7c97c50dd, |
93 | 0x3f84d5b5b5470917}, |
94 | }; |
95 | |
96 | // This is a portable string hasher. It is not cryptographically secure. |
97 | // The quality of the hash is good enough to pass all tests in SMHasher. |
98 | // The implementation is derived from the generic routine of aHash. |
99 | class HashState { |
100 | uint64_t buffer; |
101 | uint64_t pad; |
102 | uint64_t [2]; |
103 | LIBC_INLINE void update(uint64_t low, uint64_t high) { |
104 | uint64_t combined = |
105 | folded_multiply(x: low ^ extra_keys[0], y: high ^ extra_keys[1]); |
106 | buffer = (buffer + pad) ^ combined; |
107 | buffer = cpp::rotl(value: buffer, rotate: ROTATE); |
108 | } |
109 | LIBC_INLINE static uint64_t mix(uint64_t seed) { |
110 | HashState mixer{RANDOMNESS[0][0], RANDOMNESS[0][1], RANDOMNESS[0][2], |
111 | RANDOMNESS[0][3]}; |
112 | mixer.update(low: seed, high: 0); |
113 | return mixer.finish(); |
114 | } |
115 | |
116 | public: |
117 | LIBC_INLINE constexpr HashState(uint64_t a, uint64_t b, uint64_t c, |
118 | uint64_t d) |
119 | : buffer(a), pad(b), extra_keys{c, d} {} |
120 | LIBC_INLINE HashState(uint64_t seed) { |
121 | // Mix one more round of the seed to make it stronger. |
122 | uint64_t mixed = mix(seed); |
123 | buffer = RANDOMNESS[1][0] ^ mixed; |
124 | pad = RANDOMNESS[1][1] ^ mixed; |
125 | extra_keys[0] = RANDOMNESS[1][2] ^ mixed; |
126 | extra_keys[1] = RANDOMNESS[1][3] ^ mixed; |
127 | } |
128 | LIBC_INLINE void update(const void *ptr, size_t size) { |
129 | uint8_t const *bytes = static_cast<const uint8_t *>(ptr); |
130 | buffer = (buffer + size) * MULTIPLE; |
131 | uint64_t low, high; |
132 | if (size > 8) { |
133 | if (size > 16) { |
134 | // update tail |
135 | low = read_little_endian<uint64_t>(ptr: &bytes[size - 16]); |
136 | high = read_little_endian<uint64_t>(ptr: &bytes[size - 8]); |
137 | update(low, high); |
138 | while (size > 16) { |
139 | low = read_little_endian<uint64_t>(ptr: &bytes[0]); |
140 | high = read_little_endian<uint64_t>(ptr: &bytes[8]); |
141 | update(low, high); |
142 | bytes += 16; |
143 | size -= 16; |
144 | } |
145 | } else { |
146 | low = read_little_endian<uint64_t>(ptr: &bytes[0]); |
147 | high = read_little_endian<uint64_t>(ptr: &bytes[size - 8]); |
148 | update(low, high); |
149 | } |
150 | } else { |
151 | read_small_values(ptr, size, low, high); |
152 | update(low, high); |
153 | } |
154 | } |
155 | LIBC_INLINE uint64_t finish() { |
156 | int rot = buffer & 63; |
157 | uint64_t folded = folded_multiply(x: buffer, y: pad); |
158 | return cpp::rotl(value: folded, rotate: rot); |
159 | } |
160 | }; |
161 | |
162 | } // namespace internal |
163 | } // namespace LIBC_NAMESPACE |
164 | |
165 | #endif // LLVM_LIBC_SRC___SUPPORT_HASH_H |
166 | |