1 | #include "blake3_impl.h" |
2 | |
3 | #include <immintrin.h> |
4 | |
5 | #define DEGREE 4 |
6 | |
7 | #define _mm_shuffle_ps2(a, b, c) \ |
8 | (_mm_castps_si128( \ |
9 | _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) |
10 | |
11 | INLINE __m128i loadu(const uint8_t src[16]) { |
12 | return _mm_loadu_si128(p: (const __m128i *)src); |
13 | } |
14 | |
15 | INLINE void storeu(__m128i src, uint8_t dest[16]) { |
16 | _mm_storeu_si128(p: (__m128i *)dest, b: src); |
17 | } |
18 | |
19 | INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a: a, b: b); } |
20 | |
21 | // Note that clang-format doesn't like the name "xor" for some reason. |
22 | INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a: a, b: b); } |
23 | |
24 | INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32(i: (int32_t)x); } |
25 | |
26 | INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { |
27 | return _mm_setr_epi32(i0: (int32_t)a, i1: (int32_t)b, i2: (int32_t)c, i3: (int32_t)d); |
28 | } |
29 | |
30 | INLINE __m128i rot16(__m128i x) { |
31 | return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0xB1), 0xB1); |
32 | } |
33 | |
34 | INLINE __m128i rot12(__m128i x) { |
35 | return xorv(a: _mm_srli_epi32(a: x, count: 12), b: _mm_slli_epi32(a: x, count: 32 - 12)); |
36 | } |
37 | |
38 | INLINE __m128i rot8(__m128i x) { |
39 | return xorv(a: _mm_srli_epi32(a: x, count: 8), b: _mm_slli_epi32(a: x, count: 32 - 8)); |
40 | } |
41 | |
42 | INLINE __m128i rot7(__m128i x) { |
43 | return xorv(a: _mm_srli_epi32(a: x, count: 7), b: _mm_slli_epi32(a: x, count: 32 - 7)); |
44 | } |
45 | |
46 | INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, |
47 | __m128i m) { |
48 | *row0 = addv(a: addv(a: *row0, b: m), b: *row1); |
49 | *row3 = xorv(a: *row3, b: *row0); |
50 | *row3 = rot16(x: *row3); |
51 | *row2 = addv(a: *row2, b: *row3); |
52 | *row1 = xorv(a: *row1, b: *row2); |
53 | *row1 = rot12(x: *row1); |
54 | } |
55 | |
56 | INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, |
57 | __m128i m) { |
58 | *row0 = addv(a: addv(a: *row0, b: m), b: *row1); |
59 | *row3 = xorv(a: *row3, b: *row0); |
60 | *row3 = rot8(x: *row3); |
61 | *row2 = addv(a: *row2, b: *row3); |
62 | *row1 = xorv(a: *row1, b: *row2); |
63 | *row1 = rot7(x: *row1); |
64 | } |
65 | |
66 | // Note the optimization here of leaving row1 as the unrotated row, rather than |
67 | // row0. All the message loads below are adjusted to compensate for this. See |
68 | // discussion at https://github.com/sneves/blake2-avx2/pull/4 |
69 | INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { |
70 | *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); |
71 | *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); |
72 | *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); |
73 | } |
74 | |
75 | INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { |
76 | *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); |
77 | *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); |
78 | *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); |
79 | } |
80 | |
81 | INLINE __m128i blend_epi16(__m128i a, __m128i b, const int16_t imm8) { |
82 | const __m128i bits = _mm_set_epi16(w7: 0x80, w6: 0x40, w5: 0x20, w4: 0x10, w3: 0x08, w2: 0x04, w1: 0x02, w0: 0x01); |
83 | __m128i mask = _mm_set1_epi16(w: imm8); |
84 | mask = _mm_and_si128(a: mask, b: bits); |
85 | mask = _mm_cmpeq_epi16(a: mask, b: bits); |
86 | return _mm_or_si128(a: _mm_and_si128(a: mask, b: b), b: _mm_andnot_si128(a: mask, b: a)); |
87 | } |
88 | |
89 | INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], |
90 | const uint8_t block[BLAKE3_BLOCK_LEN], |
91 | uint8_t block_len, uint64_t counter, uint8_t flags) { |
92 | rows[0] = loadu(src: (uint8_t *)&cv[0]); |
93 | rows[1] = loadu(src: (uint8_t *)&cv[4]); |
94 | rows[2] = set4(a: IV[0], b: IV[1], c: IV[2], d: IV[3]); |
95 | rows[3] = set4(a: counter_low(counter), b: counter_high(counter), |
96 | c: (uint32_t)block_len, d: (uint32_t)flags); |
97 | |
98 | __m128i m0 = loadu(src: &block[sizeof(__m128i) * 0]); |
99 | __m128i m1 = loadu(src: &block[sizeof(__m128i) * 1]); |
100 | __m128i m2 = loadu(src: &block[sizeof(__m128i) * 2]); |
101 | __m128i m3 = loadu(src: &block[sizeof(__m128i) * 3]); |
102 | |
103 | __m128i t0, t1, t2, t3, tt; |
104 | |
105 | // Round 1. The first round permutes the message words from the original |
106 | // input order, into the groups that get mixed in parallel. |
107 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 |
108 | g1(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t0); |
109 | t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 |
110 | g2(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t1); |
111 | diagonalize(row0: &rows[0], row2: &rows[2], row3: &rows[3]); |
112 | t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 |
113 | t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 |
114 | g1(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t2); |
115 | t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 |
116 | t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 |
117 | g2(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t3); |
118 | undiagonalize(row0: &rows[0], row2: &rows[2], row3: &rows[3]); |
119 | m0 = t0; |
120 | m1 = t1; |
121 | m2 = t2; |
122 | m3 = t3; |
123 | |
124 | // Round 2. This round and all following rounds apply a fixed permutation |
125 | // to the message words from the round before. |
126 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); |
127 | t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); |
128 | g1(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t0); |
129 | t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); |
130 | tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); |
131 | t1 = blend_epi16(a: tt, b: t1, imm8: 0xCC); |
132 | g2(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t1); |
133 | diagonalize(row0: &rows[0], row2: &rows[2], row3: &rows[3]); |
134 | t2 = _mm_unpacklo_epi64(a: m3, b: m1); |
135 | tt = blend_epi16(a: t2, b: m2, imm8: 0xC0); |
136 | t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); |
137 | g1(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t2); |
138 | t3 = _mm_unpackhi_epi32(a: m1, b: m3); |
139 | tt = _mm_unpacklo_epi32(a: m2, b: t3); |
140 | t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); |
141 | g2(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t3); |
142 | undiagonalize(row0: &rows[0], row2: &rows[2], row3: &rows[3]); |
143 | m0 = t0; |
144 | m1 = t1; |
145 | m2 = t2; |
146 | m3 = t3; |
147 | |
148 | // Round 3 |
149 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); |
150 | t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); |
151 | g1(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t0); |
152 | t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); |
153 | tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); |
154 | t1 = blend_epi16(a: tt, b: t1, imm8: 0xCC); |
155 | g2(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t1); |
156 | diagonalize(row0: &rows[0], row2: &rows[2], row3: &rows[3]); |
157 | t2 = _mm_unpacklo_epi64(a: m3, b: m1); |
158 | tt = blend_epi16(a: t2, b: m2, imm8: 0xC0); |
159 | t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); |
160 | g1(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t2); |
161 | t3 = _mm_unpackhi_epi32(a: m1, b: m3); |
162 | tt = _mm_unpacklo_epi32(a: m2, b: t3); |
163 | t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); |
164 | g2(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t3); |
165 | undiagonalize(row0: &rows[0], row2: &rows[2], row3: &rows[3]); |
166 | m0 = t0; |
167 | m1 = t1; |
168 | m2 = t2; |
169 | m3 = t3; |
170 | |
171 | // Round 4 |
172 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); |
173 | t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); |
174 | g1(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t0); |
175 | t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); |
176 | tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); |
177 | t1 = blend_epi16(a: tt, b: t1, imm8: 0xCC); |
178 | g2(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t1); |
179 | diagonalize(row0: &rows[0], row2: &rows[2], row3: &rows[3]); |
180 | t2 = _mm_unpacklo_epi64(a: m3, b: m1); |
181 | tt = blend_epi16(a: t2, b: m2, imm8: 0xC0); |
182 | t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); |
183 | g1(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t2); |
184 | t3 = _mm_unpackhi_epi32(a: m1, b: m3); |
185 | tt = _mm_unpacklo_epi32(a: m2, b: t3); |
186 | t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); |
187 | g2(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t3); |
188 | undiagonalize(row0: &rows[0], row2: &rows[2], row3: &rows[3]); |
189 | m0 = t0; |
190 | m1 = t1; |
191 | m2 = t2; |
192 | m3 = t3; |
193 | |
194 | // Round 5 |
195 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); |
196 | t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); |
197 | g1(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t0); |
198 | t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); |
199 | tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); |
200 | t1 = blend_epi16(a: tt, b: t1, imm8: 0xCC); |
201 | g2(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t1); |
202 | diagonalize(row0: &rows[0], row2: &rows[2], row3: &rows[3]); |
203 | t2 = _mm_unpacklo_epi64(a: m3, b: m1); |
204 | tt = blend_epi16(a: t2, b: m2, imm8: 0xC0); |
205 | t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); |
206 | g1(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t2); |
207 | t3 = _mm_unpackhi_epi32(a: m1, b: m3); |
208 | tt = _mm_unpacklo_epi32(a: m2, b: t3); |
209 | t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); |
210 | g2(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t3); |
211 | undiagonalize(row0: &rows[0], row2: &rows[2], row3: &rows[3]); |
212 | m0 = t0; |
213 | m1 = t1; |
214 | m2 = t2; |
215 | m3 = t3; |
216 | |
217 | // Round 6 |
218 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); |
219 | t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); |
220 | g1(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t0); |
221 | t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); |
222 | tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); |
223 | t1 = blend_epi16(a: tt, b: t1, imm8: 0xCC); |
224 | g2(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t1); |
225 | diagonalize(row0: &rows[0], row2: &rows[2], row3: &rows[3]); |
226 | t2 = _mm_unpacklo_epi64(a: m3, b: m1); |
227 | tt = blend_epi16(a: t2, b: m2, imm8: 0xC0); |
228 | t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); |
229 | g1(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t2); |
230 | t3 = _mm_unpackhi_epi32(a: m1, b: m3); |
231 | tt = _mm_unpacklo_epi32(a: m2, b: t3); |
232 | t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); |
233 | g2(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t3); |
234 | undiagonalize(row0: &rows[0], row2: &rows[2], row3: &rows[3]); |
235 | m0 = t0; |
236 | m1 = t1; |
237 | m2 = t2; |
238 | m3 = t3; |
239 | |
240 | // Round 7 |
241 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); |
242 | t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); |
243 | g1(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t0); |
244 | t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); |
245 | tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); |
246 | t1 = blend_epi16(a: tt, b: t1, imm8: 0xCC); |
247 | g2(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t1); |
248 | diagonalize(row0: &rows[0], row2: &rows[2], row3: &rows[3]); |
249 | t2 = _mm_unpacklo_epi64(a: m3, b: m1); |
250 | tt = blend_epi16(a: t2, b: m2, imm8: 0xC0); |
251 | t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); |
252 | g1(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t2); |
253 | t3 = _mm_unpackhi_epi32(a: m1, b: m3); |
254 | tt = _mm_unpacklo_epi32(a: m2, b: t3); |
255 | t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); |
256 | g2(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t3); |
257 | undiagonalize(row0: &rows[0], row2: &rows[2], row3: &rows[3]); |
258 | } |
259 | |
260 | void blake3_compress_in_place_sse2(uint32_t cv[8], |
261 | const uint8_t block[BLAKE3_BLOCK_LEN], |
262 | uint8_t block_len, uint64_t counter, |
263 | uint8_t flags) { |
264 | __m128i rows[4]; |
265 | compress_pre(rows, cv, block, block_len, counter, flags); |
266 | storeu(src: xorv(a: rows[0], b: rows[2]), dest: (uint8_t *)&cv[0]); |
267 | storeu(src: xorv(a: rows[1], b: rows[3]), dest: (uint8_t *)&cv[4]); |
268 | } |
269 | |
270 | void blake3_compress_xof_sse2(const uint32_t cv[8], |
271 | const uint8_t block[BLAKE3_BLOCK_LEN], |
272 | uint8_t block_len, uint64_t counter, |
273 | uint8_t flags, uint8_t out[64]) { |
274 | __m128i rows[4]; |
275 | compress_pre(rows, cv, block, block_len, counter, flags); |
276 | storeu(src: xorv(a: rows[0], b: rows[2]), dest: &out[0]); |
277 | storeu(src: xorv(a: rows[1], b: rows[3]), dest: &out[16]); |
278 | storeu(src: xorv(a: rows[2], b: loadu(src: (uint8_t *)&cv[0])), dest: &out[32]); |
279 | storeu(src: xorv(a: rows[3], b: loadu(src: (uint8_t *)&cv[4])), dest: &out[48]); |
280 | } |
281 | |
282 | INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) { |
283 | v[0] = addv(a: v[0], b: m[(size_t)MSG_SCHEDULE[r][0]]); |
284 | v[1] = addv(a: v[1], b: m[(size_t)MSG_SCHEDULE[r][2]]); |
285 | v[2] = addv(a: v[2], b: m[(size_t)MSG_SCHEDULE[r][4]]); |
286 | v[3] = addv(a: v[3], b: m[(size_t)MSG_SCHEDULE[r][6]]); |
287 | v[0] = addv(a: v[0], b: v[4]); |
288 | v[1] = addv(a: v[1], b: v[5]); |
289 | v[2] = addv(a: v[2], b: v[6]); |
290 | v[3] = addv(a: v[3], b: v[7]); |
291 | v[12] = xorv(a: v[12], b: v[0]); |
292 | v[13] = xorv(a: v[13], b: v[1]); |
293 | v[14] = xorv(a: v[14], b: v[2]); |
294 | v[15] = xorv(a: v[15], b: v[3]); |
295 | v[12] = rot16(x: v[12]); |
296 | v[13] = rot16(x: v[13]); |
297 | v[14] = rot16(x: v[14]); |
298 | v[15] = rot16(x: v[15]); |
299 | v[8] = addv(a: v[8], b: v[12]); |
300 | v[9] = addv(a: v[9], b: v[13]); |
301 | v[10] = addv(a: v[10], b: v[14]); |
302 | v[11] = addv(a: v[11], b: v[15]); |
303 | v[4] = xorv(a: v[4], b: v[8]); |
304 | v[5] = xorv(a: v[5], b: v[9]); |
305 | v[6] = xorv(a: v[6], b: v[10]); |
306 | v[7] = xorv(a: v[7], b: v[11]); |
307 | v[4] = rot12(x: v[4]); |
308 | v[5] = rot12(x: v[5]); |
309 | v[6] = rot12(x: v[6]); |
310 | v[7] = rot12(x: v[7]); |
311 | v[0] = addv(a: v[0], b: m[(size_t)MSG_SCHEDULE[r][1]]); |
312 | v[1] = addv(a: v[1], b: m[(size_t)MSG_SCHEDULE[r][3]]); |
313 | v[2] = addv(a: v[2], b: m[(size_t)MSG_SCHEDULE[r][5]]); |
314 | v[3] = addv(a: v[3], b: m[(size_t)MSG_SCHEDULE[r][7]]); |
315 | v[0] = addv(a: v[0], b: v[4]); |
316 | v[1] = addv(a: v[1], b: v[5]); |
317 | v[2] = addv(a: v[2], b: v[6]); |
318 | v[3] = addv(a: v[3], b: v[7]); |
319 | v[12] = xorv(a: v[12], b: v[0]); |
320 | v[13] = xorv(a: v[13], b: v[1]); |
321 | v[14] = xorv(a: v[14], b: v[2]); |
322 | v[15] = xorv(a: v[15], b: v[3]); |
323 | v[12] = rot8(x: v[12]); |
324 | v[13] = rot8(x: v[13]); |
325 | v[14] = rot8(x: v[14]); |
326 | v[15] = rot8(x: v[15]); |
327 | v[8] = addv(a: v[8], b: v[12]); |
328 | v[9] = addv(a: v[9], b: v[13]); |
329 | v[10] = addv(a: v[10], b: v[14]); |
330 | v[11] = addv(a: v[11], b: v[15]); |
331 | v[4] = xorv(a: v[4], b: v[8]); |
332 | v[5] = xorv(a: v[5], b: v[9]); |
333 | v[6] = xorv(a: v[6], b: v[10]); |
334 | v[7] = xorv(a: v[7], b: v[11]); |
335 | v[4] = rot7(x: v[4]); |
336 | v[5] = rot7(x: v[5]); |
337 | v[6] = rot7(x: v[6]); |
338 | v[7] = rot7(x: v[7]); |
339 | |
340 | v[0] = addv(a: v[0], b: m[(size_t)MSG_SCHEDULE[r][8]]); |
341 | v[1] = addv(a: v[1], b: m[(size_t)MSG_SCHEDULE[r][10]]); |
342 | v[2] = addv(a: v[2], b: m[(size_t)MSG_SCHEDULE[r][12]]); |
343 | v[3] = addv(a: v[3], b: m[(size_t)MSG_SCHEDULE[r][14]]); |
344 | v[0] = addv(a: v[0], b: v[5]); |
345 | v[1] = addv(a: v[1], b: v[6]); |
346 | v[2] = addv(a: v[2], b: v[7]); |
347 | v[3] = addv(a: v[3], b: v[4]); |
348 | v[15] = xorv(a: v[15], b: v[0]); |
349 | v[12] = xorv(a: v[12], b: v[1]); |
350 | v[13] = xorv(a: v[13], b: v[2]); |
351 | v[14] = xorv(a: v[14], b: v[3]); |
352 | v[15] = rot16(x: v[15]); |
353 | v[12] = rot16(x: v[12]); |
354 | v[13] = rot16(x: v[13]); |
355 | v[14] = rot16(x: v[14]); |
356 | v[10] = addv(a: v[10], b: v[15]); |
357 | v[11] = addv(a: v[11], b: v[12]); |
358 | v[8] = addv(a: v[8], b: v[13]); |
359 | v[9] = addv(a: v[9], b: v[14]); |
360 | v[5] = xorv(a: v[5], b: v[10]); |
361 | v[6] = xorv(a: v[6], b: v[11]); |
362 | v[7] = xorv(a: v[7], b: v[8]); |
363 | v[4] = xorv(a: v[4], b: v[9]); |
364 | v[5] = rot12(x: v[5]); |
365 | v[6] = rot12(x: v[6]); |
366 | v[7] = rot12(x: v[7]); |
367 | v[4] = rot12(x: v[4]); |
368 | v[0] = addv(a: v[0], b: m[(size_t)MSG_SCHEDULE[r][9]]); |
369 | v[1] = addv(a: v[1], b: m[(size_t)MSG_SCHEDULE[r][11]]); |
370 | v[2] = addv(a: v[2], b: m[(size_t)MSG_SCHEDULE[r][13]]); |
371 | v[3] = addv(a: v[3], b: m[(size_t)MSG_SCHEDULE[r][15]]); |
372 | v[0] = addv(a: v[0], b: v[5]); |
373 | v[1] = addv(a: v[1], b: v[6]); |
374 | v[2] = addv(a: v[2], b: v[7]); |
375 | v[3] = addv(a: v[3], b: v[4]); |
376 | v[15] = xorv(a: v[15], b: v[0]); |
377 | v[12] = xorv(a: v[12], b: v[1]); |
378 | v[13] = xorv(a: v[13], b: v[2]); |
379 | v[14] = xorv(a: v[14], b: v[3]); |
380 | v[15] = rot8(x: v[15]); |
381 | v[12] = rot8(x: v[12]); |
382 | v[13] = rot8(x: v[13]); |
383 | v[14] = rot8(x: v[14]); |
384 | v[10] = addv(a: v[10], b: v[15]); |
385 | v[11] = addv(a: v[11], b: v[12]); |
386 | v[8] = addv(a: v[8], b: v[13]); |
387 | v[9] = addv(a: v[9], b: v[14]); |
388 | v[5] = xorv(a: v[5], b: v[10]); |
389 | v[6] = xorv(a: v[6], b: v[11]); |
390 | v[7] = xorv(a: v[7], b: v[8]); |
391 | v[4] = xorv(a: v[4], b: v[9]); |
392 | v[5] = rot7(x: v[5]); |
393 | v[6] = rot7(x: v[6]); |
394 | v[7] = rot7(x: v[7]); |
395 | v[4] = rot7(x: v[4]); |
396 | } |
397 | |
398 | INLINE void transpose_vecs(__m128i vecs[DEGREE]) { |
399 | // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is |
400 | // 22/33. Note that this doesn't split the vector into two lanes, as the |
401 | // AVX2 counterparts do. |
402 | __m128i ab_01 = _mm_unpacklo_epi32(a: vecs[0], b: vecs[1]); |
403 | __m128i ab_23 = _mm_unpackhi_epi32(a: vecs[0], b: vecs[1]); |
404 | __m128i cd_01 = _mm_unpacklo_epi32(a: vecs[2], b: vecs[3]); |
405 | __m128i cd_23 = _mm_unpackhi_epi32(a: vecs[2], b: vecs[3]); |
406 | |
407 | // Interleave 64-bit lanes. |
408 | __m128i abcd_0 = _mm_unpacklo_epi64(a: ab_01, b: cd_01); |
409 | __m128i abcd_1 = _mm_unpackhi_epi64(a: ab_01, b: cd_01); |
410 | __m128i abcd_2 = _mm_unpacklo_epi64(a: ab_23, b: cd_23); |
411 | __m128i abcd_3 = _mm_unpackhi_epi64(a: ab_23, b: cd_23); |
412 | |
413 | vecs[0] = abcd_0; |
414 | vecs[1] = abcd_1; |
415 | vecs[2] = abcd_2; |
416 | vecs[3] = abcd_3; |
417 | } |
418 | |
419 | INLINE void transpose_msg_vecs(const uint8_t *const *inputs, |
420 | size_t block_offset, __m128i out[16]) { |
421 | out[0] = loadu(src: &inputs[0][block_offset + 0 * sizeof(__m128i)]); |
422 | out[1] = loadu(src: &inputs[1][block_offset + 0 * sizeof(__m128i)]); |
423 | out[2] = loadu(src: &inputs[2][block_offset + 0 * sizeof(__m128i)]); |
424 | out[3] = loadu(src: &inputs[3][block_offset + 0 * sizeof(__m128i)]); |
425 | out[4] = loadu(src: &inputs[0][block_offset + 1 * sizeof(__m128i)]); |
426 | out[5] = loadu(src: &inputs[1][block_offset + 1 * sizeof(__m128i)]); |
427 | out[6] = loadu(src: &inputs[2][block_offset + 1 * sizeof(__m128i)]); |
428 | out[7] = loadu(src: &inputs[3][block_offset + 1 * sizeof(__m128i)]); |
429 | out[8] = loadu(src: &inputs[0][block_offset + 2 * sizeof(__m128i)]); |
430 | out[9] = loadu(src: &inputs[1][block_offset + 2 * sizeof(__m128i)]); |
431 | out[10] = loadu(src: &inputs[2][block_offset + 2 * sizeof(__m128i)]); |
432 | out[11] = loadu(src: &inputs[3][block_offset + 2 * sizeof(__m128i)]); |
433 | out[12] = loadu(src: &inputs[0][block_offset + 3 * sizeof(__m128i)]); |
434 | out[13] = loadu(src: &inputs[1][block_offset + 3 * sizeof(__m128i)]); |
435 | out[14] = loadu(src: &inputs[2][block_offset + 3 * sizeof(__m128i)]); |
436 | out[15] = loadu(src: &inputs[3][block_offset + 3 * sizeof(__m128i)]); |
437 | for (size_t i = 0; i < 4; ++i) { |
438 | _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); |
439 | } |
440 | transpose_vecs(vecs: &out[0]); |
441 | transpose_vecs(vecs: &out[4]); |
442 | transpose_vecs(vecs: &out[8]); |
443 | transpose_vecs(vecs: &out[12]); |
444 | } |
445 | |
446 | INLINE void load_counters(uint64_t counter, bool increment_counter, |
447 | __m128i *out_lo, __m128i *out_hi) { |
448 | const __m128i mask = _mm_set1_epi32(i: -(int32_t)increment_counter); |
449 | const __m128i add0 = _mm_set_epi32(i3: 3, i2: 2, i1: 1, i0: 0); |
450 | const __m128i add1 = _mm_and_si128(a: mask, b: add0); |
451 | __m128i l = _mm_add_epi32(a: _mm_set1_epi32(i: (int32_t)counter), b: add1); |
452 | __m128i carry = _mm_cmpgt_epi32(a: _mm_xor_si128(a: add1, b: _mm_set1_epi32(i: 0x80000000)), |
453 | b: _mm_xor_si128( a: l, b: _mm_set1_epi32(i: 0x80000000))); |
454 | __m128i h = _mm_sub_epi32(a: _mm_set1_epi32(i: (int32_t)(counter >> 32)), b: carry); |
455 | *out_lo = l; |
456 | *out_hi = h; |
457 | } |
458 | |
459 | static |
460 | void blake3_hash4_sse2(const uint8_t *const *inputs, size_t blocks, |
461 | const uint32_t key[8], uint64_t counter, |
462 | bool increment_counter, uint8_t flags, |
463 | uint8_t flags_start, uint8_t flags_end, uint8_t *out) { |
464 | __m128i h_vecs[8] = { |
465 | set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), |
466 | set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), |
467 | }; |
468 | __m128i counter_low_vec, counter_high_vec; |
469 | load_counters(counter, increment_counter, out_lo: &counter_low_vec, |
470 | out_hi: &counter_high_vec); |
471 | uint8_t block_flags = flags | flags_start; |
472 | |
473 | for (size_t block = 0; block < blocks; block++) { |
474 | if (block + 1 == blocks) { |
475 | block_flags |= flags_end; |
476 | } |
477 | __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN); |
478 | __m128i block_flags_vec = set1(block_flags); |
479 | __m128i msg_vecs[16]; |
480 | transpose_msg_vecs(inputs, block_offset: block * BLAKE3_BLOCK_LEN, out: msg_vecs); |
481 | |
482 | __m128i v[16] = { |
483 | h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], |
484 | h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], |
485 | set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), |
486 | counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, |
487 | }; |
488 | round_fn(v, m: msg_vecs, r: 0); |
489 | round_fn(v, m: msg_vecs, r: 1); |
490 | round_fn(v, m: msg_vecs, r: 2); |
491 | round_fn(v, m: msg_vecs, r: 3); |
492 | round_fn(v, m: msg_vecs, r: 4); |
493 | round_fn(v, m: msg_vecs, r: 5); |
494 | round_fn(v, m: msg_vecs, r: 6); |
495 | h_vecs[0] = xorv(a: v[0], b: v[8]); |
496 | h_vecs[1] = xorv(a: v[1], b: v[9]); |
497 | h_vecs[2] = xorv(a: v[2], b: v[10]); |
498 | h_vecs[3] = xorv(a: v[3], b: v[11]); |
499 | h_vecs[4] = xorv(a: v[4], b: v[12]); |
500 | h_vecs[5] = xorv(a: v[5], b: v[13]); |
501 | h_vecs[6] = xorv(a: v[6], b: v[14]); |
502 | h_vecs[7] = xorv(a: v[7], b: v[15]); |
503 | |
504 | block_flags = flags; |
505 | } |
506 | |
507 | transpose_vecs(vecs: &h_vecs[0]); |
508 | transpose_vecs(vecs: &h_vecs[4]); |
509 | // The first four vecs now contain the first half of each output, and the |
510 | // second four vecs contain the second half of each output. |
511 | storeu(src: h_vecs[0], dest: &out[0 * sizeof(__m128i)]); |
512 | storeu(src: h_vecs[4], dest: &out[1 * sizeof(__m128i)]); |
513 | storeu(src: h_vecs[1], dest: &out[2 * sizeof(__m128i)]); |
514 | storeu(src: h_vecs[5], dest: &out[3 * sizeof(__m128i)]); |
515 | storeu(src: h_vecs[2], dest: &out[4 * sizeof(__m128i)]); |
516 | storeu(src: h_vecs[6], dest: &out[5 * sizeof(__m128i)]); |
517 | storeu(src: h_vecs[3], dest: &out[6 * sizeof(__m128i)]); |
518 | storeu(src: h_vecs[7], dest: &out[7 * sizeof(__m128i)]); |
519 | } |
520 | |
521 | INLINE void hash_one_sse2(const uint8_t *input, size_t blocks, |
522 | const uint32_t key[8], uint64_t counter, |
523 | uint8_t flags, uint8_t flags_start, |
524 | uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { |
525 | uint32_t cv[8]; |
526 | memcpy(dest: cv, src: key, BLAKE3_KEY_LEN); |
527 | uint8_t block_flags = flags | flags_start; |
528 | while (blocks > 0) { |
529 | if (blocks == 1) { |
530 | block_flags |= flags_end; |
531 | } |
532 | blake3_compress_in_place_sse2(cv, block: input, BLAKE3_BLOCK_LEN, counter, |
533 | flags: block_flags); |
534 | input = &input[BLAKE3_BLOCK_LEN]; |
535 | blocks -= 1; |
536 | block_flags = flags; |
537 | } |
538 | memcpy(dest: out, src: cv, BLAKE3_OUT_LEN); |
539 | } |
540 | |
541 | void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, |
542 | size_t blocks, const uint32_t key[8], |
543 | uint64_t counter, bool increment_counter, |
544 | uint8_t flags, uint8_t flags_start, |
545 | uint8_t flags_end, uint8_t *out) { |
546 | while (num_inputs >= DEGREE) { |
547 | blake3_hash4_sse2(inputs, blocks, key, counter, increment_counter, flags, |
548 | flags_start, flags_end, out); |
549 | if (increment_counter) { |
550 | counter += DEGREE; |
551 | } |
552 | inputs += DEGREE; |
553 | num_inputs -= DEGREE; |
554 | out = &out[DEGREE * BLAKE3_OUT_LEN]; |
555 | } |
556 | while (num_inputs > 0) { |
557 | hash_one_sse2(input: inputs[0], blocks, key, counter, flags, flags_start, |
558 | flags_end, out); |
559 | if (increment_counter) { |
560 | counter += 1; |
561 | } |
562 | inputs += 1; |
563 | num_inputs -= 1; |
564 | out = &out[BLAKE3_OUT_LEN]; |
565 | } |
566 | } |
567 | |