1 | #include "blake3_impl.h" |
2 | |
3 | #include <immintrin.h> |
4 | |
5 | #define DEGREE 4 |
6 | |
7 | #define _mm_shuffle_ps2(a, b, c) \ |
8 | (_mm_castps_si128( \ |
9 | _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) |
10 | |
11 | INLINE __m128i loadu(const uint8_t src[16]) { |
12 | return _mm_loadu_si128(p: (const __m128i *)src); |
13 | } |
14 | |
15 | INLINE void storeu(__m128i src, uint8_t dest[16]) { |
16 | _mm_storeu_si128(p: (__m128i *)dest, b: src); |
17 | } |
18 | |
19 | INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a: a, b: b); } |
20 | |
21 | // Note that clang-format doesn't like the name "xor" for some reason. |
22 | INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a: a, b: b); } |
23 | |
24 | INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32(i: (int32_t)x); } |
25 | |
26 | INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { |
27 | return _mm_setr_epi32(i0: (int32_t)a, i1: (int32_t)b, i2: (int32_t)c, i3: (int32_t)d); |
28 | } |
29 | |
30 | INLINE __m128i rot16(__m128i x) { |
31 | return _mm_shuffle_epi8( |
32 | a: x, b: _mm_set_epi8(b15: 13, b14: 12, b13: 15, b12: 14, b11: 9, b10: 8, b9: 11, b8: 10, b7: 5, b6: 4, b5: 7, b4: 6, b3: 1, b2: 0, b1: 3, b0: 2)); |
33 | } |
34 | |
35 | INLINE __m128i rot12(__m128i x) { |
36 | return xorv(a: _mm_srli_epi32(a: x, count: 12), b: _mm_slli_epi32(a: x, count: 32 - 12)); |
37 | } |
38 | |
39 | INLINE __m128i rot8(__m128i x) { |
40 | return _mm_shuffle_epi8( |
41 | a: x, b: _mm_set_epi8(b15: 12, b14: 15, b13: 14, b12: 13, b11: 8, b10: 11, b9: 10, b8: 9, b7: 4, b6: 7, b5: 6, b4: 5, b3: 0, b2: 3, b1: 2, b0: 1)); |
42 | } |
43 | |
44 | INLINE __m128i rot7(__m128i x) { |
45 | return xorv(a: _mm_srli_epi32(a: x, count: 7), b: _mm_slli_epi32(a: x, count: 32 - 7)); |
46 | } |
47 | |
48 | INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, |
49 | __m128i m) { |
50 | *row0 = addv(a: addv(a: *row0, b: m), b: *row1); |
51 | *row3 = xorv(a: *row3, b: *row0); |
52 | *row3 = rot16(x: *row3); |
53 | *row2 = addv(a: *row2, b: *row3); |
54 | *row1 = xorv(a: *row1, b: *row2); |
55 | *row1 = rot12(x: *row1); |
56 | } |
57 | |
58 | INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, |
59 | __m128i m) { |
60 | *row0 = addv(a: addv(a: *row0, b: m), b: *row1); |
61 | *row3 = xorv(a: *row3, b: *row0); |
62 | *row3 = rot8(x: *row3); |
63 | *row2 = addv(a: *row2, b: *row3); |
64 | *row1 = xorv(a: *row1, b: *row2); |
65 | *row1 = rot7(x: *row1); |
66 | } |
67 | |
68 | // Note the optimization here of leaving row1 as the unrotated row, rather than |
69 | // row0. All the message loads below are adjusted to compensate for this. See |
70 | // discussion at https://github.com/sneves/blake2-avx2/pull/4 |
71 | INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { |
72 | *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); |
73 | *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); |
74 | *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); |
75 | } |
76 | |
77 | INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { |
78 | *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); |
79 | *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); |
80 | *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); |
81 | } |
82 | |
83 | INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], |
84 | const uint8_t block[BLAKE3_BLOCK_LEN], |
85 | uint8_t block_len, uint64_t counter, uint8_t flags) { |
86 | rows[0] = loadu(src: (uint8_t *)&cv[0]); |
87 | rows[1] = loadu(src: (uint8_t *)&cv[4]); |
88 | rows[2] = set4(a: IV[0], b: IV[1], c: IV[2], d: IV[3]); |
89 | rows[3] = set4(a: counter_low(counter), b: counter_high(counter), |
90 | c: (uint32_t)block_len, d: (uint32_t)flags); |
91 | |
92 | __m128i m0 = loadu(src: &block[sizeof(__m128i) * 0]); |
93 | __m128i m1 = loadu(src: &block[sizeof(__m128i) * 1]); |
94 | __m128i m2 = loadu(src: &block[sizeof(__m128i) * 2]); |
95 | __m128i m3 = loadu(src: &block[sizeof(__m128i) * 3]); |
96 | |
97 | __m128i t0, t1, t2, t3, tt; |
98 | |
99 | // Round 1. The first round permutes the message words from the original |
100 | // input order, into the groups that get mixed in parallel. |
101 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 |
102 | g1(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t0); |
103 | t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 |
104 | g2(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t1); |
105 | diagonalize(row0: &rows[0], row2: &rows[2], row3: &rows[3]); |
106 | t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 |
107 | t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 |
108 | g1(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t2); |
109 | t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 |
110 | t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 |
111 | g2(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t3); |
112 | undiagonalize(row0: &rows[0], row2: &rows[2], row3: &rows[3]); |
113 | m0 = t0; |
114 | m1 = t1; |
115 | m2 = t2; |
116 | m3 = t3; |
117 | |
118 | // Round 2. This round and all following rounds apply a fixed permutation |
119 | // to the message words from the round before. |
120 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); |
121 | t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); |
122 | g1(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t0); |
123 | t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); |
124 | tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); |
125 | t1 = _mm_blend_epi16(tt, t1, 0xCC); |
126 | g2(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t1); |
127 | diagonalize(row0: &rows[0], row2: &rows[2], row3: &rows[3]); |
128 | t2 = _mm_unpacklo_epi64(a: m3, b: m1); |
129 | tt = _mm_blend_epi16(t2, m2, 0xC0); |
130 | t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); |
131 | g1(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t2); |
132 | t3 = _mm_unpackhi_epi32(a: m1, b: m3); |
133 | tt = _mm_unpacklo_epi32(a: m2, b: t3); |
134 | t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); |
135 | g2(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t3); |
136 | undiagonalize(row0: &rows[0], row2: &rows[2], row3: &rows[3]); |
137 | m0 = t0; |
138 | m1 = t1; |
139 | m2 = t2; |
140 | m3 = t3; |
141 | |
142 | // Round 3 |
143 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); |
144 | t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); |
145 | g1(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t0); |
146 | t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); |
147 | tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); |
148 | t1 = _mm_blend_epi16(tt, t1, 0xCC); |
149 | g2(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t1); |
150 | diagonalize(row0: &rows[0], row2: &rows[2], row3: &rows[3]); |
151 | t2 = _mm_unpacklo_epi64(a: m3, b: m1); |
152 | tt = _mm_blend_epi16(t2, m2, 0xC0); |
153 | t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); |
154 | g1(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t2); |
155 | t3 = _mm_unpackhi_epi32(a: m1, b: m3); |
156 | tt = _mm_unpacklo_epi32(a: m2, b: t3); |
157 | t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); |
158 | g2(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t3); |
159 | undiagonalize(row0: &rows[0], row2: &rows[2], row3: &rows[3]); |
160 | m0 = t0; |
161 | m1 = t1; |
162 | m2 = t2; |
163 | m3 = t3; |
164 | |
165 | // Round 4 |
166 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); |
167 | t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); |
168 | g1(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t0); |
169 | t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); |
170 | tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); |
171 | t1 = _mm_blend_epi16(tt, t1, 0xCC); |
172 | g2(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t1); |
173 | diagonalize(row0: &rows[0], row2: &rows[2], row3: &rows[3]); |
174 | t2 = _mm_unpacklo_epi64(a: m3, b: m1); |
175 | tt = _mm_blend_epi16(t2, m2, 0xC0); |
176 | t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); |
177 | g1(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t2); |
178 | t3 = _mm_unpackhi_epi32(a: m1, b: m3); |
179 | tt = _mm_unpacklo_epi32(a: m2, b: t3); |
180 | t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); |
181 | g2(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t3); |
182 | undiagonalize(row0: &rows[0], row2: &rows[2], row3: &rows[3]); |
183 | m0 = t0; |
184 | m1 = t1; |
185 | m2 = t2; |
186 | m3 = t3; |
187 | |
188 | // Round 5 |
189 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); |
190 | t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); |
191 | g1(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t0); |
192 | t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); |
193 | tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); |
194 | t1 = _mm_blend_epi16(tt, t1, 0xCC); |
195 | g2(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t1); |
196 | diagonalize(row0: &rows[0], row2: &rows[2], row3: &rows[3]); |
197 | t2 = _mm_unpacklo_epi64(a: m3, b: m1); |
198 | tt = _mm_blend_epi16(t2, m2, 0xC0); |
199 | t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); |
200 | g1(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t2); |
201 | t3 = _mm_unpackhi_epi32(a: m1, b: m3); |
202 | tt = _mm_unpacklo_epi32(a: m2, b: t3); |
203 | t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); |
204 | g2(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t3); |
205 | undiagonalize(row0: &rows[0], row2: &rows[2], row3: &rows[3]); |
206 | m0 = t0; |
207 | m1 = t1; |
208 | m2 = t2; |
209 | m3 = t3; |
210 | |
211 | // Round 6 |
212 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); |
213 | t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); |
214 | g1(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t0); |
215 | t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); |
216 | tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); |
217 | t1 = _mm_blend_epi16(tt, t1, 0xCC); |
218 | g2(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t1); |
219 | diagonalize(row0: &rows[0], row2: &rows[2], row3: &rows[3]); |
220 | t2 = _mm_unpacklo_epi64(a: m3, b: m1); |
221 | tt = _mm_blend_epi16(t2, m2, 0xC0); |
222 | t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); |
223 | g1(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t2); |
224 | t3 = _mm_unpackhi_epi32(a: m1, b: m3); |
225 | tt = _mm_unpacklo_epi32(a: m2, b: t3); |
226 | t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); |
227 | g2(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t3); |
228 | undiagonalize(row0: &rows[0], row2: &rows[2], row3: &rows[3]); |
229 | m0 = t0; |
230 | m1 = t1; |
231 | m2 = t2; |
232 | m3 = t3; |
233 | |
234 | // Round 7 |
235 | t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); |
236 | t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); |
237 | g1(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t0); |
238 | t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); |
239 | tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); |
240 | t1 = _mm_blend_epi16(tt, t1, 0xCC); |
241 | g2(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t1); |
242 | diagonalize(row0: &rows[0], row2: &rows[2], row3: &rows[3]); |
243 | t2 = _mm_unpacklo_epi64(a: m3, b: m1); |
244 | tt = _mm_blend_epi16(t2, m2, 0xC0); |
245 | t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); |
246 | g1(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t2); |
247 | t3 = _mm_unpackhi_epi32(a: m1, b: m3); |
248 | tt = _mm_unpacklo_epi32(a: m2, b: t3); |
249 | t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); |
250 | g2(row0: &rows[0], row1: &rows[1], row2: &rows[2], row3: &rows[3], m: t3); |
251 | undiagonalize(row0: &rows[0], row2: &rows[2], row3: &rows[3]); |
252 | } |
253 | |
254 | void blake3_compress_in_place_sse41(uint32_t cv[8], |
255 | const uint8_t block[BLAKE3_BLOCK_LEN], |
256 | uint8_t block_len, uint64_t counter, |
257 | uint8_t flags) { |
258 | __m128i rows[4]; |
259 | compress_pre(rows, cv, block, block_len, counter, flags); |
260 | storeu(src: xorv(a: rows[0], b: rows[2]), dest: (uint8_t *)&cv[0]); |
261 | storeu(src: xorv(a: rows[1], b: rows[3]), dest: (uint8_t *)&cv[4]); |
262 | } |
263 | |
264 | void blake3_compress_xof_sse41(const uint32_t cv[8], |
265 | const uint8_t block[BLAKE3_BLOCK_LEN], |
266 | uint8_t block_len, uint64_t counter, |
267 | uint8_t flags, uint8_t out[64]) { |
268 | __m128i rows[4]; |
269 | compress_pre(rows, cv, block, block_len, counter, flags); |
270 | storeu(src: xorv(a: rows[0], b: rows[2]), dest: &out[0]); |
271 | storeu(src: xorv(a: rows[1], b: rows[3]), dest: &out[16]); |
272 | storeu(src: xorv(a: rows[2], b: loadu(src: (uint8_t *)&cv[0])), dest: &out[32]); |
273 | storeu(src: xorv(a: rows[3], b: loadu(src: (uint8_t *)&cv[4])), dest: &out[48]); |
274 | } |
275 | |
276 | INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) { |
277 | v[0] = addv(a: v[0], b: m[(size_t)MSG_SCHEDULE[r][0]]); |
278 | v[1] = addv(a: v[1], b: m[(size_t)MSG_SCHEDULE[r][2]]); |
279 | v[2] = addv(a: v[2], b: m[(size_t)MSG_SCHEDULE[r][4]]); |
280 | v[3] = addv(a: v[3], b: m[(size_t)MSG_SCHEDULE[r][6]]); |
281 | v[0] = addv(a: v[0], b: v[4]); |
282 | v[1] = addv(a: v[1], b: v[5]); |
283 | v[2] = addv(a: v[2], b: v[6]); |
284 | v[3] = addv(a: v[3], b: v[7]); |
285 | v[12] = xorv(a: v[12], b: v[0]); |
286 | v[13] = xorv(a: v[13], b: v[1]); |
287 | v[14] = xorv(a: v[14], b: v[2]); |
288 | v[15] = xorv(a: v[15], b: v[3]); |
289 | v[12] = rot16(x: v[12]); |
290 | v[13] = rot16(x: v[13]); |
291 | v[14] = rot16(x: v[14]); |
292 | v[15] = rot16(x: v[15]); |
293 | v[8] = addv(a: v[8], b: v[12]); |
294 | v[9] = addv(a: v[9], b: v[13]); |
295 | v[10] = addv(a: v[10], b: v[14]); |
296 | v[11] = addv(a: v[11], b: v[15]); |
297 | v[4] = xorv(a: v[4], b: v[8]); |
298 | v[5] = xorv(a: v[5], b: v[9]); |
299 | v[6] = xorv(a: v[6], b: v[10]); |
300 | v[7] = xorv(a: v[7], b: v[11]); |
301 | v[4] = rot12(x: v[4]); |
302 | v[5] = rot12(x: v[5]); |
303 | v[6] = rot12(x: v[6]); |
304 | v[7] = rot12(x: v[7]); |
305 | v[0] = addv(a: v[0], b: m[(size_t)MSG_SCHEDULE[r][1]]); |
306 | v[1] = addv(a: v[1], b: m[(size_t)MSG_SCHEDULE[r][3]]); |
307 | v[2] = addv(a: v[2], b: m[(size_t)MSG_SCHEDULE[r][5]]); |
308 | v[3] = addv(a: v[3], b: m[(size_t)MSG_SCHEDULE[r][7]]); |
309 | v[0] = addv(a: v[0], b: v[4]); |
310 | v[1] = addv(a: v[1], b: v[5]); |
311 | v[2] = addv(a: v[2], b: v[6]); |
312 | v[3] = addv(a: v[3], b: v[7]); |
313 | v[12] = xorv(a: v[12], b: v[0]); |
314 | v[13] = xorv(a: v[13], b: v[1]); |
315 | v[14] = xorv(a: v[14], b: v[2]); |
316 | v[15] = xorv(a: v[15], b: v[3]); |
317 | v[12] = rot8(x: v[12]); |
318 | v[13] = rot8(x: v[13]); |
319 | v[14] = rot8(x: v[14]); |
320 | v[15] = rot8(x: v[15]); |
321 | v[8] = addv(a: v[8], b: v[12]); |
322 | v[9] = addv(a: v[9], b: v[13]); |
323 | v[10] = addv(a: v[10], b: v[14]); |
324 | v[11] = addv(a: v[11], b: v[15]); |
325 | v[4] = xorv(a: v[4], b: v[8]); |
326 | v[5] = xorv(a: v[5], b: v[9]); |
327 | v[6] = xorv(a: v[6], b: v[10]); |
328 | v[7] = xorv(a: v[7], b: v[11]); |
329 | v[4] = rot7(x: v[4]); |
330 | v[5] = rot7(x: v[5]); |
331 | v[6] = rot7(x: v[6]); |
332 | v[7] = rot7(x: v[7]); |
333 | |
334 | v[0] = addv(a: v[0], b: m[(size_t)MSG_SCHEDULE[r][8]]); |
335 | v[1] = addv(a: v[1], b: m[(size_t)MSG_SCHEDULE[r][10]]); |
336 | v[2] = addv(a: v[2], b: m[(size_t)MSG_SCHEDULE[r][12]]); |
337 | v[3] = addv(a: v[3], b: m[(size_t)MSG_SCHEDULE[r][14]]); |
338 | v[0] = addv(a: v[0], b: v[5]); |
339 | v[1] = addv(a: v[1], b: v[6]); |
340 | v[2] = addv(a: v[2], b: v[7]); |
341 | v[3] = addv(a: v[3], b: v[4]); |
342 | v[15] = xorv(a: v[15], b: v[0]); |
343 | v[12] = xorv(a: v[12], b: v[1]); |
344 | v[13] = xorv(a: v[13], b: v[2]); |
345 | v[14] = xorv(a: v[14], b: v[3]); |
346 | v[15] = rot16(x: v[15]); |
347 | v[12] = rot16(x: v[12]); |
348 | v[13] = rot16(x: v[13]); |
349 | v[14] = rot16(x: v[14]); |
350 | v[10] = addv(a: v[10], b: v[15]); |
351 | v[11] = addv(a: v[11], b: v[12]); |
352 | v[8] = addv(a: v[8], b: v[13]); |
353 | v[9] = addv(a: v[9], b: v[14]); |
354 | v[5] = xorv(a: v[5], b: v[10]); |
355 | v[6] = xorv(a: v[6], b: v[11]); |
356 | v[7] = xorv(a: v[7], b: v[8]); |
357 | v[4] = xorv(a: v[4], b: v[9]); |
358 | v[5] = rot12(x: v[5]); |
359 | v[6] = rot12(x: v[6]); |
360 | v[7] = rot12(x: v[7]); |
361 | v[4] = rot12(x: v[4]); |
362 | v[0] = addv(a: v[0], b: m[(size_t)MSG_SCHEDULE[r][9]]); |
363 | v[1] = addv(a: v[1], b: m[(size_t)MSG_SCHEDULE[r][11]]); |
364 | v[2] = addv(a: v[2], b: m[(size_t)MSG_SCHEDULE[r][13]]); |
365 | v[3] = addv(a: v[3], b: m[(size_t)MSG_SCHEDULE[r][15]]); |
366 | v[0] = addv(a: v[0], b: v[5]); |
367 | v[1] = addv(a: v[1], b: v[6]); |
368 | v[2] = addv(a: v[2], b: v[7]); |
369 | v[3] = addv(a: v[3], b: v[4]); |
370 | v[15] = xorv(a: v[15], b: v[0]); |
371 | v[12] = xorv(a: v[12], b: v[1]); |
372 | v[13] = xorv(a: v[13], b: v[2]); |
373 | v[14] = xorv(a: v[14], b: v[3]); |
374 | v[15] = rot8(x: v[15]); |
375 | v[12] = rot8(x: v[12]); |
376 | v[13] = rot8(x: v[13]); |
377 | v[14] = rot8(x: v[14]); |
378 | v[10] = addv(a: v[10], b: v[15]); |
379 | v[11] = addv(a: v[11], b: v[12]); |
380 | v[8] = addv(a: v[8], b: v[13]); |
381 | v[9] = addv(a: v[9], b: v[14]); |
382 | v[5] = xorv(a: v[5], b: v[10]); |
383 | v[6] = xorv(a: v[6], b: v[11]); |
384 | v[7] = xorv(a: v[7], b: v[8]); |
385 | v[4] = xorv(a: v[4], b: v[9]); |
386 | v[5] = rot7(x: v[5]); |
387 | v[6] = rot7(x: v[6]); |
388 | v[7] = rot7(x: v[7]); |
389 | v[4] = rot7(x: v[4]); |
390 | } |
391 | |
392 | INLINE void transpose_vecs(__m128i vecs[DEGREE]) { |
393 | // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is |
394 | // 22/33. Note that this doesn't split the vector into two lanes, as the |
395 | // AVX2 counterparts do. |
396 | __m128i ab_01 = _mm_unpacklo_epi32(a: vecs[0], b: vecs[1]); |
397 | __m128i ab_23 = _mm_unpackhi_epi32(a: vecs[0], b: vecs[1]); |
398 | __m128i cd_01 = _mm_unpacklo_epi32(a: vecs[2], b: vecs[3]); |
399 | __m128i cd_23 = _mm_unpackhi_epi32(a: vecs[2], b: vecs[3]); |
400 | |
401 | // Interleave 64-bit lanes. |
402 | __m128i abcd_0 = _mm_unpacklo_epi64(a: ab_01, b: cd_01); |
403 | __m128i abcd_1 = _mm_unpackhi_epi64(a: ab_01, b: cd_01); |
404 | __m128i abcd_2 = _mm_unpacklo_epi64(a: ab_23, b: cd_23); |
405 | __m128i abcd_3 = _mm_unpackhi_epi64(a: ab_23, b: cd_23); |
406 | |
407 | vecs[0] = abcd_0; |
408 | vecs[1] = abcd_1; |
409 | vecs[2] = abcd_2; |
410 | vecs[3] = abcd_3; |
411 | } |
412 | |
413 | INLINE void transpose_msg_vecs(const uint8_t *const *inputs, |
414 | size_t block_offset, __m128i out[16]) { |
415 | out[0] = loadu(src: &inputs[0][block_offset + 0 * sizeof(__m128i)]); |
416 | out[1] = loadu(src: &inputs[1][block_offset + 0 * sizeof(__m128i)]); |
417 | out[2] = loadu(src: &inputs[2][block_offset + 0 * sizeof(__m128i)]); |
418 | out[3] = loadu(src: &inputs[3][block_offset + 0 * sizeof(__m128i)]); |
419 | out[4] = loadu(src: &inputs[0][block_offset + 1 * sizeof(__m128i)]); |
420 | out[5] = loadu(src: &inputs[1][block_offset + 1 * sizeof(__m128i)]); |
421 | out[6] = loadu(src: &inputs[2][block_offset + 1 * sizeof(__m128i)]); |
422 | out[7] = loadu(src: &inputs[3][block_offset + 1 * sizeof(__m128i)]); |
423 | out[8] = loadu(src: &inputs[0][block_offset + 2 * sizeof(__m128i)]); |
424 | out[9] = loadu(src: &inputs[1][block_offset + 2 * sizeof(__m128i)]); |
425 | out[10] = loadu(src: &inputs[2][block_offset + 2 * sizeof(__m128i)]); |
426 | out[11] = loadu(src: &inputs[3][block_offset + 2 * sizeof(__m128i)]); |
427 | out[12] = loadu(src: &inputs[0][block_offset + 3 * sizeof(__m128i)]); |
428 | out[13] = loadu(src: &inputs[1][block_offset + 3 * sizeof(__m128i)]); |
429 | out[14] = loadu(src: &inputs[2][block_offset + 3 * sizeof(__m128i)]); |
430 | out[15] = loadu(src: &inputs[3][block_offset + 3 * sizeof(__m128i)]); |
431 | for (size_t i = 0; i < 4; ++i) { |
432 | _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); |
433 | } |
434 | transpose_vecs(vecs: &out[0]); |
435 | transpose_vecs(vecs: &out[4]); |
436 | transpose_vecs(vecs: &out[8]); |
437 | transpose_vecs(vecs: &out[12]); |
438 | } |
439 | |
440 | INLINE void load_counters(uint64_t counter, bool increment_counter, |
441 | __m128i *out_lo, __m128i *out_hi) { |
442 | const __m128i mask = _mm_set1_epi32(i: -(int32_t)increment_counter); |
443 | const __m128i add0 = _mm_set_epi32(i3: 3, i2: 2, i1: 1, i0: 0); |
444 | const __m128i add1 = _mm_and_si128(a: mask, b: add0); |
445 | __m128i l = _mm_add_epi32(a: _mm_set1_epi32(i: (int32_t)counter), b: add1); |
446 | __m128i carry = _mm_cmpgt_epi32(a: _mm_xor_si128(a: add1, b: _mm_set1_epi32(i: 0x80000000)), |
447 | b: _mm_xor_si128( a: l, b: _mm_set1_epi32(i: 0x80000000))); |
448 | __m128i h = _mm_sub_epi32(a: _mm_set1_epi32(i: (int32_t)(counter >> 32)), b: carry); |
449 | *out_lo = l; |
450 | *out_hi = h; |
451 | } |
452 | |
453 | static |
454 | void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks, |
455 | const uint32_t key[8], uint64_t counter, |
456 | bool increment_counter, uint8_t flags, |
457 | uint8_t flags_start, uint8_t flags_end, uint8_t *out) { |
458 | __m128i h_vecs[8] = { |
459 | set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), |
460 | set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), |
461 | }; |
462 | __m128i counter_low_vec, counter_high_vec; |
463 | load_counters(counter, increment_counter, out_lo: &counter_low_vec, |
464 | out_hi: &counter_high_vec); |
465 | uint8_t block_flags = flags | flags_start; |
466 | |
467 | for (size_t block = 0; block < blocks; block++) { |
468 | if (block + 1 == blocks) { |
469 | block_flags |= flags_end; |
470 | } |
471 | __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN); |
472 | __m128i block_flags_vec = set1(block_flags); |
473 | __m128i msg_vecs[16]; |
474 | transpose_msg_vecs(inputs, block_offset: block * BLAKE3_BLOCK_LEN, out: msg_vecs); |
475 | |
476 | __m128i v[16] = { |
477 | h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], |
478 | h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], |
479 | set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), |
480 | counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, |
481 | }; |
482 | round_fn(v, m: msg_vecs, r: 0); |
483 | round_fn(v, m: msg_vecs, r: 1); |
484 | round_fn(v, m: msg_vecs, r: 2); |
485 | round_fn(v, m: msg_vecs, r: 3); |
486 | round_fn(v, m: msg_vecs, r: 4); |
487 | round_fn(v, m: msg_vecs, r: 5); |
488 | round_fn(v, m: msg_vecs, r: 6); |
489 | h_vecs[0] = xorv(a: v[0], b: v[8]); |
490 | h_vecs[1] = xorv(a: v[1], b: v[9]); |
491 | h_vecs[2] = xorv(a: v[2], b: v[10]); |
492 | h_vecs[3] = xorv(a: v[3], b: v[11]); |
493 | h_vecs[4] = xorv(a: v[4], b: v[12]); |
494 | h_vecs[5] = xorv(a: v[5], b: v[13]); |
495 | h_vecs[6] = xorv(a: v[6], b: v[14]); |
496 | h_vecs[7] = xorv(a: v[7], b: v[15]); |
497 | |
498 | block_flags = flags; |
499 | } |
500 | |
501 | transpose_vecs(vecs: &h_vecs[0]); |
502 | transpose_vecs(vecs: &h_vecs[4]); |
503 | // The first four vecs now contain the first half of each output, and the |
504 | // second four vecs contain the second half of each output. |
505 | storeu(src: h_vecs[0], dest: &out[0 * sizeof(__m128i)]); |
506 | storeu(src: h_vecs[4], dest: &out[1 * sizeof(__m128i)]); |
507 | storeu(src: h_vecs[1], dest: &out[2 * sizeof(__m128i)]); |
508 | storeu(src: h_vecs[5], dest: &out[3 * sizeof(__m128i)]); |
509 | storeu(src: h_vecs[2], dest: &out[4 * sizeof(__m128i)]); |
510 | storeu(src: h_vecs[6], dest: &out[5 * sizeof(__m128i)]); |
511 | storeu(src: h_vecs[3], dest: &out[6 * sizeof(__m128i)]); |
512 | storeu(src: h_vecs[7], dest: &out[7 * sizeof(__m128i)]); |
513 | } |
514 | |
515 | INLINE void hash_one_sse41(const uint8_t *input, size_t blocks, |
516 | const uint32_t key[8], uint64_t counter, |
517 | uint8_t flags, uint8_t flags_start, |
518 | uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { |
519 | uint32_t cv[8]; |
520 | memcpy(dest: cv, src: key, BLAKE3_KEY_LEN); |
521 | uint8_t block_flags = flags | flags_start; |
522 | while (blocks > 0) { |
523 | if (blocks == 1) { |
524 | block_flags |= flags_end; |
525 | } |
526 | blake3_compress_in_place_sse41(cv, block: input, BLAKE3_BLOCK_LEN, counter, |
527 | flags: block_flags); |
528 | input = &input[BLAKE3_BLOCK_LEN]; |
529 | blocks -= 1; |
530 | block_flags = flags; |
531 | } |
532 | memcpy(dest: out, src: cv, BLAKE3_OUT_LEN); |
533 | } |
534 | |
535 | void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, |
536 | size_t blocks, const uint32_t key[8], |
537 | uint64_t counter, bool increment_counter, |
538 | uint8_t flags, uint8_t flags_start, |
539 | uint8_t flags_end, uint8_t *out) { |
540 | while (num_inputs >= DEGREE) { |
541 | blake3_hash4_sse41(inputs, blocks, key, counter, increment_counter, flags, |
542 | flags_start, flags_end, out); |
543 | if (increment_counter) { |
544 | counter += DEGREE; |
545 | } |
546 | inputs += DEGREE; |
547 | num_inputs -= DEGREE; |
548 | out = &out[DEGREE * BLAKE3_OUT_LEN]; |
549 | } |
550 | while (num_inputs > 0) { |
551 | hash_one_sse41(input: inputs[0], blocks, key, counter, flags, flags_start, |
552 | flags_end, out); |
553 | if (increment_counter) { |
554 | counter += 1; |
555 | } |
556 | inputs += 1; |
557 | num_inputs -= 1; |
558 | out = &out[BLAKE3_OUT_LEN]; |
559 | } |
560 | } |
561 | |