1#if defined(__x86_64__)
2
3#include "llvm_blake3_prefix.h"
4
5#if defined(__ELF__) && !(defined(__sun__) && defined(__svr4__))
6.section .note.GNU-stack,"",%progbits
7#endif
8
9#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
10#if __has_include(<cet.h>)
11#include <cet.h>
12#endif
13#endif
14
15#if !defined(_CET_ENDBR)
16#define _CET_ENDBR
17#endif
18
19#ifdef __APPLE__
20#define HIDDEN .private_extern
21#else
22#define HIDDEN .hidden
23#endif
24
25.intel_syntax noprefix
26HIDDEN blake3_hash_many_sse2
27HIDDEN _blake3_hash_many_sse2
28HIDDEN blake3_compress_in_place_sse2
29HIDDEN _blake3_compress_in_place_sse2
30HIDDEN blake3_compress_xof_sse2
31HIDDEN _blake3_compress_xof_sse2
32.global blake3_hash_many_sse2
33.global _blake3_hash_many_sse2
34.global blake3_compress_in_place_sse2
35.global _blake3_compress_in_place_sse2
36.global blake3_compress_xof_sse2
37.global _blake3_compress_xof_sse2
38#ifdef __APPLE__
39.text
40#else
41.section .text
42#endif
43 .p2align 6
44_blake3_hash_many_sse2:
45blake3_hash_many_sse2:
46 _CET_ENDBR
47 push r15
48 push r14
49 push r13
50 push r12
51 push rbx
52 push rbp
53 mov rbp, rsp
54 sub rsp, 360
55 and rsp, 0xFFFFFFFFFFFFFFC0
56 neg r9d
57 movd xmm0, r9d
58 pshufd xmm0, xmm0, 0x00
59 movdqa xmmword ptr [rsp+0x130], xmm0
60 movdqa xmm1, xmm0
61 pand xmm1, xmmword ptr [ADD0+rip]
62 pand xmm0, xmmword ptr [ADD1+rip]
63 movdqa xmmword ptr [rsp+0x150], xmm0
64 movd xmm0, r8d
65 pshufd xmm0, xmm0, 0x00
66 paddd xmm0, xmm1
67 movdqa xmmword ptr [rsp+0x110], xmm0
68 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
69 pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
70 pcmpgtd xmm1, xmm0
71 shr r8, 32
72 movd xmm2, r8d
73 pshufd xmm2, xmm2, 0x00
74 psubd xmm2, xmm1
75 movdqa xmmword ptr [rsp+0x120], xmm2
76 mov rbx, qword ptr [rbp+0x50]
77 mov r15, rdx
78 shl r15, 6
79 movzx r13d, byte ptr [rbp+0x38]
80 movzx r12d, byte ptr [rbp+0x48]
81 cmp rsi, 4
82 jc 3f
832:
84 movdqu xmm3, xmmword ptr [rcx]
85 pshufd xmm0, xmm3, 0x00
86 pshufd xmm1, xmm3, 0x55
87 pshufd xmm2, xmm3, 0xAA
88 pshufd xmm3, xmm3, 0xFF
89 movdqu xmm7, xmmword ptr [rcx+0x10]
90 pshufd xmm4, xmm7, 0x00
91 pshufd xmm5, xmm7, 0x55
92 pshufd xmm6, xmm7, 0xAA
93 pshufd xmm7, xmm7, 0xFF
94 mov r8, qword ptr [rdi]
95 mov r9, qword ptr [rdi+0x8]
96 mov r10, qword ptr [rdi+0x10]
97 mov r11, qword ptr [rdi+0x18]
98 movzx eax, byte ptr [rbp+0x40]
99 or eax, r13d
100 xor edx, edx
1019:
102 mov r14d, eax
103 or eax, r12d
104 add rdx, 64
105 cmp rdx, r15
106 cmovne eax, r14d
107 movdqu xmm8, xmmword ptr [r8+rdx-0x40]
108 movdqu xmm9, xmmword ptr [r9+rdx-0x40]
109 movdqu xmm10, xmmword ptr [r10+rdx-0x40]
110 movdqu xmm11, xmmword ptr [r11+rdx-0x40]
111 movdqa xmm12, xmm8
112 punpckldq xmm8, xmm9
113 punpckhdq xmm12, xmm9
114 movdqa xmm14, xmm10
115 punpckldq xmm10, xmm11
116 punpckhdq xmm14, xmm11
117 movdqa xmm9, xmm8
118 punpcklqdq xmm8, xmm10
119 punpckhqdq xmm9, xmm10
120 movdqa xmm13, xmm12
121 punpcklqdq xmm12, xmm14
122 punpckhqdq xmm13, xmm14
123 movdqa xmmword ptr [rsp], xmm8
124 movdqa xmmword ptr [rsp+0x10], xmm9
125 movdqa xmmword ptr [rsp+0x20], xmm12
126 movdqa xmmword ptr [rsp+0x30], xmm13
127 movdqu xmm8, xmmword ptr [r8+rdx-0x30]
128 movdqu xmm9, xmmword ptr [r9+rdx-0x30]
129 movdqu xmm10, xmmword ptr [r10+rdx-0x30]
130 movdqu xmm11, xmmword ptr [r11+rdx-0x30]
131 movdqa xmm12, xmm8
132 punpckldq xmm8, xmm9
133 punpckhdq xmm12, xmm9
134 movdqa xmm14, xmm10
135 punpckldq xmm10, xmm11
136 punpckhdq xmm14, xmm11
137 movdqa xmm9, xmm8
138 punpcklqdq xmm8, xmm10
139 punpckhqdq xmm9, xmm10
140 movdqa xmm13, xmm12
141 punpcklqdq xmm12, xmm14
142 punpckhqdq xmm13, xmm14
143 movdqa xmmword ptr [rsp+0x40], xmm8
144 movdqa xmmword ptr [rsp+0x50], xmm9
145 movdqa xmmword ptr [rsp+0x60], xmm12
146 movdqa xmmword ptr [rsp+0x70], xmm13
147 movdqu xmm8, xmmword ptr [r8+rdx-0x20]
148 movdqu xmm9, xmmword ptr [r9+rdx-0x20]
149 movdqu xmm10, xmmword ptr [r10+rdx-0x20]
150 movdqu xmm11, xmmword ptr [r11+rdx-0x20]
151 movdqa xmm12, xmm8
152 punpckldq xmm8, xmm9
153 punpckhdq xmm12, xmm9
154 movdqa xmm14, xmm10
155 punpckldq xmm10, xmm11
156 punpckhdq xmm14, xmm11
157 movdqa xmm9, xmm8
158 punpcklqdq xmm8, xmm10
159 punpckhqdq xmm9, xmm10
160 movdqa xmm13, xmm12
161 punpcklqdq xmm12, xmm14
162 punpckhqdq xmm13, xmm14
163 movdqa xmmword ptr [rsp+0x80], xmm8
164 movdqa xmmword ptr [rsp+0x90], xmm9
165 movdqa xmmword ptr [rsp+0xA0], xmm12
166 movdqa xmmword ptr [rsp+0xB0], xmm13
167 movdqu xmm8, xmmword ptr [r8+rdx-0x10]
168 movdqu xmm9, xmmword ptr [r9+rdx-0x10]
169 movdqu xmm10, xmmword ptr [r10+rdx-0x10]
170 movdqu xmm11, xmmword ptr [r11+rdx-0x10]
171 movdqa xmm12, xmm8
172 punpckldq xmm8, xmm9
173 punpckhdq xmm12, xmm9
174 movdqa xmm14, xmm10
175 punpckldq xmm10, xmm11
176 punpckhdq xmm14, xmm11
177 movdqa xmm9, xmm8
178 punpcklqdq xmm8, xmm10
179 punpckhqdq xmm9, xmm10
180 movdqa xmm13, xmm12
181 punpcklqdq xmm12, xmm14
182 punpckhqdq xmm13, xmm14
183 movdqa xmmword ptr [rsp+0xC0], xmm8
184 movdqa xmmword ptr [rsp+0xD0], xmm9
185 movdqa xmmword ptr [rsp+0xE0], xmm12
186 movdqa xmmword ptr [rsp+0xF0], xmm13
187 movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
188 movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
189 movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
190 movdqa xmm12, xmmword ptr [rsp+0x110]
191 movdqa xmm13, xmmword ptr [rsp+0x120]
192 movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
193 movd xmm15, eax
194 pshufd xmm15, xmm15, 0x00
195 prefetcht0 [r8+rdx+0x80]
196 prefetcht0 [r9+rdx+0x80]
197 prefetcht0 [r10+rdx+0x80]
198 prefetcht0 [r11+rdx+0x80]
199 paddd xmm0, xmmword ptr [rsp]
200 paddd xmm1, xmmword ptr [rsp+0x20]
201 paddd xmm2, xmmword ptr [rsp+0x40]
202 paddd xmm3, xmmword ptr [rsp+0x60]
203 paddd xmm0, xmm4
204 paddd xmm1, xmm5
205 paddd xmm2, xmm6
206 paddd xmm3, xmm7
207 pxor xmm12, xmm0
208 pxor xmm13, xmm1
209 pxor xmm14, xmm2
210 pxor xmm15, xmm3
211 pshuflw xmm12, xmm12, 0xB1
212 pshufhw xmm12, xmm12, 0xB1
213 pshuflw xmm13, xmm13, 0xB1
214 pshufhw xmm13, xmm13, 0xB1
215 pshuflw xmm14, xmm14, 0xB1
216 pshufhw xmm14, xmm14, 0xB1
217 pshuflw xmm15, xmm15, 0xB1
218 pshufhw xmm15, xmm15, 0xB1
219 movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
220 paddd xmm8, xmm12
221 paddd xmm9, xmm13
222 paddd xmm10, xmm14
223 paddd xmm11, xmm15
224 pxor xmm4, xmm8
225 pxor xmm5, xmm9
226 pxor xmm6, xmm10
227 pxor xmm7, xmm11
228 movdqa xmmword ptr [rsp+0x100], xmm8
229 movdqa xmm8, xmm4
230 psrld xmm8, 12
231 pslld xmm4, 20
232 por xmm4, xmm8
233 movdqa xmm8, xmm5
234 psrld xmm8, 12
235 pslld xmm5, 20
236 por xmm5, xmm8
237 movdqa xmm8, xmm6
238 psrld xmm8, 12
239 pslld xmm6, 20
240 por xmm6, xmm8
241 movdqa xmm8, xmm7
242 psrld xmm8, 12
243 pslld xmm7, 20
244 por xmm7, xmm8
245 paddd xmm0, xmmword ptr [rsp+0x10]
246 paddd xmm1, xmmword ptr [rsp+0x30]
247 paddd xmm2, xmmword ptr [rsp+0x50]
248 paddd xmm3, xmmword ptr [rsp+0x70]
249 paddd xmm0, xmm4
250 paddd xmm1, xmm5
251 paddd xmm2, xmm6
252 paddd xmm3, xmm7
253 pxor xmm12, xmm0
254 pxor xmm13, xmm1
255 pxor xmm14, xmm2
256 pxor xmm15, xmm3
257 movdqa xmm8, xmm12
258 psrld xmm12, 8
259 pslld xmm8, 24
260 pxor xmm12, xmm8
261 movdqa xmm8, xmm13
262 psrld xmm13, 8
263 pslld xmm8, 24
264 pxor xmm13, xmm8
265 movdqa xmm8, xmm14
266 psrld xmm14, 8
267 pslld xmm8, 24
268 pxor xmm14, xmm8
269 movdqa xmm8, xmm15
270 psrld xmm15, 8
271 pslld xmm8, 24
272 pxor xmm15, xmm8
273 movdqa xmm8, xmmword ptr [rsp+0x100]
274 paddd xmm8, xmm12
275 paddd xmm9, xmm13
276 paddd xmm10, xmm14
277 paddd xmm11, xmm15
278 pxor xmm4, xmm8
279 pxor xmm5, xmm9
280 pxor xmm6, xmm10
281 pxor xmm7, xmm11
282 movdqa xmmword ptr [rsp+0x100], xmm8
283 movdqa xmm8, xmm4
284 psrld xmm8, 7
285 pslld xmm4, 25
286 por xmm4, xmm8
287 movdqa xmm8, xmm5
288 psrld xmm8, 7
289 pslld xmm5, 25
290 por xmm5, xmm8
291 movdqa xmm8, xmm6
292 psrld xmm8, 7
293 pslld xmm6, 25
294 por xmm6, xmm8
295 movdqa xmm8, xmm7
296 psrld xmm8, 7
297 pslld xmm7, 25
298 por xmm7, xmm8
299 paddd xmm0, xmmword ptr [rsp+0x80]
300 paddd xmm1, xmmword ptr [rsp+0xA0]
301 paddd xmm2, xmmword ptr [rsp+0xC0]
302 paddd xmm3, xmmword ptr [rsp+0xE0]
303 paddd xmm0, xmm5
304 paddd xmm1, xmm6
305 paddd xmm2, xmm7
306 paddd xmm3, xmm4
307 pxor xmm15, xmm0
308 pxor xmm12, xmm1
309 pxor xmm13, xmm2
310 pxor xmm14, xmm3
311 pshuflw xmm15, xmm15, 0xB1
312 pshufhw xmm15, xmm15, 0xB1
313 pshuflw xmm12, xmm12, 0xB1
314 pshufhw xmm12, xmm12, 0xB1
315 pshuflw xmm13, xmm13, 0xB1
316 pshufhw xmm13, xmm13, 0xB1
317 pshuflw xmm14, xmm14, 0xB1
318 pshufhw xmm14, xmm14, 0xB1
319 paddd xmm10, xmm15
320 paddd xmm11, xmm12
321 movdqa xmm8, xmmword ptr [rsp+0x100]
322 paddd xmm8, xmm13
323 paddd xmm9, xmm14
324 pxor xmm5, xmm10
325 pxor xmm6, xmm11
326 pxor xmm7, xmm8
327 pxor xmm4, xmm9
328 movdqa xmmword ptr [rsp+0x100], xmm8
329 movdqa xmm8, xmm5
330 psrld xmm8, 12
331 pslld xmm5, 20
332 por xmm5, xmm8
333 movdqa xmm8, xmm6
334 psrld xmm8, 12
335 pslld xmm6, 20
336 por xmm6, xmm8
337 movdqa xmm8, xmm7
338 psrld xmm8, 12
339 pslld xmm7, 20
340 por xmm7, xmm8
341 movdqa xmm8, xmm4
342 psrld xmm8, 12
343 pslld xmm4, 20
344 por xmm4, xmm8
345 paddd xmm0, xmmword ptr [rsp+0x90]
346 paddd xmm1, xmmword ptr [rsp+0xB0]
347 paddd xmm2, xmmword ptr [rsp+0xD0]
348 paddd xmm3, xmmword ptr [rsp+0xF0]
349 paddd xmm0, xmm5
350 paddd xmm1, xmm6
351 paddd xmm2, xmm7
352 paddd xmm3, xmm4
353 pxor xmm15, xmm0
354 pxor xmm12, xmm1
355 pxor xmm13, xmm2
356 pxor xmm14, xmm3
357 movdqa xmm8, xmm15
358 psrld xmm15, 8
359 pslld xmm8, 24
360 pxor xmm15, xmm8
361 movdqa xmm8, xmm12
362 psrld xmm12, 8
363 pslld xmm8, 24
364 pxor xmm12, xmm8
365 movdqa xmm8, xmm13
366 psrld xmm13, 8
367 pslld xmm8, 24
368 pxor xmm13, xmm8
369 movdqa xmm8, xmm14
370 psrld xmm14, 8
371 pslld xmm8, 24
372 pxor xmm14, xmm8
373 paddd xmm10, xmm15
374 paddd xmm11, xmm12
375 movdqa xmm8, xmmword ptr [rsp+0x100]
376 paddd xmm8, xmm13
377 paddd xmm9, xmm14
378 pxor xmm5, xmm10
379 pxor xmm6, xmm11
380 pxor xmm7, xmm8
381 pxor xmm4, xmm9
382 movdqa xmmword ptr [rsp+0x100], xmm8
383 movdqa xmm8, xmm5
384 psrld xmm8, 7
385 pslld xmm5, 25
386 por xmm5, xmm8
387 movdqa xmm8, xmm6
388 psrld xmm8, 7
389 pslld xmm6, 25
390 por xmm6, xmm8
391 movdqa xmm8, xmm7
392 psrld xmm8, 7
393 pslld xmm7, 25
394 por xmm7, xmm8
395 movdqa xmm8, xmm4
396 psrld xmm8, 7
397 pslld xmm4, 25
398 por xmm4, xmm8
399 paddd xmm0, xmmword ptr [rsp+0x20]
400 paddd xmm1, xmmword ptr [rsp+0x30]
401 paddd xmm2, xmmword ptr [rsp+0x70]
402 paddd xmm3, xmmword ptr [rsp+0x40]
403 paddd xmm0, xmm4
404 paddd xmm1, xmm5
405 paddd xmm2, xmm6
406 paddd xmm3, xmm7
407 pxor xmm12, xmm0
408 pxor xmm13, xmm1
409 pxor xmm14, xmm2
410 pxor xmm15, xmm3
411 pshuflw xmm12, xmm12, 0xB1
412 pshufhw xmm12, xmm12, 0xB1
413 pshuflw xmm13, xmm13, 0xB1
414 pshufhw xmm13, xmm13, 0xB1
415 pshuflw xmm14, xmm14, 0xB1
416 pshufhw xmm14, xmm14, 0xB1
417 pshuflw xmm15, xmm15, 0xB1
418 pshufhw xmm15, xmm15, 0xB1
419 movdqa xmm8, xmmword ptr [rsp+0x100]
420 paddd xmm8, xmm12
421 paddd xmm9, xmm13
422 paddd xmm10, xmm14
423 paddd xmm11, xmm15
424 pxor xmm4, xmm8
425 pxor xmm5, xmm9
426 pxor xmm6, xmm10
427 pxor xmm7, xmm11
428 movdqa xmmword ptr [rsp+0x100], xmm8
429 movdqa xmm8, xmm4
430 psrld xmm8, 12
431 pslld xmm4, 20
432 por xmm4, xmm8
433 movdqa xmm8, xmm5
434 psrld xmm8, 12
435 pslld xmm5, 20
436 por xmm5, xmm8
437 movdqa xmm8, xmm6
438 psrld xmm8, 12
439 pslld xmm6, 20
440 por xmm6, xmm8
441 movdqa xmm8, xmm7
442 psrld xmm8, 12
443 pslld xmm7, 20
444 por xmm7, xmm8
445 paddd xmm0, xmmword ptr [rsp+0x60]
446 paddd xmm1, xmmword ptr [rsp+0xA0]
447 paddd xmm2, xmmword ptr [rsp]
448 paddd xmm3, xmmword ptr [rsp+0xD0]
449 paddd xmm0, xmm4
450 paddd xmm1, xmm5
451 paddd xmm2, xmm6
452 paddd xmm3, xmm7
453 pxor xmm12, xmm0
454 pxor xmm13, xmm1
455 pxor xmm14, xmm2
456 pxor xmm15, xmm3
457 movdqa xmm8, xmm12
458 psrld xmm12, 8
459 pslld xmm8, 24
460 pxor xmm12, xmm8
461 movdqa xmm8, xmm13
462 psrld xmm13, 8
463 pslld xmm8, 24
464 pxor xmm13, xmm8
465 movdqa xmm8, xmm14
466 psrld xmm14, 8
467 pslld xmm8, 24
468 pxor xmm14, xmm8
469 movdqa xmm8, xmm15
470 psrld xmm15, 8
471 pslld xmm8, 24
472 pxor xmm15, xmm8
473 movdqa xmm8, xmmword ptr [rsp+0x100]
474 paddd xmm8, xmm12
475 paddd xmm9, xmm13
476 paddd xmm10, xmm14
477 paddd xmm11, xmm15
478 pxor xmm4, xmm8
479 pxor xmm5, xmm9
480 pxor xmm6, xmm10
481 pxor xmm7, xmm11
482 movdqa xmmword ptr [rsp+0x100], xmm8
483 movdqa xmm8, xmm4
484 psrld xmm8, 7
485 pslld xmm4, 25
486 por xmm4, xmm8
487 movdqa xmm8, xmm5
488 psrld xmm8, 7
489 pslld xmm5, 25
490 por xmm5, xmm8
491 movdqa xmm8, xmm6
492 psrld xmm8, 7
493 pslld xmm6, 25
494 por xmm6, xmm8
495 movdqa xmm8, xmm7
496 psrld xmm8, 7
497 pslld xmm7, 25
498 por xmm7, xmm8
499 paddd xmm0, xmmword ptr [rsp+0x10]
500 paddd xmm1, xmmword ptr [rsp+0xC0]
501 paddd xmm2, xmmword ptr [rsp+0x90]
502 paddd xmm3, xmmword ptr [rsp+0xF0]
503 paddd xmm0, xmm5
504 paddd xmm1, xmm6
505 paddd xmm2, xmm7
506 paddd xmm3, xmm4
507 pxor xmm15, xmm0
508 pxor xmm12, xmm1
509 pxor xmm13, xmm2
510 pxor xmm14, xmm3
511 pshuflw xmm15, xmm15, 0xB1
512 pshufhw xmm15, xmm15, 0xB1
513 pshuflw xmm12, xmm12, 0xB1
514 pshufhw xmm12, xmm12, 0xB1
515 pshuflw xmm13, xmm13, 0xB1
516 pshufhw xmm13, xmm13, 0xB1
517 pshuflw xmm14, xmm14, 0xB1
518 pshufhw xmm14, xmm14, 0xB1
519 paddd xmm10, xmm15
520 paddd xmm11, xmm12
521 movdqa xmm8, xmmword ptr [rsp+0x100]
522 paddd xmm8, xmm13
523 paddd xmm9, xmm14
524 pxor xmm5, xmm10
525 pxor xmm6, xmm11
526 pxor xmm7, xmm8
527 pxor xmm4, xmm9
528 movdqa xmmword ptr [rsp+0x100], xmm8
529 movdqa xmm8, xmm5
530 psrld xmm8, 12
531 pslld xmm5, 20
532 por xmm5, xmm8
533 movdqa xmm8, xmm6
534 psrld xmm8, 12
535 pslld xmm6, 20
536 por xmm6, xmm8
537 movdqa xmm8, xmm7
538 psrld xmm8, 12
539 pslld xmm7, 20
540 por xmm7, xmm8
541 movdqa xmm8, xmm4
542 psrld xmm8, 12
543 pslld xmm4, 20
544 por xmm4, xmm8
545 paddd xmm0, xmmword ptr [rsp+0xB0]
546 paddd xmm1, xmmword ptr [rsp+0x50]
547 paddd xmm2, xmmword ptr [rsp+0xE0]
548 paddd xmm3, xmmword ptr [rsp+0x80]
549 paddd xmm0, xmm5
550 paddd xmm1, xmm6
551 paddd xmm2, xmm7
552 paddd xmm3, xmm4
553 pxor xmm15, xmm0
554 pxor xmm12, xmm1
555 pxor xmm13, xmm2
556 pxor xmm14, xmm3
557 movdqa xmm8, xmm15
558 psrld xmm15, 8
559 pslld xmm8, 24
560 pxor xmm15, xmm8
561 movdqa xmm8, xmm12
562 psrld xmm12, 8
563 pslld xmm8, 24
564 pxor xmm12, xmm8
565 movdqa xmm8, xmm13
566 psrld xmm13, 8
567 pslld xmm8, 24
568 pxor xmm13, xmm8
569 movdqa xmm8, xmm14
570 psrld xmm14, 8
571 pslld xmm8, 24
572 pxor xmm14, xmm8
573 paddd xmm10, xmm15
574 paddd xmm11, xmm12
575 movdqa xmm8, xmmword ptr [rsp+0x100]
576 paddd xmm8, xmm13
577 paddd xmm9, xmm14
578 pxor xmm5, xmm10
579 pxor xmm6, xmm11
580 pxor xmm7, xmm8
581 pxor xmm4, xmm9
582 movdqa xmmword ptr [rsp+0x100], xmm8
583 movdqa xmm8, xmm5
584 psrld xmm8, 7
585 pslld xmm5, 25
586 por xmm5, xmm8
587 movdqa xmm8, xmm6
588 psrld xmm8, 7
589 pslld xmm6, 25
590 por xmm6, xmm8
591 movdqa xmm8, xmm7
592 psrld xmm8, 7
593 pslld xmm7, 25
594 por xmm7, xmm8
595 movdqa xmm8, xmm4
596 psrld xmm8, 7
597 pslld xmm4, 25
598 por xmm4, xmm8
599 paddd xmm0, xmmword ptr [rsp+0x30]
600 paddd xmm1, xmmword ptr [rsp+0xA0]
601 paddd xmm2, xmmword ptr [rsp+0xD0]
602 paddd xmm3, xmmword ptr [rsp+0x70]
603 paddd xmm0, xmm4
604 paddd xmm1, xmm5
605 paddd xmm2, xmm6
606 paddd xmm3, xmm7
607 pxor xmm12, xmm0
608 pxor xmm13, xmm1
609 pxor xmm14, xmm2
610 pxor xmm15, xmm3
611 pshuflw xmm12, xmm12, 0xB1
612 pshufhw xmm12, xmm12, 0xB1
613 pshuflw xmm13, xmm13, 0xB1
614 pshufhw xmm13, xmm13, 0xB1
615 pshuflw xmm14, xmm14, 0xB1
616 pshufhw xmm14, xmm14, 0xB1
617 pshuflw xmm15, xmm15, 0xB1
618 pshufhw xmm15, xmm15, 0xB1
619 movdqa xmm8, xmmword ptr [rsp+0x100]
620 paddd xmm8, xmm12
621 paddd xmm9, xmm13
622 paddd xmm10, xmm14
623 paddd xmm11, xmm15
624 pxor xmm4, xmm8
625 pxor xmm5, xmm9
626 pxor xmm6, xmm10
627 pxor xmm7, xmm11
628 movdqa xmmword ptr [rsp+0x100], xmm8
629 movdqa xmm8, xmm4
630 psrld xmm8, 12
631 pslld xmm4, 20
632 por xmm4, xmm8
633 movdqa xmm8, xmm5
634 psrld xmm8, 12
635 pslld xmm5, 20
636 por xmm5, xmm8
637 movdqa xmm8, xmm6
638 psrld xmm8, 12
639 pslld xmm6, 20
640 por xmm6, xmm8
641 movdqa xmm8, xmm7
642 psrld xmm8, 12
643 pslld xmm7, 20
644 por xmm7, xmm8
645 paddd xmm0, xmmword ptr [rsp+0x40]
646 paddd xmm1, xmmword ptr [rsp+0xC0]
647 paddd xmm2, xmmword ptr [rsp+0x20]
648 paddd xmm3, xmmword ptr [rsp+0xE0]
649 paddd xmm0, xmm4
650 paddd xmm1, xmm5
651 paddd xmm2, xmm6
652 paddd xmm3, xmm7
653 pxor xmm12, xmm0
654 pxor xmm13, xmm1
655 pxor xmm14, xmm2
656 pxor xmm15, xmm3
657 movdqa xmm8, xmm12
658 psrld xmm12, 8
659 pslld xmm8, 24
660 pxor xmm12, xmm8
661 movdqa xmm8, xmm13
662 psrld xmm13, 8
663 pslld xmm8, 24
664 pxor xmm13, xmm8
665 movdqa xmm8, xmm14
666 psrld xmm14, 8
667 pslld xmm8, 24
668 pxor xmm14, xmm8
669 movdqa xmm8, xmm15
670 psrld xmm15, 8
671 pslld xmm8, 24
672 pxor xmm15, xmm8
673 movdqa xmm8, xmmword ptr [rsp+0x100]
674 paddd xmm8, xmm12
675 paddd xmm9, xmm13
676 paddd xmm10, xmm14
677 paddd xmm11, xmm15
678 pxor xmm4, xmm8
679 pxor xmm5, xmm9
680 pxor xmm6, xmm10
681 pxor xmm7, xmm11
682 movdqa xmmword ptr [rsp+0x100], xmm8
683 movdqa xmm8, xmm4
684 psrld xmm8, 7
685 pslld xmm4, 25
686 por xmm4, xmm8
687 movdqa xmm8, xmm5
688 psrld xmm8, 7
689 pslld xmm5, 25
690 por xmm5, xmm8
691 movdqa xmm8, xmm6
692 psrld xmm8, 7
693 pslld xmm6, 25
694 por xmm6, xmm8
695 movdqa xmm8, xmm7
696 psrld xmm8, 7
697 pslld xmm7, 25
698 por xmm7, xmm8
699 paddd xmm0, xmmword ptr [rsp+0x60]
700 paddd xmm1, xmmword ptr [rsp+0x90]
701 paddd xmm2, xmmword ptr [rsp+0xB0]
702 paddd xmm3, xmmword ptr [rsp+0x80]
703 paddd xmm0, xmm5
704 paddd xmm1, xmm6
705 paddd xmm2, xmm7
706 paddd xmm3, xmm4
707 pxor xmm15, xmm0
708 pxor xmm12, xmm1
709 pxor xmm13, xmm2
710 pxor xmm14, xmm3
711 pshuflw xmm15, xmm15, 0xB1
712 pshufhw xmm15, xmm15, 0xB1
713 pshuflw xmm12, xmm12, 0xB1
714 pshufhw xmm12, xmm12, 0xB1
715 pshuflw xmm13, xmm13, 0xB1
716 pshufhw xmm13, xmm13, 0xB1
717 pshuflw xmm14, xmm14, 0xB1
718 pshufhw xmm14, xmm14, 0xB1
719 paddd xmm10, xmm15
720 paddd xmm11, xmm12
721 movdqa xmm8, xmmword ptr [rsp+0x100]
722 paddd xmm8, xmm13
723 paddd xmm9, xmm14
724 pxor xmm5, xmm10
725 pxor xmm6, xmm11
726 pxor xmm7, xmm8
727 pxor xmm4, xmm9
728 movdqa xmmword ptr [rsp+0x100], xmm8
729 movdqa xmm8, xmm5
730 psrld xmm8, 12
731 pslld xmm5, 20
732 por xmm5, xmm8
733 movdqa xmm8, xmm6
734 psrld xmm8, 12
735 pslld xmm6, 20
736 por xmm6, xmm8
737 movdqa xmm8, xmm7
738 psrld xmm8, 12
739 pslld xmm7, 20
740 por xmm7, xmm8
741 movdqa xmm8, xmm4
742 psrld xmm8, 12
743 pslld xmm4, 20
744 por xmm4, xmm8
745 paddd xmm0, xmmword ptr [rsp+0x50]
746 paddd xmm1, xmmword ptr [rsp]
747 paddd xmm2, xmmword ptr [rsp+0xF0]
748 paddd xmm3, xmmword ptr [rsp+0x10]
749 paddd xmm0, xmm5
750 paddd xmm1, xmm6
751 paddd xmm2, xmm7
752 paddd xmm3, xmm4
753 pxor xmm15, xmm0
754 pxor xmm12, xmm1
755 pxor xmm13, xmm2
756 pxor xmm14, xmm3
757 movdqa xmm8, xmm15
758 psrld xmm15, 8
759 pslld xmm8, 24
760 pxor xmm15, xmm8
761 movdqa xmm8, xmm12
762 psrld xmm12, 8
763 pslld xmm8, 24
764 pxor xmm12, xmm8
765 movdqa xmm8, xmm13
766 psrld xmm13, 8
767 pslld xmm8, 24
768 pxor xmm13, xmm8
769 movdqa xmm8, xmm14
770 psrld xmm14, 8
771 pslld xmm8, 24
772 pxor xmm14, xmm8
773 paddd xmm10, xmm15
774 paddd xmm11, xmm12
775 movdqa xmm8, xmmword ptr [rsp+0x100]
776 paddd xmm8, xmm13
777 paddd xmm9, xmm14
778 pxor xmm5, xmm10
779 pxor xmm6, xmm11
780 pxor xmm7, xmm8
781 pxor xmm4, xmm9
782 movdqa xmmword ptr [rsp+0x100], xmm8
783 movdqa xmm8, xmm5
784 psrld xmm8, 7
785 pslld xmm5, 25
786 por xmm5, xmm8
787 movdqa xmm8, xmm6
788 psrld xmm8, 7
789 pslld xmm6, 25
790 por xmm6, xmm8
791 movdqa xmm8, xmm7
792 psrld xmm8, 7
793 pslld xmm7, 25
794 por xmm7, xmm8
795 movdqa xmm8, xmm4
796 psrld xmm8, 7
797 pslld xmm4, 25
798 por xmm4, xmm8
799 paddd xmm0, xmmword ptr [rsp+0xA0]
800 paddd xmm1, xmmword ptr [rsp+0xC0]
801 paddd xmm2, xmmword ptr [rsp+0xE0]
802 paddd xmm3, xmmword ptr [rsp+0xD0]
803 paddd xmm0, xmm4
804 paddd xmm1, xmm5
805 paddd xmm2, xmm6
806 paddd xmm3, xmm7
807 pxor xmm12, xmm0
808 pxor xmm13, xmm1
809 pxor xmm14, xmm2
810 pxor xmm15, xmm3
811 pshuflw xmm12, xmm12, 0xB1
812 pshufhw xmm12, xmm12, 0xB1
813 pshuflw xmm13, xmm13, 0xB1
814 pshufhw xmm13, xmm13, 0xB1
815 pshuflw xmm14, xmm14, 0xB1
816 pshufhw xmm14, xmm14, 0xB1
817 pshuflw xmm15, xmm15, 0xB1
818 pshufhw xmm15, xmm15, 0xB1
819 movdqa xmm8, xmmword ptr [rsp+0x100]
820 paddd xmm8, xmm12
821 paddd xmm9, xmm13
822 paddd xmm10, xmm14
823 paddd xmm11, xmm15
824 pxor xmm4, xmm8
825 pxor xmm5, xmm9
826 pxor xmm6, xmm10
827 pxor xmm7, xmm11
828 movdqa xmmword ptr [rsp+0x100], xmm8
829 movdqa xmm8, xmm4
830 psrld xmm8, 12
831 pslld xmm4, 20
832 por xmm4, xmm8
833 movdqa xmm8, xmm5
834 psrld xmm8, 12
835 pslld xmm5, 20
836 por xmm5, xmm8
837 movdqa xmm8, xmm6
838 psrld xmm8, 12
839 pslld xmm6, 20
840 por xmm6, xmm8
841 movdqa xmm8, xmm7
842 psrld xmm8, 12
843 pslld xmm7, 20
844 por xmm7, xmm8
845 paddd xmm0, xmmword ptr [rsp+0x70]
846 paddd xmm1, xmmword ptr [rsp+0x90]
847 paddd xmm2, xmmword ptr [rsp+0x30]
848 paddd xmm3, xmmword ptr [rsp+0xF0]
849 paddd xmm0, xmm4
850 paddd xmm1, xmm5
851 paddd xmm2, xmm6
852 paddd xmm3, xmm7
853 pxor xmm12, xmm0
854 pxor xmm13, xmm1
855 pxor xmm14, xmm2
856 pxor xmm15, xmm3
857 movdqa xmm8, xmm12
858 psrld xmm12, 8
859 pslld xmm8, 24
860 pxor xmm12, xmm8
861 movdqa xmm8, xmm13
862 psrld xmm13, 8
863 pslld xmm8, 24
864 pxor xmm13, xmm8
865 movdqa xmm8, xmm14
866 psrld xmm14, 8
867 pslld xmm8, 24
868 pxor xmm14, xmm8
869 movdqa xmm8, xmm15
870 psrld xmm15, 8
871 pslld xmm8, 24
872 pxor xmm15, xmm8
873 movdqa xmm8, xmmword ptr [rsp+0x100]
874 paddd xmm8, xmm12
875 paddd xmm9, xmm13
876 paddd xmm10, xmm14
877 paddd xmm11, xmm15
878 pxor xmm4, xmm8
879 pxor xmm5, xmm9
880 pxor xmm6, xmm10
881 pxor xmm7, xmm11
882 movdqa xmmword ptr [rsp+0x100], xmm8
883 movdqa xmm8, xmm4
884 psrld xmm8, 7
885 pslld xmm4, 25
886 por xmm4, xmm8
887 movdqa xmm8, xmm5
888 psrld xmm8, 7
889 pslld xmm5, 25
890 por xmm5, xmm8
891 movdqa xmm8, xmm6
892 psrld xmm8, 7
893 pslld xmm6, 25
894 por xmm6, xmm8
895 movdqa xmm8, xmm7
896 psrld xmm8, 7
897 pslld xmm7, 25
898 por xmm7, xmm8
899 paddd xmm0, xmmword ptr [rsp+0x40]
900 paddd xmm1, xmmword ptr [rsp+0xB0]
901 paddd xmm2, xmmword ptr [rsp+0x50]
902 paddd xmm3, xmmword ptr [rsp+0x10]
903 paddd xmm0, xmm5
904 paddd xmm1, xmm6
905 paddd xmm2, xmm7
906 paddd xmm3, xmm4
907 pxor xmm15, xmm0
908 pxor xmm12, xmm1
909 pxor xmm13, xmm2
910 pxor xmm14, xmm3
911 pshuflw xmm15, xmm15, 0xB1
912 pshufhw xmm15, xmm15, 0xB1
913 pshuflw xmm12, xmm12, 0xB1
914 pshufhw xmm12, xmm12, 0xB1
915 pshuflw xmm13, xmm13, 0xB1
916 pshufhw xmm13, xmm13, 0xB1
917 pshuflw xmm14, xmm14, 0xB1
918 pshufhw xmm14, xmm14, 0xB1
919 paddd xmm10, xmm15
920 paddd xmm11, xmm12
921 movdqa xmm8, xmmword ptr [rsp+0x100]
922 paddd xmm8, xmm13
923 paddd xmm9, xmm14
924 pxor xmm5, xmm10
925 pxor xmm6, xmm11
926 pxor xmm7, xmm8
927 pxor xmm4, xmm9
928 movdqa xmmword ptr [rsp+0x100], xmm8
929 movdqa xmm8, xmm5
930 psrld xmm8, 12
931 pslld xmm5, 20
932 por xmm5, xmm8
933 movdqa xmm8, xmm6
934 psrld xmm8, 12
935 pslld xmm6, 20
936 por xmm6, xmm8
937 movdqa xmm8, xmm7
938 psrld xmm8, 12
939 pslld xmm7, 20
940 por xmm7, xmm8
941 movdqa xmm8, xmm4
942 psrld xmm8, 12
943 pslld xmm4, 20
944 por xmm4, xmm8
945 paddd xmm0, xmmword ptr [rsp]
946 paddd xmm1, xmmword ptr [rsp+0x20]
947 paddd xmm2, xmmword ptr [rsp+0x80]
948 paddd xmm3, xmmword ptr [rsp+0x60]
949 paddd xmm0, xmm5
950 paddd xmm1, xmm6
951 paddd xmm2, xmm7
952 paddd xmm3, xmm4
953 pxor xmm15, xmm0
954 pxor xmm12, xmm1
955 pxor xmm13, xmm2
956 pxor xmm14, xmm3
957 movdqa xmm8, xmm15
958 psrld xmm15, 8
959 pslld xmm8, 24
960 pxor xmm15, xmm8
961 movdqa xmm8, xmm12
962 psrld xmm12, 8
963 pslld xmm8, 24
964 pxor xmm12, xmm8
965 movdqa xmm8, xmm13
966 psrld xmm13, 8
967 pslld xmm8, 24
968 pxor xmm13, xmm8
969 movdqa xmm8, xmm14
970 psrld xmm14, 8
971 pslld xmm8, 24
972 pxor xmm14, xmm8
973 paddd xmm10, xmm15
974 paddd xmm11, xmm12
975 movdqa xmm8, xmmword ptr [rsp+0x100]
976 paddd xmm8, xmm13
977 paddd xmm9, xmm14
978 pxor xmm5, xmm10
979 pxor xmm6, xmm11
980 pxor xmm7, xmm8
981 pxor xmm4, xmm9
982 movdqa xmmword ptr [rsp+0x100], xmm8
983 movdqa xmm8, xmm5
984 psrld xmm8, 7
985 pslld xmm5, 25
986 por xmm5, xmm8
987 movdqa xmm8, xmm6
988 psrld xmm8, 7
989 pslld xmm6, 25
990 por xmm6, xmm8
991 movdqa xmm8, xmm7
992 psrld xmm8, 7
993 pslld xmm7, 25
994 por xmm7, xmm8
995 movdqa xmm8, xmm4
996 psrld xmm8, 7
997 pslld xmm4, 25
998 por xmm4, xmm8
999 paddd xmm0, xmmword ptr [rsp+0xC0]
1000 paddd xmm1, xmmword ptr [rsp+0x90]
1001 paddd xmm2, xmmword ptr [rsp+0xF0]
1002 paddd xmm3, xmmword ptr [rsp+0xE0]
1003 paddd xmm0, xmm4
1004 paddd xmm1, xmm5
1005 paddd xmm2, xmm6
1006 paddd xmm3, xmm7
1007 pxor xmm12, xmm0
1008 pxor xmm13, xmm1
1009 pxor xmm14, xmm2
1010 pxor xmm15, xmm3
1011 pshuflw xmm12, xmm12, 0xB1
1012 pshufhw xmm12, xmm12, 0xB1
1013 pshuflw xmm13, xmm13, 0xB1
1014 pshufhw xmm13, xmm13, 0xB1
1015 pshuflw xmm14, xmm14, 0xB1
1016 pshufhw xmm14, xmm14, 0xB1
1017 pshuflw xmm15, xmm15, 0xB1
1018 pshufhw xmm15, xmm15, 0xB1
1019 movdqa xmm8, xmmword ptr [rsp+0x100]
1020 paddd xmm8, xmm12
1021 paddd xmm9, xmm13
1022 paddd xmm10, xmm14
1023 paddd xmm11, xmm15
1024 pxor xmm4, xmm8
1025 pxor xmm5, xmm9
1026 pxor xmm6, xmm10
1027 pxor xmm7, xmm11
1028 movdqa xmmword ptr [rsp+0x100], xmm8
1029 movdqa xmm8, xmm4
1030 psrld xmm8, 12
1031 pslld xmm4, 20
1032 por xmm4, xmm8
1033 movdqa xmm8, xmm5
1034 psrld xmm8, 12
1035 pslld xmm5, 20
1036 por xmm5, xmm8
1037 movdqa xmm8, xmm6
1038 psrld xmm8, 12
1039 pslld xmm6, 20
1040 por xmm6, xmm8
1041 movdqa xmm8, xmm7
1042 psrld xmm8, 12
1043 pslld xmm7, 20
1044 por xmm7, xmm8
1045 paddd xmm0, xmmword ptr [rsp+0xD0]
1046 paddd xmm1, xmmword ptr [rsp+0xB0]
1047 paddd xmm2, xmmword ptr [rsp+0xA0]
1048 paddd xmm3, xmmword ptr [rsp+0x80]
1049 paddd xmm0, xmm4
1050 paddd xmm1, xmm5
1051 paddd xmm2, xmm6
1052 paddd xmm3, xmm7
1053 pxor xmm12, xmm0
1054 pxor xmm13, xmm1
1055 pxor xmm14, xmm2
1056 pxor xmm15, xmm3
1057 movdqa xmm8, xmm12
1058 psrld xmm12, 8
1059 pslld xmm8, 24
1060 pxor xmm12, xmm8
1061 movdqa xmm8, xmm13
1062 psrld xmm13, 8
1063 pslld xmm8, 24
1064 pxor xmm13, xmm8
1065 movdqa xmm8, xmm14
1066 psrld xmm14, 8
1067 pslld xmm8, 24
1068 pxor xmm14, xmm8
1069 movdqa xmm8, xmm15
1070 psrld xmm15, 8
1071 pslld xmm8, 24
1072 pxor xmm15, xmm8
1073 movdqa xmm8, xmmword ptr [rsp+0x100]
1074 paddd xmm8, xmm12
1075 paddd xmm9, xmm13
1076 paddd xmm10, xmm14
1077 paddd xmm11, xmm15
1078 pxor xmm4, xmm8
1079 pxor xmm5, xmm9
1080 pxor xmm6, xmm10
1081 pxor xmm7, xmm11
1082 movdqa xmmword ptr [rsp+0x100], xmm8
1083 movdqa xmm8, xmm4
1084 psrld xmm8, 7
1085 pslld xmm4, 25
1086 por xmm4, xmm8
1087 movdqa xmm8, xmm5
1088 psrld xmm8, 7
1089 pslld xmm5, 25
1090 por xmm5, xmm8
1091 movdqa xmm8, xmm6
1092 psrld xmm8, 7
1093 pslld xmm6, 25
1094 por xmm6, xmm8
1095 movdqa xmm8, xmm7
1096 psrld xmm8, 7
1097 pslld xmm7, 25
1098 por xmm7, xmm8
1099 paddd xmm0, xmmword ptr [rsp+0x70]
1100 paddd xmm1, xmmword ptr [rsp+0x50]
1101 paddd xmm2, xmmword ptr [rsp]
1102 paddd xmm3, xmmword ptr [rsp+0x60]
1103 paddd xmm0, xmm5
1104 paddd xmm1, xmm6
1105 paddd xmm2, xmm7
1106 paddd xmm3, xmm4
1107 pxor xmm15, xmm0
1108 pxor xmm12, xmm1
1109 pxor xmm13, xmm2
1110 pxor xmm14, xmm3
1111 pshuflw xmm15, xmm15, 0xB1
1112 pshufhw xmm15, xmm15, 0xB1
1113 pshuflw xmm12, xmm12, 0xB1
1114 pshufhw xmm12, xmm12, 0xB1
1115 pshuflw xmm13, xmm13, 0xB1
1116 pshufhw xmm13, xmm13, 0xB1
1117 pshuflw xmm14, xmm14, 0xB1
1118 pshufhw xmm14, xmm14, 0xB1
1119 paddd xmm10, xmm15
1120 paddd xmm11, xmm12
1121 movdqa xmm8, xmmword ptr [rsp+0x100]
1122 paddd xmm8, xmm13
1123 paddd xmm9, xmm14
1124 pxor xmm5, xmm10
1125 pxor xmm6, xmm11
1126 pxor xmm7, xmm8
1127 pxor xmm4, xmm9
1128 movdqa xmmword ptr [rsp+0x100], xmm8
1129 movdqa xmm8, xmm5
1130 psrld xmm8, 12
1131 pslld xmm5, 20
1132 por xmm5, xmm8
1133 movdqa xmm8, xmm6
1134 psrld xmm8, 12
1135 pslld xmm6, 20
1136 por xmm6, xmm8
1137 movdqa xmm8, xmm7
1138 psrld xmm8, 12
1139 pslld xmm7, 20
1140 por xmm7, xmm8
1141 movdqa xmm8, xmm4
1142 psrld xmm8, 12
1143 pslld xmm4, 20
1144 por xmm4, xmm8
1145 paddd xmm0, xmmword ptr [rsp+0x20]
1146 paddd xmm1, xmmword ptr [rsp+0x30]
1147 paddd xmm2, xmmword ptr [rsp+0x10]
1148 paddd xmm3, xmmword ptr [rsp+0x40]
1149 paddd xmm0, xmm5
1150 paddd xmm1, xmm6
1151 paddd xmm2, xmm7
1152 paddd xmm3, xmm4
1153 pxor xmm15, xmm0
1154 pxor xmm12, xmm1
1155 pxor xmm13, xmm2
1156 pxor xmm14, xmm3
1157 movdqa xmm8, xmm15
1158 psrld xmm15, 8
1159 pslld xmm8, 24
1160 pxor xmm15, xmm8
1161 movdqa xmm8, xmm12
1162 psrld xmm12, 8
1163 pslld xmm8, 24
1164 pxor xmm12, xmm8
1165 movdqa xmm8, xmm13
1166 psrld xmm13, 8
1167 pslld xmm8, 24
1168 pxor xmm13, xmm8
1169 movdqa xmm8, xmm14
1170 psrld xmm14, 8
1171 pslld xmm8, 24
1172 pxor xmm14, xmm8
1173 paddd xmm10, xmm15
1174 paddd xmm11, xmm12
1175 movdqa xmm8, xmmword ptr [rsp+0x100]
1176 paddd xmm8, xmm13
1177 paddd xmm9, xmm14
1178 pxor xmm5, xmm10
1179 pxor xmm6, xmm11
1180 pxor xmm7, xmm8
1181 pxor xmm4, xmm9
1182 movdqa xmmword ptr [rsp+0x100], xmm8
1183 movdqa xmm8, xmm5
1184 psrld xmm8, 7
1185 pslld xmm5, 25
1186 por xmm5, xmm8
1187 movdqa xmm8, xmm6
1188 psrld xmm8, 7
1189 pslld xmm6, 25
1190 por xmm6, xmm8
1191 movdqa xmm8, xmm7
1192 psrld xmm8, 7
1193 pslld xmm7, 25
1194 por xmm7, xmm8
1195 movdqa xmm8, xmm4
1196 psrld xmm8, 7
1197 pslld xmm4, 25
1198 por xmm4, xmm8
1199 paddd xmm0, xmmword ptr [rsp+0x90]
1200 paddd xmm1, xmmword ptr [rsp+0xB0]
1201 paddd xmm2, xmmword ptr [rsp+0x80]
1202 paddd xmm3, xmmword ptr [rsp+0xF0]
1203 paddd xmm0, xmm4
1204 paddd xmm1, xmm5
1205 paddd xmm2, xmm6
1206 paddd xmm3, xmm7
1207 pxor xmm12, xmm0
1208 pxor xmm13, xmm1
1209 pxor xmm14, xmm2
1210 pxor xmm15, xmm3
1211 pshuflw xmm12, xmm12, 0xB1
1212 pshufhw xmm12, xmm12, 0xB1
1213 pshuflw xmm13, xmm13, 0xB1
1214 pshufhw xmm13, xmm13, 0xB1
1215 pshuflw xmm14, xmm14, 0xB1
1216 pshufhw xmm14, xmm14, 0xB1
1217 pshuflw xmm15, xmm15, 0xB1
1218 pshufhw xmm15, xmm15, 0xB1
1219 movdqa xmm8, xmmword ptr [rsp+0x100]
1220 paddd xmm8, xmm12
1221 paddd xmm9, xmm13
1222 paddd xmm10, xmm14
1223 paddd xmm11, xmm15
1224 pxor xmm4, xmm8
1225 pxor xmm5, xmm9
1226 pxor xmm6, xmm10
1227 pxor xmm7, xmm11
1228 movdqa xmmword ptr [rsp+0x100], xmm8
1229 movdqa xmm8, xmm4
1230 psrld xmm8, 12
1231 pslld xmm4, 20
1232 por xmm4, xmm8
1233 movdqa xmm8, xmm5
1234 psrld xmm8, 12
1235 pslld xmm5, 20
1236 por xmm5, xmm8
1237 movdqa xmm8, xmm6
1238 psrld xmm8, 12
1239 pslld xmm6, 20
1240 por xmm6, xmm8
1241 movdqa xmm8, xmm7
1242 psrld xmm8, 12
1243 pslld xmm7, 20
1244 por xmm7, xmm8
1245 paddd xmm0, xmmword ptr [rsp+0xE0]
1246 paddd xmm1, xmmword ptr [rsp+0x50]
1247 paddd xmm2, xmmword ptr [rsp+0xC0]
1248 paddd xmm3, xmmword ptr [rsp+0x10]
1249 paddd xmm0, xmm4
1250 paddd xmm1, xmm5
1251 paddd xmm2, xmm6
1252 paddd xmm3, xmm7
1253 pxor xmm12, xmm0
1254 pxor xmm13, xmm1
1255 pxor xmm14, xmm2
1256 pxor xmm15, xmm3
1257 movdqa xmm8, xmm12
1258 psrld xmm12, 8
1259 pslld xmm8, 24
1260 pxor xmm12, xmm8
1261 movdqa xmm8, xmm13
1262 psrld xmm13, 8
1263 pslld xmm8, 24
1264 pxor xmm13, xmm8
1265 movdqa xmm8, xmm14
1266 psrld xmm14, 8
1267 pslld xmm8, 24
1268 pxor xmm14, xmm8
1269 movdqa xmm8, xmm15
1270 psrld xmm15, 8
1271 pslld xmm8, 24
1272 pxor xmm15, xmm8
1273 movdqa xmm8, xmmword ptr [rsp+0x100]
1274 paddd xmm8, xmm12
1275 paddd xmm9, xmm13
1276 paddd xmm10, xmm14
1277 paddd xmm11, xmm15
1278 pxor xmm4, xmm8
1279 pxor xmm5, xmm9
1280 pxor xmm6, xmm10
1281 pxor xmm7, xmm11
1282 movdqa xmmword ptr [rsp+0x100], xmm8
1283 movdqa xmm8, xmm4
1284 psrld xmm8, 7
1285 pslld xmm4, 25
1286 por xmm4, xmm8
1287 movdqa xmm8, xmm5
1288 psrld xmm8, 7
1289 pslld xmm5, 25
1290 por xmm5, xmm8
1291 movdqa xmm8, xmm6
1292 psrld xmm8, 7
1293 pslld xmm6, 25
1294 por xmm6, xmm8
1295 movdqa xmm8, xmm7
1296 psrld xmm8, 7
1297 pslld xmm7, 25
1298 por xmm7, xmm8
1299 paddd xmm0, xmmword ptr [rsp+0xD0]
1300 paddd xmm1, xmmword ptr [rsp]
1301 paddd xmm2, xmmword ptr [rsp+0x20]
1302 paddd xmm3, xmmword ptr [rsp+0x40]
1303 paddd xmm0, xmm5
1304 paddd xmm1, xmm6
1305 paddd xmm2, xmm7
1306 paddd xmm3, xmm4
1307 pxor xmm15, xmm0
1308 pxor xmm12, xmm1
1309 pxor xmm13, xmm2
1310 pxor xmm14, xmm3
1311 pshuflw xmm15, xmm15, 0xB1
1312 pshufhw xmm15, xmm15, 0xB1
1313 pshuflw xmm12, xmm12, 0xB1
1314 pshufhw xmm12, xmm12, 0xB1
1315 pshuflw xmm13, xmm13, 0xB1
1316 pshufhw xmm13, xmm13, 0xB1
1317 pshuflw xmm14, xmm14, 0xB1
1318 pshufhw xmm14, xmm14, 0xB1
1319 paddd xmm10, xmm15
1320 paddd xmm11, xmm12
1321 movdqa xmm8, xmmword ptr [rsp+0x100]
1322 paddd xmm8, xmm13
1323 paddd xmm9, xmm14
1324 pxor xmm5, xmm10
1325 pxor xmm6, xmm11
1326 pxor xmm7, xmm8
1327 pxor xmm4, xmm9
1328 movdqa xmmword ptr [rsp+0x100], xmm8
1329 movdqa xmm8, xmm5
1330 psrld xmm8, 12
1331 pslld xmm5, 20
1332 por xmm5, xmm8
1333 movdqa xmm8, xmm6
1334 psrld xmm8, 12
1335 pslld xmm6, 20
1336 por xmm6, xmm8
1337 movdqa xmm8, xmm7
1338 psrld xmm8, 12
1339 pslld xmm7, 20
1340 por xmm7, xmm8
1341 movdqa xmm8, xmm4
1342 psrld xmm8, 12
1343 pslld xmm4, 20
1344 por xmm4, xmm8
1345 paddd xmm0, xmmword ptr [rsp+0x30]
1346 paddd xmm1, xmmword ptr [rsp+0xA0]
1347 paddd xmm2, xmmword ptr [rsp+0x60]
1348 paddd xmm3, xmmword ptr [rsp+0x70]
1349 paddd xmm0, xmm5
1350 paddd xmm1, xmm6
1351 paddd xmm2, xmm7
1352 paddd xmm3, xmm4
1353 pxor xmm15, xmm0
1354 pxor xmm12, xmm1
1355 pxor xmm13, xmm2
1356 pxor xmm14, xmm3
1357 movdqa xmm8, xmm15
1358 psrld xmm15, 8
1359 pslld xmm8, 24
1360 pxor xmm15, xmm8
1361 movdqa xmm8, xmm12
1362 psrld xmm12, 8
1363 pslld xmm8, 24
1364 pxor xmm12, xmm8
1365 movdqa xmm8, xmm13
1366 psrld xmm13, 8
1367 pslld xmm8, 24
1368 pxor xmm13, xmm8
1369 movdqa xmm8, xmm14
1370 psrld xmm14, 8
1371 pslld xmm8, 24
1372 pxor xmm14, xmm8
1373 paddd xmm10, xmm15
1374 paddd xmm11, xmm12
1375 movdqa xmm8, xmmword ptr [rsp+0x100]
1376 paddd xmm8, xmm13
1377 paddd xmm9, xmm14
1378 pxor xmm5, xmm10
1379 pxor xmm6, xmm11
1380 pxor xmm7, xmm8
1381 pxor xmm4, xmm9
1382 movdqa xmmword ptr [rsp+0x100], xmm8
1383 movdqa xmm8, xmm5
1384 psrld xmm8, 7
1385 pslld xmm5, 25
1386 por xmm5, xmm8
1387 movdqa xmm8, xmm6
1388 psrld xmm8, 7
1389 pslld xmm6, 25
1390 por xmm6, xmm8
1391 movdqa xmm8, xmm7
1392 psrld xmm8, 7
1393 pslld xmm7, 25
1394 por xmm7, xmm8
1395 movdqa xmm8, xmm4
1396 psrld xmm8, 7
1397 pslld xmm4, 25
1398 por xmm4, xmm8
1399 paddd xmm0, xmmword ptr [rsp+0xB0]
1400 paddd xmm1, xmmword ptr [rsp+0x50]
1401 paddd xmm2, xmmword ptr [rsp+0x10]
1402 paddd xmm3, xmmword ptr [rsp+0x80]
1403 paddd xmm0, xmm4
1404 paddd xmm1, xmm5
1405 paddd xmm2, xmm6
1406 paddd xmm3, xmm7
1407 pxor xmm12, xmm0
1408 pxor xmm13, xmm1
1409 pxor xmm14, xmm2
1410 pxor xmm15, xmm3
1411 pshuflw xmm12, xmm12, 0xB1
1412 pshufhw xmm12, xmm12, 0xB1
1413 pshuflw xmm13, xmm13, 0xB1
1414 pshufhw xmm13, xmm13, 0xB1
1415 pshuflw xmm14, xmm14, 0xB1
1416 pshufhw xmm14, xmm14, 0xB1
1417 pshuflw xmm15, xmm15, 0xB1
1418 pshufhw xmm15, xmm15, 0xB1
1419 movdqa xmm8, xmmword ptr [rsp+0x100]
1420 paddd xmm8, xmm12
1421 paddd xmm9, xmm13
1422 paddd xmm10, xmm14
1423 paddd xmm11, xmm15
1424 pxor xmm4, xmm8
1425 pxor xmm5, xmm9
1426 pxor xmm6, xmm10
1427 pxor xmm7, xmm11
1428 movdqa xmmword ptr [rsp+0x100], xmm8
1429 movdqa xmm8, xmm4
1430 psrld xmm8, 12
1431 pslld xmm4, 20
1432 por xmm4, xmm8
1433 movdqa xmm8, xmm5
1434 psrld xmm8, 12
1435 pslld xmm5, 20
1436 por xmm5, xmm8
1437 movdqa xmm8, xmm6
1438 psrld xmm8, 12
1439 pslld xmm6, 20
1440 por xmm6, xmm8
1441 movdqa xmm8, xmm7
1442 psrld xmm8, 12
1443 pslld xmm7, 20
1444 por xmm7, xmm8
1445 paddd xmm0, xmmword ptr [rsp+0xF0]
1446 paddd xmm1, xmmword ptr [rsp]
1447 paddd xmm2, xmmword ptr [rsp+0x90]
1448 paddd xmm3, xmmword ptr [rsp+0x60]
1449 paddd xmm0, xmm4
1450 paddd xmm1, xmm5
1451 paddd xmm2, xmm6
1452 paddd xmm3, xmm7
1453 pxor xmm12, xmm0
1454 pxor xmm13, xmm1
1455 pxor xmm14, xmm2
1456 pxor xmm15, xmm3
1457 movdqa xmm8, xmm12
1458 psrld xmm12, 8
1459 pslld xmm8, 24
1460 pxor xmm12, xmm8
1461 movdqa xmm8, xmm13
1462 psrld xmm13, 8
1463 pslld xmm8, 24
1464 pxor xmm13, xmm8
1465 movdqa xmm8, xmm14
1466 psrld xmm14, 8
1467 pslld xmm8, 24
1468 pxor xmm14, xmm8
1469 movdqa xmm8, xmm15
1470 psrld xmm15, 8
1471 pslld xmm8, 24
1472 pxor xmm15, xmm8
1473 movdqa xmm8, xmmword ptr [rsp+0x100]
1474 paddd xmm8, xmm12
1475 paddd xmm9, xmm13
1476 paddd xmm10, xmm14
1477 paddd xmm11, xmm15
1478 pxor xmm4, xmm8
1479 pxor xmm5, xmm9
1480 pxor xmm6, xmm10
1481 pxor xmm7, xmm11
1482 movdqa xmmword ptr [rsp+0x100], xmm8
1483 movdqa xmm8, xmm4
1484 psrld xmm8, 7
1485 pslld xmm4, 25
1486 por xmm4, xmm8
1487 movdqa xmm8, xmm5
1488 psrld xmm8, 7
1489 pslld xmm5, 25
1490 por xmm5, xmm8
1491 movdqa xmm8, xmm6
1492 psrld xmm8, 7
1493 pslld xmm6, 25
1494 por xmm6, xmm8
1495 movdqa xmm8, xmm7
1496 psrld xmm8, 7
1497 pslld xmm7, 25
1498 por xmm7, xmm8
1499 paddd xmm0, xmmword ptr [rsp+0xE0]
1500 paddd xmm1, xmmword ptr [rsp+0x20]
1501 paddd xmm2, xmmword ptr [rsp+0x30]
1502 paddd xmm3, xmmword ptr [rsp+0x70]
1503 paddd xmm0, xmm5
1504 paddd xmm1, xmm6
1505 paddd xmm2, xmm7
1506 paddd xmm3, xmm4
1507 pxor xmm15, xmm0
1508 pxor xmm12, xmm1
1509 pxor xmm13, xmm2
1510 pxor xmm14, xmm3
1511 pshuflw xmm15, xmm15, 0xB1
1512 pshufhw xmm15, xmm15, 0xB1
1513 pshuflw xmm12, xmm12, 0xB1
1514 pshufhw xmm12, xmm12, 0xB1
1515 pshuflw xmm13, xmm13, 0xB1
1516 pshufhw xmm13, xmm13, 0xB1
1517 pshuflw xmm14, xmm14, 0xB1
1518 pshufhw xmm14, xmm14, 0xB1
1519 paddd xmm10, xmm15
1520 paddd xmm11, xmm12
1521 movdqa xmm8, xmmword ptr [rsp+0x100]
1522 paddd xmm8, xmm13
1523 paddd xmm9, xmm14
1524 pxor xmm5, xmm10
1525 pxor xmm6, xmm11
1526 pxor xmm7, xmm8
1527 pxor xmm4, xmm9
1528 movdqa xmmword ptr [rsp+0x100], xmm8
1529 movdqa xmm8, xmm5
1530 psrld xmm8, 12
1531 pslld xmm5, 20
1532 por xmm5, xmm8
1533 movdqa xmm8, xmm6
1534 psrld xmm8, 12
1535 pslld xmm6, 20
1536 por xmm6, xmm8
1537 movdqa xmm8, xmm7
1538 psrld xmm8, 12
1539 pslld xmm7, 20
1540 por xmm7, xmm8
1541 movdqa xmm8, xmm4
1542 psrld xmm8, 12
1543 pslld xmm4, 20
1544 por xmm4, xmm8
1545 paddd xmm0, xmmword ptr [rsp+0xA0]
1546 paddd xmm1, xmmword ptr [rsp+0xC0]
1547 paddd xmm2, xmmword ptr [rsp+0x40]
1548 paddd xmm3, xmmword ptr [rsp+0xD0]
1549 paddd xmm0, xmm5
1550 paddd xmm1, xmm6
1551 paddd xmm2, xmm7
1552 paddd xmm3, xmm4
1553 pxor xmm15, xmm0
1554 pxor xmm12, xmm1
1555 pxor xmm13, xmm2
1556 pxor xmm14, xmm3
1557 movdqa xmm8, xmm15
1558 psrld xmm15, 8
1559 pslld xmm8, 24
1560 pxor xmm15, xmm8
1561 movdqa xmm8, xmm12
1562 psrld xmm12, 8
1563 pslld xmm8, 24
1564 pxor xmm12, xmm8
1565 movdqa xmm8, xmm13
1566 psrld xmm13, 8
1567 pslld xmm8, 24
1568 pxor xmm13, xmm8
1569 movdqa xmm8, xmm14
1570 psrld xmm14, 8
1571 pslld xmm8, 24
1572 pxor xmm14, xmm8
1573 paddd xmm10, xmm15
1574 paddd xmm11, xmm12
1575 movdqa xmm8, xmmword ptr [rsp+0x100]
1576 paddd xmm8, xmm13
1577 paddd xmm9, xmm14
1578 pxor xmm5, xmm10
1579 pxor xmm6, xmm11
1580 pxor xmm7, xmm8
1581 pxor xmm4, xmm9
1582 pxor xmm0, xmm8
1583 pxor xmm1, xmm9
1584 pxor xmm2, xmm10
1585 pxor xmm3, xmm11
1586 movdqa xmm8, xmm5
1587 psrld xmm8, 7
1588 pslld xmm5, 25
1589 por xmm5, xmm8
1590 movdqa xmm8, xmm6
1591 psrld xmm8, 7
1592 pslld xmm6, 25
1593 por xmm6, xmm8
1594 movdqa xmm8, xmm7
1595 psrld xmm8, 7
1596 pslld xmm7, 25
1597 por xmm7, xmm8
1598 movdqa xmm8, xmm4
1599 psrld xmm8, 7
1600 pslld xmm4, 25
1601 por xmm4, xmm8
1602 pxor xmm4, xmm12
1603 pxor xmm5, xmm13
1604 pxor xmm6, xmm14
1605 pxor xmm7, xmm15
1606 mov eax, r13d
1607 jne 9b
1608 movdqa xmm9, xmm0
1609 punpckldq xmm0, xmm1
1610 punpckhdq xmm9, xmm1
1611 movdqa xmm11, xmm2
1612 punpckldq xmm2, xmm3
1613 punpckhdq xmm11, xmm3
1614 movdqa xmm1, xmm0
1615 punpcklqdq xmm0, xmm2
1616 punpckhqdq xmm1, xmm2
1617 movdqa xmm3, xmm9
1618 punpcklqdq xmm9, xmm11
1619 punpckhqdq xmm3, xmm11
1620 movdqu xmmword ptr [rbx], xmm0
1621 movdqu xmmword ptr [rbx+0x20], xmm1
1622 movdqu xmmword ptr [rbx+0x40], xmm9
1623 movdqu xmmword ptr [rbx+0x60], xmm3
1624 movdqa xmm9, xmm4
1625 punpckldq xmm4, xmm5
1626 punpckhdq xmm9, xmm5
1627 movdqa xmm11, xmm6
1628 punpckldq xmm6, xmm7
1629 punpckhdq xmm11, xmm7
1630 movdqa xmm5, xmm4
1631 punpcklqdq xmm4, xmm6
1632 punpckhqdq xmm5, xmm6
1633 movdqa xmm7, xmm9
1634 punpcklqdq xmm9, xmm11
1635 punpckhqdq xmm7, xmm11
1636 movdqu xmmword ptr [rbx+0x10], xmm4
1637 movdqu xmmword ptr [rbx+0x30], xmm5
1638 movdqu xmmword ptr [rbx+0x50], xmm9
1639 movdqu xmmword ptr [rbx+0x70], xmm7
1640 movdqa xmm1, xmmword ptr [rsp+0x110]
1641 movdqa xmm0, xmm1
1642 paddd xmm1, xmmword ptr [rsp+0x150]
1643 movdqa xmmword ptr [rsp+0x110], xmm1
1644 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
1645 pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
1646 pcmpgtd xmm0, xmm1
1647 movdqa xmm1, xmmword ptr [rsp+0x120]
1648 psubd xmm1, xmm0
1649 movdqa xmmword ptr [rsp+0x120], xmm1
1650 add rbx, 128
1651 add rdi, 32
1652 sub rsi, 4
1653 cmp rsi, 4
1654 jnc 2b
1655 test rsi, rsi
1656 jnz 3f
16574:
1658 mov rsp, rbp
1659 pop rbp
1660 pop rbx
1661 pop r12
1662 pop r13
1663 pop r14
1664 pop r15
1665 ret
1666.p2align 5
16673:
1668 test esi, 0x2
1669 je 3f
1670 movups xmm0, xmmword ptr [rcx]
1671 movups xmm1, xmmword ptr [rcx+0x10]
1672 movaps xmm8, xmm0
1673 movaps xmm9, xmm1
1674 movd xmm13, dword ptr [rsp+0x110]
1675 movd xmm14, dword ptr [rsp+0x120]
1676 punpckldq xmm13, xmm14
1677 movaps xmmword ptr [rsp], xmm13
1678 movd xmm14, dword ptr [rsp+0x114]
1679 movd xmm13, dword ptr [rsp+0x124]
1680 punpckldq xmm14, xmm13
1681 movaps xmmword ptr [rsp+0x10], xmm14
1682 mov r8, qword ptr [rdi]
1683 mov r9, qword ptr [rdi+0x8]
1684 movzx eax, byte ptr [rbp+0x40]
1685 or eax, r13d
1686 xor edx, edx
16872:
1688 mov r14d, eax
1689 or eax, r12d
1690 add rdx, 64
1691 cmp rdx, r15
1692 cmovne eax, r14d
1693 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1694 movaps xmm10, xmm2
1695 movups xmm4, xmmword ptr [r8+rdx-0x40]
1696 movups xmm5, xmmword ptr [r8+rdx-0x30]
1697 movaps xmm3, xmm4
1698 shufps xmm4, xmm5, 136
1699 shufps xmm3, xmm5, 221
1700 movaps xmm5, xmm3
1701 movups xmm6, xmmword ptr [r8+rdx-0x20]
1702 movups xmm7, xmmword ptr [r8+rdx-0x10]
1703 movaps xmm3, xmm6
1704 shufps xmm6, xmm7, 136
1705 pshufd xmm6, xmm6, 0x93
1706 shufps xmm3, xmm7, 221
1707 pshufd xmm7, xmm3, 0x93
1708 movups xmm12, xmmword ptr [r9+rdx-0x40]
1709 movups xmm13, xmmword ptr [r9+rdx-0x30]
1710 movaps xmm11, xmm12
1711 shufps xmm12, xmm13, 136
1712 shufps xmm11, xmm13, 221
1713 movaps xmm13, xmm11
1714 movups xmm14, xmmword ptr [r9+rdx-0x20]
1715 movups xmm15, xmmword ptr [r9+rdx-0x10]
1716 movaps xmm11, xmm14
1717 shufps xmm14, xmm15, 136
1718 pshufd xmm14, xmm14, 0x93
1719 shufps xmm11, xmm15, 221
1720 pshufd xmm15, xmm11, 0x93
1721 shl rax, 0x20
1722 or rax, 0x40
1723 movq xmm3, rax
1724 movdqa xmmword ptr [rsp+0x20], xmm3
1725 movaps xmm3, xmmword ptr [rsp]
1726 movaps xmm11, xmmword ptr [rsp+0x10]
1727 punpcklqdq xmm3, xmmword ptr [rsp+0x20]
1728 punpcklqdq xmm11, xmmword ptr [rsp+0x20]
1729 mov al, 7
17309:
1731 paddd xmm0, xmm4
1732 paddd xmm8, xmm12
1733 movaps xmmword ptr [rsp+0x20], xmm4
1734 movaps xmmword ptr [rsp+0x30], xmm12
1735 paddd xmm0, xmm1
1736 paddd xmm8, xmm9
1737 pxor xmm3, xmm0
1738 pxor xmm11, xmm8
1739 pshuflw xmm3, xmm3, 0xB1
1740 pshufhw xmm3, xmm3, 0xB1
1741 pshuflw xmm11, xmm11, 0xB1
1742 pshufhw xmm11, xmm11, 0xB1
1743 paddd xmm2, xmm3
1744 paddd xmm10, xmm11
1745 pxor xmm1, xmm2
1746 pxor xmm9, xmm10
1747 movdqa xmm4, xmm1
1748 pslld xmm1, 20
1749 psrld xmm4, 12
1750 por xmm1, xmm4
1751 movdqa xmm4, xmm9
1752 pslld xmm9, 20
1753 psrld xmm4, 12
1754 por xmm9, xmm4
1755 paddd xmm0, xmm5
1756 paddd xmm8, xmm13
1757 movaps xmmword ptr [rsp+0x40], xmm5
1758 movaps xmmword ptr [rsp+0x50], xmm13
1759 paddd xmm0, xmm1
1760 paddd xmm8, xmm9
1761 pxor xmm3, xmm0
1762 pxor xmm11, xmm8
1763 movdqa xmm13, xmm3
1764 psrld xmm3, 8
1765 pslld xmm13, 24
1766 pxor xmm3, xmm13
1767 movdqa xmm13, xmm11
1768 psrld xmm11, 8
1769 pslld xmm13, 24
1770 pxor xmm11, xmm13
1771 paddd xmm2, xmm3
1772 paddd xmm10, xmm11
1773 pxor xmm1, xmm2
1774 pxor xmm9, xmm10
1775 movdqa xmm4, xmm1
1776 pslld xmm1, 25
1777 psrld xmm4, 7
1778 por xmm1, xmm4
1779 movdqa xmm4, xmm9
1780 pslld xmm9, 25
1781 psrld xmm4, 7
1782 por xmm9, xmm4
1783 pshufd xmm0, xmm0, 0x93
1784 pshufd xmm8, xmm8, 0x93
1785 pshufd xmm3, xmm3, 0x4E
1786 pshufd xmm11, xmm11, 0x4E
1787 pshufd xmm2, xmm2, 0x39
1788 pshufd xmm10, xmm10, 0x39
1789 paddd xmm0, xmm6
1790 paddd xmm8, xmm14
1791 paddd xmm0, xmm1
1792 paddd xmm8, xmm9
1793 pxor xmm3, xmm0
1794 pxor xmm11, xmm8
1795 pshuflw xmm3, xmm3, 0xB1
1796 pshufhw xmm3, xmm3, 0xB1
1797 pshuflw xmm11, xmm11, 0xB1
1798 pshufhw xmm11, xmm11, 0xB1
1799 paddd xmm2, xmm3
1800 paddd xmm10, xmm11
1801 pxor xmm1, xmm2
1802 pxor xmm9, xmm10
1803 movdqa xmm4, xmm1
1804 pslld xmm1, 20
1805 psrld xmm4, 12
1806 por xmm1, xmm4
1807 movdqa xmm4, xmm9
1808 pslld xmm9, 20
1809 psrld xmm4, 12
1810 por xmm9, xmm4
1811 paddd xmm0, xmm7
1812 paddd xmm8, xmm15
1813 paddd xmm0, xmm1
1814 paddd xmm8, xmm9
1815 pxor xmm3, xmm0
1816 pxor xmm11, xmm8
1817 movdqa xmm13, xmm3
1818 psrld xmm3, 8
1819 pslld xmm13, 24
1820 pxor xmm3, xmm13
1821 movdqa xmm13, xmm11
1822 psrld xmm11, 8
1823 pslld xmm13, 24
1824 pxor xmm11, xmm13
1825 paddd xmm2, xmm3
1826 paddd xmm10, xmm11
1827 pxor xmm1, xmm2
1828 pxor xmm9, xmm10
1829 movdqa xmm4, xmm1
1830 pslld xmm1, 25
1831 psrld xmm4, 7
1832 por xmm1, xmm4
1833 movdqa xmm4, xmm9
1834 pslld xmm9, 25
1835 psrld xmm4, 7
1836 por xmm9, xmm4
1837 pshufd xmm0, xmm0, 0x39
1838 pshufd xmm8, xmm8, 0x39
1839 pshufd xmm3, xmm3, 0x4E
1840 pshufd xmm11, xmm11, 0x4E
1841 pshufd xmm2, xmm2, 0x93
1842 pshufd xmm10, xmm10, 0x93
1843 dec al
1844 je 9f
1845 movdqa xmm12, xmmword ptr [rsp+0x20]
1846 movdqa xmm5, xmmword ptr [rsp+0x40]
1847 pshufd xmm13, xmm12, 0x0F
1848 shufps xmm12, xmm5, 214
1849 pshufd xmm4, xmm12, 0x39
1850 movdqa xmm12, xmm6
1851 shufps xmm12, xmm7, 250
1852 pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
1853 pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
1854 por xmm13, xmm12
1855 movdqa xmmword ptr [rsp+0x20], xmm13
1856 movdqa xmm12, xmm7
1857 punpcklqdq xmm12, xmm5
1858 movdqa xmm13, xmm6
1859 pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
1860 pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip]
1861 por xmm12, xmm13
1862 pshufd xmm12, xmm12, 0x78
1863 punpckhdq xmm5, xmm7
1864 punpckldq xmm6, xmm5
1865 pshufd xmm7, xmm6, 0x1E
1866 movdqa xmmword ptr [rsp+0x40], xmm12
1867 movdqa xmm5, xmmword ptr [rsp+0x30]
1868 movdqa xmm13, xmmword ptr [rsp+0x50]
1869 pshufd xmm6, xmm5, 0x0F
1870 shufps xmm5, xmm13, 214
1871 pshufd xmm12, xmm5, 0x39
1872 movdqa xmm5, xmm14
1873 shufps xmm5, xmm15, 250
1874 pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
1875 pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
1876 por xmm6, xmm5
1877 movdqa xmm5, xmm15
1878 punpcklqdq xmm5, xmm13
1879 movdqa xmmword ptr [rsp+0x30], xmm2
1880 movdqa xmm2, xmm14
1881 pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
1882 pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
1883 por xmm5, xmm2
1884 movdqa xmm2, xmmword ptr [rsp+0x30]
1885 pshufd xmm5, xmm5, 0x78
1886 punpckhdq xmm13, xmm15
1887 punpckldq xmm14, xmm13
1888 pshufd xmm15, xmm14, 0x1E
1889 movdqa xmm13, xmm6
1890 movdqa xmm14, xmm5
1891 movdqa xmm5, xmmword ptr [rsp+0x20]
1892 movdqa xmm6, xmmword ptr [rsp+0x40]
1893 jmp 9b
18949:
1895 pxor xmm0, xmm2
1896 pxor xmm1, xmm3
1897 pxor xmm8, xmm10
1898 pxor xmm9, xmm11
1899 mov eax, r13d
1900 cmp rdx, r15
1901 jne 2b
1902 movups xmmword ptr [rbx], xmm0
1903 movups xmmword ptr [rbx+0x10], xmm1
1904 movups xmmword ptr [rbx+0x20], xmm8
1905 movups xmmword ptr [rbx+0x30], xmm9
1906 mov eax, dword ptr [rsp+0x130]
1907 neg eax
1908 mov r10d, dword ptr [rsp+0x110+8*rax]
1909 mov r11d, dword ptr [rsp+0x120+8*rax]
1910 mov dword ptr [rsp+0x110], r10d
1911 mov dword ptr [rsp+0x120], r11d
1912 add rdi, 16
1913 add rbx, 64
1914 sub rsi, 2
19153:
1916 test esi, 0x1
1917 je 4b
1918 movups xmm0, xmmword ptr [rcx]
1919 movups xmm1, xmmword ptr [rcx+0x10]
1920 movd xmm13, dword ptr [rsp+0x110]
1921 movd xmm14, dword ptr [rsp+0x120]
1922 punpckldq xmm13, xmm14
1923 mov r8, qword ptr [rdi]
1924 movzx eax, byte ptr [rbp+0x40]
1925 or eax, r13d
1926 xor edx, edx
19272:
1928 mov r14d, eax
1929 or eax, r12d
1930 add rdx, 64
1931 cmp rdx, r15
1932 cmovne eax, r14d
1933 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1934 shl rax, 32
1935 or rax, 64
1936 movq xmm12, rax
1937 movdqa xmm3, xmm13
1938 punpcklqdq xmm3, xmm12
1939 movups xmm4, xmmword ptr [r8+rdx-0x40]
1940 movups xmm5, xmmword ptr [r8+rdx-0x30]
1941 movaps xmm8, xmm4
1942 shufps xmm4, xmm5, 136
1943 shufps xmm8, xmm5, 221
1944 movaps xmm5, xmm8
1945 movups xmm6, xmmword ptr [r8+rdx-0x20]
1946 movups xmm7, xmmword ptr [r8+rdx-0x10]
1947 movaps xmm8, xmm6
1948 shufps xmm6, xmm7, 136
1949 pshufd xmm6, xmm6, 0x93
1950 shufps xmm8, xmm7, 221
1951 pshufd xmm7, xmm8, 0x93
1952 mov al, 7
19539:
1954 paddd xmm0, xmm4
1955 paddd xmm0, xmm1
1956 pxor xmm3, xmm0
1957 pshuflw xmm3, xmm3, 0xB1
1958 pshufhw xmm3, xmm3, 0xB1
1959 paddd xmm2, xmm3
1960 pxor xmm1, xmm2
1961 movdqa xmm11, xmm1
1962 pslld xmm1, 20
1963 psrld xmm11, 12
1964 por xmm1, xmm11
1965 paddd xmm0, xmm5
1966 paddd xmm0, xmm1
1967 pxor xmm3, xmm0
1968 movdqa xmm14, xmm3
1969 psrld xmm3, 8
1970 pslld xmm14, 24
1971 pxor xmm3, xmm14
1972 paddd xmm2, xmm3
1973 pxor xmm1, xmm2
1974 movdqa xmm11, xmm1
1975 pslld xmm1, 25
1976 psrld xmm11, 7
1977 por xmm1, xmm11
1978 pshufd xmm0, xmm0, 0x93
1979 pshufd xmm3, xmm3, 0x4E
1980 pshufd xmm2, xmm2, 0x39
1981 paddd xmm0, xmm6
1982 paddd xmm0, xmm1
1983 pxor xmm3, xmm0
1984 pshuflw xmm3, xmm3, 0xB1
1985 pshufhw xmm3, xmm3, 0xB1
1986 paddd xmm2, xmm3
1987 pxor xmm1, xmm2
1988 movdqa xmm11, xmm1
1989 pslld xmm1, 20
1990 psrld xmm11, 12
1991 por xmm1, xmm11
1992 paddd xmm0, xmm7
1993 paddd xmm0, xmm1
1994 pxor xmm3, xmm0
1995 movdqa xmm14, xmm3
1996 psrld xmm3, 8
1997 pslld xmm14, 24
1998 pxor xmm3, xmm14
1999 paddd xmm2, xmm3
2000 pxor xmm1, xmm2
2001 movdqa xmm11, xmm1
2002 pslld xmm1, 25
2003 psrld xmm11, 7
2004 por xmm1, xmm11
2005 pshufd xmm0, xmm0, 0x39
2006 pshufd xmm3, xmm3, 0x4E
2007 pshufd xmm2, xmm2, 0x93
2008 dec al
2009 jz 9f
2010 movdqa xmm8, xmm4
2011 shufps xmm8, xmm5, 214
2012 pshufd xmm9, xmm4, 0x0F
2013 pshufd xmm4, xmm8, 0x39
2014 movdqa xmm8, xmm6
2015 shufps xmm8, xmm7, 250
2016 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2017 pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2018 por xmm9, xmm8
2019 movdqa xmm8, xmm7
2020 punpcklqdq xmm8, xmm5
2021 movdqa xmm10, xmm6
2022 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2023 pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2024 por xmm8, xmm10
2025 pshufd xmm8, xmm8, 0x78
2026 punpckhdq xmm5, xmm7
2027 punpckldq xmm6, xmm5
2028 pshufd xmm7, xmm6, 0x1E
2029 movdqa xmm5, xmm9
2030 movdqa xmm6, xmm8
2031 jmp 9b
20329:
2033 pxor xmm0, xmm2
2034 pxor xmm1, xmm3
2035 mov eax, r13d
2036 cmp rdx, r15
2037 jne 2b
2038 movups xmmword ptr [rbx], xmm0
2039 movups xmmword ptr [rbx+0x10], xmm1
2040 jmp 4b
2041
2042.p2align 6
2043blake3_compress_in_place_sse2:
2044_blake3_compress_in_place_sse2:
2045 _CET_ENDBR
2046 movups xmm0, xmmword ptr [rdi]
2047 movups xmm1, xmmword ptr [rdi+0x10]
2048 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
2049 shl r8, 32
2050 add rdx, r8
2051 movq xmm3, rcx
2052 movq xmm4, rdx
2053 punpcklqdq xmm3, xmm4
2054 movups xmm4, xmmword ptr [rsi]
2055 movups xmm5, xmmword ptr [rsi+0x10]
2056 movaps xmm8, xmm4
2057 shufps xmm4, xmm5, 136
2058 shufps xmm8, xmm5, 221
2059 movaps xmm5, xmm8
2060 movups xmm6, xmmword ptr [rsi+0x20]
2061 movups xmm7, xmmword ptr [rsi+0x30]
2062 movaps xmm8, xmm6
2063 shufps xmm6, xmm7, 136
2064 pshufd xmm6, xmm6, 0x93
2065 shufps xmm8, xmm7, 221
2066 pshufd xmm7, xmm8, 0x93
2067 mov al, 7
20689:
2069 paddd xmm0, xmm4
2070 paddd xmm0, xmm1
2071 pxor xmm3, xmm0
2072 pshuflw xmm3, xmm3, 0xB1
2073 pshufhw xmm3, xmm3, 0xB1
2074 paddd xmm2, xmm3
2075 pxor xmm1, xmm2
2076 movdqa xmm11, xmm1
2077 pslld xmm1, 20
2078 psrld xmm11, 12
2079 por xmm1, xmm11
2080 paddd xmm0, xmm5
2081 paddd xmm0, xmm1
2082 pxor xmm3, xmm0
2083 movdqa xmm14, xmm3
2084 psrld xmm3, 8
2085 pslld xmm14, 24
2086 pxor xmm3, xmm14
2087 paddd xmm2, xmm3
2088 pxor xmm1, xmm2
2089 movdqa xmm11, xmm1
2090 pslld xmm1, 25
2091 psrld xmm11, 7
2092 por xmm1, xmm11
2093 pshufd xmm0, xmm0, 0x93
2094 pshufd xmm3, xmm3, 0x4E
2095 pshufd xmm2, xmm2, 0x39
2096 paddd xmm0, xmm6
2097 paddd xmm0, xmm1
2098 pxor xmm3, xmm0
2099 pshuflw xmm3, xmm3, 0xB1
2100 pshufhw xmm3, xmm3, 0xB1
2101 paddd xmm2, xmm3
2102 pxor xmm1, xmm2
2103 movdqa xmm11, xmm1
2104 pslld xmm1, 20
2105 psrld xmm11, 12
2106 por xmm1, xmm11
2107 paddd xmm0, xmm7
2108 paddd xmm0, xmm1
2109 pxor xmm3, xmm0
2110 movdqa xmm14, xmm3
2111 psrld xmm3, 8
2112 pslld xmm14, 24
2113 pxor xmm3, xmm14
2114 paddd xmm2, xmm3
2115 pxor xmm1, xmm2
2116 movdqa xmm11, xmm1
2117 pslld xmm1, 25
2118 psrld xmm11, 7
2119 por xmm1, xmm11
2120 pshufd xmm0, xmm0, 0x39
2121 pshufd xmm3, xmm3, 0x4E
2122 pshufd xmm2, xmm2, 0x93
2123 dec al
2124 jz 9f
2125 movdqa xmm8, xmm4
2126 shufps xmm8, xmm5, 214
2127 pshufd xmm9, xmm4, 0x0F
2128 pshufd xmm4, xmm8, 0x39
2129 movdqa xmm8, xmm6
2130 shufps xmm8, xmm7, 250
2131 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2132 pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2133 por xmm9, xmm8
2134 movdqa xmm8, xmm7
2135 punpcklqdq xmm8, xmm5
2136 movdqa xmm10, xmm6
2137 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2138 pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2139 por xmm8, xmm10
2140 pshufd xmm8, xmm8, 0x78
2141 punpckhdq xmm5, xmm7
2142 punpckldq xmm6, xmm5
2143 pshufd xmm7, xmm6, 0x1E
2144 movdqa xmm5, xmm9
2145 movdqa xmm6, xmm8
2146 jmp 9b
21479:
2148 pxor xmm0, xmm2
2149 pxor xmm1, xmm3
2150 movups xmmword ptr [rdi], xmm0
2151 movups xmmword ptr [rdi+0x10], xmm1
2152 ret
2153
2154.p2align 6
2155blake3_compress_xof_sse2:
2156_blake3_compress_xof_sse2:
2157 _CET_ENDBR
2158 movups xmm0, xmmword ptr [rdi]
2159 movups xmm1, xmmword ptr [rdi+0x10]
2160 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
2161 movzx eax, r8b
2162 movzx edx, dl
2163 shl rax, 32
2164 add rdx, rax
2165 movq xmm3, rcx
2166 movq xmm4, rdx
2167 punpcklqdq xmm3, xmm4
2168 movups xmm4, xmmword ptr [rsi]
2169 movups xmm5, xmmword ptr [rsi+0x10]
2170 movaps xmm8, xmm4
2171 shufps xmm4, xmm5, 136
2172 shufps xmm8, xmm5, 221
2173 movaps xmm5, xmm8
2174 movups xmm6, xmmword ptr [rsi+0x20]
2175 movups xmm7, xmmword ptr [rsi+0x30]
2176 movaps xmm8, xmm6
2177 shufps xmm6, xmm7, 136
2178 pshufd xmm6, xmm6, 0x93
2179 shufps xmm8, xmm7, 221
2180 pshufd xmm7, xmm8, 0x93
2181 mov al, 7
21829:
2183 paddd xmm0, xmm4
2184 paddd xmm0, xmm1
2185 pxor xmm3, xmm0
2186 pshuflw xmm3, xmm3, 0xB1
2187 pshufhw xmm3, xmm3, 0xB1
2188 paddd xmm2, xmm3
2189 pxor xmm1, xmm2
2190 movdqa xmm11, xmm1
2191 pslld xmm1, 20
2192 psrld xmm11, 12
2193 por xmm1, xmm11
2194 paddd xmm0, xmm5
2195 paddd xmm0, xmm1
2196 pxor xmm3, xmm0
2197 movdqa xmm14, xmm3
2198 psrld xmm3, 8
2199 pslld xmm14, 24
2200 pxor xmm3, xmm14
2201 paddd xmm2, xmm3
2202 pxor xmm1, xmm2
2203 movdqa xmm11, xmm1
2204 pslld xmm1, 25
2205 psrld xmm11, 7
2206 por xmm1, xmm11
2207 pshufd xmm0, xmm0, 0x93
2208 pshufd xmm3, xmm3, 0x4E
2209 pshufd xmm2, xmm2, 0x39
2210 paddd xmm0, xmm6
2211 paddd xmm0, xmm1
2212 pxor xmm3, xmm0
2213 pshuflw xmm3, xmm3, 0xB1
2214 pshufhw xmm3, xmm3, 0xB1
2215 paddd xmm2, xmm3
2216 pxor xmm1, xmm2
2217 movdqa xmm11, xmm1
2218 pslld xmm1, 20
2219 psrld xmm11, 12
2220 por xmm1, xmm11
2221 paddd xmm0, xmm7
2222 paddd xmm0, xmm1
2223 pxor xmm3, xmm0
2224 movdqa xmm14, xmm3
2225 psrld xmm3, 8
2226 pslld xmm14, 24
2227 pxor xmm3, xmm14
2228 paddd xmm2, xmm3
2229 pxor xmm1, xmm2
2230 movdqa xmm11, xmm1
2231 pslld xmm1, 25
2232 psrld xmm11, 7
2233 por xmm1, xmm11
2234 pshufd xmm0, xmm0, 0x39
2235 pshufd xmm3, xmm3, 0x4E
2236 pshufd xmm2, xmm2, 0x93
2237 dec al
2238 jz 9f
2239 movdqa xmm8, xmm4
2240 shufps xmm8, xmm5, 214
2241 pshufd xmm9, xmm4, 0x0F
2242 pshufd xmm4, xmm8, 0x39
2243 movdqa xmm8, xmm6
2244 shufps xmm8, xmm7, 250
2245 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2246 pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2247 por xmm9, xmm8
2248 movdqa xmm8, xmm7
2249 punpcklqdq xmm8, xmm5
2250 movdqa xmm10, xmm6
2251 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2252 pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2253 por xmm8, xmm10
2254 pshufd xmm8, xmm8, 0x78
2255 punpckhdq xmm5, xmm7
2256 punpckldq xmm6, xmm5
2257 pshufd xmm7, xmm6, 0x1E
2258 movdqa xmm5, xmm9
2259 movdqa xmm6, xmm8
2260 jmp 9b
22619:
2262 movdqu xmm4, xmmword ptr [rdi]
2263 movdqu xmm5, xmmword ptr [rdi+0x10]
2264 pxor xmm0, xmm2
2265 pxor xmm1, xmm3
2266 pxor xmm2, xmm4
2267 pxor xmm3, xmm5
2268 movups xmmword ptr [r9], xmm0
2269 movups xmmword ptr [r9+0x10], xmm1
2270 movups xmmword ptr [r9+0x20], xmm2
2271 movups xmmword ptr [r9+0x30], xmm3
2272 ret
2273
2274
2275#ifdef __APPLE__
2276.static_data
2277#else
2278.section .rodata
2279#endif
2280.p2align 6
2281BLAKE3_IV:
2282 .long 0x6A09E667, 0xBB67AE85
2283 .long 0x3C6EF372, 0xA54FF53A
2284ADD0:
2285 .long 0, 1, 2, 3
2286ADD1:
2287 .long 4, 4, 4, 4
2288BLAKE3_IV_0:
2289 .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
2290BLAKE3_IV_1:
2291 .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
2292BLAKE3_IV_2:
2293 .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
2294BLAKE3_IV_3:
2295 .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
2296BLAKE3_BLOCK_LEN:
2297 .long 64, 64, 64, 64
2298CMP_MSB_MASK:
2299 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
2300PBLENDW_0x33_MASK:
2301 .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
2302PBLENDW_0xCC_MASK:
2303 .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
2304PBLENDW_0x3F_MASK:
2305 .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
2306PBLENDW_0xC0_MASK:
2307 .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
2308
2309#endif
2310

source code of llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_unix.S