1#include "llvm_blake3_prefix.h"
2
3.intel_syntax noprefix
4.global blake3_hash_many_sse41
5.global _blake3_hash_many_sse41
6.global blake3_compress_in_place_sse41
7.global _blake3_compress_in_place_sse41
8.global blake3_compress_xof_sse41
9.global _blake3_compress_xof_sse41
10.section .text
11 .p2align 6
12_blake3_hash_many_sse41:
13blake3_hash_many_sse41:
14 push r15
15 push r14
16 push r13
17 push r12
18 push rsi
19 push rdi
20 push rbx
21 push rbp
22 mov rbp, rsp
23 sub rsp, 528
24 and rsp, 0xFFFFFFFFFFFFFFC0
25 movdqa xmmword ptr [rsp+0x170], xmm6
26 movdqa xmmword ptr [rsp+0x180], xmm7
27 movdqa xmmword ptr [rsp+0x190], xmm8
28 movdqa xmmword ptr [rsp+0x1A0], xmm9
29 movdqa xmmword ptr [rsp+0x1B0], xmm10
30 movdqa xmmword ptr [rsp+0x1C0], xmm11
31 movdqa xmmword ptr [rsp+0x1D0], xmm12
32 movdqa xmmword ptr [rsp+0x1E0], xmm13
33 movdqa xmmword ptr [rsp+0x1F0], xmm14
34 movdqa xmmword ptr [rsp+0x200], xmm15
35 mov rdi, rcx
36 mov rsi, rdx
37 mov rdx, r8
38 mov rcx, r9
39 mov r8, qword ptr [rbp+0x68]
40 movzx r9, byte ptr [rbp+0x70]
41 neg r9d
42 movd xmm0, r9d
43 pshufd xmm0, xmm0, 0x00
44 movdqa xmmword ptr [rsp+0x130], xmm0
45 movdqa xmm1, xmm0
46 pand xmm1, xmmword ptr [ADD0+rip]
47 pand xmm0, xmmword ptr [ADD1+rip]
48 movdqa xmmword ptr [rsp+0x150], xmm0
49 movd xmm0, r8d
50 pshufd xmm0, xmm0, 0x00
51 paddd xmm0, xmm1
52 movdqa xmmword ptr [rsp+0x110], xmm0
53 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
54 pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
55 pcmpgtd xmm1, xmm0
56 shr r8, 32
57 movd xmm2, r8d
58 pshufd xmm2, xmm2, 0x00
59 psubd xmm2, xmm1
60 movdqa xmmword ptr [rsp+0x120], xmm2
61 mov rbx, qword ptr [rbp+0x90]
62 mov r15, rdx
63 shl r15, 6
64 movzx r13d, byte ptr [rbp+0x78]
65 movzx r12d, byte ptr [rbp+0x88]
66 cmp rsi, 4
67 jc 3f
682:
69 movdqu xmm3, xmmword ptr [rcx]
70 pshufd xmm0, xmm3, 0x00
71 pshufd xmm1, xmm3, 0x55
72 pshufd xmm2, xmm3, 0xAA
73 pshufd xmm3, xmm3, 0xFF
74 movdqu xmm7, xmmword ptr [rcx+0x10]
75 pshufd xmm4, xmm7, 0x00
76 pshufd xmm5, xmm7, 0x55
77 pshufd xmm6, xmm7, 0xAA
78 pshufd xmm7, xmm7, 0xFF
79 mov r8, qword ptr [rdi]
80 mov r9, qword ptr [rdi+0x8]
81 mov r10, qword ptr [rdi+0x10]
82 mov r11, qword ptr [rdi+0x18]
83 movzx eax, byte ptr [rbp+0x80]
84 or eax, r13d
85 xor edx, edx
869:
87 mov r14d, eax
88 or eax, r12d
89 add rdx, 64
90 cmp rdx, r15
91 cmovne eax, r14d
92 movdqu xmm8, xmmword ptr [r8+rdx-0x40]
93 movdqu xmm9, xmmword ptr [r9+rdx-0x40]
94 movdqu xmm10, xmmword ptr [r10+rdx-0x40]
95 movdqu xmm11, xmmword ptr [r11+rdx-0x40]
96 movdqa xmm12, xmm8
97 punpckldq xmm8, xmm9
98 punpckhdq xmm12, xmm9
99 movdqa xmm14, xmm10
100 punpckldq xmm10, xmm11
101 punpckhdq xmm14, xmm11
102 movdqa xmm9, xmm8
103 punpcklqdq xmm8, xmm10
104 punpckhqdq xmm9, xmm10
105 movdqa xmm13, xmm12
106 punpcklqdq xmm12, xmm14
107 punpckhqdq xmm13, xmm14
108 movdqa xmmword ptr [rsp], xmm8
109 movdqa xmmword ptr [rsp+0x10], xmm9
110 movdqa xmmword ptr [rsp+0x20], xmm12
111 movdqa xmmword ptr [rsp+0x30], xmm13
112 movdqu xmm8, xmmword ptr [r8+rdx-0x30]
113 movdqu xmm9, xmmword ptr [r9+rdx-0x30]
114 movdqu xmm10, xmmword ptr [r10+rdx-0x30]
115 movdqu xmm11, xmmword ptr [r11+rdx-0x30]
116 movdqa xmm12, xmm8
117 punpckldq xmm8, xmm9
118 punpckhdq xmm12, xmm9
119 movdqa xmm14, xmm10
120 punpckldq xmm10, xmm11
121 punpckhdq xmm14, xmm11
122 movdqa xmm9, xmm8
123 punpcklqdq xmm8, xmm10
124 punpckhqdq xmm9, xmm10
125 movdqa xmm13, xmm12
126 punpcklqdq xmm12, xmm14
127 punpckhqdq xmm13, xmm14
128 movdqa xmmword ptr [rsp+0x40], xmm8
129 movdqa xmmword ptr [rsp+0x50], xmm9
130 movdqa xmmword ptr [rsp+0x60], xmm12
131 movdqa xmmword ptr [rsp+0x70], xmm13
132 movdqu xmm8, xmmword ptr [r8+rdx-0x20]
133 movdqu xmm9, xmmword ptr [r9+rdx-0x20]
134 movdqu xmm10, xmmword ptr [r10+rdx-0x20]
135 movdqu xmm11, xmmword ptr [r11+rdx-0x20]
136 movdqa xmm12, xmm8
137 punpckldq xmm8, xmm9
138 punpckhdq xmm12, xmm9
139 movdqa xmm14, xmm10
140 punpckldq xmm10, xmm11
141 punpckhdq xmm14, xmm11
142 movdqa xmm9, xmm8
143 punpcklqdq xmm8, xmm10
144 punpckhqdq xmm9, xmm10
145 movdqa xmm13, xmm12
146 punpcklqdq xmm12, xmm14
147 punpckhqdq xmm13, xmm14
148 movdqa xmmword ptr [rsp+0x80], xmm8
149 movdqa xmmword ptr [rsp+0x90], xmm9
150 movdqa xmmword ptr [rsp+0xA0], xmm12
151 movdqa xmmword ptr [rsp+0xB0], xmm13
152 movdqu xmm8, xmmword ptr [r8+rdx-0x10]
153 movdqu xmm9, xmmword ptr [r9+rdx-0x10]
154 movdqu xmm10, xmmword ptr [r10+rdx-0x10]
155 movdqu xmm11, xmmword ptr [r11+rdx-0x10]
156 movdqa xmm12, xmm8
157 punpckldq xmm8, xmm9
158 punpckhdq xmm12, xmm9
159 movdqa xmm14, xmm10
160 punpckldq xmm10, xmm11
161 punpckhdq xmm14, xmm11
162 movdqa xmm9, xmm8
163 punpcklqdq xmm8, xmm10
164 punpckhqdq xmm9, xmm10
165 movdqa xmm13, xmm12
166 punpcklqdq xmm12, xmm14
167 punpckhqdq xmm13, xmm14
168 movdqa xmmword ptr [rsp+0xC0], xmm8
169 movdqa xmmword ptr [rsp+0xD0], xmm9
170 movdqa xmmword ptr [rsp+0xE0], xmm12
171 movdqa xmmword ptr [rsp+0xF0], xmm13
172 movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
173 movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
174 movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
175 movdqa xmm12, xmmword ptr [rsp+0x110]
176 movdqa xmm13, xmmword ptr [rsp+0x120]
177 movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
178 movd xmm15, eax
179 pshufd xmm15, xmm15, 0x00
180 prefetcht0 [r8+rdx+0x80]
181 prefetcht0 [r9+rdx+0x80]
182 prefetcht0 [r10+rdx+0x80]
183 prefetcht0 [r11+rdx+0x80]
184 paddd xmm0, xmmword ptr [rsp]
185 paddd xmm1, xmmword ptr [rsp+0x20]
186 paddd xmm2, xmmword ptr [rsp+0x40]
187 paddd xmm3, xmmword ptr [rsp+0x60]
188 paddd xmm0, xmm4
189 paddd xmm1, xmm5
190 paddd xmm2, xmm6
191 paddd xmm3, xmm7
192 pxor xmm12, xmm0
193 pxor xmm13, xmm1
194 pxor xmm14, xmm2
195 pxor xmm15, xmm3
196 movdqa xmm8, xmmword ptr [ROT16+rip]
197 pshufb xmm12, xmm8
198 pshufb xmm13, xmm8
199 pshufb xmm14, xmm8
200 pshufb xmm15, xmm8
201 movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
202 paddd xmm8, xmm12
203 paddd xmm9, xmm13
204 paddd xmm10, xmm14
205 paddd xmm11, xmm15
206 pxor xmm4, xmm8
207 pxor xmm5, xmm9
208 pxor xmm6, xmm10
209 pxor xmm7, xmm11
210 movdqa xmmword ptr [rsp+0x100], xmm8
211 movdqa xmm8, xmm4
212 psrld xmm8, 12
213 pslld xmm4, 20
214 por xmm4, xmm8
215 movdqa xmm8, xmm5
216 psrld xmm8, 12
217 pslld xmm5, 20
218 por xmm5, xmm8
219 movdqa xmm8, xmm6
220 psrld xmm8, 12
221 pslld xmm6, 20
222 por xmm6, xmm8
223 movdqa xmm8, xmm7
224 psrld xmm8, 12
225 pslld xmm7, 20
226 por xmm7, xmm8
227 paddd xmm0, xmmword ptr [rsp+0x10]
228 paddd xmm1, xmmword ptr [rsp+0x30]
229 paddd xmm2, xmmword ptr [rsp+0x50]
230 paddd xmm3, xmmword ptr [rsp+0x70]
231 paddd xmm0, xmm4
232 paddd xmm1, xmm5
233 paddd xmm2, xmm6
234 paddd xmm3, xmm7
235 pxor xmm12, xmm0
236 pxor xmm13, xmm1
237 pxor xmm14, xmm2
238 pxor xmm15, xmm3
239 movdqa xmm8, xmmword ptr [ROT8+rip]
240 pshufb xmm12, xmm8
241 pshufb xmm13, xmm8
242 pshufb xmm14, xmm8
243 pshufb xmm15, xmm8
244 movdqa xmm8, xmmword ptr [rsp+0x100]
245 paddd xmm8, xmm12
246 paddd xmm9, xmm13
247 paddd xmm10, xmm14
248 paddd xmm11, xmm15
249 pxor xmm4, xmm8
250 pxor xmm5, xmm9
251 pxor xmm6, xmm10
252 pxor xmm7, xmm11
253 movdqa xmmword ptr [rsp+0x100], xmm8
254 movdqa xmm8, xmm4
255 psrld xmm8, 7
256 pslld xmm4, 25
257 por xmm4, xmm8
258 movdqa xmm8, xmm5
259 psrld xmm8, 7
260 pslld xmm5, 25
261 por xmm5, xmm8
262 movdqa xmm8, xmm6
263 psrld xmm8, 7
264 pslld xmm6, 25
265 por xmm6, xmm8
266 movdqa xmm8, xmm7
267 psrld xmm8, 7
268 pslld xmm7, 25
269 por xmm7, xmm8
270 paddd xmm0, xmmword ptr [rsp+0x80]
271 paddd xmm1, xmmword ptr [rsp+0xA0]
272 paddd xmm2, xmmword ptr [rsp+0xC0]
273 paddd xmm3, xmmword ptr [rsp+0xE0]
274 paddd xmm0, xmm5
275 paddd xmm1, xmm6
276 paddd xmm2, xmm7
277 paddd xmm3, xmm4
278 pxor xmm15, xmm0
279 pxor xmm12, xmm1
280 pxor xmm13, xmm2
281 pxor xmm14, xmm3
282 movdqa xmm8, xmmword ptr [ROT16+rip]
283 pshufb xmm15, xmm8
284 pshufb xmm12, xmm8
285 pshufb xmm13, xmm8
286 pshufb xmm14, xmm8
287 paddd xmm10, xmm15
288 paddd xmm11, xmm12
289 movdqa xmm8, xmmword ptr [rsp+0x100]
290 paddd xmm8, xmm13
291 paddd xmm9, xmm14
292 pxor xmm5, xmm10
293 pxor xmm6, xmm11
294 pxor xmm7, xmm8
295 pxor xmm4, xmm9
296 movdqa xmmword ptr [rsp+0x100], xmm8
297 movdqa xmm8, xmm5
298 psrld xmm8, 12
299 pslld xmm5, 20
300 por xmm5, xmm8
301 movdqa xmm8, xmm6
302 psrld xmm8, 12
303 pslld xmm6, 20
304 por xmm6, xmm8
305 movdqa xmm8, xmm7
306 psrld xmm8, 12
307 pslld xmm7, 20
308 por xmm7, xmm8
309 movdqa xmm8, xmm4
310 psrld xmm8, 12
311 pslld xmm4, 20
312 por xmm4, xmm8
313 paddd xmm0, xmmword ptr [rsp+0x90]
314 paddd xmm1, xmmword ptr [rsp+0xB0]
315 paddd xmm2, xmmword ptr [rsp+0xD0]
316 paddd xmm3, xmmword ptr [rsp+0xF0]
317 paddd xmm0, xmm5
318 paddd xmm1, xmm6
319 paddd xmm2, xmm7
320 paddd xmm3, xmm4
321 pxor xmm15, xmm0
322 pxor xmm12, xmm1
323 pxor xmm13, xmm2
324 pxor xmm14, xmm3
325 movdqa xmm8, xmmword ptr [ROT8+rip]
326 pshufb xmm15, xmm8
327 pshufb xmm12, xmm8
328 pshufb xmm13, xmm8
329 pshufb xmm14, xmm8
330 paddd xmm10, xmm15
331 paddd xmm11, xmm12
332 movdqa xmm8, xmmword ptr [rsp+0x100]
333 paddd xmm8, xmm13
334 paddd xmm9, xmm14
335 pxor xmm5, xmm10
336 pxor xmm6, xmm11
337 pxor xmm7, xmm8
338 pxor xmm4, xmm9
339 movdqa xmmword ptr [rsp+0x100], xmm8
340 movdqa xmm8, xmm5
341 psrld xmm8, 7
342 pslld xmm5, 25
343 por xmm5, xmm8
344 movdqa xmm8, xmm6
345 psrld xmm8, 7
346 pslld xmm6, 25
347 por xmm6, xmm8
348 movdqa xmm8, xmm7
349 psrld xmm8, 7
350 pslld xmm7, 25
351 por xmm7, xmm8
352 movdqa xmm8, xmm4
353 psrld xmm8, 7
354 pslld xmm4, 25
355 por xmm4, xmm8
356 paddd xmm0, xmmword ptr [rsp+0x20]
357 paddd xmm1, xmmword ptr [rsp+0x30]
358 paddd xmm2, xmmword ptr [rsp+0x70]
359 paddd xmm3, xmmword ptr [rsp+0x40]
360 paddd xmm0, xmm4
361 paddd xmm1, xmm5
362 paddd xmm2, xmm6
363 paddd xmm3, xmm7
364 pxor xmm12, xmm0
365 pxor xmm13, xmm1
366 pxor xmm14, xmm2
367 pxor xmm15, xmm3
368 movdqa xmm8, xmmword ptr [ROT16+rip]
369 pshufb xmm12, xmm8
370 pshufb xmm13, xmm8
371 pshufb xmm14, xmm8
372 pshufb xmm15, xmm8
373 movdqa xmm8, xmmword ptr [rsp+0x100]
374 paddd xmm8, xmm12
375 paddd xmm9, xmm13
376 paddd xmm10, xmm14
377 paddd xmm11, xmm15
378 pxor xmm4, xmm8
379 pxor xmm5, xmm9
380 pxor xmm6, xmm10
381 pxor xmm7, xmm11
382 movdqa xmmword ptr [rsp+0x100], xmm8
383 movdqa xmm8, xmm4
384 psrld xmm8, 12
385 pslld xmm4, 20
386 por xmm4, xmm8
387 movdqa xmm8, xmm5
388 psrld xmm8, 12
389 pslld xmm5, 20
390 por xmm5, xmm8
391 movdqa xmm8, xmm6
392 psrld xmm8, 12
393 pslld xmm6, 20
394 por xmm6, xmm8
395 movdqa xmm8, xmm7
396 psrld xmm8, 12
397 pslld xmm7, 20
398 por xmm7, xmm8
399 paddd xmm0, xmmword ptr [rsp+0x60]
400 paddd xmm1, xmmword ptr [rsp+0xA0]
401 paddd xmm2, xmmword ptr [rsp]
402 paddd xmm3, xmmword ptr [rsp+0xD0]
403 paddd xmm0, xmm4
404 paddd xmm1, xmm5
405 paddd xmm2, xmm6
406 paddd xmm3, xmm7
407 pxor xmm12, xmm0
408 pxor xmm13, xmm1
409 pxor xmm14, xmm2
410 pxor xmm15, xmm3
411 movdqa xmm8, xmmword ptr [ROT8+rip]
412 pshufb xmm12, xmm8
413 pshufb xmm13, xmm8
414 pshufb xmm14, xmm8
415 pshufb xmm15, xmm8
416 movdqa xmm8, xmmword ptr [rsp+0x100]
417 paddd xmm8, xmm12
418 paddd xmm9, xmm13
419 paddd xmm10, xmm14
420 paddd xmm11, xmm15
421 pxor xmm4, xmm8
422 pxor xmm5, xmm9
423 pxor xmm6, xmm10
424 pxor xmm7, xmm11
425 movdqa xmmword ptr [rsp+0x100], xmm8
426 movdqa xmm8, xmm4
427 psrld xmm8, 7
428 pslld xmm4, 25
429 por xmm4, xmm8
430 movdqa xmm8, xmm5
431 psrld xmm8, 7
432 pslld xmm5, 25
433 por xmm5, xmm8
434 movdqa xmm8, xmm6
435 psrld xmm8, 7
436 pslld xmm6, 25
437 por xmm6, xmm8
438 movdqa xmm8, xmm7
439 psrld xmm8, 7
440 pslld xmm7, 25
441 por xmm7, xmm8
442 paddd xmm0, xmmword ptr [rsp+0x10]
443 paddd xmm1, xmmword ptr [rsp+0xC0]
444 paddd xmm2, xmmword ptr [rsp+0x90]
445 paddd xmm3, xmmword ptr [rsp+0xF0]
446 paddd xmm0, xmm5
447 paddd xmm1, xmm6
448 paddd xmm2, xmm7
449 paddd xmm3, xmm4
450 pxor xmm15, xmm0
451 pxor xmm12, xmm1
452 pxor xmm13, xmm2
453 pxor xmm14, xmm3
454 movdqa xmm8, xmmword ptr [ROT16+rip]
455 pshufb xmm15, xmm8
456 pshufb xmm12, xmm8
457 pshufb xmm13, xmm8
458 pshufb xmm14, xmm8
459 paddd xmm10, xmm15
460 paddd xmm11, xmm12
461 movdqa xmm8, xmmword ptr [rsp+0x100]
462 paddd xmm8, xmm13
463 paddd xmm9, xmm14
464 pxor xmm5, xmm10
465 pxor xmm6, xmm11
466 pxor xmm7, xmm8
467 pxor xmm4, xmm9
468 movdqa xmmword ptr [rsp+0x100], xmm8
469 movdqa xmm8, xmm5
470 psrld xmm8, 12
471 pslld xmm5, 20
472 por xmm5, xmm8
473 movdqa xmm8, xmm6
474 psrld xmm8, 12
475 pslld xmm6, 20
476 por xmm6, xmm8
477 movdqa xmm8, xmm7
478 psrld xmm8, 12
479 pslld xmm7, 20
480 por xmm7, xmm8
481 movdqa xmm8, xmm4
482 psrld xmm8, 12
483 pslld xmm4, 20
484 por xmm4, xmm8
485 paddd xmm0, xmmword ptr [rsp+0xB0]
486 paddd xmm1, xmmword ptr [rsp+0x50]
487 paddd xmm2, xmmword ptr [rsp+0xE0]
488 paddd xmm3, xmmword ptr [rsp+0x80]
489 paddd xmm0, xmm5
490 paddd xmm1, xmm6
491 paddd xmm2, xmm7
492 paddd xmm3, xmm4
493 pxor xmm15, xmm0
494 pxor xmm12, xmm1
495 pxor xmm13, xmm2
496 pxor xmm14, xmm3
497 movdqa xmm8, xmmword ptr [ROT8+rip]
498 pshufb xmm15, xmm8
499 pshufb xmm12, xmm8
500 pshufb xmm13, xmm8
501 pshufb xmm14, xmm8
502 paddd xmm10, xmm15
503 paddd xmm11, xmm12
504 movdqa xmm8, xmmword ptr [rsp+0x100]
505 paddd xmm8, xmm13
506 paddd xmm9, xmm14
507 pxor xmm5, xmm10
508 pxor xmm6, xmm11
509 pxor xmm7, xmm8
510 pxor xmm4, xmm9
511 movdqa xmmword ptr [rsp+0x100], xmm8
512 movdqa xmm8, xmm5
513 psrld xmm8, 7
514 pslld xmm5, 25
515 por xmm5, xmm8
516 movdqa xmm8, xmm6
517 psrld xmm8, 7
518 pslld xmm6, 25
519 por xmm6, xmm8
520 movdqa xmm8, xmm7
521 psrld xmm8, 7
522 pslld xmm7, 25
523 por xmm7, xmm8
524 movdqa xmm8, xmm4
525 psrld xmm8, 7
526 pslld xmm4, 25
527 por xmm4, xmm8
528 paddd xmm0, xmmword ptr [rsp+0x30]
529 paddd xmm1, xmmword ptr [rsp+0xA0]
530 paddd xmm2, xmmword ptr [rsp+0xD0]
531 paddd xmm3, xmmword ptr [rsp+0x70]
532 paddd xmm0, xmm4
533 paddd xmm1, xmm5
534 paddd xmm2, xmm6
535 paddd xmm3, xmm7
536 pxor xmm12, xmm0
537 pxor xmm13, xmm1
538 pxor xmm14, xmm2
539 pxor xmm15, xmm3
540 movdqa xmm8, xmmword ptr [ROT16+rip]
541 pshufb xmm12, xmm8
542 pshufb xmm13, xmm8
543 pshufb xmm14, xmm8
544 pshufb xmm15, xmm8
545 movdqa xmm8, xmmword ptr [rsp+0x100]
546 paddd xmm8, xmm12
547 paddd xmm9, xmm13
548 paddd xmm10, xmm14
549 paddd xmm11, xmm15
550 pxor xmm4, xmm8
551 pxor xmm5, xmm9
552 pxor xmm6, xmm10
553 pxor xmm7, xmm11
554 movdqa xmmword ptr [rsp+0x100], xmm8
555 movdqa xmm8, xmm4
556 psrld xmm8, 12
557 pslld xmm4, 20
558 por xmm4, xmm8
559 movdqa xmm8, xmm5
560 psrld xmm8, 12
561 pslld xmm5, 20
562 por xmm5, xmm8
563 movdqa xmm8, xmm6
564 psrld xmm8, 12
565 pslld xmm6, 20
566 por xmm6, xmm8
567 movdqa xmm8, xmm7
568 psrld xmm8, 12
569 pslld xmm7, 20
570 por xmm7, xmm8
571 paddd xmm0, xmmword ptr [rsp+0x40]
572 paddd xmm1, xmmword ptr [rsp+0xC0]
573 paddd xmm2, xmmword ptr [rsp+0x20]
574 paddd xmm3, xmmword ptr [rsp+0xE0]
575 paddd xmm0, xmm4
576 paddd xmm1, xmm5
577 paddd xmm2, xmm6
578 paddd xmm3, xmm7
579 pxor xmm12, xmm0
580 pxor xmm13, xmm1
581 pxor xmm14, xmm2
582 pxor xmm15, xmm3
583 movdqa xmm8, xmmword ptr [ROT8+rip]
584 pshufb xmm12, xmm8
585 pshufb xmm13, xmm8
586 pshufb xmm14, xmm8
587 pshufb xmm15, xmm8
588 movdqa xmm8, xmmword ptr [rsp+0x100]
589 paddd xmm8, xmm12
590 paddd xmm9, xmm13
591 paddd xmm10, xmm14
592 paddd xmm11, xmm15
593 pxor xmm4, xmm8
594 pxor xmm5, xmm9
595 pxor xmm6, xmm10
596 pxor xmm7, xmm11
597 movdqa xmmword ptr [rsp+0x100], xmm8
598 movdqa xmm8, xmm4
599 psrld xmm8, 7
600 pslld xmm4, 25
601 por xmm4, xmm8
602 movdqa xmm8, xmm5
603 psrld xmm8, 7
604 pslld xmm5, 25
605 por xmm5, xmm8
606 movdqa xmm8, xmm6
607 psrld xmm8, 7
608 pslld xmm6, 25
609 por xmm6, xmm8
610 movdqa xmm8, xmm7
611 psrld xmm8, 7
612 pslld xmm7, 25
613 por xmm7, xmm8
614 paddd xmm0, xmmword ptr [rsp+0x60]
615 paddd xmm1, xmmword ptr [rsp+0x90]
616 paddd xmm2, xmmword ptr [rsp+0xB0]
617 paddd xmm3, xmmword ptr [rsp+0x80]
618 paddd xmm0, xmm5
619 paddd xmm1, xmm6
620 paddd xmm2, xmm7
621 paddd xmm3, xmm4
622 pxor xmm15, xmm0
623 pxor xmm12, xmm1
624 pxor xmm13, xmm2
625 pxor xmm14, xmm3
626 movdqa xmm8, xmmword ptr [ROT16+rip]
627 pshufb xmm15, xmm8
628 pshufb xmm12, xmm8
629 pshufb xmm13, xmm8
630 pshufb xmm14, xmm8
631 paddd xmm10, xmm15
632 paddd xmm11, xmm12
633 movdqa xmm8, xmmword ptr [rsp+0x100]
634 paddd xmm8, xmm13
635 paddd xmm9, xmm14
636 pxor xmm5, xmm10
637 pxor xmm6, xmm11
638 pxor xmm7, xmm8
639 pxor xmm4, xmm9
640 movdqa xmmword ptr [rsp+0x100], xmm8
641 movdqa xmm8, xmm5
642 psrld xmm8, 12
643 pslld xmm5, 20
644 por xmm5, xmm8
645 movdqa xmm8, xmm6
646 psrld xmm8, 12
647 pslld xmm6, 20
648 por xmm6, xmm8
649 movdqa xmm8, xmm7
650 psrld xmm8, 12
651 pslld xmm7, 20
652 por xmm7, xmm8
653 movdqa xmm8, xmm4
654 psrld xmm8, 12
655 pslld xmm4, 20
656 por xmm4, xmm8
657 paddd xmm0, xmmword ptr [rsp+0x50]
658 paddd xmm1, xmmword ptr [rsp]
659 paddd xmm2, xmmword ptr [rsp+0xF0]
660 paddd xmm3, xmmword ptr [rsp+0x10]
661 paddd xmm0, xmm5
662 paddd xmm1, xmm6
663 paddd xmm2, xmm7
664 paddd xmm3, xmm4
665 pxor xmm15, xmm0
666 pxor xmm12, xmm1
667 pxor xmm13, xmm2
668 pxor xmm14, xmm3
669 movdqa xmm8, xmmword ptr [ROT8+rip]
670 pshufb xmm15, xmm8
671 pshufb xmm12, xmm8
672 pshufb xmm13, xmm8
673 pshufb xmm14, xmm8
674 paddd xmm10, xmm15
675 paddd xmm11, xmm12
676 movdqa xmm8, xmmword ptr [rsp+0x100]
677 paddd xmm8, xmm13
678 paddd xmm9, xmm14
679 pxor xmm5, xmm10
680 pxor xmm6, xmm11
681 pxor xmm7, xmm8
682 pxor xmm4, xmm9
683 movdqa xmmword ptr [rsp+0x100], xmm8
684 movdqa xmm8, xmm5
685 psrld xmm8, 7
686 pslld xmm5, 25
687 por xmm5, xmm8
688 movdqa xmm8, xmm6
689 psrld xmm8, 7
690 pslld xmm6, 25
691 por xmm6, xmm8
692 movdqa xmm8, xmm7
693 psrld xmm8, 7
694 pslld xmm7, 25
695 por xmm7, xmm8
696 movdqa xmm8, xmm4
697 psrld xmm8, 7
698 pslld xmm4, 25
699 por xmm4, xmm8
700 paddd xmm0, xmmword ptr [rsp+0xA0]
701 paddd xmm1, xmmword ptr [rsp+0xC0]
702 paddd xmm2, xmmword ptr [rsp+0xE0]
703 paddd xmm3, xmmword ptr [rsp+0xD0]
704 paddd xmm0, xmm4
705 paddd xmm1, xmm5
706 paddd xmm2, xmm6
707 paddd xmm3, xmm7
708 pxor xmm12, xmm0
709 pxor xmm13, xmm1
710 pxor xmm14, xmm2
711 pxor xmm15, xmm3
712 movdqa xmm8, xmmword ptr [ROT16+rip]
713 pshufb xmm12, xmm8
714 pshufb xmm13, xmm8
715 pshufb xmm14, xmm8
716 pshufb xmm15, xmm8
717 movdqa xmm8, xmmword ptr [rsp+0x100]
718 paddd xmm8, xmm12
719 paddd xmm9, xmm13
720 paddd xmm10, xmm14
721 paddd xmm11, xmm15
722 pxor xmm4, xmm8
723 pxor xmm5, xmm9
724 pxor xmm6, xmm10
725 pxor xmm7, xmm11
726 movdqa xmmword ptr [rsp+0x100], xmm8
727 movdqa xmm8, xmm4
728 psrld xmm8, 12
729 pslld xmm4, 20
730 por xmm4, xmm8
731 movdqa xmm8, xmm5
732 psrld xmm8, 12
733 pslld xmm5, 20
734 por xmm5, xmm8
735 movdqa xmm8, xmm6
736 psrld xmm8, 12
737 pslld xmm6, 20
738 por xmm6, xmm8
739 movdqa xmm8, xmm7
740 psrld xmm8, 12
741 pslld xmm7, 20
742 por xmm7, xmm8
743 paddd xmm0, xmmword ptr [rsp+0x70]
744 paddd xmm1, xmmword ptr [rsp+0x90]
745 paddd xmm2, xmmword ptr [rsp+0x30]
746 paddd xmm3, xmmword ptr [rsp+0xF0]
747 paddd xmm0, xmm4
748 paddd xmm1, xmm5
749 paddd xmm2, xmm6
750 paddd xmm3, xmm7
751 pxor xmm12, xmm0
752 pxor xmm13, xmm1
753 pxor xmm14, xmm2
754 pxor xmm15, xmm3
755 movdqa xmm8, xmmword ptr [ROT8+rip]
756 pshufb xmm12, xmm8
757 pshufb xmm13, xmm8
758 pshufb xmm14, xmm8
759 pshufb xmm15, xmm8
760 movdqa xmm8, xmmword ptr [rsp+0x100]
761 paddd xmm8, xmm12
762 paddd xmm9, xmm13
763 paddd xmm10, xmm14
764 paddd xmm11, xmm15
765 pxor xmm4, xmm8
766 pxor xmm5, xmm9
767 pxor xmm6, xmm10
768 pxor xmm7, xmm11
769 movdqa xmmword ptr [rsp+0x100], xmm8
770 movdqa xmm8, xmm4
771 psrld xmm8, 7
772 pslld xmm4, 25
773 por xmm4, xmm8
774 movdqa xmm8, xmm5
775 psrld xmm8, 7
776 pslld xmm5, 25
777 por xmm5, xmm8
778 movdqa xmm8, xmm6
779 psrld xmm8, 7
780 pslld xmm6, 25
781 por xmm6, xmm8
782 movdqa xmm8, xmm7
783 psrld xmm8, 7
784 pslld xmm7, 25
785 por xmm7, xmm8
786 paddd xmm0, xmmword ptr [rsp+0x40]
787 paddd xmm1, xmmword ptr [rsp+0xB0]
788 paddd xmm2, xmmword ptr [rsp+0x50]
789 paddd xmm3, xmmword ptr [rsp+0x10]
790 paddd xmm0, xmm5
791 paddd xmm1, xmm6
792 paddd xmm2, xmm7
793 paddd xmm3, xmm4
794 pxor xmm15, xmm0
795 pxor xmm12, xmm1
796 pxor xmm13, xmm2
797 pxor xmm14, xmm3
798 movdqa xmm8, xmmword ptr [ROT16+rip]
799 pshufb xmm15, xmm8
800 pshufb xmm12, xmm8
801 pshufb xmm13, xmm8
802 pshufb xmm14, xmm8
803 paddd xmm10, xmm15
804 paddd xmm11, xmm12
805 movdqa xmm8, xmmword ptr [rsp+0x100]
806 paddd xmm8, xmm13
807 paddd xmm9, xmm14
808 pxor xmm5, xmm10
809 pxor xmm6, xmm11
810 pxor xmm7, xmm8
811 pxor xmm4, xmm9
812 movdqa xmmword ptr [rsp+0x100], xmm8
813 movdqa xmm8, xmm5
814 psrld xmm8, 12
815 pslld xmm5, 20
816 por xmm5, xmm8
817 movdqa xmm8, xmm6
818 psrld xmm8, 12
819 pslld xmm6, 20
820 por xmm6, xmm8
821 movdqa xmm8, xmm7
822 psrld xmm8, 12
823 pslld xmm7, 20
824 por xmm7, xmm8
825 movdqa xmm8, xmm4
826 psrld xmm8, 12
827 pslld xmm4, 20
828 por xmm4, xmm8
829 paddd xmm0, xmmword ptr [rsp]
830 paddd xmm1, xmmword ptr [rsp+0x20]
831 paddd xmm2, xmmword ptr [rsp+0x80]
832 paddd xmm3, xmmword ptr [rsp+0x60]
833 paddd xmm0, xmm5
834 paddd xmm1, xmm6
835 paddd xmm2, xmm7
836 paddd xmm3, xmm4
837 pxor xmm15, xmm0
838 pxor xmm12, xmm1
839 pxor xmm13, xmm2
840 pxor xmm14, xmm3
841 movdqa xmm8, xmmword ptr [ROT8+rip]
842 pshufb xmm15, xmm8
843 pshufb xmm12, xmm8
844 pshufb xmm13, xmm8
845 pshufb xmm14, xmm8
846 paddd xmm10, xmm15
847 paddd xmm11, xmm12
848 movdqa xmm8, xmmword ptr [rsp+0x100]
849 paddd xmm8, xmm13
850 paddd xmm9, xmm14
851 pxor xmm5, xmm10
852 pxor xmm6, xmm11
853 pxor xmm7, xmm8
854 pxor xmm4, xmm9
855 movdqa xmmword ptr [rsp+0x100], xmm8
856 movdqa xmm8, xmm5
857 psrld xmm8, 7
858 pslld xmm5, 25
859 por xmm5, xmm8
860 movdqa xmm8, xmm6
861 psrld xmm8, 7
862 pslld xmm6, 25
863 por xmm6, xmm8
864 movdqa xmm8, xmm7
865 psrld xmm8, 7
866 pslld xmm7, 25
867 por xmm7, xmm8
868 movdqa xmm8, xmm4
869 psrld xmm8, 7
870 pslld xmm4, 25
871 por xmm4, xmm8
872 paddd xmm0, xmmword ptr [rsp+0xC0]
873 paddd xmm1, xmmword ptr [rsp+0x90]
874 paddd xmm2, xmmword ptr [rsp+0xF0]
875 paddd xmm3, xmmword ptr [rsp+0xE0]
876 paddd xmm0, xmm4
877 paddd xmm1, xmm5
878 paddd xmm2, xmm6
879 paddd xmm3, xmm7
880 pxor xmm12, xmm0
881 pxor xmm13, xmm1
882 pxor xmm14, xmm2
883 pxor xmm15, xmm3
884 movdqa xmm8, xmmword ptr [ROT16+rip]
885 pshufb xmm12, xmm8
886 pshufb xmm13, xmm8
887 pshufb xmm14, xmm8
888 pshufb xmm15, xmm8
889 movdqa xmm8, xmmword ptr [rsp+0x100]
890 paddd xmm8, xmm12
891 paddd xmm9, xmm13
892 paddd xmm10, xmm14
893 paddd xmm11, xmm15
894 pxor xmm4, xmm8
895 pxor xmm5, xmm9
896 pxor xmm6, xmm10
897 pxor xmm7, xmm11
898 movdqa xmmword ptr [rsp+0x100], xmm8
899 movdqa xmm8, xmm4
900 psrld xmm8, 12
901 pslld xmm4, 20
902 por xmm4, xmm8
903 movdqa xmm8, xmm5
904 psrld xmm8, 12
905 pslld xmm5, 20
906 por xmm5, xmm8
907 movdqa xmm8, xmm6
908 psrld xmm8, 12
909 pslld xmm6, 20
910 por xmm6, xmm8
911 movdqa xmm8, xmm7
912 psrld xmm8, 12
913 pslld xmm7, 20
914 por xmm7, xmm8
915 paddd xmm0, xmmword ptr [rsp+0xD0]
916 paddd xmm1, xmmword ptr [rsp+0xB0]
917 paddd xmm2, xmmword ptr [rsp+0xA0]
918 paddd xmm3, xmmword ptr [rsp+0x80]
919 paddd xmm0, xmm4
920 paddd xmm1, xmm5
921 paddd xmm2, xmm6
922 paddd xmm3, xmm7
923 pxor xmm12, xmm0
924 pxor xmm13, xmm1
925 pxor xmm14, xmm2
926 pxor xmm15, xmm3
927 movdqa xmm8, xmmword ptr [ROT8+rip]
928 pshufb xmm12, xmm8
929 pshufb xmm13, xmm8
930 pshufb xmm14, xmm8
931 pshufb xmm15, xmm8
932 movdqa xmm8, xmmword ptr [rsp+0x100]
933 paddd xmm8, xmm12
934 paddd xmm9, xmm13
935 paddd xmm10, xmm14
936 paddd xmm11, xmm15
937 pxor xmm4, xmm8
938 pxor xmm5, xmm9
939 pxor xmm6, xmm10
940 pxor xmm7, xmm11
941 movdqa xmmword ptr [rsp+0x100], xmm8
942 movdqa xmm8, xmm4
943 psrld xmm8, 7
944 pslld xmm4, 25
945 por xmm4, xmm8
946 movdqa xmm8, xmm5
947 psrld xmm8, 7
948 pslld xmm5, 25
949 por xmm5, xmm8
950 movdqa xmm8, xmm6
951 psrld xmm8, 7
952 pslld xmm6, 25
953 por xmm6, xmm8
954 movdqa xmm8, xmm7
955 psrld xmm8, 7
956 pslld xmm7, 25
957 por xmm7, xmm8
958 paddd xmm0, xmmword ptr [rsp+0x70]
959 paddd xmm1, xmmword ptr [rsp+0x50]
960 paddd xmm2, xmmword ptr [rsp]
961 paddd xmm3, xmmword ptr [rsp+0x60]
962 paddd xmm0, xmm5
963 paddd xmm1, xmm6
964 paddd xmm2, xmm7
965 paddd xmm3, xmm4
966 pxor xmm15, xmm0
967 pxor xmm12, xmm1
968 pxor xmm13, xmm2
969 pxor xmm14, xmm3
970 movdqa xmm8, xmmword ptr [ROT16+rip]
971 pshufb xmm15, xmm8
972 pshufb xmm12, xmm8
973 pshufb xmm13, xmm8
974 pshufb xmm14, xmm8
975 paddd xmm10, xmm15
976 paddd xmm11, xmm12
977 movdqa xmm8, xmmword ptr [rsp+0x100]
978 paddd xmm8, xmm13
979 paddd xmm9, xmm14
980 pxor xmm5, xmm10
981 pxor xmm6, xmm11
982 pxor xmm7, xmm8
983 pxor xmm4, xmm9
984 movdqa xmmword ptr [rsp+0x100], xmm8
985 movdqa xmm8, xmm5
986 psrld xmm8, 12
987 pslld xmm5, 20
988 por xmm5, xmm8
989 movdqa xmm8, xmm6
990 psrld xmm8, 12
991 pslld xmm6, 20
992 por xmm6, xmm8
993 movdqa xmm8, xmm7
994 psrld xmm8, 12
995 pslld xmm7, 20
996 por xmm7, xmm8
997 movdqa xmm8, xmm4
998 psrld xmm8, 12
999 pslld xmm4, 20
1000 por xmm4, xmm8
1001 paddd xmm0, xmmword ptr [rsp+0x20]
1002 paddd xmm1, xmmword ptr [rsp+0x30]
1003 paddd xmm2, xmmword ptr [rsp+0x10]
1004 paddd xmm3, xmmword ptr [rsp+0x40]
1005 paddd xmm0, xmm5
1006 paddd xmm1, xmm6
1007 paddd xmm2, xmm7
1008 paddd xmm3, xmm4
1009 pxor xmm15, xmm0
1010 pxor xmm12, xmm1
1011 pxor xmm13, xmm2
1012 pxor xmm14, xmm3
1013 movdqa xmm8, xmmword ptr [ROT8+rip]
1014 pshufb xmm15, xmm8
1015 pshufb xmm12, xmm8
1016 pshufb xmm13, xmm8
1017 pshufb xmm14, xmm8
1018 paddd xmm10, xmm15
1019 paddd xmm11, xmm12
1020 movdqa xmm8, xmmword ptr [rsp+0x100]
1021 paddd xmm8, xmm13
1022 paddd xmm9, xmm14
1023 pxor xmm5, xmm10
1024 pxor xmm6, xmm11
1025 pxor xmm7, xmm8
1026 pxor xmm4, xmm9
1027 movdqa xmmword ptr [rsp+0x100], xmm8
1028 movdqa xmm8, xmm5
1029 psrld xmm8, 7
1030 pslld xmm5, 25
1031 por xmm5, xmm8
1032 movdqa xmm8, xmm6
1033 psrld xmm8, 7
1034 pslld xmm6, 25
1035 por xmm6, xmm8
1036 movdqa xmm8, xmm7
1037 psrld xmm8, 7
1038 pslld xmm7, 25
1039 por xmm7, xmm8
1040 movdqa xmm8, xmm4
1041 psrld xmm8, 7
1042 pslld xmm4, 25
1043 por xmm4, xmm8
1044 paddd xmm0, xmmword ptr [rsp+0x90]
1045 paddd xmm1, xmmword ptr [rsp+0xB0]
1046 paddd xmm2, xmmword ptr [rsp+0x80]
1047 paddd xmm3, xmmword ptr [rsp+0xF0]
1048 paddd xmm0, xmm4
1049 paddd xmm1, xmm5
1050 paddd xmm2, xmm6
1051 paddd xmm3, xmm7
1052 pxor xmm12, xmm0
1053 pxor xmm13, xmm1
1054 pxor xmm14, xmm2
1055 pxor xmm15, xmm3
1056 movdqa xmm8, xmmword ptr [ROT16+rip]
1057 pshufb xmm12, xmm8
1058 pshufb xmm13, xmm8
1059 pshufb xmm14, xmm8
1060 pshufb xmm15, xmm8
1061 movdqa xmm8, xmmword ptr [rsp+0x100]
1062 paddd xmm8, xmm12
1063 paddd xmm9, xmm13
1064 paddd xmm10, xmm14
1065 paddd xmm11, xmm15
1066 pxor xmm4, xmm8
1067 pxor xmm5, xmm9
1068 pxor xmm6, xmm10
1069 pxor xmm7, xmm11
1070 movdqa xmmword ptr [rsp+0x100], xmm8
1071 movdqa xmm8, xmm4
1072 psrld xmm8, 12
1073 pslld xmm4, 20
1074 por xmm4, xmm8
1075 movdqa xmm8, xmm5
1076 psrld xmm8, 12
1077 pslld xmm5, 20
1078 por xmm5, xmm8
1079 movdqa xmm8, xmm6
1080 psrld xmm8, 12
1081 pslld xmm6, 20
1082 por xmm6, xmm8
1083 movdqa xmm8, xmm7
1084 psrld xmm8, 12
1085 pslld xmm7, 20
1086 por xmm7, xmm8
1087 paddd xmm0, xmmword ptr [rsp+0xE0]
1088 paddd xmm1, xmmword ptr [rsp+0x50]
1089 paddd xmm2, xmmword ptr [rsp+0xC0]
1090 paddd xmm3, xmmword ptr [rsp+0x10]
1091 paddd xmm0, xmm4
1092 paddd xmm1, xmm5
1093 paddd xmm2, xmm6
1094 paddd xmm3, xmm7
1095 pxor xmm12, xmm0
1096 pxor xmm13, xmm1
1097 pxor xmm14, xmm2
1098 pxor xmm15, xmm3
1099 movdqa xmm8, xmmword ptr [ROT8+rip]
1100 pshufb xmm12, xmm8
1101 pshufb xmm13, xmm8
1102 pshufb xmm14, xmm8
1103 pshufb xmm15, xmm8
1104 movdqa xmm8, xmmword ptr [rsp+0x100]
1105 paddd xmm8, xmm12
1106 paddd xmm9, xmm13
1107 paddd xmm10, xmm14
1108 paddd xmm11, xmm15
1109 pxor xmm4, xmm8
1110 pxor xmm5, xmm9
1111 pxor xmm6, xmm10
1112 pxor xmm7, xmm11
1113 movdqa xmmword ptr [rsp+0x100], xmm8
1114 movdqa xmm8, xmm4
1115 psrld xmm8, 7
1116 pslld xmm4, 25
1117 por xmm4, xmm8
1118 movdqa xmm8, xmm5
1119 psrld xmm8, 7
1120 pslld xmm5, 25
1121 por xmm5, xmm8
1122 movdqa xmm8, xmm6
1123 psrld xmm8, 7
1124 pslld xmm6, 25
1125 por xmm6, xmm8
1126 movdqa xmm8, xmm7
1127 psrld xmm8, 7
1128 pslld xmm7, 25
1129 por xmm7, xmm8
1130 paddd xmm0, xmmword ptr [rsp+0xD0]
1131 paddd xmm1, xmmword ptr [rsp]
1132 paddd xmm2, xmmword ptr [rsp+0x20]
1133 paddd xmm3, xmmword ptr [rsp+0x40]
1134 paddd xmm0, xmm5
1135 paddd xmm1, xmm6
1136 paddd xmm2, xmm7
1137 paddd xmm3, xmm4
1138 pxor xmm15, xmm0
1139 pxor xmm12, xmm1
1140 pxor xmm13, xmm2
1141 pxor xmm14, xmm3
1142 movdqa xmm8, xmmword ptr [ROT16+rip]
1143 pshufb xmm15, xmm8
1144 pshufb xmm12, xmm8
1145 pshufb xmm13, xmm8
1146 pshufb xmm14, xmm8
1147 paddd xmm10, xmm15
1148 paddd xmm11, xmm12
1149 movdqa xmm8, xmmword ptr [rsp+0x100]
1150 paddd xmm8, xmm13
1151 paddd xmm9, xmm14
1152 pxor xmm5, xmm10
1153 pxor xmm6, xmm11
1154 pxor xmm7, xmm8
1155 pxor xmm4, xmm9
1156 movdqa xmmword ptr [rsp+0x100], xmm8
1157 movdqa xmm8, xmm5
1158 psrld xmm8, 12
1159 pslld xmm5, 20
1160 por xmm5, xmm8
1161 movdqa xmm8, xmm6
1162 psrld xmm8, 12
1163 pslld xmm6, 20
1164 por xmm6, xmm8
1165 movdqa xmm8, xmm7
1166 psrld xmm8, 12
1167 pslld xmm7, 20
1168 por xmm7, xmm8
1169 movdqa xmm8, xmm4
1170 psrld xmm8, 12
1171 pslld xmm4, 20
1172 por xmm4, xmm8
1173 paddd xmm0, xmmword ptr [rsp+0x30]
1174 paddd xmm1, xmmword ptr [rsp+0xA0]
1175 paddd xmm2, xmmword ptr [rsp+0x60]
1176 paddd xmm3, xmmword ptr [rsp+0x70]
1177 paddd xmm0, xmm5
1178 paddd xmm1, xmm6
1179 paddd xmm2, xmm7
1180 paddd xmm3, xmm4
1181 pxor xmm15, xmm0
1182 pxor xmm12, xmm1
1183 pxor xmm13, xmm2
1184 pxor xmm14, xmm3
1185 movdqa xmm8, xmmword ptr [ROT8+rip]
1186 pshufb xmm15, xmm8
1187 pshufb xmm12, xmm8
1188 pshufb xmm13, xmm8
1189 pshufb xmm14, xmm8
1190 paddd xmm10, xmm15
1191 paddd xmm11, xmm12
1192 movdqa xmm8, xmmword ptr [rsp+0x100]
1193 paddd xmm8, xmm13
1194 paddd xmm9, xmm14
1195 pxor xmm5, xmm10
1196 pxor xmm6, xmm11
1197 pxor xmm7, xmm8
1198 pxor xmm4, xmm9
1199 movdqa xmmword ptr [rsp+0x100], xmm8
1200 movdqa xmm8, xmm5
1201 psrld xmm8, 7
1202 pslld xmm5, 25
1203 por xmm5, xmm8
1204 movdqa xmm8, xmm6
1205 psrld xmm8, 7
1206 pslld xmm6, 25
1207 por xmm6, xmm8
1208 movdqa xmm8, xmm7
1209 psrld xmm8, 7
1210 pslld xmm7, 25
1211 por xmm7, xmm8
1212 movdqa xmm8, xmm4
1213 psrld xmm8, 7
1214 pslld xmm4, 25
1215 por xmm4, xmm8
1216 paddd xmm0, xmmword ptr [rsp+0xB0]
1217 paddd xmm1, xmmword ptr [rsp+0x50]
1218 paddd xmm2, xmmword ptr [rsp+0x10]
1219 paddd xmm3, xmmword ptr [rsp+0x80]
1220 paddd xmm0, xmm4
1221 paddd xmm1, xmm5
1222 paddd xmm2, xmm6
1223 paddd xmm3, xmm7
1224 pxor xmm12, xmm0
1225 pxor xmm13, xmm1
1226 pxor xmm14, xmm2
1227 pxor xmm15, xmm3
1228 movdqa xmm8, xmmword ptr [ROT16+rip]
1229 pshufb xmm12, xmm8
1230 pshufb xmm13, xmm8
1231 pshufb xmm14, xmm8
1232 pshufb xmm15, xmm8
1233 movdqa xmm8, xmmword ptr [rsp+0x100]
1234 paddd xmm8, xmm12
1235 paddd xmm9, xmm13
1236 paddd xmm10, xmm14
1237 paddd xmm11, xmm15
1238 pxor xmm4, xmm8
1239 pxor xmm5, xmm9
1240 pxor xmm6, xmm10
1241 pxor xmm7, xmm11
1242 movdqa xmmword ptr [rsp+0x100], xmm8
1243 movdqa xmm8, xmm4
1244 psrld xmm8, 12
1245 pslld xmm4, 20
1246 por xmm4, xmm8
1247 movdqa xmm8, xmm5
1248 psrld xmm8, 12
1249 pslld xmm5, 20
1250 por xmm5, xmm8
1251 movdqa xmm8, xmm6
1252 psrld xmm8, 12
1253 pslld xmm6, 20
1254 por xmm6, xmm8
1255 movdqa xmm8, xmm7
1256 psrld xmm8, 12
1257 pslld xmm7, 20
1258 por xmm7, xmm8
1259 paddd xmm0, xmmword ptr [rsp+0xF0]
1260 paddd xmm1, xmmword ptr [rsp]
1261 paddd xmm2, xmmword ptr [rsp+0x90]
1262 paddd xmm3, xmmword ptr [rsp+0x60]
1263 paddd xmm0, xmm4
1264 paddd xmm1, xmm5
1265 paddd xmm2, xmm6
1266 paddd xmm3, xmm7
1267 pxor xmm12, xmm0
1268 pxor xmm13, xmm1
1269 pxor xmm14, xmm2
1270 pxor xmm15, xmm3
1271 movdqa xmm8, xmmword ptr [ROT8+rip]
1272 pshufb xmm12, xmm8
1273 pshufb xmm13, xmm8
1274 pshufb xmm14, xmm8
1275 pshufb xmm15, xmm8
1276 movdqa xmm8, xmmword ptr [rsp+0x100]
1277 paddd xmm8, xmm12
1278 paddd xmm9, xmm13
1279 paddd xmm10, xmm14
1280 paddd xmm11, xmm15
1281 pxor xmm4, xmm8
1282 pxor xmm5, xmm9
1283 pxor xmm6, xmm10
1284 pxor xmm7, xmm11
1285 movdqa xmmword ptr [rsp+0x100], xmm8
1286 movdqa xmm8, xmm4
1287 psrld xmm8, 7
1288 pslld xmm4, 25
1289 por xmm4, xmm8
1290 movdqa xmm8, xmm5
1291 psrld xmm8, 7
1292 pslld xmm5, 25
1293 por xmm5, xmm8
1294 movdqa xmm8, xmm6
1295 psrld xmm8, 7
1296 pslld xmm6, 25
1297 por xmm6, xmm8
1298 movdqa xmm8, xmm7
1299 psrld xmm8, 7
1300 pslld xmm7, 25
1301 por xmm7, xmm8
1302 paddd xmm0, xmmword ptr [rsp+0xE0]
1303 paddd xmm1, xmmword ptr [rsp+0x20]
1304 paddd xmm2, xmmword ptr [rsp+0x30]
1305 paddd xmm3, xmmword ptr [rsp+0x70]
1306 paddd xmm0, xmm5
1307 paddd xmm1, xmm6
1308 paddd xmm2, xmm7
1309 paddd xmm3, xmm4
1310 pxor xmm15, xmm0
1311 pxor xmm12, xmm1
1312 pxor xmm13, xmm2
1313 pxor xmm14, xmm3
1314 movdqa xmm8, xmmword ptr [ROT16+rip]
1315 pshufb xmm15, xmm8
1316 pshufb xmm12, xmm8
1317 pshufb xmm13, xmm8
1318 pshufb xmm14, xmm8
1319 paddd xmm10, xmm15
1320 paddd xmm11, xmm12
1321 movdqa xmm8, xmmword ptr [rsp+0x100]
1322 paddd xmm8, xmm13
1323 paddd xmm9, xmm14
1324 pxor xmm5, xmm10
1325 pxor xmm6, xmm11
1326 pxor xmm7, xmm8
1327 pxor xmm4, xmm9
1328 movdqa xmmword ptr [rsp+0x100], xmm8
1329 movdqa xmm8, xmm5
1330 psrld xmm8, 12
1331 pslld xmm5, 20
1332 por xmm5, xmm8
1333 movdqa xmm8, xmm6
1334 psrld xmm8, 12
1335 pslld xmm6, 20
1336 por xmm6, xmm8
1337 movdqa xmm8, xmm7
1338 psrld xmm8, 12
1339 pslld xmm7, 20
1340 por xmm7, xmm8
1341 movdqa xmm8, xmm4
1342 psrld xmm8, 12
1343 pslld xmm4, 20
1344 por xmm4, xmm8
1345 paddd xmm0, xmmword ptr [rsp+0xA0]
1346 paddd xmm1, xmmword ptr [rsp+0xC0]
1347 paddd xmm2, xmmword ptr [rsp+0x40]
1348 paddd xmm3, xmmword ptr [rsp+0xD0]
1349 paddd xmm0, xmm5
1350 paddd xmm1, xmm6
1351 paddd xmm2, xmm7
1352 paddd xmm3, xmm4
1353 pxor xmm15, xmm0
1354 pxor xmm12, xmm1
1355 pxor xmm13, xmm2
1356 pxor xmm14, xmm3
1357 movdqa xmm8, xmmword ptr [ROT8+rip]
1358 pshufb xmm15, xmm8
1359 pshufb xmm12, xmm8
1360 pshufb xmm13, xmm8
1361 pshufb xmm14, xmm8
1362 paddd xmm10, xmm15
1363 paddd xmm11, xmm12
1364 movdqa xmm8, xmmword ptr [rsp+0x100]
1365 paddd xmm8, xmm13
1366 paddd xmm9, xmm14
1367 pxor xmm5, xmm10
1368 pxor xmm6, xmm11
1369 pxor xmm7, xmm8
1370 pxor xmm4, xmm9
1371 pxor xmm0, xmm8
1372 pxor xmm1, xmm9
1373 pxor xmm2, xmm10
1374 pxor xmm3, xmm11
1375 movdqa xmm8, xmm5
1376 psrld xmm8, 7
1377 pslld xmm5, 25
1378 por xmm5, xmm8
1379 movdqa xmm8, xmm6
1380 psrld xmm8, 7
1381 pslld xmm6, 25
1382 por xmm6, xmm8
1383 movdqa xmm8, xmm7
1384 psrld xmm8, 7
1385 pslld xmm7, 25
1386 por xmm7, xmm8
1387 movdqa xmm8, xmm4
1388 psrld xmm8, 7
1389 pslld xmm4, 25
1390 por xmm4, xmm8
1391 pxor xmm4, xmm12
1392 pxor xmm5, xmm13
1393 pxor xmm6, xmm14
1394 pxor xmm7, xmm15
1395 mov eax, r13d
1396 jne 9b
1397 movdqa xmm9, xmm0
1398 punpckldq xmm0, xmm1
1399 punpckhdq xmm9, xmm1
1400 movdqa xmm11, xmm2
1401 punpckldq xmm2, xmm3
1402 punpckhdq xmm11, xmm3
1403 movdqa xmm1, xmm0
1404 punpcklqdq xmm0, xmm2
1405 punpckhqdq xmm1, xmm2
1406 movdqa xmm3, xmm9
1407 punpcklqdq xmm9, xmm11
1408 punpckhqdq xmm3, xmm11
1409 movdqu xmmword ptr [rbx], xmm0
1410 movdqu xmmword ptr [rbx+0x20], xmm1
1411 movdqu xmmword ptr [rbx+0x40], xmm9
1412 movdqu xmmword ptr [rbx+0x60], xmm3
1413 movdqa xmm9, xmm4
1414 punpckldq xmm4, xmm5
1415 punpckhdq xmm9, xmm5
1416 movdqa xmm11, xmm6
1417 punpckldq xmm6, xmm7
1418 punpckhdq xmm11, xmm7
1419 movdqa xmm5, xmm4
1420 punpcklqdq xmm4, xmm6
1421 punpckhqdq xmm5, xmm6
1422 movdqa xmm7, xmm9
1423 punpcklqdq xmm9, xmm11
1424 punpckhqdq xmm7, xmm11
1425 movdqu xmmword ptr [rbx+0x10], xmm4
1426 movdqu xmmword ptr [rbx+0x30], xmm5
1427 movdqu xmmword ptr [rbx+0x50], xmm9
1428 movdqu xmmword ptr [rbx+0x70], xmm7
1429 movdqa xmm1, xmmword ptr [rsp+0x110]
1430 movdqa xmm0, xmm1
1431 paddd xmm1, xmmword ptr [rsp+0x150]
1432 movdqa xmmword ptr [rsp+0x110], xmm1
1433 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
1434 pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
1435 pcmpgtd xmm0, xmm1
1436 movdqa xmm1, xmmword ptr [rsp+0x120]
1437 psubd xmm1, xmm0
1438 movdqa xmmword ptr [rsp+0x120], xmm1
1439 add rbx, 128
1440 add rdi, 32
1441 sub rsi, 4
1442 cmp rsi, 4
1443 jnc 2b
1444 test rsi, rsi
1445 jne 3f
14464:
1447 movdqa xmm6, xmmword ptr [rsp+0x170]
1448 movdqa xmm7, xmmword ptr [rsp+0x180]
1449 movdqa xmm8, xmmword ptr [rsp+0x190]
1450 movdqa xmm9, xmmword ptr [rsp+0x1A0]
1451 movdqa xmm10, xmmword ptr [rsp+0x1B0]
1452 movdqa xmm11, xmmword ptr [rsp+0x1C0]
1453 movdqa xmm12, xmmword ptr [rsp+0x1D0]
1454 movdqa xmm13, xmmword ptr [rsp+0x1E0]
1455 movdqa xmm14, xmmword ptr [rsp+0x1F0]
1456 movdqa xmm15, xmmword ptr [rsp+0x200]
1457 mov rsp, rbp
1458 pop rbp
1459 pop rbx
1460 pop rdi
1461 pop rsi
1462 pop r12
1463 pop r13
1464 pop r14
1465 pop r15
1466 ret
1467.p2align 5
14683:
1469 test esi, 0x2
1470 je 3f
1471 movups xmm0, xmmword ptr [rcx]
1472 movups xmm1, xmmword ptr [rcx+0x10]
1473 movaps xmm8, xmm0
1474 movaps xmm9, xmm1
1475 movd xmm13, dword ptr [rsp+0x110]
1476 pinsrd xmm13, dword ptr [rsp+0x120], 1
1477 pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1478 movaps xmmword ptr [rsp], xmm13
1479 movd xmm14, dword ptr [rsp+0x114]
1480 pinsrd xmm14, dword ptr [rsp+0x124], 1
1481 pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1482 movaps xmmword ptr [rsp+0x10], xmm14
1483 mov r8, qword ptr [rdi]
1484 mov r9, qword ptr [rdi+0x8]
1485 movzx eax, byte ptr [rbp+0x80]
1486 or eax, r13d
1487 xor edx, edx
14882:
1489 mov r14d, eax
1490 or eax, r12d
1491 add rdx, 64
1492 cmp rdx, r15
1493 cmovne eax, r14d
1494 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1495 movaps xmm10, xmm2
1496 movups xmm4, xmmword ptr [r8+rdx-0x40]
1497 movups xmm5, xmmword ptr [r8+rdx-0x30]
1498 movaps xmm3, xmm4
1499 shufps xmm4, xmm5, 136
1500 shufps xmm3, xmm5, 221
1501 movaps xmm5, xmm3
1502 movups xmm6, xmmword ptr [r8+rdx-0x20]
1503 movups xmm7, xmmword ptr [r8+rdx-0x10]
1504 movaps xmm3, xmm6
1505 shufps xmm6, xmm7, 136
1506 pshufd xmm6, xmm6, 0x93
1507 shufps xmm3, xmm7, 221
1508 pshufd xmm7, xmm3, 0x93
1509 movups xmm12, xmmword ptr [r9+rdx-0x40]
1510 movups xmm13, xmmword ptr [r9+rdx-0x30]
1511 movaps xmm11, xmm12
1512 shufps xmm12, xmm13, 136
1513 shufps xmm11, xmm13, 221
1514 movaps xmm13, xmm11
1515 movups xmm14, xmmword ptr [r9+rdx-0x20]
1516 movups xmm15, xmmword ptr [r9+rdx-0x10]
1517 movaps xmm11, xmm14
1518 shufps xmm14, xmm15, 136
1519 pshufd xmm14, xmm14, 0x93
1520 shufps xmm11, xmm15, 221
1521 pshufd xmm15, xmm11, 0x93
1522 movaps xmm3, xmmword ptr [rsp]
1523 movaps xmm11, xmmword ptr [rsp+0x10]
1524 pinsrd xmm3, eax, 3
1525 pinsrd xmm11, eax, 3
1526 mov al, 7
15279:
1528 paddd xmm0, xmm4
1529 paddd xmm8, xmm12
1530 movaps xmmword ptr [rsp+0x20], xmm4
1531 movaps xmmword ptr [rsp+0x30], xmm12
1532 paddd xmm0, xmm1
1533 paddd xmm8, xmm9
1534 pxor xmm3, xmm0
1535 pxor xmm11, xmm8
1536 movaps xmm12, xmmword ptr [ROT16+rip]
1537 pshufb xmm3, xmm12
1538 pshufb xmm11, xmm12
1539 paddd xmm2, xmm3
1540 paddd xmm10, xmm11
1541 pxor xmm1, xmm2
1542 pxor xmm9, xmm10
1543 movdqa xmm4, xmm1
1544 pslld xmm1, 20
1545 psrld xmm4, 12
1546 por xmm1, xmm4
1547 movdqa xmm4, xmm9
1548 pslld xmm9, 20
1549 psrld xmm4, 12
1550 por xmm9, xmm4
1551 paddd xmm0, xmm5
1552 paddd xmm8, xmm13
1553 movaps xmmword ptr [rsp+0x40], xmm5
1554 movaps xmmword ptr [rsp+0x50], xmm13
1555 paddd xmm0, xmm1
1556 paddd xmm8, xmm9
1557 pxor xmm3, xmm0
1558 pxor xmm11, xmm8
1559 movaps xmm13, xmmword ptr [ROT8+rip]
1560 pshufb xmm3, xmm13
1561 pshufb xmm11, xmm13
1562 paddd xmm2, xmm3
1563 paddd xmm10, xmm11
1564 pxor xmm1, xmm2
1565 pxor xmm9, xmm10
1566 movdqa xmm4, xmm1
1567 pslld xmm1, 25
1568 psrld xmm4, 7
1569 por xmm1, xmm4
1570 movdqa xmm4, xmm9
1571 pslld xmm9, 25
1572 psrld xmm4, 7
1573 por xmm9, xmm4
1574 pshufd xmm0, xmm0, 0x93
1575 pshufd xmm8, xmm8, 0x93
1576 pshufd xmm3, xmm3, 0x4E
1577 pshufd xmm11, xmm11, 0x4E
1578 pshufd xmm2, xmm2, 0x39
1579 pshufd xmm10, xmm10, 0x39
1580 paddd xmm0, xmm6
1581 paddd xmm8, xmm14
1582 paddd xmm0, xmm1
1583 paddd xmm8, xmm9
1584 pxor xmm3, xmm0
1585 pxor xmm11, xmm8
1586 pshufb xmm3, xmm12
1587 pshufb xmm11, xmm12
1588 paddd xmm2, xmm3
1589 paddd xmm10, xmm11
1590 pxor xmm1, xmm2
1591 pxor xmm9, xmm10
1592 movdqa xmm4, xmm1
1593 pslld xmm1, 20
1594 psrld xmm4, 12
1595 por xmm1, xmm4
1596 movdqa xmm4, xmm9
1597 pslld xmm9, 20
1598 psrld xmm4, 12
1599 por xmm9, xmm4
1600 paddd xmm0, xmm7
1601 paddd xmm8, xmm15
1602 paddd xmm0, xmm1
1603 paddd xmm8, xmm9
1604 pxor xmm3, xmm0
1605 pxor xmm11, xmm8
1606 pshufb xmm3, xmm13
1607 pshufb xmm11, xmm13
1608 paddd xmm2, xmm3
1609 paddd xmm10, xmm11
1610 pxor xmm1, xmm2
1611 pxor xmm9, xmm10
1612 movdqa xmm4, xmm1
1613 pslld xmm1, 25
1614 psrld xmm4, 7
1615 por xmm1, xmm4
1616 movdqa xmm4, xmm9
1617 pslld xmm9, 25
1618 psrld xmm4, 7
1619 por xmm9, xmm4
1620 pshufd xmm0, xmm0, 0x39
1621 pshufd xmm8, xmm8, 0x39
1622 pshufd xmm3, xmm3, 0x4E
1623 pshufd xmm11, xmm11, 0x4E
1624 pshufd xmm2, xmm2, 0x93
1625 pshufd xmm10, xmm10, 0x93
1626 dec al
1627 je 9f
1628 movdqa xmm12, xmmword ptr [rsp+0x20]
1629 movdqa xmm5, xmmword ptr [rsp+0x40]
1630 pshufd xmm13, xmm12, 0x0F
1631 shufps xmm12, xmm5, 214
1632 pshufd xmm4, xmm12, 0x39
1633 movdqa xmm12, xmm6
1634 shufps xmm12, xmm7, 250
1635 pblendw xmm13, xmm12, 0xCC
1636 movdqa xmm12, xmm7
1637 punpcklqdq xmm12, xmm5
1638 pblendw xmm12, xmm6, 0xC0
1639 pshufd xmm12, xmm12, 0x78
1640 punpckhdq xmm5, xmm7
1641 punpckldq xmm6, xmm5
1642 pshufd xmm7, xmm6, 0x1E
1643 movdqa xmmword ptr [rsp+0x20], xmm13
1644 movdqa xmmword ptr [rsp+0x40], xmm12
1645 movdqa xmm5, xmmword ptr [rsp+0x30]
1646 movdqa xmm13, xmmword ptr [rsp+0x50]
1647 pshufd xmm6, xmm5, 0x0F
1648 shufps xmm5, xmm13, 214
1649 pshufd xmm12, xmm5, 0x39
1650 movdqa xmm5, xmm14
1651 shufps xmm5, xmm15, 250
1652 pblendw xmm6, xmm5, 0xCC
1653 movdqa xmm5, xmm15
1654 punpcklqdq xmm5, xmm13
1655 pblendw xmm5, xmm14, 0xC0
1656 pshufd xmm5, xmm5, 0x78
1657 punpckhdq xmm13, xmm15
1658 punpckldq xmm14, xmm13
1659 pshufd xmm15, xmm14, 0x1E
1660 movdqa xmm13, xmm6
1661 movdqa xmm14, xmm5
1662 movdqa xmm5, xmmword ptr [rsp+0x20]
1663 movdqa xmm6, xmmword ptr [rsp+0x40]
1664 jmp 9b
16659:
1666 pxor xmm0, xmm2
1667 pxor xmm1, xmm3
1668 pxor xmm8, xmm10
1669 pxor xmm9, xmm11
1670 mov eax, r13d
1671 cmp rdx, r15
1672 jne 2b
1673 movups xmmword ptr [rbx], xmm0
1674 movups xmmword ptr [rbx+0x10], xmm1
1675 movups xmmword ptr [rbx+0x20], xmm8
1676 movups xmmword ptr [rbx+0x30], xmm9
1677 movdqa xmm0, xmmword ptr [rsp+0x130]
1678 movdqa xmm1, xmmword ptr [rsp+0x110]
1679 movdqa xmm2, xmmword ptr [rsp+0x120]
1680 movdqu xmm3, xmmword ptr [rsp+0x118]
1681 movdqu xmm4, xmmword ptr [rsp+0x128]
1682 blendvps xmm1, xmm3, xmm0
1683 blendvps xmm2, xmm4, xmm0
1684 movdqa xmmword ptr [rsp+0x110], xmm1
1685 movdqa xmmword ptr [rsp+0x120], xmm2
1686 add rdi, 16
1687 add rbx, 64
1688 sub rsi, 2
16893:
1690 test esi, 0x1
1691 je 4b
1692 movups xmm0, xmmword ptr [rcx]
1693 movups xmm1, xmmword ptr [rcx+0x10]
1694 movd xmm13, dword ptr [rsp+0x110]
1695 pinsrd xmm13, dword ptr [rsp+0x120], 1
1696 pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1697 movaps xmm14, xmmword ptr [ROT8+rip]
1698 movaps xmm15, xmmword ptr [ROT16+rip]
1699 mov r8, qword ptr [rdi]
1700 movzx eax, byte ptr [rbp+0x80]
1701 or eax, r13d
1702 xor edx, edx
17032:
1704 mov r14d, eax
1705 or eax, r12d
1706 add rdx, 64
1707 cmp rdx, r15
1708 cmovne eax, r14d
1709 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1710 movaps xmm3, xmm13
1711 pinsrd xmm3, eax, 3
1712 movups xmm4, xmmword ptr [r8+rdx-0x40]
1713 movups xmm5, xmmword ptr [r8+rdx-0x30]
1714 movaps xmm8, xmm4
1715 shufps xmm4, xmm5, 136
1716 shufps xmm8, xmm5, 221
1717 movaps xmm5, xmm8
1718 movups xmm6, xmmword ptr [r8+rdx-0x20]
1719 movups xmm7, xmmword ptr [r8+rdx-0x10]
1720 movaps xmm8, xmm6
1721 shufps xmm6, xmm7, 136
1722 pshufd xmm6, xmm6, 0x93
1723 shufps xmm8, xmm7, 221
1724 pshufd xmm7, xmm8, 0x93
1725 mov al, 7
17269:
1727 paddd xmm0, xmm4
1728 paddd xmm0, xmm1
1729 pxor xmm3, xmm0
1730 pshufb xmm3, xmm15
1731 paddd xmm2, xmm3
1732 pxor xmm1, xmm2
1733 movdqa xmm11, xmm1
1734 pslld xmm1, 20
1735 psrld xmm11, 12
1736 por xmm1, xmm11
1737 paddd xmm0, xmm5
1738 paddd xmm0, xmm1
1739 pxor xmm3, xmm0
1740 pshufb xmm3, xmm14
1741 paddd xmm2, xmm3
1742 pxor xmm1, xmm2
1743 movdqa xmm11, xmm1
1744 pslld xmm1, 25
1745 psrld xmm11, 7
1746 por xmm1, xmm11
1747 pshufd xmm0, xmm0, 0x93
1748 pshufd xmm3, xmm3, 0x4E
1749 pshufd xmm2, xmm2, 0x39
1750 paddd xmm0, xmm6
1751 paddd xmm0, xmm1
1752 pxor xmm3, xmm0
1753 pshufb xmm3, xmm15
1754 paddd xmm2, xmm3
1755 pxor xmm1, xmm2
1756 movdqa xmm11, xmm1
1757 pslld xmm1, 20
1758 psrld xmm11, 12
1759 por xmm1, xmm11
1760 paddd xmm0, xmm7
1761 paddd xmm0, xmm1
1762 pxor xmm3, xmm0
1763 pshufb xmm3, xmm14
1764 paddd xmm2, xmm3
1765 pxor xmm1, xmm2
1766 movdqa xmm11, xmm1
1767 pslld xmm1, 25
1768 psrld xmm11, 7
1769 por xmm1, xmm11
1770 pshufd xmm0, xmm0, 0x39
1771 pshufd xmm3, xmm3, 0x4E
1772 pshufd xmm2, xmm2, 0x93
1773 dec al
1774 jz 9f
1775 movdqa xmm8, xmm4
1776 shufps xmm8, xmm5, 214
1777 pshufd xmm9, xmm4, 0x0F
1778 pshufd xmm4, xmm8, 0x39
1779 movdqa xmm8, xmm6
1780 shufps xmm8, xmm7, 250
1781 pblendw xmm9, xmm8, 0xCC
1782 movdqa xmm8, xmm7
1783 punpcklqdq xmm8, xmm5
1784 pblendw xmm8, xmm6, 0xC0
1785 pshufd xmm8, xmm8, 0x78
1786 punpckhdq xmm5, xmm7
1787 punpckldq xmm6, xmm5
1788 pshufd xmm7, xmm6, 0x1E
1789 movdqa xmm5, xmm9
1790 movdqa xmm6, xmm8
1791 jmp 9b
17929:
1793 pxor xmm0, xmm2
1794 pxor xmm1, xmm3
1795 mov eax, r13d
1796 cmp rdx, r15
1797 jne 2b
1798 movups xmmword ptr [rbx], xmm0
1799 movups xmmword ptr [rbx+0x10], xmm1
1800 jmp 4b
1801
1802.p2align 6
1803blake3_compress_in_place_sse41:
1804_blake3_compress_in_place_sse41:
1805 sub rsp, 120
1806 movdqa xmmword ptr [rsp], xmm6
1807 movdqa xmmword ptr [rsp+0x10], xmm7
1808 movdqa xmmword ptr [rsp+0x20], xmm8
1809 movdqa xmmword ptr [rsp+0x30], xmm9
1810 movdqa xmmword ptr [rsp+0x40], xmm11
1811 movdqa xmmword ptr [rsp+0x50], xmm14
1812 movdqa xmmword ptr [rsp+0x60], xmm15
1813 movups xmm0, xmmword ptr [rcx]
1814 movups xmm1, xmmword ptr [rcx+0x10]
1815 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1816 movzx eax, byte ptr [rsp+0xA0]
1817 movzx r8d, r8b
1818 shl rax, 32
1819 add r8, rax
1820 movq xmm3, r9
1821 movq xmm4, r8
1822 punpcklqdq xmm3, xmm4
1823 movups xmm4, xmmword ptr [rdx]
1824 movups xmm5, xmmword ptr [rdx+0x10]
1825 movaps xmm8, xmm4
1826 shufps xmm4, xmm5, 136
1827 shufps xmm8, xmm5, 221
1828 movaps xmm5, xmm8
1829 movups xmm6, xmmword ptr [rdx+0x20]
1830 movups xmm7, xmmword ptr [rdx+0x30]
1831 movaps xmm8, xmm6
1832 shufps xmm6, xmm7, 136
1833 pshufd xmm6, xmm6, 0x93
1834 shufps xmm8, xmm7, 221
1835 pshufd xmm7, xmm8, 0x93
1836 movaps xmm14, xmmword ptr [ROT8+rip]
1837 movaps xmm15, xmmword ptr [ROT16+rip]
1838 mov al, 7
18399:
1840 paddd xmm0, xmm4
1841 paddd xmm0, xmm1
1842 pxor xmm3, xmm0
1843 pshufb xmm3, xmm15
1844 paddd xmm2, xmm3
1845 pxor xmm1, xmm2
1846 movdqa xmm11, xmm1
1847 pslld xmm1, 20
1848 psrld xmm11, 12
1849 por xmm1, xmm11
1850 paddd xmm0, xmm5
1851 paddd xmm0, xmm1
1852 pxor xmm3, xmm0
1853 pshufb xmm3, xmm14
1854 paddd xmm2, xmm3
1855 pxor xmm1, xmm2
1856 movdqa xmm11, xmm1
1857 pslld xmm1, 25
1858 psrld xmm11, 7
1859 por xmm1, xmm11
1860 pshufd xmm0, xmm0, 0x93
1861 pshufd xmm3, xmm3, 0x4E
1862 pshufd xmm2, xmm2, 0x39
1863 paddd xmm0, xmm6
1864 paddd xmm0, xmm1
1865 pxor xmm3, xmm0
1866 pshufb xmm3, xmm15
1867 paddd xmm2, xmm3
1868 pxor xmm1, xmm2
1869 movdqa xmm11, xmm1
1870 pslld xmm1, 20
1871 psrld xmm11, 12
1872 por xmm1, xmm11
1873 paddd xmm0, xmm7
1874 paddd xmm0, xmm1
1875 pxor xmm3, xmm0
1876 pshufb xmm3, xmm14
1877 paddd xmm2, xmm3
1878 pxor xmm1, xmm2
1879 movdqa xmm11, xmm1
1880 pslld xmm1, 25
1881 psrld xmm11, 7
1882 por xmm1, xmm11
1883 pshufd xmm0, xmm0, 0x39
1884 pshufd xmm3, xmm3, 0x4E
1885 pshufd xmm2, xmm2, 0x93
1886 dec al
1887 jz 9f
1888 movdqa xmm8, xmm4
1889 shufps xmm8, xmm5, 214
1890 pshufd xmm9, xmm4, 0x0F
1891 pshufd xmm4, xmm8, 0x39
1892 movdqa xmm8, xmm6
1893 shufps xmm8, xmm7, 250
1894 pblendw xmm9, xmm8, 0xCC
1895 movdqa xmm8, xmm7
1896 punpcklqdq xmm8, xmm5
1897 pblendw xmm8, xmm6, 0xC0
1898 pshufd xmm8, xmm8, 0x78
1899 punpckhdq xmm5, xmm7
1900 punpckldq xmm6, xmm5
1901 pshufd xmm7, xmm6, 0x1E
1902 movdqa xmm5, xmm9
1903 movdqa xmm6, xmm8
1904 jmp 9b
19059:
1906 pxor xmm0, xmm2
1907 pxor xmm1, xmm3
1908 movups xmmword ptr [rcx], xmm0
1909 movups xmmword ptr [rcx+0x10], xmm1
1910 movdqa xmm6, xmmword ptr [rsp]
1911 movdqa xmm7, xmmword ptr [rsp+0x10]
1912 movdqa xmm8, xmmword ptr [rsp+0x20]
1913 movdqa xmm9, xmmword ptr [rsp+0x30]
1914 movdqa xmm11, xmmword ptr [rsp+0x40]
1915 movdqa xmm14, xmmword ptr [rsp+0x50]
1916 movdqa xmm15, xmmword ptr [rsp+0x60]
1917 add rsp, 120
1918 ret
1919
1920
1921.p2align 6
1922_blake3_compress_xof_sse41:
1923blake3_compress_xof_sse41:
1924 sub rsp, 120
1925 movdqa xmmword ptr [rsp], xmm6
1926 movdqa xmmword ptr [rsp+0x10], xmm7
1927 movdqa xmmword ptr [rsp+0x20], xmm8
1928 movdqa xmmword ptr [rsp+0x30], xmm9
1929 movdqa xmmword ptr [rsp+0x40], xmm11
1930 movdqa xmmword ptr [rsp+0x50], xmm14
1931 movdqa xmmword ptr [rsp+0x60], xmm15
1932 movups xmm0, xmmword ptr [rcx]
1933 movups xmm1, xmmword ptr [rcx+0x10]
1934 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1935 movzx eax, byte ptr [rsp+0xA0]
1936 movzx r8d, r8b
1937 mov r10, qword ptr [rsp+0xA8]
1938 shl rax, 32
1939 add r8, rax
1940 movq xmm3, r9
1941 movq xmm4, r8
1942 punpcklqdq xmm3, xmm4
1943 movups xmm4, xmmword ptr [rdx]
1944 movups xmm5, xmmword ptr [rdx+0x10]
1945 movaps xmm8, xmm4
1946 shufps xmm4, xmm5, 136
1947 shufps xmm8, xmm5, 221
1948 movaps xmm5, xmm8
1949 movups xmm6, xmmword ptr [rdx+0x20]
1950 movups xmm7, xmmword ptr [rdx+0x30]
1951 movaps xmm8, xmm6
1952 shufps xmm6, xmm7, 136
1953 pshufd xmm6, xmm6, 0x93
1954 shufps xmm8, xmm7, 221
1955 pshufd xmm7, xmm8, 0x93
1956 movaps xmm14, xmmword ptr [ROT8+rip]
1957 movaps xmm15, xmmword ptr [ROT16+rip]
1958 mov al, 7
19599:
1960 paddd xmm0, xmm4
1961 paddd xmm0, xmm1
1962 pxor xmm3, xmm0
1963 pshufb xmm3, xmm15
1964 paddd xmm2, xmm3
1965 pxor xmm1, xmm2
1966 movdqa xmm11, xmm1
1967 pslld xmm1, 20
1968 psrld xmm11, 12
1969 por xmm1, xmm11
1970 paddd xmm0, xmm5
1971 paddd xmm0, xmm1
1972 pxor xmm3, xmm0
1973 pshufb xmm3, xmm14
1974 paddd xmm2, xmm3
1975 pxor xmm1, xmm2
1976 movdqa xmm11, xmm1
1977 pslld xmm1, 25
1978 psrld xmm11, 7
1979 por xmm1, xmm11
1980 pshufd xmm0, xmm0, 0x93
1981 pshufd xmm3, xmm3, 0x4E
1982 pshufd xmm2, xmm2, 0x39
1983 paddd xmm0, xmm6
1984 paddd xmm0, xmm1
1985 pxor xmm3, xmm0
1986 pshufb xmm3, xmm15
1987 paddd xmm2, xmm3
1988 pxor xmm1, xmm2
1989 movdqa xmm11, xmm1
1990 pslld xmm1, 20
1991 psrld xmm11, 12
1992 por xmm1, xmm11
1993 paddd xmm0, xmm7
1994 paddd xmm0, xmm1
1995 pxor xmm3, xmm0
1996 pshufb xmm3, xmm14
1997 paddd xmm2, xmm3
1998 pxor xmm1, xmm2
1999 movdqa xmm11, xmm1
2000 pslld xmm1, 25
2001 psrld xmm11, 7
2002 por xmm1, xmm11
2003 pshufd xmm0, xmm0, 0x39
2004 pshufd xmm3, xmm3, 0x4E
2005 pshufd xmm2, xmm2, 0x93
2006 dec al
2007 jz 9f
2008 movdqa xmm8, xmm4
2009 shufps xmm8, xmm5, 214
2010 pshufd xmm9, xmm4, 0x0F
2011 pshufd xmm4, xmm8, 0x39
2012 movdqa xmm8, xmm6
2013 shufps xmm8, xmm7, 250
2014 pblendw xmm9, xmm8, 0xCC
2015 movdqa xmm8, xmm7
2016 punpcklqdq xmm8, xmm5
2017 pblendw xmm8, xmm6, 0xC0
2018 pshufd xmm8, xmm8, 0x78
2019 punpckhdq xmm5, xmm7
2020 punpckldq xmm6, xmm5
2021 pshufd xmm7, xmm6, 0x1E
2022 movdqa xmm5, xmm9
2023 movdqa xmm6, xmm8
2024 jmp 9b
20259:
2026 movdqu xmm4, xmmword ptr [rcx]
2027 movdqu xmm5, xmmword ptr [rcx+0x10]
2028 pxor xmm0, xmm2
2029 pxor xmm1, xmm3
2030 pxor xmm2, xmm4
2031 pxor xmm3, xmm5
2032 movups xmmword ptr [r10], xmm0
2033 movups xmmword ptr [r10+0x10], xmm1
2034 movups xmmword ptr [r10+0x20], xmm2
2035 movups xmmword ptr [r10+0x30], xmm3
2036 movdqa xmm6, xmmword ptr [rsp]
2037 movdqa xmm7, xmmword ptr [rsp+0x10]
2038 movdqa xmm8, xmmword ptr [rsp+0x20]
2039 movdqa xmm9, xmmword ptr [rsp+0x30]
2040 movdqa xmm11, xmmword ptr [rsp+0x40]
2041 movdqa xmm14, xmmword ptr [rsp+0x50]
2042 movdqa xmm15, xmmword ptr [rsp+0x60]
2043 add rsp, 120
2044 ret
2045
2046
2047.section .rodata
2048.p2align 6
2049BLAKE3_IV:
2050 .long 0x6A09E667, 0xBB67AE85
2051 .long 0x3C6EF372, 0xA54FF53A
2052ROT16:
2053 .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
2054ROT8:
2055 .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
2056ADD0:
2057 .long 0, 1, 2, 3
2058ADD1:
2059 .long 4, 4, 4, 4
2060BLAKE3_IV_0:
2061 .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
2062BLAKE3_IV_1:
2063 .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
2064BLAKE3_IV_2:
2065 .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
2066BLAKE3_IV_3:
2067 .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
2068BLAKE3_BLOCK_LEN:
2069 .long 64, 64, 64, 64
2070CMP_MSB_MASK:
2071 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
2072

source code of llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_windows_gnu.S