1 | #include "llvm_blake3_prefix.h" |
2 | |
3 | .intel_syntax noprefix |
4 | .global blake3_hash_many_sse2 |
5 | .global _blake3_hash_many_sse2 |
6 | .global blake3_compress_in_place_sse2 |
7 | .global _blake3_compress_in_place_sse2 |
8 | .global blake3_compress_xof_sse2 |
9 | .global _blake3_compress_xof_sse2 |
10 | .section .text |
11 | .p2align 6 |
12 | _blake3_hash_many_sse2: |
13 | blake3_hash_many_sse2: |
14 | push r15 |
15 | push r14 |
16 | push r13 |
17 | push r12 |
18 | push rsi |
19 | push rdi |
20 | push rbx |
21 | push rbp |
22 | mov rbp, rsp |
23 | sub rsp, 528 |
24 | and rsp, 0xFFFFFFFFFFFFFFC0 |
25 | movdqa xmmword ptr [rsp+0x170], xmm6 |
26 | movdqa xmmword ptr [rsp+0x180], xmm7 |
27 | movdqa xmmword ptr [rsp+0x190], xmm8 |
28 | movdqa xmmword ptr [rsp+0x1A0], xmm9 |
29 | movdqa xmmword ptr [rsp+0x1B0], xmm10 |
30 | movdqa xmmword ptr [rsp+0x1C0], xmm11 |
31 | movdqa xmmword ptr [rsp+0x1D0], xmm12 |
32 | movdqa xmmword ptr [rsp+0x1E0], xmm13 |
33 | movdqa xmmword ptr [rsp+0x1F0], xmm14 |
34 | movdqa xmmword ptr [rsp+0x200], xmm15 |
35 | mov rdi, rcx |
36 | mov rsi, rdx |
37 | mov rdx, r8 |
38 | mov rcx, r9 |
39 | mov r8, qword ptr [rbp+0x68] |
40 | movzx r9, byte ptr [rbp+0x70] |
41 | neg r9d |
42 | movd xmm0, r9d |
43 | pshufd xmm0, xmm0, 0x00 |
44 | movdqa xmmword ptr [rsp+0x130], xmm0 |
45 | movdqa xmm1, xmm0 |
46 | pand xmm1, xmmword ptr [ADD0+rip] |
47 | pand xmm0, xmmword ptr [ADD1+rip] |
48 | movdqa xmmword ptr [rsp+0x150], xmm0 |
49 | movd xmm0, r8d |
50 | pshufd xmm0, xmm0, 0x00 |
51 | paddd xmm0, xmm1 |
52 | movdqa xmmword ptr [rsp+0x110], xmm0 |
53 | pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] |
54 | pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] |
55 | pcmpgtd xmm1, xmm0 |
56 | shr r8, 32 |
57 | movd xmm2, r8d |
58 | pshufd xmm2, xmm2, 0x00 |
59 | psubd xmm2, xmm1 |
60 | movdqa xmmword ptr [rsp+0x120], xmm2 |
61 | mov rbx, qword ptr [rbp+0x90] |
62 | mov r15, rdx |
63 | shl r15, 6 |
64 | movzx r13d, byte ptr [rbp+0x78] |
65 | movzx r12d, byte ptr [rbp+0x88] |
66 | cmp rsi, 4 |
67 | jc 3f |
68 | 2: |
69 | movdqu xmm3, xmmword ptr [rcx] |
70 | pshufd xmm0, xmm3, 0x00 |
71 | pshufd xmm1, xmm3, 0x55 |
72 | pshufd xmm2, xmm3, 0xAA |
73 | pshufd xmm3, xmm3, 0xFF |
74 | movdqu xmm7, xmmword ptr [rcx+0x10] |
75 | pshufd xmm4, xmm7, 0x00 |
76 | pshufd xmm5, xmm7, 0x55 |
77 | pshufd xmm6, xmm7, 0xAA |
78 | pshufd xmm7, xmm7, 0xFF |
79 | mov r8, qword ptr [rdi] |
80 | mov r9, qword ptr [rdi+0x8] |
81 | mov r10, qword ptr [rdi+0x10] |
82 | mov r11, qword ptr [rdi+0x18] |
83 | movzx eax, byte ptr [rbp+0x80] |
84 | or eax, r13d |
85 | xor edx, edx |
86 | 9: |
87 | mov r14d, eax |
88 | or eax, r12d |
89 | add rdx, 64 |
90 | cmp rdx, r15 |
91 | cmovne eax, r14d |
92 | movdqu xmm8, xmmword ptr [r8+rdx-0x40] |
93 | movdqu xmm9, xmmword ptr [r9+rdx-0x40] |
94 | movdqu xmm10, xmmword ptr [r10+rdx-0x40] |
95 | movdqu xmm11, xmmword ptr [r11+rdx-0x40] |
96 | movdqa xmm12, xmm8 |
97 | punpckldq xmm8, xmm9 |
98 | punpckhdq xmm12, xmm9 |
99 | movdqa xmm14, xmm10 |
100 | punpckldq xmm10, xmm11 |
101 | punpckhdq xmm14, xmm11 |
102 | movdqa xmm9, xmm8 |
103 | punpcklqdq xmm8, xmm10 |
104 | punpckhqdq xmm9, xmm10 |
105 | movdqa xmm13, xmm12 |
106 | punpcklqdq xmm12, xmm14 |
107 | punpckhqdq xmm13, xmm14 |
108 | movdqa xmmword ptr [rsp], xmm8 |
109 | movdqa xmmword ptr [rsp+0x10], xmm9 |
110 | movdqa xmmword ptr [rsp+0x20], xmm12 |
111 | movdqa xmmword ptr [rsp+0x30], xmm13 |
112 | movdqu xmm8, xmmword ptr [r8+rdx-0x30] |
113 | movdqu xmm9, xmmword ptr [r9+rdx-0x30] |
114 | movdqu xmm10, xmmword ptr [r10+rdx-0x30] |
115 | movdqu xmm11, xmmword ptr [r11+rdx-0x30] |
116 | movdqa xmm12, xmm8 |
117 | punpckldq xmm8, xmm9 |
118 | punpckhdq xmm12, xmm9 |
119 | movdqa xmm14, xmm10 |
120 | punpckldq xmm10, xmm11 |
121 | punpckhdq xmm14, xmm11 |
122 | movdqa xmm9, xmm8 |
123 | punpcklqdq xmm8, xmm10 |
124 | punpckhqdq xmm9, xmm10 |
125 | movdqa xmm13, xmm12 |
126 | punpcklqdq xmm12, xmm14 |
127 | punpckhqdq xmm13, xmm14 |
128 | movdqa xmmword ptr [rsp+0x40], xmm8 |
129 | movdqa xmmword ptr [rsp+0x50], xmm9 |
130 | movdqa xmmword ptr [rsp+0x60], xmm12 |
131 | movdqa xmmword ptr [rsp+0x70], xmm13 |
132 | movdqu xmm8, xmmword ptr [r8+rdx-0x20] |
133 | movdqu xmm9, xmmword ptr [r9+rdx-0x20] |
134 | movdqu xmm10, xmmword ptr [r10+rdx-0x20] |
135 | movdqu xmm11, xmmword ptr [r11+rdx-0x20] |
136 | movdqa xmm12, xmm8 |
137 | punpckldq xmm8, xmm9 |
138 | punpckhdq xmm12, xmm9 |
139 | movdqa xmm14, xmm10 |
140 | punpckldq xmm10, xmm11 |
141 | punpckhdq xmm14, xmm11 |
142 | movdqa xmm9, xmm8 |
143 | punpcklqdq xmm8, xmm10 |
144 | punpckhqdq xmm9, xmm10 |
145 | movdqa xmm13, xmm12 |
146 | punpcklqdq xmm12, xmm14 |
147 | punpckhqdq xmm13, xmm14 |
148 | movdqa xmmword ptr [rsp+0x80], xmm8 |
149 | movdqa xmmword ptr [rsp+0x90], xmm9 |
150 | movdqa xmmword ptr [rsp+0xA0], xmm12 |
151 | movdqa xmmword ptr [rsp+0xB0], xmm13 |
152 | movdqu xmm8, xmmword ptr [r8+rdx-0x10] |
153 | movdqu xmm9, xmmword ptr [r9+rdx-0x10] |
154 | movdqu xmm10, xmmword ptr [r10+rdx-0x10] |
155 | movdqu xmm11, xmmword ptr [r11+rdx-0x10] |
156 | movdqa xmm12, xmm8 |
157 | punpckldq xmm8, xmm9 |
158 | punpckhdq xmm12, xmm9 |
159 | movdqa xmm14, xmm10 |
160 | punpckldq xmm10, xmm11 |
161 | punpckhdq xmm14, xmm11 |
162 | movdqa xmm9, xmm8 |
163 | punpcklqdq xmm8, xmm10 |
164 | punpckhqdq xmm9, xmm10 |
165 | movdqa xmm13, xmm12 |
166 | punpcklqdq xmm12, xmm14 |
167 | punpckhqdq xmm13, xmm14 |
168 | movdqa xmmword ptr [rsp+0xC0], xmm8 |
169 | movdqa xmmword ptr [rsp+0xD0], xmm9 |
170 | movdqa xmmword ptr [rsp+0xE0], xmm12 |
171 | movdqa xmmword ptr [rsp+0xF0], xmm13 |
172 | movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] |
173 | movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] |
174 | movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] |
175 | movdqa xmm12, xmmword ptr [rsp+0x110] |
176 | movdqa xmm13, xmmword ptr [rsp+0x120] |
177 | movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] |
178 | movd xmm15, eax |
179 | pshufd xmm15, xmm15, 0x00 |
180 | prefetcht0 [r8+rdx+0x80] |
181 | prefetcht0 [r9+rdx+0x80] |
182 | prefetcht0 [r10+rdx+0x80] |
183 | prefetcht0 [r11+rdx+0x80] |
184 | paddd xmm0, xmmword ptr [rsp] |
185 | paddd xmm1, xmmword ptr [rsp+0x20] |
186 | paddd xmm2, xmmword ptr [rsp+0x40] |
187 | paddd xmm3, xmmword ptr [rsp+0x60] |
188 | paddd xmm0, xmm4 |
189 | paddd xmm1, xmm5 |
190 | paddd xmm2, xmm6 |
191 | paddd xmm3, xmm7 |
192 | pxor xmm12, xmm0 |
193 | pxor xmm13, xmm1 |
194 | pxor xmm14, xmm2 |
195 | pxor xmm15, xmm3 |
196 | pshuflw xmm12, xmm12, 0xB1 |
197 | pshufhw xmm12, xmm12, 0xB1 |
198 | pshuflw xmm13, xmm13, 0xB1 |
199 | pshufhw xmm13, xmm13, 0xB1 |
200 | pshuflw xmm14, xmm14, 0xB1 |
201 | pshufhw xmm14, xmm14, 0xB1 |
202 | pshuflw xmm15, xmm15, 0xB1 |
203 | pshufhw xmm15, xmm15, 0xB1 |
204 | movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] |
205 | paddd xmm8, xmm12 |
206 | paddd xmm9, xmm13 |
207 | paddd xmm10, xmm14 |
208 | paddd xmm11, xmm15 |
209 | pxor xmm4, xmm8 |
210 | pxor xmm5, xmm9 |
211 | pxor xmm6, xmm10 |
212 | pxor xmm7, xmm11 |
213 | movdqa xmmword ptr [rsp+0x100], xmm8 |
214 | movdqa xmm8, xmm4 |
215 | psrld xmm8, 12 |
216 | pslld xmm4, 20 |
217 | por xmm4, xmm8 |
218 | movdqa xmm8, xmm5 |
219 | psrld xmm8, 12 |
220 | pslld xmm5, 20 |
221 | por xmm5, xmm8 |
222 | movdqa xmm8, xmm6 |
223 | psrld xmm8, 12 |
224 | pslld xmm6, 20 |
225 | por xmm6, xmm8 |
226 | movdqa xmm8, xmm7 |
227 | psrld xmm8, 12 |
228 | pslld xmm7, 20 |
229 | por xmm7, xmm8 |
230 | paddd xmm0, xmmword ptr [rsp+0x10] |
231 | paddd xmm1, xmmword ptr [rsp+0x30] |
232 | paddd xmm2, xmmword ptr [rsp+0x50] |
233 | paddd xmm3, xmmword ptr [rsp+0x70] |
234 | paddd xmm0, xmm4 |
235 | paddd xmm1, xmm5 |
236 | paddd xmm2, xmm6 |
237 | paddd xmm3, xmm7 |
238 | pxor xmm12, xmm0 |
239 | pxor xmm13, xmm1 |
240 | pxor xmm14, xmm2 |
241 | pxor xmm15, xmm3 |
242 | movdqa xmm8, xmm12 |
243 | psrld xmm12, 8 |
244 | pslld xmm8, 24 |
245 | pxor xmm12, xmm8 |
246 | movdqa xmm8, xmm13 |
247 | psrld xmm13, 8 |
248 | pslld xmm8, 24 |
249 | pxor xmm13, xmm8 |
250 | movdqa xmm8, xmm14 |
251 | psrld xmm14, 8 |
252 | pslld xmm8, 24 |
253 | pxor xmm14, xmm8 |
254 | movdqa xmm8, xmm15 |
255 | psrld xmm15, 8 |
256 | pslld xmm8, 24 |
257 | pxor xmm15, xmm8 |
258 | movdqa xmm8, xmmword ptr [rsp+0x100] |
259 | paddd xmm8, xmm12 |
260 | paddd xmm9, xmm13 |
261 | paddd xmm10, xmm14 |
262 | paddd xmm11, xmm15 |
263 | pxor xmm4, xmm8 |
264 | pxor xmm5, xmm9 |
265 | pxor xmm6, xmm10 |
266 | pxor xmm7, xmm11 |
267 | movdqa xmmword ptr [rsp+0x100], xmm8 |
268 | movdqa xmm8, xmm4 |
269 | psrld xmm8, 7 |
270 | pslld xmm4, 25 |
271 | por xmm4, xmm8 |
272 | movdqa xmm8, xmm5 |
273 | psrld xmm8, 7 |
274 | pslld xmm5, 25 |
275 | por xmm5, xmm8 |
276 | movdqa xmm8, xmm6 |
277 | psrld xmm8, 7 |
278 | pslld xmm6, 25 |
279 | por xmm6, xmm8 |
280 | movdqa xmm8, xmm7 |
281 | psrld xmm8, 7 |
282 | pslld xmm7, 25 |
283 | por xmm7, xmm8 |
284 | paddd xmm0, xmmword ptr [rsp+0x80] |
285 | paddd xmm1, xmmword ptr [rsp+0xA0] |
286 | paddd xmm2, xmmword ptr [rsp+0xC0] |
287 | paddd xmm3, xmmword ptr [rsp+0xE0] |
288 | paddd xmm0, xmm5 |
289 | paddd xmm1, xmm6 |
290 | paddd xmm2, xmm7 |
291 | paddd xmm3, xmm4 |
292 | pxor xmm15, xmm0 |
293 | pxor xmm12, xmm1 |
294 | pxor xmm13, xmm2 |
295 | pxor xmm14, xmm3 |
296 | pshuflw xmm15, xmm15, 0xB1 |
297 | pshufhw xmm15, xmm15, 0xB1 |
298 | pshuflw xmm12, xmm12, 0xB1 |
299 | pshufhw xmm12, xmm12, 0xB1 |
300 | pshuflw xmm13, xmm13, 0xB1 |
301 | pshufhw xmm13, xmm13, 0xB1 |
302 | pshuflw xmm14, xmm14, 0xB1 |
303 | pshufhw xmm14, xmm14, 0xB1 |
304 | paddd xmm10, xmm15 |
305 | paddd xmm11, xmm12 |
306 | movdqa xmm8, xmmword ptr [rsp+0x100] |
307 | paddd xmm8, xmm13 |
308 | paddd xmm9, xmm14 |
309 | pxor xmm5, xmm10 |
310 | pxor xmm6, xmm11 |
311 | pxor xmm7, xmm8 |
312 | pxor xmm4, xmm9 |
313 | movdqa xmmword ptr [rsp+0x100], xmm8 |
314 | movdqa xmm8, xmm5 |
315 | psrld xmm8, 12 |
316 | pslld xmm5, 20 |
317 | por xmm5, xmm8 |
318 | movdqa xmm8, xmm6 |
319 | psrld xmm8, 12 |
320 | pslld xmm6, 20 |
321 | por xmm6, xmm8 |
322 | movdqa xmm8, xmm7 |
323 | psrld xmm8, 12 |
324 | pslld xmm7, 20 |
325 | por xmm7, xmm8 |
326 | movdqa xmm8, xmm4 |
327 | psrld xmm8, 12 |
328 | pslld xmm4, 20 |
329 | por xmm4, xmm8 |
330 | paddd xmm0, xmmword ptr [rsp+0x90] |
331 | paddd xmm1, xmmword ptr [rsp+0xB0] |
332 | paddd xmm2, xmmword ptr [rsp+0xD0] |
333 | paddd xmm3, xmmword ptr [rsp+0xF0] |
334 | paddd xmm0, xmm5 |
335 | paddd xmm1, xmm6 |
336 | paddd xmm2, xmm7 |
337 | paddd xmm3, xmm4 |
338 | pxor xmm15, xmm0 |
339 | pxor xmm12, xmm1 |
340 | pxor xmm13, xmm2 |
341 | pxor xmm14, xmm3 |
342 | movdqa xmm8, xmm15 |
343 | psrld xmm15, 8 |
344 | pslld xmm8, 24 |
345 | pxor xmm15, xmm8 |
346 | movdqa xmm8, xmm12 |
347 | psrld xmm12, 8 |
348 | pslld xmm8, 24 |
349 | pxor xmm12, xmm8 |
350 | movdqa xmm8, xmm13 |
351 | psrld xmm13, 8 |
352 | pslld xmm8, 24 |
353 | pxor xmm13, xmm8 |
354 | movdqa xmm8, xmm14 |
355 | psrld xmm14, 8 |
356 | pslld xmm8, 24 |
357 | pxor xmm14, xmm8 |
358 | paddd xmm10, xmm15 |
359 | paddd xmm11, xmm12 |
360 | movdqa xmm8, xmmword ptr [rsp+0x100] |
361 | paddd xmm8, xmm13 |
362 | paddd xmm9, xmm14 |
363 | pxor xmm5, xmm10 |
364 | pxor xmm6, xmm11 |
365 | pxor xmm7, xmm8 |
366 | pxor xmm4, xmm9 |
367 | movdqa xmmword ptr [rsp+0x100], xmm8 |
368 | movdqa xmm8, xmm5 |
369 | psrld xmm8, 7 |
370 | pslld xmm5, 25 |
371 | por xmm5, xmm8 |
372 | movdqa xmm8, xmm6 |
373 | psrld xmm8, 7 |
374 | pslld xmm6, 25 |
375 | por xmm6, xmm8 |
376 | movdqa xmm8, xmm7 |
377 | psrld xmm8, 7 |
378 | pslld xmm7, 25 |
379 | por xmm7, xmm8 |
380 | movdqa xmm8, xmm4 |
381 | psrld xmm8, 7 |
382 | pslld xmm4, 25 |
383 | por xmm4, xmm8 |
384 | paddd xmm0, xmmword ptr [rsp+0x20] |
385 | paddd xmm1, xmmword ptr [rsp+0x30] |
386 | paddd xmm2, xmmword ptr [rsp+0x70] |
387 | paddd xmm3, xmmword ptr [rsp+0x40] |
388 | paddd xmm0, xmm4 |
389 | paddd xmm1, xmm5 |
390 | paddd xmm2, xmm6 |
391 | paddd xmm3, xmm7 |
392 | pxor xmm12, xmm0 |
393 | pxor xmm13, xmm1 |
394 | pxor xmm14, xmm2 |
395 | pxor xmm15, xmm3 |
396 | pshuflw xmm12, xmm12, 0xB1 |
397 | pshufhw xmm12, xmm12, 0xB1 |
398 | pshuflw xmm13, xmm13, 0xB1 |
399 | pshufhw xmm13, xmm13, 0xB1 |
400 | pshuflw xmm14, xmm14, 0xB1 |
401 | pshufhw xmm14, xmm14, 0xB1 |
402 | pshuflw xmm15, xmm15, 0xB1 |
403 | pshufhw xmm15, xmm15, 0xB1 |
404 | movdqa xmm8, xmmword ptr [rsp+0x100] |
405 | paddd xmm8, xmm12 |
406 | paddd xmm9, xmm13 |
407 | paddd xmm10, xmm14 |
408 | paddd xmm11, xmm15 |
409 | pxor xmm4, xmm8 |
410 | pxor xmm5, xmm9 |
411 | pxor xmm6, xmm10 |
412 | pxor xmm7, xmm11 |
413 | movdqa xmmword ptr [rsp+0x100], xmm8 |
414 | movdqa xmm8, xmm4 |
415 | psrld xmm8, 12 |
416 | pslld xmm4, 20 |
417 | por xmm4, xmm8 |
418 | movdqa xmm8, xmm5 |
419 | psrld xmm8, 12 |
420 | pslld xmm5, 20 |
421 | por xmm5, xmm8 |
422 | movdqa xmm8, xmm6 |
423 | psrld xmm8, 12 |
424 | pslld xmm6, 20 |
425 | por xmm6, xmm8 |
426 | movdqa xmm8, xmm7 |
427 | psrld xmm8, 12 |
428 | pslld xmm7, 20 |
429 | por xmm7, xmm8 |
430 | paddd xmm0, xmmword ptr [rsp+0x60] |
431 | paddd xmm1, xmmword ptr [rsp+0xA0] |
432 | paddd xmm2, xmmword ptr [rsp] |
433 | paddd xmm3, xmmword ptr [rsp+0xD0] |
434 | paddd xmm0, xmm4 |
435 | paddd xmm1, xmm5 |
436 | paddd xmm2, xmm6 |
437 | paddd xmm3, xmm7 |
438 | pxor xmm12, xmm0 |
439 | pxor xmm13, xmm1 |
440 | pxor xmm14, xmm2 |
441 | pxor xmm15, xmm3 |
442 | movdqa xmm8, xmm12 |
443 | psrld xmm12, 8 |
444 | pslld xmm8, 24 |
445 | pxor xmm12, xmm8 |
446 | movdqa xmm8, xmm13 |
447 | psrld xmm13, 8 |
448 | pslld xmm8, 24 |
449 | pxor xmm13, xmm8 |
450 | movdqa xmm8, xmm14 |
451 | psrld xmm14, 8 |
452 | pslld xmm8, 24 |
453 | pxor xmm14, xmm8 |
454 | movdqa xmm8, xmm15 |
455 | psrld xmm15, 8 |
456 | pslld xmm8, 24 |
457 | pxor xmm15, xmm8 |
458 | movdqa xmm8, xmmword ptr [rsp+0x100] |
459 | paddd xmm8, xmm12 |
460 | paddd xmm9, xmm13 |
461 | paddd xmm10, xmm14 |
462 | paddd xmm11, xmm15 |
463 | pxor xmm4, xmm8 |
464 | pxor xmm5, xmm9 |
465 | pxor xmm6, xmm10 |
466 | pxor xmm7, xmm11 |
467 | movdqa xmmword ptr [rsp+0x100], xmm8 |
468 | movdqa xmm8, xmm4 |
469 | psrld xmm8, 7 |
470 | pslld xmm4, 25 |
471 | por xmm4, xmm8 |
472 | movdqa xmm8, xmm5 |
473 | psrld xmm8, 7 |
474 | pslld xmm5, 25 |
475 | por xmm5, xmm8 |
476 | movdqa xmm8, xmm6 |
477 | psrld xmm8, 7 |
478 | pslld xmm6, 25 |
479 | por xmm6, xmm8 |
480 | movdqa xmm8, xmm7 |
481 | psrld xmm8, 7 |
482 | pslld xmm7, 25 |
483 | por xmm7, xmm8 |
484 | paddd xmm0, xmmword ptr [rsp+0x10] |
485 | paddd xmm1, xmmword ptr [rsp+0xC0] |
486 | paddd xmm2, xmmword ptr [rsp+0x90] |
487 | paddd xmm3, xmmword ptr [rsp+0xF0] |
488 | paddd xmm0, xmm5 |
489 | paddd xmm1, xmm6 |
490 | paddd xmm2, xmm7 |
491 | paddd xmm3, xmm4 |
492 | pxor xmm15, xmm0 |
493 | pxor xmm12, xmm1 |
494 | pxor xmm13, xmm2 |
495 | pxor xmm14, xmm3 |
496 | pshuflw xmm15, xmm15, 0xB1 |
497 | pshufhw xmm15, xmm15, 0xB1 |
498 | pshuflw xmm12, xmm12, 0xB1 |
499 | pshufhw xmm12, xmm12, 0xB1 |
500 | pshuflw xmm13, xmm13, 0xB1 |
501 | pshufhw xmm13, xmm13, 0xB1 |
502 | pshuflw xmm14, xmm14, 0xB1 |
503 | pshufhw xmm14, xmm14, 0xB1 |
504 | paddd xmm10, xmm15 |
505 | paddd xmm11, xmm12 |
506 | movdqa xmm8, xmmword ptr [rsp+0x100] |
507 | paddd xmm8, xmm13 |
508 | paddd xmm9, xmm14 |
509 | pxor xmm5, xmm10 |
510 | pxor xmm6, xmm11 |
511 | pxor xmm7, xmm8 |
512 | pxor xmm4, xmm9 |
513 | movdqa xmmword ptr [rsp+0x100], xmm8 |
514 | movdqa xmm8, xmm5 |
515 | psrld xmm8, 12 |
516 | pslld xmm5, 20 |
517 | por xmm5, xmm8 |
518 | movdqa xmm8, xmm6 |
519 | psrld xmm8, 12 |
520 | pslld xmm6, 20 |
521 | por xmm6, xmm8 |
522 | movdqa xmm8, xmm7 |
523 | psrld xmm8, 12 |
524 | pslld xmm7, 20 |
525 | por xmm7, xmm8 |
526 | movdqa xmm8, xmm4 |
527 | psrld xmm8, 12 |
528 | pslld xmm4, 20 |
529 | por xmm4, xmm8 |
530 | paddd xmm0, xmmword ptr [rsp+0xB0] |
531 | paddd xmm1, xmmword ptr [rsp+0x50] |
532 | paddd xmm2, xmmword ptr [rsp+0xE0] |
533 | paddd xmm3, xmmword ptr [rsp+0x80] |
534 | paddd xmm0, xmm5 |
535 | paddd xmm1, xmm6 |
536 | paddd xmm2, xmm7 |
537 | paddd xmm3, xmm4 |
538 | pxor xmm15, xmm0 |
539 | pxor xmm12, xmm1 |
540 | pxor xmm13, xmm2 |
541 | pxor xmm14, xmm3 |
542 | movdqa xmm8, xmm15 |
543 | psrld xmm15, 8 |
544 | pslld xmm8, 24 |
545 | pxor xmm15, xmm8 |
546 | movdqa xmm8, xmm12 |
547 | psrld xmm12, 8 |
548 | pslld xmm8, 24 |
549 | pxor xmm12, xmm8 |
550 | movdqa xmm8, xmm13 |
551 | psrld xmm13, 8 |
552 | pslld xmm8, 24 |
553 | pxor xmm13, xmm8 |
554 | movdqa xmm8, xmm14 |
555 | psrld xmm14, 8 |
556 | pslld xmm8, 24 |
557 | pxor xmm14, xmm8 |
558 | paddd xmm10, xmm15 |
559 | paddd xmm11, xmm12 |
560 | movdqa xmm8, xmmword ptr [rsp+0x100] |
561 | paddd xmm8, xmm13 |
562 | paddd xmm9, xmm14 |
563 | pxor xmm5, xmm10 |
564 | pxor xmm6, xmm11 |
565 | pxor xmm7, xmm8 |
566 | pxor xmm4, xmm9 |
567 | movdqa xmmword ptr [rsp+0x100], xmm8 |
568 | movdqa xmm8, xmm5 |
569 | psrld xmm8, 7 |
570 | pslld xmm5, 25 |
571 | por xmm5, xmm8 |
572 | movdqa xmm8, xmm6 |
573 | psrld xmm8, 7 |
574 | pslld xmm6, 25 |
575 | por xmm6, xmm8 |
576 | movdqa xmm8, xmm7 |
577 | psrld xmm8, 7 |
578 | pslld xmm7, 25 |
579 | por xmm7, xmm8 |
580 | movdqa xmm8, xmm4 |
581 | psrld xmm8, 7 |
582 | pslld xmm4, 25 |
583 | por xmm4, xmm8 |
584 | paddd xmm0, xmmword ptr [rsp+0x30] |
585 | paddd xmm1, xmmword ptr [rsp+0xA0] |
586 | paddd xmm2, xmmword ptr [rsp+0xD0] |
587 | paddd xmm3, xmmword ptr [rsp+0x70] |
588 | paddd xmm0, xmm4 |
589 | paddd xmm1, xmm5 |
590 | paddd xmm2, xmm6 |
591 | paddd xmm3, xmm7 |
592 | pxor xmm12, xmm0 |
593 | pxor xmm13, xmm1 |
594 | pxor xmm14, xmm2 |
595 | pxor xmm15, xmm3 |
596 | pshuflw xmm12, xmm12, 0xB1 |
597 | pshufhw xmm12, xmm12, 0xB1 |
598 | pshuflw xmm13, xmm13, 0xB1 |
599 | pshufhw xmm13, xmm13, 0xB1 |
600 | pshuflw xmm14, xmm14, 0xB1 |
601 | pshufhw xmm14, xmm14, 0xB1 |
602 | pshuflw xmm15, xmm15, 0xB1 |
603 | pshufhw xmm15, xmm15, 0xB1 |
604 | movdqa xmm8, xmmword ptr [rsp+0x100] |
605 | paddd xmm8, xmm12 |
606 | paddd xmm9, xmm13 |
607 | paddd xmm10, xmm14 |
608 | paddd xmm11, xmm15 |
609 | pxor xmm4, xmm8 |
610 | pxor xmm5, xmm9 |
611 | pxor xmm6, xmm10 |
612 | pxor xmm7, xmm11 |
613 | movdqa xmmword ptr [rsp+0x100], xmm8 |
614 | movdqa xmm8, xmm4 |
615 | psrld xmm8, 12 |
616 | pslld xmm4, 20 |
617 | por xmm4, xmm8 |
618 | movdqa xmm8, xmm5 |
619 | psrld xmm8, 12 |
620 | pslld xmm5, 20 |
621 | por xmm5, xmm8 |
622 | movdqa xmm8, xmm6 |
623 | psrld xmm8, 12 |
624 | pslld xmm6, 20 |
625 | por xmm6, xmm8 |
626 | movdqa xmm8, xmm7 |
627 | psrld xmm8, 12 |
628 | pslld xmm7, 20 |
629 | por xmm7, xmm8 |
630 | paddd xmm0, xmmword ptr [rsp+0x40] |
631 | paddd xmm1, xmmword ptr [rsp+0xC0] |
632 | paddd xmm2, xmmword ptr [rsp+0x20] |
633 | paddd xmm3, xmmword ptr [rsp+0xE0] |
634 | paddd xmm0, xmm4 |
635 | paddd xmm1, xmm5 |
636 | paddd xmm2, xmm6 |
637 | paddd xmm3, xmm7 |
638 | pxor xmm12, xmm0 |
639 | pxor xmm13, xmm1 |
640 | pxor xmm14, xmm2 |
641 | pxor xmm15, xmm3 |
642 | movdqa xmm8, xmm12 |
643 | psrld xmm12, 8 |
644 | pslld xmm8, 24 |
645 | pxor xmm12, xmm8 |
646 | movdqa xmm8, xmm13 |
647 | psrld xmm13, 8 |
648 | pslld xmm8, 24 |
649 | pxor xmm13, xmm8 |
650 | movdqa xmm8, xmm14 |
651 | psrld xmm14, 8 |
652 | pslld xmm8, 24 |
653 | pxor xmm14, xmm8 |
654 | movdqa xmm8, xmm15 |
655 | psrld xmm15, 8 |
656 | pslld xmm8, 24 |
657 | pxor xmm15, xmm8 |
658 | movdqa xmm8, xmmword ptr [rsp+0x100] |
659 | paddd xmm8, xmm12 |
660 | paddd xmm9, xmm13 |
661 | paddd xmm10, xmm14 |
662 | paddd xmm11, xmm15 |
663 | pxor xmm4, xmm8 |
664 | pxor xmm5, xmm9 |
665 | pxor xmm6, xmm10 |
666 | pxor xmm7, xmm11 |
667 | movdqa xmmword ptr [rsp+0x100], xmm8 |
668 | movdqa xmm8, xmm4 |
669 | psrld xmm8, 7 |
670 | pslld xmm4, 25 |
671 | por xmm4, xmm8 |
672 | movdqa xmm8, xmm5 |
673 | psrld xmm8, 7 |
674 | pslld xmm5, 25 |
675 | por xmm5, xmm8 |
676 | movdqa xmm8, xmm6 |
677 | psrld xmm8, 7 |
678 | pslld xmm6, 25 |
679 | por xmm6, xmm8 |
680 | movdqa xmm8, xmm7 |
681 | psrld xmm8, 7 |
682 | pslld xmm7, 25 |
683 | por xmm7, xmm8 |
684 | paddd xmm0, xmmword ptr [rsp+0x60] |
685 | paddd xmm1, xmmword ptr [rsp+0x90] |
686 | paddd xmm2, xmmword ptr [rsp+0xB0] |
687 | paddd xmm3, xmmword ptr [rsp+0x80] |
688 | paddd xmm0, xmm5 |
689 | paddd xmm1, xmm6 |
690 | paddd xmm2, xmm7 |
691 | paddd xmm3, xmm4 |
692 | pxor xmm15, xmm0 |
693 | pxor xmm12, xmm1 |
694 | pxor xmm13, xmm2 |
695 | pxor xmm14, xmm3 |
696 | pshuflw xmm15, xmm15, 0xB1 |
697 | pshufhw xmm15, xmm15, 0xB1 |
698 | pshuflw xmm12, xmm12, 0xB1 |
699 | pshufhw xmm12, xmm12, 0xB1 |
700 | pshuflw xmm13, xmm13, 0xB1 |
701 | pshufhw xmm13, xmm13, 0xB1 |
702 | pshuflw xmm14, xmm14, 0xB1 |
703 | pshufhw xmm14, xmm14, 0xB1 |
704 | paddd xmm10, xmm15 |
705 | paddd xmm11, xmm12 |
706 | movdqa xmm8, xmmword ptr [rsp+0x100] |
707 | paddd xmm8, xmm13 |
708 | paddd xmm9, xmm14 |
709 | pxor xmm5, xmm10 |
710 | pxor xmm6, xmm11 |
711 | pxor xmm7, xmm8 |
712 | pxor xmm4, xmm9 |
713 | movdqa xmmword ptr [rsp+0x100], xmm8 |
714 | movdqa xmm8, xmm5 |
715 | psrld xmm8, 12 |
716 | pslld xmm5, 20 |
717 | por xmm5, xmm8 |
718 | movdqa xmm8, xmm6 |
719 | psrld xmm8, 12 |
720 | pslld xmm6, 20 |
721 | por xmm6, xmm8 |
722 | movdqa xmm8, xmm7 |
723 | psrld xmm8, 12 |
724 | pslld xmm7, 20 |
725 | por xmm7, xmm8 |
726 | movdqa xmm8, xmm4 |
727 | psrld xmm8, 12 |
728 | pslld xmm4, 20 |
729 | por xmm4, xmm8 |
730 | paddd xmm0, xmmword ptr [rsp+0x50] |
731 | paddd xmm1, xmmword ptr [rsp] |
732 | paddd xmm2, xmmword ptr [rsp+0xF0] |
733 | paddd xmm3, xmmword ptr [rsp+0x10] |
734 | paddd xmm0, xmm5 |
735 | paddd xmm1, xmm6 |
736 | paddd xmm2, xmm7 |
737 | paddd xmm3, xmm4 |
738 | pxor xmm15, xmm0 |
739 | pxor xmm12, xmm1 |
740 | pxor xmm13, xmm2 |
741 | pxor xmm14, xmm3 |
742 | movdqa xmm8, xmm15 |
743 | psrld xmm15, 8 |
744 | pslld xmm8, 24 |
745 | pxor xmm15, xmm8 |
746 | movdqa xmm8, xmm12 |
747 | psrld xmm12, 8 |
748 | pslld xmm8, 24 |
749 | pxor xmm12, xmm8 |
750 | movdqa xmm8, xmm13 |
751 | psrld xmm13, 8 |
752 | pslld xmm8, 24 |
753 | pxor xmm13, xmm8 |
754 | movdqa xmm8, xmm14 |
755 | psrld xmm14, 8 |
756 | pslld xmm8, 24 |
757 | pxor xmm14, xmm8 |
758 | paddd xmm10, xmm15 |
759 | paddd xmm11, xmm12 |
760 | movdqa xmm8, xmmword ptr [rsp+0x100] |
761 | paddd xmm8, xmm13 |
762 | paddd xmm9, xmm14 |
763 | pxor xmm5, xmm10 |
764 | pxor xmm6, xmm11 |
765 | pxor xmm7, xmm8 |
766 | pxor xmm4, xmm9 |
767 | movdqa xmmword ptr [rsp+0x100], xmm8 |
768 | movdqa xmm8, xmm5 |
769 | psrld xmm8, 7 |
770 | pslld xmm5, 25 |
771 | por xmm5, xmm8 |
772 | movdqa xmm8, xmm6 |
773 | psrld xmm8, 7 |
774 | pslld xmm6, 25 |
775 | por xmm6, xmm8 |
776 | movdqa xmm8, xmm7 |
777 | psrld xmm8, 7 |
778 | pslld xmm7, 25 |
779 | por xmm7, xmm8 |
780 | movdqa xmm8, xmm4 |
781 | psrld xmm8, 7 |
782 | pslld xmm4, 25 |
783 | por xmm4, xmm8 |
784 | paddd xmm0, xmmword ptr [rsp+0xA0] |
785 | paddd xmm1, xmmword ptr [rsp+0xC0] |
786 | paddd xmm2, xmmword ptr [rsp+0xE0] |
787 | paddd xmm3, xmmword ptr [rsp+0xD0] |
788 | paddd xmm0, xmm4 |
789 | paddd xmm1, xmm5 |
790 | paddd xmm2, xmm6 |
791 | paddd xmm3, xmm7 |
792 | pxor xmm12, xmm0 |
793 | pxor xmm13, xmm1 |
794 | pxor xmm14, xmm2 |
795 | pxor xmm15, xmm3 |
796 | pshuflw xmm12, xmm12, 0xB1 |
797 | pshufhw xmm12, xmm12, 0xB1 |
798 | pshuflw xmm13, xmm13, 0xB1 |
799 | pshufhw xmm13, xmm13, 0xB1 |
800 | pshuflw xmm14, xmm14, 0xB1 |
801 | pshufhw xmm14, xmm14, 0xB1 |
802 | pshuflw xmm15, xmm15, 0xB1 |
803 | pshufhw xmm15, xmm15, 0xB1 |
804 | movdqa xmm8, xmmword ptr [rsp+0x100] |
805 | paddd xmm8, xmm12 |
806 | paddd xmm9, xmm13 |
807 | paddd xmm10, xmm14 |
808 | paddd xmm11, xmm15 |
809 | pxor xmm4, xmm8 |
810 | pxor xmm5, xmm9 |
811 | pxor xmm6, xmm10 |
812 | pxor xmm7, xmm11 |
813 | movdqa xmmword ptr [rsp+0x100], xmm8 |
814 | movdqa xmm8, xmm4 |
815 | psrld xmm8, 12 |
816 | pslld xmm4, 20 |
817 | por xmm4, xmm8 |
818 | movdqa xmm8, xmm5 |
819 | psrld xmm8, 12 |
820 | pslld xmm5, 20 |
821 | por xmm5, xmm8 |
822 | movdqa xmm8, xmm6 |
823 | psrld xmm8, 12 |
824 | pslld xmm6, 20 |
825 | por xmm6, xmm8 |
826 | movdqa xmm8, xmm7 |
827 | psrld xmm8, 12 |
828 | pslld xmm7, 20 |
829 | por xmm7, xmm8 |
830 | paddd xmm0, xmmword ptr [rsp+0x70] |
831 | paddd xmm1, xmmword ptr [rsp+0x90] |
832 | paddd xmm2, xmmword ptr [rsp+0x30] |
833 | paddd xmm3, xmmword ptr [rsp+0xF0] |
834 | paddd xmm0, xmm4 |
835 | paddd xmm1, xmm5 |
836 | paddd xmm2, xmm6 |
837 | paddd xmm3, xmm7 |
838 | pxor xmm12, xmm0 |
839 | pxor xmm13, xmm1 |
840 | pxor xmm14, xmm2 |
841 | pxor xmm15, xmm3 |
842 | movdqa xmm8, xmm12 |
843 | psrld xmm12, 8 |
844 | pslld xmm8, 24 |
845 | pxor xmm12, xmm8 |
846 | movdqa xmm8, xmm13 |
847 | psrld xmm13, 8 |
848 | pslld xmm8, 24 |
849 | pxor xmm13, xmm8 |
850 | movdqa xmm8, xmm14 |
851 | psrld xmm14, 8 |
852 | pslld xmm8, 24 |
853 | pxor xmm14, xmm8 |
854 | movdqa xmm8, xmm15 |
855 | psrld xmm15, 8 |
856 | pslld xmm8, 24 |
857 | pxor xmm15, xmm8 |
858 | movdqa xmm8, xmmword ptr [rsp+0x100] |
859 | paddd xmm8, xmm12 |
860 | paddd xmm9, xmm13 |
861 | paddd xmm10, xmm14 |
862 | paddd xmm11, xmm15 |
863 | pxor xmm4, xmm8 |
864 | pxor xmm5, xmm9 |
865 | pxor xmm6, xmm10 |
866 | pxor xmm7, xmm11 |
867 | movdqa xmmword ptr [rsp+0x100], xmm8 |
868 | movdqa xmm8, xmm4 |
869 | psrld xmm8, 7 |
870 | pslld xmm4, 25 |
871 | por xmm4, xmm8 |
872 | movdqa xmm8, xmm5 |
873 | psrld xmm8, 7 |
874 | pslld xmm5, 25 |
875 | por xmm5, xmm8 |
876 | movdqa xmm8, xmm6 |
877 | psrld xmm8, 7 |
878 | pslld xmm6, 25 |
879 | por xmm6, xmm8 |
880 | movdqa xmm8, xmm7 |
881 | psrld xmm8, 7 |
882 | pslld xmm7, 25 |
883 | por xmm7, xmm8 |
884 | paddd xmm0, xmmword ptr [rsp+0x40] |
885 | paddd xmm1, xmmword ptr [rsp+0xB0] |
886 | paddd xmm2, xmmword ptr [rsp+0x50] |
887 | paddd xmm3, xmmword ptr [rsp+0x10] |
888 | paddd xmm0, xmm5 |
889 | paddd xmm1, xmm6 |
890 | paddd xmm2, xmm7 |
891 | paddd xmm3, xmm4 |
892 | pxor xmm15, xmm0 |
893 | pxor xmm12, xmm1 |
894 | pxor xmm13, xmm2 |
895 | pxor xmm14, xmm3 |
896 | pshuflw xmm15, xmm15, 0xB1 |
897 | pshufhw xmm15, xmm15, 0xB1 |
898 | pshuflw xmm12, xmm12, 0xB1 |
899 | pshufhw xmm12, xmm12, 0xB1 |
900 | pshuflw xmm13, xmm13, 0xB1 |
901 | pshufhw xmm13, xmm13, 0xB1 |
902 | pshuflw xmm14, xmm14, 0xB1 |
903 | pshufhw xmm14, xmm14, 0xB1 |
904 | paddd xmm10, xmm15 |
905 | paddd xmm11, xmm12 |
906 | movdqa xmm8, xmmword ptr [rsp+0x100] |
907 | paddd xmm8, xmm13 |
908 | paddd xmm9, xmm14 |
909 | pxor xmm5, xmm10 |
910 | pxor xmm6, xmm11 |
911 | pxor xmm7, xmm8 |
912 | pxor xmm4, xmm9 |
913 | movdqa xmmword ptr [rsp+0x100], xmm8 |
914 | movdqa xmm8, xmm5 |
915 | psrld xmm8, 12 |
916 | pslld xmm5, 20 |
917 | por xmm5, xmm8 |
918 | movdqa xmm8, xmm6 |
919 | psrld xmm8, 12 |
920 | pslld xmm6, 20 |
921 | por xmm6, xmm8 |
922 | movdqa xmm8, xmm7 |
923 | psrld xmm8, 12 |
924 | pslld xmm7, 20 |
925 | por xmm7, xmm8 |
926 | movdqa xmm8, xmm4 |
927 | psrld xmm8, 12 |
928 | pslld xmm4, 20 |
929 | por xmm4, xmm8 |
930 | paddd xmm0, xmmword ptr [rsp] |
931 | paddd xmm1, xmmword ptr [rsp+0x20] |
932 | paddd xmm2, xmmword ptr [rsp+0x80] |
933 | paddd xmm3, xmmword ptr [rsp+0x60] |
934 | paddd xmm0, xmm5 |
935 | paddd xmm1, xmm6 |
936 | paddd xmm2, xmm7 |
937 | paddd xmm3, xmm4 |
938 | pxor xmm15, xmm0 |
939 | pxor xmm12, xmm1 |
940 | pxor xmm13, xmm2 |
941 | pxor xmm14, xmm3 |
942 | movdqa xmm8, xmm15 |
943 | psrld xmm15, 8 |
944 | pslld xmm8, 24 |
945 | pxor xmm15, xmm8 |
946 | movdqa xmm8, xmm12 |
947 | psrld xmm12, 8 |
948 | pslld xmm8, 24 |
949 | pxor xmm12, xmm8 |
950 | movdqa xmm8, xmm13 |
951 | psrld xmm13, 8 |
952 | pslld xmm8, 24 |
953 | pxor xmm13, xmm8 |
954 | movdqa xmm8, xmm14 |
955 | psrld xmm14, 8 |
956 | pslld xmm8, 24 |
957 | pxor xmm14, xmm8 |
958 | paddd xmm10, xmm15 |
959 | paddd xmm11, xmm12 |
960 | movdqa xmm8, xmmword ptr [rsp+0x100] |
961 | paddd xmm8, xmm13 |
962 | paddd xmm9, xmm14 |
963 | pxor xmm5, xmm10 |
964 | pxor xmm6, xmm11 |
965 | pxor xmm7, xmm8 |
966 | pxor xmm4, xmm9 |
967 | movdqa xmmword ptr [rsp+0x100], xmm8 |
968 | movdqa xmm8, xmm5 |
969 | psrld xmm8, 7 |
970 | pslld xmm5, 25 |
971 | por xmm5, xmm8 |
972 | movdqa xmm8, xmm6 |
973 | psrld xmm8, 7 |
974 | pslld xmm6, 25 |
975 | por xmm6, xmm8 |
976 | movdqa xmm8, xmm7 |
977 | psrld xmm8, 7 |
978 | pslld xmm7, 25 |
979 | por xmm7, xmm8 |
980 | movdqa xmm8, xmm4 |
981 | psrld xmm8, 7 |
982 | pslld xmm4, 25 |
983 | por xmm4, xmm8 |
984 | paddd xmm0, xmmword ptr [rsp+0xC0] |
985 | paddd xmm1, xmmword ptr [rsp+0x90] |
986 | paddd xmm2, xmmword ptr [rsp+0xF0] |
987 | paddd xmm3, xmmword ptr [rsp+0xE0] |
988 | paddd xmm0, xmm4 |
989 | paddd xmm1, xmm5 |
990 | paddd xmm2, xmm6 |
991 | paddd xmm3, xmm7 |
992 | pxor xmm12, xmm0 |
993 | pxor xmm13, xmm1 |
994 | pxor xmm14, xmm2 |
995 | pxor xmm15, xmm3 |
996 | pshuflw xmm12, xmm12, 0xB1 |
997 | pshufhw xmm12, xmm12, 0xB1 |
998 | pshuflw xmm13, xmm13, 0xB1 |
999 | pshufhw xmm13, xmm13, 0xB1 |
1000 | pshuflw xmm14, xmm14, 0xB1 |
1001 | pshufhw xmm14, xmm14, 0xB1 |
1002 | pshuflw xmm15, xmm15, 0xB1 |
1003 | pshufhw xmm15, xmm15, 0xB1 |
1004 | movdqa xmm8, xmmword ptr [rsp+0x100] |
1005 | paddd xmm8, xmm12 |
1006 | paddd xmm9, xmm13 |
1007 | paddd xmm10, xmm14 |
1008 | paddd xmm11, xmm15 |
1009 | pxor xmm4, xmm8 |
1010 | pxor xmm5, xmm9 |
1011 | pxor xmm6, xmm10 |
1012 | pxor xmm7, xmm11 |
1013 | movdqa xmmword ptr [rsp+0x100], xmm8 |
1014 | movdqa xmm8, xmm4 |
1015 | psrld xmm8, 12 |
1016 | pslld xmm4, 20 |
1017 | por xmm4, xmm8 |
1018 | movdqa xmm8, xmm5 |
1019 | psrld xmm8, 12 |
1020 | pslld xmm5, 20 |
1021 | por xmm5, xmm8 |
1022 | movdqa xmm8, xmm6 |
1023 | psrld xmm8, 12 |
1024 | pslld xmm6, 20 |
1025 | por xmm6, xmm8 |
1026 | movdqa xmm8, xmm7 |
1027 | psrld xmm8, 12 |
1028 | pslld xmm7, 20 |
1029 | por xmm7, xmm8 |
1030 | paddd xmm0, xmmword ptr [rsp+0xD0] |
1031 | paddd xmm1, xmmword ptr [rsp+0xB0] |
1032 | paddd xmm2, xmmword ptr [rsp+0xA0] |
1033 | paddd xmm3, xmmword ptr [rsp+0x80] |
1034 | paddd xmm0, xmm4 |
1035 | paddd xmm1, xmm5 |
1036 | paddd xmm2, xmm6 |
1037 | paddd xmm3, xmm7 |
1038 | pxor xmm12, xmm0 |
1039 | pxor xmm13, xmm1 |
1040 | pxor xmm14, xmm2 |
1041 | pxor xmm15, xmm3 |
1042 | movdqa xmm8, xmm12 |
1043 | psrld xmm12, 8 |
1044 | pslld xmm8, 24 |
1045 | pxor xmm12, xmm8 |
1046 | movdqa xmm8, xmm13 |
1047 | psrld xmm13, 8 |
1048 | pslld xmm8, 24 |
1049 | pxor xmm13, xmm8 |
1050 | movdqa xmm8, xmm14 |
1051 | psrld xmm14, 8 |
1052 | pslld xmm8, 24 |
1053 | pxor xmm14, xmm8 |
1054 | movdqa xmm8, xmm15 |
1055 | psrld xmm15, 8 |
1056 | pslld xmm8, 24 |
1057 | pxor xmm15, xmm8 |
1058 | movdqa xmm8, xmmword ptr [rsp+0x100] |
1059 | paddd xmm8, xmm12 |
1060 | paddd xmm9, xmm13 |
1061 | paddd xmm10, xmm14 |
1062 | paddd xmm11, xmm15 |
1063 | pxor xmm4, xmm8 |
1064 | pxor xmm5, xmm9 |
1065 | pxor xmm6, xmm10 |
1066 | pxor xmm7, xmm11 |
1067 | movdqa xmmword ptr [rsp+0x100], xmm8 |
1068 | movdqa xmm8, xmm4 |
1069 | psrld xmm8, 7 |
1070 | pslld xmm4, 25 |
1071 | por xmm4, xmm8 |
1072 | movdqa xmm8, xmm5 |
1073 | psrld xmm8, 7 |
1074 | pslld xmm5, 25 |
1075 | por xmm5, xmm8 |
1076 | movdqa xmm8, xmm6 |
1077 | psrld xmm8, 7 |
1078 | pslld xmm6, 25 |
1079 | por xmm6, xmm8 |
1080 | movdqa xmm8, xmm7 |
1081 | psrld xmm8, 7 |
1082 | pslld xmm7, 25 |
1083 | por xmm7, xmm8 |
1084 | paddd xmm0, xmmword ptr [rsp+0x70] |
1085 | paddd xmm1, xmmword ptr [rsp+0x50] |
1086 | paddd xmm2, xmmword ptr [rsp] |
1087 | paddd xmm3, xmmword ptr [rsp+0x60] |
1088 | paddd xmm0, xmm5 |
1089 | paddd xmm1, xmm6 |
1090 | paddd xmm2, xmm7 |
1091 | paddd xmm3, xmm4 |
1092 | pxor xmm15, xmm0 |
1093 | pxor xmm12, xmm1 |
1094 | pxor xmm13, xmm2 |
1095 | pxor xmm14, xmm3 |
1096 | pshuflw xmm15, xmm15, 0xB1 |
1097 | pshufhw xmm15, xmm15, 0xB1 |
1098 | pshuflw xmm12, xmm12, 0xB1 |
1099 | pshufhw xmm12, xmm12, 0xB1 |
1100 | pshuflw xmm13, xmm13, 0xB1 |
1101 | pshufhw xmm13, xmm13, 0xB1 |
1102 | pshuflw xmm14, xmm14, 0xB1 |
1103 | pshufhw xmm14, xmm14, 0xB1 |
1104 | paddd xmm10, xmm15 |
1105 | paddd xmm11, xmm12 |
1106 | movdqa xmm8, xmmword ptr [rsp+0x100] |
1107 | paddd xmm8, xmm13 |
1108 | paddd xmm9, xmm14 |
1109 | pxor xmm5, xmm10 |
1110 | pxor xmm6, xmm11 |
1111 | pxor xmm7, xmm8 |
1112 | pxor xmm4, xmm9 |
1113 | movdqa xmmword ptr [rsp+0x100], xmm8 |
1114 | movdqa xmm8, xmm5 |
1115 | psrld xmm8, 12 |
1116 | pslld xmm5, 20 |
1117 | por xmm5, xmm8 |
1118 | movdqa xmm8, xmm6 |
1119 | psrld xmm8, 12 |
1120 | pslld xmm6, 20 |
1121 | por xmm6, xmm8 |
1122 | movdqa xmm8, xmm7 |
1123 | psrld xmm8, 12 |
1124 | pslld xmm7, 20 |
1125 | por xmm7, xmm8 |
1126 | movdqa xmm8, xmm4 |
1127 | psrld xmm8, 12 |
1128 | pslld xmm4, 20 |
1129 | por xmm4, xmm8 |
1130 | paddd xmm0, xmmword ptr [rsp+0x20] |
1131 | paddd xmm1, xmmword ptr [rsp+0x30] |
1132 | paddd xmm2, xmmword ptr [rsp+0x10] |
1133 | paddd xmm3, xmmword ptr [rsp+0x40] |
1134 | paddd xmm0, xmm5 |
1135 | paddd xmm1, xmm6 |
1136 | paddd xmm2, xmm7 |
1137 | paddd xmm3, xmm4 |
1138 | pxor xmm15, xmm0 |
1139 | pxor xmm12, xmm1 |
1140 | pxor xmm13, xmm2 |
1141 | pxor xmm14, xmm3 |
1142 | movdqa xmm8, xmm15 |
1143 | psrld xmm15, 8 |
1144 | pslld xmm8, 24 |
1145 | pxor xmm15, xmm8 |
1146 | movdqa xmm8, xmm12 |
1147 | psrld xmm12, 8 |
1148 | pslld xmm8, 24 |
1149 | pxor xmm12, xmm8 |
1150 | movdqa xmm8, xmm13 |
1151 | psrld xmm13, 8 |
1152 | pslld xmm8, 24 |
1153 | pxor xmm13, xmm8 |
1154 | movdqa xmm8, xmm14 |
1155 | psrld xmm14, 8 |
1156 | pslld xmm8, 24 |
1157 | pxor xmm14, xmm8 |
1158 | paddd xmm10, xmm15 |
1159 | paddd xmm11, xmm12 |
1160 | movdqa xmm8, xmmword ptr [rsp+0x100] |
1161 | paddd xmm8, xmm13 |
1162 | paddd xmm9, xmm14 |
1163 | pxor xmm5, xmm10 |
1164 | pxor xmm6, xmm11 |
1165 | pxor xmm7, xmm8 |
1166 | pxor xmm4, xmm9 |
1167 | movdqa xmmword ptr [rsp+0x100], xmm8 |
1168 | movdqa xmm8, xmm5 |
1169 | psrld xmm8, 7 |
1170 | pslld xmm5, 25 |
1171 | por xmm5, xmm8 |
1172 | movdqa xmm8, xmm6 |
1173 | psrld xmm8, 7 |
1174 | pslld xmm6, 25 |
1175 | por xmm6, xmm8 |
1176 | movdqa xmm8, xmm7 |
1177 | psrld xmm8, 7 |
1178 | pslld xmm7, 25 |
1179 | por xmm7, xmm8 |
1180 | movdqa xmm8, xmm4 |
1181 | psrld xmm8, 7 |
1182 | pslld xmm4, 25 |
1183 | por xmm4, xmm8 |
1184 | paddd xmm0, xmmword ptr [rsp+0x90] |
1185 | paddd xmm1, xmmword ptr [rsp+0xB0] |
1186 | paddd xmm2, xmmword ptr [rsp+0x80] |
1187 | paddd xmm3, xmmword ptr [rsp+0xF0] |
1188 | paddd xmm0, xmm4 |
1189 | paddd xmm1, xmm5 |
1190 | paddd xmm2, xmm6 |
1191 | paddd xmm3, xmm7 |
1192 | pxor xmm12, xmm0 |
1193 | pxor xmm13, xmm1 |
1194 | pxor xmm14, xmm2 |
1195 | pxor xmm15, xmm3 |
1196 | pshuflw xmm12, xmm12, 0xB1 |
1197 | pshufhw xmm12, xmm12, 0xB1 |
1198 | pshuflw xmm13, xmm13, 0xB1 |
1199 | pshufhw xmm13, xmm13, 0xB1 |
1200 | pshuflw xmm14, xmm14, 0xB1 |
1201 | pshufhw xmm14, xmm14, 0xB1 |
1202 | pshuflw xmm15, xmm15, 0xB1 |
1203 | pshufhw xmm15, xmm15, 0xB1 |
1204 | movdqa xmm8, xmmword ptr [rsp+0x100] |
1205 | paddd xmm8, xmm12 |
1206 | paddd xmm9, xmm13 |
1207 | paddd xmm10, xmm14 |
1208 | paddd xmm11, xmm15 |
1209 | pxor xmm4, xmm8 |
1210 | pxor xmm5, xmm9 |
1211 | pxor xmm6, xmm10 |
1212 | pxor xmm7, xmm11 |
1213 | movdqa xmmword ptr [rsp+0x100], xmm8 |
1214 | movdqa xmm8, xmm4 |
1215 | psrld xmm8, 12 |
1216 | pslld xmm4, 20 |
1217 | por xmm4, xmm8 |
1218 | movdqa xmm8, xmm5 |
1219 | psrld xmm8, 12 |
1220 | pslld xmm5, 20 |
1221 | por xmm5, xmm8 |
1222 | movdqa xmm8, xmm6 |
1223 | psrld xmm8, 12 |
1224 | pslld xmm6, 20 |
1225 | por xmm6, xmm8 |
1226 | movdqa xmm8, xmm7 |
1227 | psrld xmm8, 12 |
1228 | pslld xmm7, 20 |
1229 | por xmm7, xmm8 |
1230 | paddd xmm0, xmmword ptr [rsp+0xE0] |
1231 | paddd xmm1, xmmword ptr [rsp+0x50] |
1232 | paddd xmm2, xmmword ptr [rsp+0xC0] |
1233 | paddd xmm3, xmmword ptr [rsp+0x10] |
1234 | paddd xmm0, xmm4 |
1235 | paddd xmm1, xmm5 |
1236 | paddd xmm2, xmm6 |
1237 | paddd xmm3, xmm7 |
1238 | pxor xmm12, xmm0 |
1239 | pxor xmm13, xmm1 |
1240 | pxor xmm14, xmm2 |
1241 | pxor xmm15, xmm3 |
1242 | movdqa xmm8, xmm12 |
1243 | psrld xmm12, 8 |
1244 | pslld xmm8, 24 |
1245 | pxor xmm12, xmm8 |
1246 | movdqa xmm8, xmm13 |
1247 | psrld xmm13, 8 |
1248 | pslld xmm8, 24 |
1249 | pxor xmm13, xmm8 |
1250 | movdqa xmm8, xmm14 |
1251 | psrld xmm14, 8 |
1252 | pslld xmm8, 24 |
1253 | pxor xmm14, xmm8 |
1254 | movdqa xmm8, xmm15 |
1255 | psrld xmm15, 8 |
1256 | pslld xmm8, 24 |
1257 | pxor xmm15, xmm8 |
1258 | movdqa xmm8, xmmword ptr [rsp+0x100] |
1259 | paddd xmm8, xmm12 |
1260 | paddd xmm9, xmm13 |
1261 | paddd xmm10, xmm14 |
1262 | paddd xmm11, xmm15 |
1263 | pxor xmm4, xmm8 |
1264 | pxor xmm5, xmm9 |
1265 | pxor xmm6, xmm10 |
1266 | pxor xmm7, xmm11 |
1267 | movdqa xmmword ptr [rsp+0x100], xmm8 |
1268 | movdqa xmm8, xmm4 |
1269 | psrld xmm8, 7 |
1270 | pslld xmm4, 25 |
1271 | por xmm4, xmm8 |
1272 | movdqa xmm8, xmm5 |
1273 | psrld xmm8, 7 |
1274 | pslld xmm5, 25 |
1275 | por xmm5, xmm8 |
1276 | movdqa xmm8, xmm6 |
1277 | psrld xmm8, 7 |
1278 | pslld xmm6, 25 |
1279 | por xmm6, xmm8 |
1280 | movdqa xmm8, xmm7 |
1281 | psrld xmm8, 7 |
1282 | pslld xmm7, 25 |
1283 | por xmm7, xmm8 |
1284 | paddd xmm0, xmmword ptr [rsp+0xD0] |
1285 | paddd xmm1, xmmword ptr [rsp] |
1286 | paddd xmm2, xmmword ptr [rsp+0x20] |
1287 | paddd xmm3, xmmword ptr [rsp+0x40] |
1288 | paddd xmm0, xmm5 |
1289 | paddd xmm1, xmm6 |
1290 | paddd xmm2, xmm7 |
1291 | paddd xmm3, xmm4 |
1292 | pxor xmm15, xmm0 |
1293 | pxor xmm12, xmm1 |
1294 | pxor xmm13, xmm2 |
1295 | pxor xmm14, xmm3 |
1296 | pshuflw xmm15, xmm15, 0xB1 |
1297 | pshufhw xmm15, xmm15, 0xB1 |
1298 | pshuflw xmm12, xmm12, 0xB1 |
1299 | pshufhw xmm12, xmm12, 0xB1 |
1300 | pshuflw xmm13, xmm13, 0xB1 |
1301 | pshufhw xmm13, xmm13, 0xB1 |
1302 | pshuflw xmm14, xmm14, 0xB1 |
1303 | pshufhw xmm14, xmm14, 0xB1 |
1304 | paddd xmm10, xmm15 |
1305 | paddd xmm11, xmm12 |
1306 | movdqa xmm8, xmmword ptr [rsp+0x100] |
1307 | paddd xmm8, xmm13 |
1308 | paddd xmm9, xmm14 |
1309 | pxor xmm5, xmm10 |
1310 | pxor xmm6, xmm11 |
1311 | pxor xmm7, xmm8 |
1312 | pxor xmm4, xmm9 |
1313 | movdqa xmmword ptr [rsp+0x100], xmm8 |
1314 | movdqa xmm8, xmm5 |
1315 | psrld xmm8, 12 |
1316 | pslld xmm5, 20 |
1317 | por xmm5, xmm8 |
1318 | movdqa xmm8, xmm6 |
1319 | psrld xmm8, 12 |
1320 | pslld xmm6, 20 |
1321 | por xmm6, xmm8 |
1322 | movdqa xmm8, xmm7 |
1323 | psrld xmm8, 12 |
1324 | pslld xmm7, 20 |
1325 | por xmm7, xmm8 |
1326 | movdqa xmm8, xmm4 |
1327 | psrld xmm8, 12 |
1328 | pslld xmm4, 20 |
1329 | por xmm4, xmm8 |
1330 | paddd xmm0, xmmword ptr [rsp+0x30] |
1331 | paddd xmm1, xmmword ptr [rsp+0xA0] |
1332 | paddd xmm2, xmmword ptr [rsp+0x60] |
1333 | paddd xmm3, xmmword ptr [rsp+0x70] |
1334 | paddd xmm0, xmm5 |
1335 | paddd xmm1, xmm6 |
1336 | paddd xmm2, xmm7 |
1337 | paddd xmm3, xmm4 |
1338 | pxor xmm15, xmm0 |
1339 | pxor xmm12, xmm1 |
1340 | pxor xmm13, xmm2 |
1341 | pxor xmm14, xmm3 |
1342 | movdqa xmm8, xmm15 |
1343 | psrld xmm15, 8 |
1344 | pslld xmm8, 24 |
1345 | pxor xmm15, xmm8 |
1346 | movdqa xmm8, xmm12 |
1347 | psrld xmm12, 8 |
1348 | pslld xmm8, 24 |
1349 | pxor xmm12, xmm8 |
1350 | movdqa xmm8, xmm13 |
1351 | psrld xmm13, 8 |
1352 | pslld xmm8, 24 |
1353 | pxor xmm13, xmm8 |
1354 | movdqa xmm8, xmm14 |
1355 | psrld xmm14, 8 |
1356 | pslld xmm8, 24 |
1357 | pxor xmm14, xmm8 |
1358 | paddd xmm10, xmm15 |
1359 | paddd xmm11, xmm12 |
1360 | movdqa xmm8, xmmword ptr [rsp+0x100] |
1361 | paddd xmm8, xmm13 |
1362 | paddd xmm9, xmm14 |
1363 | pxor xmm5, xmm10 |
1364 | pxor xmm6, xmm11 |
1365 | pxor xmm7, xmm8 |
1366 | pxor xmm4, xmm9 |
1367 | movdqa xmmword ptr [rsp+0x100], xmm8 |
1368 | movdqa xmm8, xmm5 |
1369 | psrld xmm8, 7 |
1370 | pslld xmm5, 25 |
1371 | por xmm5, xmm8 |
1372 | movdqa xmm8, xmm6 |
1373 | psrld xmm8, 7 |
1374 | pslld xmm6, 25 |
1375 | por xmm6, xmm8 |
1376 | movdqa xmm8, xmm7 |
1377 | psrld xmm8, 7 |
1378 | pslld xmm7, 25 |
1379 | por xmm7, xmm8 |
1380 | movdqa xmm8, xmm4 |
1381 | psrld xmm8, 7 |
1382 | pslld xmm4, 25 |
1383 | por xmm4, xmm8 |
1384 | paddd xmm0, xmmword ptr [rsp+0xB0] |
1385 | paddd xmm1, xmmword ptr [rsp+0x50] |
1386 | paddd xmm2, xmmword ptr [rsp+0x10] |
1387 | paddd xmm3, xmmword ptr [rsp+0x80] |
1388 | paddd xmm0, xmm4 |
1389 | paddd xmm1, xmm5 |
1390 | paddd xmm2, xmm6 |
1391 | paddd xmm3, xmm7 |
1392 | pxor xmm12, xmm0 |
1393 | pxor xmm13, xmm1 |
1394 | pxor xmm14, xmm2 |
1395 | pxor xmm15, xmm3 |
1396 | pshuflw xmm12, xmm12, 0xB1 |
1397 | pshufhw xmm12, xmm12, 0xB1 |
1398 | pshuflw xmm13, xmm13, 0xB1 |
1399 | pshufhw xmm13, xmm13, 0xB1 |
1400 | pshuflw xmm14, xmm14, 0xB1 |
1401 | pshufhw xmm14, xmm14, 0xB1 |
1402 | pshuflw xmm15, xmm15, 0xB1 |
1403 | pshufhw xmm15, xmm15, 0xB1 |
1404 | movdqa xmm8, xmmword ptr [rsp+0x100] |
1405 | paddd xmm8, xmm12 |
1406 | paddd xmm9, xmm13 |
1407 | paddd xmm10, xmm14 |
1408 | paddd xmm11, xmm15 |
1409 | pxor xmm4, xmm8 |
1410 | pxor xmm5, xmm9 |
1411 | pxor xmm6, xmm10 |
1412 | pxor xmm7, xmm11 |
1413 | movdqa xmmword ptr [rsp+0x100], xmm8 |
1414 | movdqa xmm8, xmm4 |
1415 | psrld xmm8, 12 |
1416 | pslld xmm4, 20 |
1417 | por xmm4, xmm8 |
1418 | movdqa xmm8, xmm5 |
1419 | psrld xmm8, 12 |
1420 | pslld xmm5, 20 |
1421 | por xmm5, xmm8 |
1422 | movdqa xmm8, xmm6 |
1423 | psrld xmm8, 12 |
1424 | pslld xmm6, 20 |
1425 | por xmm6, xmm8 |
1426 | movdqa xmm8, xmm7 |
1427 | psrld xmm8, 12 |
1428 | pslld xmm7, 20 |
1429 | por xmm7, xmm8 |
1430 | paddd xmm0, xmmword ptr [rsp+0xF0] |
1431 | paddd xmm1, xmmword ptr [rsp] |
1432 | paddd xmm2, xmmword ptr [rsp+0x90] |
1433 | paddd xmm3, xmmword ptr [rsp+0x60] |
1434 | paddd xmm0, xmm4 |
1435 | paddd xmm1, xmm5 |
1436 | paddd xmm2, xmm6 |
1437 | paddd xmm3, xmm7 |
1438 | pxor xmm12, xmm0 |
1439 | pxor xmm13, xmm1 |
1440 | pxor xmm14, xmm2 |
1441 | pxor xmm15, xmm3 |
1442 | movdqa xmm8, xmm12 |
1443 | psrld xmm12, 8 |
1444 | pslld xmm8, 24 |
1445 | pxor xmm12, xmm8 |
1446 | movdqa xmm8, xmm13 |
1447 | psrld xmm13, 8 |
1448 | pslld xmm8, 24 |
1449 | pxor xmm13, xmm8 |
1450 | movdqa xmm8, xmm14 |
1451 | psrld xmm14, 8 |
1452 | pslld xmm8, 24 |
1453 | pxor xmm14, xmm8 |
1454 | movdqa xmm8, xmm15 |
1455 | psrld xmm15, 8 |
1456 | pslld xmm8, 24 |
1457 | pxor xmm15, xmm8 |
1458 | movdqa xmm8, xmmword ptr [rsp+0x100] |
1459 | paddd xmm8, xmm12 |
1460 | paddd xmm9, xmm13 |
1461 | paddd xmm10, xmm14 |
1462 | paddd xmm11, xmm15 |
1463 | pxor xmm4, xmm8 |
1464 | pxor xmm5, xmm9 |
1465 | pxor xmm6, xmm10 |
1466 | pxor xmm7, xmm11 |
1467 | movdqa xmmword ptr [rsp+0x100], xmm8 |
1468 | movdqa xmm8, xmm4 |
1469 | psrld xmm8, 7 |
1470 | pslld xmm4, 25 |
1471 | por xmm4, xmm8 |
1472 | movdqa xmm8, xmm5 |
1473 | psrld xmm8, 7 |
1474 | pslld xmm5, 25 |
1475 | por xmm5, xmm8 |
1476 | movdqa xmm8, xmm6 |
1477 | psrld xmm8, 7 |
1478 | pslld xmm6, 25 |
1479 | por xmm6, xmm8 |
1480 | movdqa xmm8, xmm7 |
1481 | psrld xmm8, 7 |
1482 | pslld xmm7, 25 |
1483 | por xmm7, xmm8 |
1484 | paddd xmm0, xmmword ptr [rsp+0xE0] |
1485 | paddd xmm1, xmmword ptr [rsp+0x20] |
1486 | paddd xmm2, xmmword ptr [rsp+0x30] |
1487 | paddd xmm3, xmmword ptr [rsp+0x70] |
1488 | paddd xmm0, xmm5 |
1489 | paddd xmm1, xmm6 |
1490 | paddd xmm2, xmm7 |
1491 | paddd xmm3, xmm4 |
1492 | pxor xmm15, xmm0 |
1493 | pxor xmm12, xmm1 |
1494 | pxor xmm13, xmm2 |
1495 | pxor xmm14, xmm3 |
1496 | pshuflw xmm15, xmm15, 0xB1 |
1497 | pshufhw xmm15, xmm15, 0xB1 |
1498 | pshuflw xmm12, xmm12, 0xB1 |
1499 | pshufhw xmm12, xmm12, 0xB1 |
1500 | pshuflw xmm13, xmm13, 0xB1 |
1501 | pshufhw xmm13, xmm13, 0xB1 |
1502 | pshuflw xmm14, xmm14, 0xB1 |
1503 | pshufhw xmm14, xmm14, 0xB1 |
1504 | paddd xmm10, xmm15 |
1505 | paddd xmm11, xmm12 |
1506 | movdqa xmm8, xmmword ptr [rsp+0x100] |
1507 | paddd xmm8, xmm13 |
1508 | paddd xmm9, xmm14 |
1509 | pxor xmm5, xmm10 |
1510 | pxor xmm6, xmm11 |
1511 | pxor xmm7, xmm8 |
1512 | pxor xmm4, xmm9 |
1513 | movdqa xmmword ptr [rsp+0x100], xmm8 |
1514 | movdqa xmm8, xmm5 |
1515 | psrld xmm8, 12 |
1516 | pslld xmm5, 20 |
1517 | por xmm5, xmm8 |
1518 | movdqa xmm8, xmm6 |
1519 | psrld xmm8, 12 |
1520 | pslld xmm6, 20 |
1521 | por xmm6, xmm8 |
1522 | movdqa xmm8, xmm7 |
1523 | psrld xmm8, 12 |
1524 | pslld xmm7, 20 |
1525 | por xmm7, xmm8 |
1526 | movdqa xmm8, xmm4 |
1527 | psrld xmm8, 12 |
1528 | pslld xmm4, 20 |
1529 | por xmm4, xmm8 |
1530 | paddd xmm0, xmmword ptr [rsp+0xA0] |
1531 | paddd xmm1, xmmword ptr [rsp+0xC0] |
1532 | paddd xmm2, xmmword ptr [rsp+0x40] |
1533 | paddd xmm3, xmmword ptr [rsp+0xD0] |
1534 | paddd xmm0, xmm5 |
1535 | paddd xmm1, xmm6 |
1536 | paddd xmm2, xmm7 |
1537 | paddd xmm3, xmm4 |
1538 | pxor xmm15, xmm0 |
1539 | pxor xmm12, xmm1 |
1540 | pxor xmm13, xmm2 |
1541 | pxor xmm14, xmm3 |
1542 | movdqa xmm8, xmm15 |
1543 | psrld xmm15, 8 |
1544 | pslld xmm8, 24 |
1545 | pxor xmm15, xmm8 |
1546 | movdqa xmm8, xmm12 |
1547 | psrld xmm12, 8 |
1548 | pslld xmm8, 24 |
1549 | pxor xmm12, xmm8 |
1550 | movdqa xmm8, xmm13 |
1551 | psrld xmm13, 8 |
1552 | pslld xmm8, 24 |
1553 | pxor xmm13, xmm8 |
1554 | movdqa xmm8, xmm14 |
1555 | psrld xmm14, 8 |
1556 | pslld xmm8, 24 |
1557 | pxor xmm14, xmm8 |
1558 | paddd xmm10, xmm15 |
1559 | paddd xmm11, xmm12 |
1560 | movdqa xmm8, xmmword ptr [rsp+0x100] |
1561 | paddd xmm8, xmm13 |
1562 | paddd xmm9, xmm14 |
1563 | pxor xmm5, xmm10 |
1564 | pxor xmm6, xmm11 |
1565 | pxor xmm7, xmm8 |
1566 | pxor xmm4, xmm9 |
1567 | pxor xmm0, xmm8 |
1568 | pxor xmm1, xmm9 |
1569 | pxor xmm2, xmm10 |
1570 | pxor xmm3, xmm11 |
1571 | movdqa xmm8, xmm5 |
1572 | psrld xmm8, 7 |
1573 | pslld xmm5, 25 |
1574 | por xmm5, xmm8 |
1575 | movdqa xmm8, xmm6 |
1576 | psrld xmm8, 7 |
1577 | pslld xmm6, 25 |
1578 | por xmm6, xmm8 |
1579 | movdqa xmm8, xmm7 |
1580 | psrld xmm8, 7 |
1581 | pslld xmm7, 25 |
1582 | por xmm7, xmm8 |
1583 | movdqa xmm8, xmm4 |
1584 | psrld xmm8, 7 |
1585 | pslld xmm4, 25 |
1586 | por xmm4, xmm8 |
1587 | pxor xmm4, xmm12 |
1588 | pxor xmm5, xmm13 |
1589 | pxor xmm6, xmm14 |
1590 | pxor xmm7, xmm15 |
1591 | mov eax, r13d |
1592 | jne 9b |
1593 | movdqa xmm9, xmm0 |
1594 | punpckldq xmm0, xmm1 |
1595 | punpckhdq xmm9, xmm1 |
1596 | movdqa xmm11, xmm2 |
1597 | punpckldq xmm2, xmm3 |
1598 | punpckhdq xmm11, xmm3 |
1599 | movdqa xmm1, xmm0 |
1600 | punpcklqdq xmm0, xmm2 |
1601 | punpckhqdq xmm1, xmm2 |
1602 | movdqa xmm3, xmm9 |
1603 | punpcklqdq xmm9, xmm11 |
1604 | punpckhqdq xmm3, xmm11 |
1605 | movdqu xmmword ptr [rbx], xmm0 |
1606 | movdqu xmmword ptr [rbx+0x20], xmm1 |
1607 | movdqu xmmword ptr [rbx+0x40], xmm9 |
1608 | movdqu xmmword ptr [rbx+0x60], xmm3 |
1609 | movdqa xmm9, xmm4 |
1610 | punpckldq xmm4, xmm5 |
1611 | punpckhdq xmm9, xmm5 |
1612 | movdqa xmm11, xmm6 |
1613 | punpckldq xmm6, xmm7 |
1614 | punpckhdq xmm11, xmm7 |
1615 | movdqa xmm5, xmm4 |
1616 | punpcklqdq xmm4, xmm6 |
1617 | punpckhqdq xmm5, xmm6 |
1618 | movdqa xmm7, xmm9 |
1619 | punpcklqdq xmm9, xmm11 |
1620 | punpckhqdq xmm7, xmm11 |
1621 | movdqu xmmword ptr [rbx+0x10], xmm4 |
1622 | movdqu xmmword ptr [rbx+0x30], xmm5 |
1623 | movdqu xmmword ptr [rbx+0x50], xmm9 |
1624 | movdqu xmmword ptr [rbx+0x70], xmm7 |
1625 | movdqa xmm1, xmmword ptr [rsp+0x110] |
1626 | movdqa xmm0, xmm1 |
1627 | paddd xmm1, xmmword ptr [rsp+0x150] |
1628 | movdqa xmmword ptr [rsp+0x110], xmm1 |
1629 | pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] |
1630 | pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] |
1631 | pcmpgtd xmm0, xmm1 |
1632 | movdqa xmm1, xmmword ptr [rsp+0x120] |
1633 | psubd xmm1, xmm0 |
1634 | movdqa xmmword ptr [rsp+0x120], xmm1 |
1635 | add rbx, 128 |
1636 | add rdi, 32 |
1637 | sub rsi, 4 |
1638 | cmp rsi, 4 |
1639 | jnc 2b |
1640 | test rsi, rsi |
1641 | jne 3f |
1642 | 4: |
1643 | movdqa xmm6, xmmword ptr [rsp+0x170] |
1644 | movdqa xmm7, xmmword ptr [rsp+0x180] |
1645 | movdqa xmm8, xmmword ptr [rsp+0x190] |
1646 | movdqa xmm9, xmmword ptr [rsp+0x1A0] |
1647 | movdqa xmm10, xmmword ptr [rsp+0x1B0] |
1648 | movdqa xmm11, xmmword ptr [rsp+0x1C0] |
1649 | movdqa xmm12, xmmword ptr [rsp+0x1D0] |
1650 | movdqa xmm13, xmmword ptr [rsp+0x1E0] |
1651 | movdqa xmm14, xmmword ptr [rsp+0x1F0] |
1652 | movdqa xmm15, xmmword ptr [rsp+0x200] |
1653 | mov rsp, rbp |
1654 | pop rbp |
1655 | pop rbx |
1656 | pop rdi |
1657 | pop rsi |
1658 | pop r12 |
1659 | pop r13 |
1660 | pop r14 |
1661 | pop r15 |
1662 | ret |
1663 | .p2align 5 |
1664 | 3: |
1665 | test esi, 0x2 |
1666 | je 3f |
1667 | movups xmm0, xmmword ptr [rcx] |
1668 | movups xmm1, xmmword ptr [rcx+0x10] |
1669 | movaps xmm8, xmm0 |
1670 | movaps xmm9, xmm1 |
1671 | movd xmm13, dword ptr [rsp+0x110] |
1672 | movd xmm14, dword ptr [rsp+0x120] |
1673 | punpckldq xmm13, xmm14 |
1674 | movaps xmmword ptr [rsp], xmm13 |
1675 | movd xmm14, dword ptr [rsp+0x114] |
1676 | movd xmm13, dword ptr [rsp+0x124] |
1677 | punpckldq xmm14, xmm13 |
1678 | movaps xmmword ptr [rsp+0x10], xmm14 |
1679 | mov r8, qword ptr [rdi] |
1680 | mov r9, qword ptr [rdi+0x8] |
1681 | movzx eax, byte ptr [rbp+0x80] |
1682 | or eax, r13d |
1683 | xor edx, edx |
1684 | 2: |
1685 | mov r14d, eax |
1686 | or eax, r12d |
1687 | add rdx, 64 |
1688 | cmp rdx, r15 |
1689 | cmovne eax, r14d |
1690 | movaps xmm2, xmmword ptr [BLAKE3_IV+rip] |
1691 | movaps xmm10, xmm2 |
1692 | movups xmm4, xmmword ptr [r8+rdx-0x40] |
1693 | movups xmm5, xmmword ptr [r8+rdx-0x30] |
1694 | movaps xmm3, xmm4 |
1695 | shufps xmm4, xmm5, 136 |
1696 | shufps xmm3, xmm5, 221 |
1697 | movaps xmm5, xmm3 |
1698 | movups xmm6, xmmword ptr [r8+rdx-0x20] |
1699 | movups xmm7, xmmword ptr [r8+rdx-0x10] |
1700 | movaps xmm3, xmm6 |
1701 | shufps xmm6, xmm7, 136 |
1702 | pshufd xmm6, xmm6, 0x93 |
1703 | shufps xmm3, xmm7, 221 |
1704 | pshufd xmm7, xmm3, 0x93 |
1705 | movups xmm12, xmmword ptr [r9+rdx-0x40] |
1706 | movups xmm13, xmmword ptr [r9+rdx-0x30] |
1707 | movaps xmm11, xmm12 |
1708 | shufps xmm12, xmm13, 136 |
1709 | shufps xmm11, xmm13, 221 |
1710 | movaps xmm13, xmm11 |
1711 | movups xmm14, xmmword ptr [r9+rdx-0x20] |
1712 | movups xmm15, xmmword ptr [r9+rdx-0x10] |
1713 | movaps xmm11, xmm14 |
1714 | shufps xmm14, xmm15, 136 |
1715 | pshufd xmm14, xmm14, 0x93 |
1716 | shufps xmm11, xmm15, 221 |
1717 | pshufd xmm15, xmm11, 0x93 |
1718 | shl rax, 0x20 |
1719 | or rax, 0x40 |
1720 | movq xmm3, rax |
1721 | movdqa xmmword ptr [rsp+0x20], xmm3 |
1722 | movaps xmm3, xmmword ptr [rsp] |
1723 | movaps xmm11, xmmword ptr [rsp+0x10] |
1724 | punpcklqdq xmm3, xmmword ptr [rsp+0x20] |
1725 | punpcklqdq xmm11, xmmword ptr [rsp+0x20] |
1726 | mov al, 7 |
1727 | 9: |
1728 | paddd xmm0, xmm4 |
1729 | paddd xmm8, xmm12 |
1730 | movaps xmmword ptr [rsp+0x20], xmm4 |
1731 | movaps xmmword ptr [rsp+0x30], xmm12 |
1732 | paddd xmm0, xmm1 |
1733 | paddd xmm8, xmm9 |
1734 | pxor xmm3, xmm0 |
1735 | pxor xmm11, xmm8 |
1736 | pshuflw xmm3, xmm3, 0xB1 |
1737 | pshufhw xmm3, xmm3, 0xB1 |
1738 | pshuflw xmm11, xmm11, 0xB1 |
1739 | pshufhw xmm11, xmm11, 0xB1 |
1740 | paddd xmm2, xmm3 |
1741 | paddd xmm10, xmm11 |
1742 | pxor xmm1, xmm2 |
1743 | pxor xmm9, xmm10 |
1744 | movdqa xmm4, xmm1 |
1745 | pslld xmm1, 20 |
1746 | psrld xmm4, 12 |
1747 | por xmm1, xmm4 |
1748 | movdqa xmm4, xmm9 |
1749 | pslld xmm9, 20 |
1750 | psrld xmm4, 12 |
1751 | por xmm9, xmm4 |
1752 | paddd xmm0, xmm5 |
1753 | paddd xmm8, xmm13 |
1754 | movaps xmmword ptr [rsp+0x40], xmm5 |
1755 | movaps xmmword ptr [rsp+0x50], xmm13 |
1756 | paddd xmm0, xmm1 |
1757 | paddd xmm8, xmm9 |
1758 | pxor xmm3, xmm0 |
1759 | pxor xmm11, xmm8 |
1760 | movdqa xmm13, xmm3 |
1761 | psrld xmm3, 8 |
1762 | pslld xmm13, 24 |
1763 | pxor xmm3, xmm13 |
1764 | movdqa xmm13, xmm11 |
1765 | psrld xmm11, 8 |
1766 | pslld xmm13, 24 |
1767 | pxor xmm11, xmm13 |
1768 | paddd xmm2, xmm3 |
1769 | paddd xmm10, xmm11 |
1770 | pxor xmm1, xmm2 |
1771 | pxor xmm9, xmm10 |
1772 | movdqa xmm4, xmm1 |
1773 | pslld xmm1, 25 |
1774 | psrld xmm4, 7 |
1775 | por xmm1, xmm4 |
1776 | movdqa xmm4, xmm9 |
1777 | pslld xmm9, 25 |
1778 | psrld xmm4, 7 |
1779 | por xmm9, xmm4 |
1780 | pshufd xmm0, xmm0, 0x93 |
1781 | pshufd xmm8, xmm8, 0x93 |
1782 | pshufd xmm3, xmm3, 0x4E |
1783 | pshufd xmm11, xmm11, 0x4E |
1784 | pshufd xmm2, xmm2, 0x39 |
1785 | pshufd xmm10, xmm10, 0x39 |
1786 | paddd xmm0, xmm6 |
1787 | paddd xmm8, xmm14 |
1788 | paddd xmm0, xmm1 |
1789 | paddd xmm8, xmm9 |
1790 | pxor xmm3, xmm0 |
1791 | pxor xmm11, xmm8 |
1792 | pshuflw xmm3, xmm3, 0xB1 |
1793 | pshufhw xmm3, xmm3, 0xB1 |
1794 | pshuflw xmm11, xmm11, 0xB1 |
1795 | pshufhw xmm11, xmm11, 0xB1 |
1796 | paddd xmm2, xmm3 |
1797 | paddd xmm10, xmm11 |
1798 | pxor xmm1, xmm2 |
1799 | pxor xmm9, xmm10 |
1800 | movdqa xmm4, xmm1 |
1801 | pslld xmm1, 20 |
1802 | psrld xmm4, 12 |
1803 | por xmm1, xmm4 |
1804 | movdqa xmm4, xmm9 |
1805 | pslld xmm9, 20 |
1806 | psrld xmm4, 12 |
1807 | por xmm9, xmm4 |
1808 | paddd xmm0, xmm7 |
1809 | paddd xmm8, xmm15 |
1810 | paddd xmm0, xmm1 |
1811 | paddd xmm8, xmm9 |
1812 | pxor xmm3, xmm0 |
1813 | pxor xmm11, xmm8 |
1814 | movdqa xmm13, xmm3 |
1815 | psrld xmm3, 8 |
1816 | pslld xmm13, 24 |
1817 | pxor xmm3, xmm13 |
1818 | movdqa xmm13, xmm11 |
1819 | psrld xmm11, 8 |
1820 | pslld xmm13, 24 |
1821 | pxor xmm11, xmm13 |
1822 | paddd xmm2, xmm3 |
1823 | paddd xmm10, xmm11 |
1824 | pxor xmm1, xmm2 |
1825 | pxor xmm9, xmm10 |
1826 | movdqa xmm4, xmm1 |
1827 | pslld xmm1, 25 |
1828 | psrld xmm4, 7 |
1829 | por xmm1, xmm4 |
1830 | movdqa xmm4, xmm9 |
1831 | pslld xmm9, 25 |
1832 | psrld xmm4, 7 |
1833 | por xmm9, xmm4 |
1834 | pshufd xmm0, xmm0, 0x39 |
1835 | pshufd xmm8, xmm8, 0x39 |
1836 | pshufd xmm3, xmm3, 0x4E |
1837 | pshufd xmm11, xmm11, 0x4E |
1838 | pshufd xmm2, xmm2, 0x93 |
1839 | pshufd xmm10, xmm10, 0x93 |
1840 | dec al |
1841 | je 9f |
1842 | movdqa xmm12, xmmword ptr [rsp+0x20] |
1843 | movdqa xmm5, xmmword ptr [rsp+0x40] |
1844 | pshufd xmm13, xmm12, 0x0F |
1845 | shufps xmm12, xmm5, 214 |
1846 | pshufd xmm4, xmm12, 0x39 |
1847 | movdqa xmm12, xmm6 |
1848 | shufps xmm12, xmm7, 250 |
1849 | pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip] |
1850 | pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip] |
1851 | por xmm13, xmm12 |
1852 | movdqa xmmword ptr [rsp+0x20], xmm13 |
1853 | movdqa xmm12, xmm7 |
1854 | punpcklqdq xmm12, xmm5 |
1855 | movdqa xmm13, xmm6 |
1856 | pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip] |
1857 | pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip] |
1858 | por xmm12, xmm13 |
1859 | pshufd xmm12, xmm12, 0x78 |
1860 | punpckhdq xmm5, xmm7 |
1861 | punpckldq xmm6, xmm5 |
1862 | pshufd xmm7, xmm6, 0x1E |
1863 | movdqa xmmword ptr [rsp+0x40], xmm12 |
1864 | movdqa xmm5, xmmword ptr [rsp+0x30] |
1865 | movdqa xmm13, xmmword ptr [rsp+0x50] |
1866 | pshufd xmm6, xmm5, 0x0F |
1867 | shufps xmm5, xmm13, 214 |
1868 | pshufd xmm12, xmm5, 0x39 |
1869 | movdqa xmm5, xmm14 |
1870 | shufps xmm5, xmm15, 250 |
1871 | pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip] |
1872 | pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip] |
1873 | por xmm6, xmm5 |
1874 | movdqa xmm5, xmm15 |
1875 | punpcklqdq xmm5, xmm13 |
1876 | movdqa xmmword ptr [rsp+0x30], xmm2 |
1877 | movdqa xmm2, xmm14 |
1878 | pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip] |
1879 | pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] |
1880 | por xmm5, xmm2 |
1881 | movdqa xmm2, xmmword ptr [rsp+0x30] |
1882 | pshufd xmm5, xmm5, 0x78 |
1883 | punpckhdq xmm13, xmm15 |
1884 | punpckldq xmm14, xmm13 |
1885 | pshufd xmm15, xmm14, 0x1E |
1886 | movdqa xmm13, xmm6 |
1887 | movdqa xmm14, xmm5 |
1888 | movdqa xmm5, xmmword ptr [rsp+0x20] |
1889 | movdqa xmm6, xmmword ptr [rsp+0x40] |
1890 | jmp 9b |
1891 | 9: |
1892 | pxor xmm0, xmm2 |
1893 | pxor xmm1, xmm3 |
1894 | pxor xmm8, xmm10 |
1895 | pxor xmm9, xmm11 |
1896 | mov eax, r13d |
1897 | cmp rdx, r15 |
1898 | jne 2b |
1899 | movups xmmword ptr [rbx], xmm0 |
1900 | movups xmmword ptr [rbx+0x10], xmm1 |
1901 | movups xmmword ptr [rbx+0x20], xmm8 |
1902 | movups xmmword ptr [rbx+0x30], xmm9 |
1903 | mov eax, dword ptr [rsp+0x130] |
1904 | neg eax |
1905 | mov r10d, dword ptr [rsp+0x110+8*rax] |
1906 | mov r11d, dword ptr [rsp+0x120+8*rax] |
1907 | mov dword ptr [rsp+0x110], r10d |
1908 | mov dword ptr [rsp+0x120], r11d |
1909 | add rdi, 16 |
1910 | add rbx, 64 |
1911 | sub rsi, 2 |
1912 | 3: |
1913 | test esi, 0x1 |
1914 | je 4b |
1915 | movups xmm0, xmmword ptr [rcx] |
1916 | movups xmm1, xmmword ptr [rcx+0x10] |
1917 | movd xmm13, dword ptr [rsp+0x110] |
1918 | movd xmm14, dword ptr [rsp+0x120] |
1919 | punpckldq xmm13, xmm14 |
1920 | mov r8, qword ptr [rdi] |
1921 | movzx eax, byte ptr [rbp+0x80] |
1922 | or eax, r13d |
1923 | xor edx, edx |
1924 | 2: |
1925 | mov r14d, eax |
1926 | or eax, r12d |
1927 | add rdx, 64 |
1928 | cmp rdx, r15 |
1929 | cmovne eax, r14d |
1930 | movaps xmm2, xmmword ptr [BLAKE3_IV+rip] |
1931 | shl rax, 32 |
1932 | or rax, 64 |
1933 | movq xmm12, rax |
1934 | movdqa xmm3, xmm13 |
1935 | punpcklqdq xmm3, xmm12 |
1936 | movups xmm4, xmmword ptr [r8+rdx-0x40] |
1937 | movups xmm5, xmmword ptr [r8+rdx-0x30] |
1938 | movaps xmm8, xmm4 |
1939 | shufps xmm4, xmm5, 136 |
1940 | shufps xmm8, xmm5, 221 |
1941 | movaps xmm5, xmm8 |
1942 | movups xmm6, xmmword ptr [r8+rdx-0x20] |
1943 | movups xmm7, xmmword ptr [r8+rdx-0x10] |
1944 | movaps xmm8, xmm6 |
1945 | shufps xmm6, xmm7, 136 |
1946 | pshufd xmm6, xmm6, 0x93 |
1947 | shufps xmm8, xmm7, 221 |
1948 | pshufd xmm7, xmm8, 0x93 |
1949 | mov al, 7 |
1950 | 9: |
1951 | paddd xmm0, xmm4 |
1952 | paddd xmm0, xmm1 |
1953 | pxor xmm3, xmm0 |
1954 | pshuflw xmm3, xmm3, 0xB1 |
1955 | pshufhw xmm3, xmm3, 0xB1 |
1956 | paddd xmm2, xmm3 |
1957 | pxor xmm1, xmm2 |
1958 | movdqa xmm11, xmm1 |
1959 | pslld xmm1, 20 |
1960 | psrld xmm11, 12 |
1961 | por xmm1, xmm11 |
1962 | paddd xmm0, xmm5 |
1963 | paddd xmm0, xmm1 |
1964 | pxor xmm3, xmm0 |
1965 | movdqa xmm14, xmm3 |
1966 | psrld xmm3, 8 |
1967 | pslld xmm14, 24 |
1968 | pxor xmm3, xmm14 |
1969 | paddd xmm2, xmm3 |
1970 | pxor xmm1, xmm2 |
1971 | movdqa xmm11, xmm1 |
1972 | pslld xmm1, 25 |
1973 | psrld xmm11, 7 |
1974 | por xmm1, xmm11 |
1975 | pshufd xmm0, xmm0, 0x93 |
1976 | pshufd xmm3, xmm3, 0x4E |
1977 | pshufd xmm2, xmm2, 0x39 |
1978 | paddd xmm0, xmm6 |
1979 | paddd xmm0, xmm1 |
1980 | pxor xmm3, xmm0 |
1981 | pshuflw xmm3, xmm3, 0xB1 |
1982 | pshufhw xmm3, xmm3, 0xB1 |
1983 | paddd xmm2, xmm3 |
1984 | pxor xmm1, xmm2 |
1985 | movdqa xmm11, xmm1 |
1986 | pslld xmm1, 20 |
1987 | psrld xmm11, 12 |
1988 | por xmm1, xmm11 |
1989 | paddd xmm0, xmm7 |
1990 | paddd xmm0, xmm1 |
1991 | pxor xmm3, xmm0 |
1992 | movdqa xmm14, xmm3 |
1993 | psrld xmm3, 8 |
1994 | pslld xmm14, 24 |
1995 | pxor xmm3, xmm14 |
1996 | paddd xmm2, xmm3 |
1997 | pxor xmm1, xmm2 |
1998 | movdqa xmm11, xmm1 |
1999 | pslld xmm1, 25 |
2000 | psrld xmm11, 7 |
2001 | por xmm1, xmm11 |
2002 | pshufd xmm0, xmm0, 0x39 |
2003 | pshufd xmm3, xmm3, 0x4E |
2004 | pshufd xmm2, xmm2, 0x93 |
2005 | dec al |
2006 | jz 9f |
2007 | movdqa xmm8, xmm4 |
2008 | shufps xmm8, xmm5, 214 |
2009 | pshufd xmm9, xmm4, 0x0F |
2010 | pshufd xmm4, xmm8, 0x39 |
2011 | movdqa xmm8, xmm6 |
2012 | shufps xmm8, xmm7, 250 |
2013 | pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] |
2014 | pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] |
2015 | por xmm9, xmm8 |
2016 | movdqa xmm8, xmm7 |
2017 | punpcklqdq xmm8, xmm5 |
2018 | movdqa xmm10, xmm6 |
2019 | pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] |
2020 | pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] |
2021 | por xmm8, xmm10 |
2022 | pshufd xmm8, xmm8, 0x78 |
2023 | punpckhdq xmm5, xmm7 |
2024 | punpckldq xmm6, xmm5 |
2025 | pshufd xmm7, xmm6, 0x1E |
2026 | movdqa xmm5, xmm9 |
2027 | movdqa xmm6, xmm8 |
2028 | jmp 9b |
2029 | 9: |
2030 | pxor xmm0, xmm2 |
2031 | pxor xmm1, xmm3 |
2032 | mov eax, r13d |
2033 | cmp rdx, r15 |
2034 | jne 2b |
2035 | movups xmmword ptr [rbx], xmm0 |
2036 | movups xmmword ptr [rbx+0x10], xmm1 |
2037 | jmp 4b |
2038 | |
2039 | .p2align 6 |
2040 | blake3_compress_in_place_sse2: |
2041 | _blake3_compress_in_place_sse2: |
2042 | sub rsp, 120 |
2043 | movdqa xmmword ptr [rsp], xmm6 |
2044 | movdqa xmmword ptr [rsp+0x10], xmm7 |
2045 | movdqa xmmword ptr [rsp+0x20], xmm8 |
2046 | movdqa xmmword ptr [rsp+0x30], xmm9 |
2047 | movdqa xmmword ptr [rsp+0x40], xmm11 |
2048 | movdqa xmmword ptr [rsp+0x50], xmm14 |
2049 | movdqa xmmword ptr [rsp+0x60], xmm15 |
2050 | movups xmm0, xmmword ptr [rcx] |
2051 | movups xmm1, xmmword ptr [rcx+0x10] |
2052 | movaps xmm2, xmmword ptr [BLAKE3_IV+rip] |
2053 | movzx eax, byte ptr [rsp+0xA0] |
2054 | movzx r8d, r8b |
2055 | shl rax, 32 |
2056 | add r8, rax |
2057 | movq xmm3, r9 |
2058 | movq xmm4, r8 |
2059 | punpcklqdq xmm3, xmm4 |
2060 | movups xmm4, xmmword ptr [rdx] |
2061 | movups xmm5, xmmword ptr [rdx+0x10] |
2062 | movaps xmm8, xmm4 |
2063 | shufps xmm4, xmm5, 136 |
2064 | shufps xmm8, xmm5, 221 |
2065 | movaps xmm5, xmm8 |
2066 | movups xmm6, xmmword ptr [rdx+0x20] |
2067 | movups xmm7, xmmword ptr [rdx+0x30] |
2068 | movaps xmm8, xmm6 |
2069 | shufps xmm6, xmm7, 136 |
2070 | pshufd xmm6, xmm6, 0x93 |
2071 | shufps xmm8, xmm7, 221 |
2072 | pshufd xmm7, xmm8, 0x93 |
2073 | mov al, 7 |
2074 | 9: |
2075 | paddd xmm0, xmm4 |
2076 | paddd xmm0, xmm1 |
2077 | pxor xmm3, xmm0 |
2078 | pshuflw xmm3, xmm3, 0xB1 |
2079 | pshufhw xmm3, xmm3, 0xB1 |
2080 | paddd xmm2, xmm3 |
2081 | pxor xmm1, xmm2 |
2082 | movdqa xmm11, xmm1 |
2083 | pslld xmm1, 20 |
2084 | psrld xmm11, 12 |
2085 | por xmm1, xmm11 |
2086 | paddd xmm0, xmm5 |
2087 | paddd xmm0, xmm1 |
2088 | pxor xmm3, xmm0 |
2089 | movdqa xmm14, xmm3 |
2090 | psrld xmm3, 8 |
2091 | pslld xmm14, 24 |
2092 | pxor xmm3, xmm14 |
2093 | paddd xmm2, xmm3 |
2094 | pxor xmm1, xmm2 |
2095 | movdqa xmm11, xmm1 |
2096 | pslld xmm1, 25 |
2097 | psrld xmm11, 7 |
2098 | por xmm1, xmm11 |
2099 | pshufd xmm0, xmm0, 0x93 |
2100 | pshufd xmm3, xmm3, 0x4E |
2101 | pshufd xmm2, xmm2, 0x39 |
2102 | paddd xmm0, xmm6 |
2103 | paddd xmm0, xmm1 |
2104 | pxor xmm3, xmm0 |
2105 | pshuflw xmm3, xmm3, 0xB1 |
2106 | pshufhw xmm3, xmm3, 0xB1 |
2107 | paddd xmm2, xmm3 |
2108 | pxor xmm1, xmm2 |
2109 | movdqa xmm11, xmm1 |
2110 | pslld xmm1, 20 |
2111 | psrld xmm11, 12 |
2112 | por xmm1, xmm11 |
2113 | paddd xmm0, xmm7 |
2114 | paddd xmm0, xmm1 |
2115 | pxor xmm3, xmm0 |
2116 | movdqa xmm14, xmm3 |
2117 | psrld xmm3, 8 |
2118 | pslld xmm14, 24 |
2119 | pxor xmm3, xmm14 |
2120 | paddd xmm2, xmm3 |
2121 | pxor xmm1, xmm2 |
2122 | movdqa xmm11, xmm1 |
2123 | pslld xmm1, 25 |
2124 | psrld xmm11, 7 |
2125 | por xmm1, xmm11 |
2126 | pshufd xmm0, xmm0, 0x39 |
2127 | pshufd xmm3, xmm3, 0x4E |
2128 | pshufd xmm2, xmm2, 0x93 |
2129 | dec al |
2130 | jz 9f |
2131 | movdqa xmm8, xmm4 |
2132 | shufps xmm8, xmm5, 214 |
2133 | pshufd xmm9, xmm4, 0x0F |
2134 | pshufd xmm4, xmm8, 0x39 |
2135 | movdqa xmm8, xmm6 |
2136 | shufps xmm8, xmm7, 250 |
2137 | pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] |
2138 | pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] |
2139 | por xmm9, xmm8 |
2140 | movdqa xmm8, xmm7 |
2141 | punpcklqdq xmm8, xmm5 |
2142 | movdqa xmm14, xmm6 |
2143 | pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] |
2144 | pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip] |
2145 | por xmm8, xmm14 |
2146 | pshufd xmm8, xmm8, 0x78 |
2147 | punpckhdq xmm5, xmm7 |
2148 | punpckldq xmm6, xmm5 |
2149 | pshufd xmm7, xmm6, 0x1E |
2150 | movdqa xmm5, xmm9 |
2151 | movdqa xmm6, xmm8 |
2152 | jmp 9b |
2153 | 9: |
2154 | pxor xmm0, xmm2 |
2155 | pxor xmm1, xmm3 |
2156 | movups xmmword ptr [rcx], xmm0 |
2157 | movups xmmword ptr [rcx+0x10], xmm1 |
2158 | movdqa xmm6, xmmword ptr [rsp] |
2159 | movdqa xmm7, xmmword ptr [rsp+0x10] |
2160 | movdqa xmm8, xmmword ptr [rsp+0x20] |
2161 | movdqa xmm9, xmmword ptr [rsp+0x30] |
2162 | movdqa xmm11, xmmword ptr [rsp+0x40] |
2163 | movdqa xmm14, xmmword ptr [rsp+0x50] |
2164 | movdqa xmm15, xmmword ptr [rsp+0x60] |
2165 | add rsp, 120 |
2166 | ret |
2167 | |
2168 | |
2169 | .p2align 6 |
2170 | _blake3_compress_xof_sse2: |
2171 | blake3_compress_xof_sse2: |
2172 | sub rsp, 120 |
2173 | movdqa xmmword ptr [rsp], xmm6 |
2174 | movdqa xmmword ptr [rsp+0x10], xmm7 |
2175 | movdqa xmmword ptr [rsp+0x20], xmm8 |
2176 | movdqa xmmword ptr [rsp+0x30], xmm9 |
2177 | movdqa xmmword ptr [rsp+0x40], xmm11 |
2178 | movdqa xmmword ptr [rsp+0x50], xmm14 |
2179 | movdqa xmmword ptr [rsp+0x60], xmm15 |
2180 | movups xmm0, xmmword ptr [rcx] |
2181 | movups xmm1, xmmword ptr [rcx+0x10] |
2182 | movaps xmm2, xmmword ptr [BLAKE3_IV+rip] |
2183 | movzx eax, byte ptr [rsp+0xA0] |
2184 | movzx r8d, r8b |
2185 | mov r10, qword ptr [rsp+0xA8] |
2186 | shl rax, 32 |
2187 | add r8, rax |
2188 | movq xmm3, r9 |
2189 | movq xmm4, r8 |
2190 | punpcklqdq xmm3, xmm4 |
2191 | movups xmm4, xmmword ptr [rdx] |
2192 | movups xmm5, xmmword ptr [rdx+0x10] |
2193 | movaps xmm8, xmm4 |
2194 | shufps xmm4, xmm5, 136 |
2195 | shufps xmm8, xmm5, 221 |
2196 | movaps xmm5, xmm8 |
2197 | movups xmm6, xmmword ptr [rdx+0x20] |
2198 | movups xmm7, xmmword ptr [rdx+0x30] |
2199 | movaps xmm8, xmm6 |
2200 | shufps xmm6, xmm7, 136 |
2201 | pshufd xmm6, xmm6, 0x93 |
2202 | shufps xmm8, xmm7, 221 |
2203 | pshufd xmm7, xmm8, 0x93 |
2204 | mov al, 7 |
2205 | 9: |
2206 | paddd xmm0, xmm4 |
2207 | paddd xmm0, xmm1 |
2208 | pxor xmm3, xmm0 |
2209 | pshuflw xmm3, xmm3, 0xB1 |
2210 | pshufhw xmm3, xmm3, 0xB1 |
2211 | paddd xmm2, xmm3 |
2212 | pxor xmm1, xmm2 |
2213 | movdqa xmm11, xmm1 |
2214 | pslld xmm1, 20 |
2215 | psrld xmm11, 12 |
2216 | por xmm1, xmm11 |
2217 | paddd xmm0, xmm5 |
2218 | paddd xmm0, xmm1 |
2219 | pxor xmm3, xmm0 |
2220 | movdqa xmm14, xmm3 |
2221 | psrld xmm3, 8 |
2222 | pslld xmm14, 24 |
2223 | pxor xmm3, xmm14 |
2224 | paddd xmm2, xmm3 |
2225 | pxor xmm1, xmm2 |
2226 | movdqa xmm11, xmm1 |
2227 | pslld xmm1, 25 |
2228 | psrld xmm11, 7 |
2229 | por xmm1, xmm11 |
2230 | pshufd xmm0, xmm0, 0x93 |
2231 | pshufd xmm3, xmm3, 0x4E |
2232 | pshufd xmm2, xmm2, 0x39 |
2233 | paddd xmm0, xmm6 |
2234 | paddd xmm0, xmm1 |
2235 | pxor xmm3, xmm0 |
2236 | pshuflw xmm3, xmm3, 0xB1 |
2237 | pshufhw xmm3, xmm3, 0xB1 |
2238 | paddd xmm2, xmm3 |
2239 | pxor xmm1, xmm2 |
2240 | movdqa xmm11, xmm1 |
2241 | pslld xmm1, 20 |
2242 | psrld xmm11, 12 |
2243 | por xmm1, xmm11 |
2244 | paddd xmm0, xmm7 |
2245 | paddd xmm0, xmm1 |
2246 | pxor xmm3, xmm0 |
2247 | movdqa xmm14, xmm3 |
2248 | psrld xmm3, 8 |
2249 | pslld xmm14, 24 |
2250 | pxor xmm3, xmm14 |
2251 | paddd xmm2, xmm3 |
2252 | pxor xmm1, xmm2 |
2253 | movdqa xmm11, xmm1 |
2254 | pslld xmm1, 25 |
2255 | psrld xmm11, 7 |
2256 | por xmm1, xmm11 |
2257 | pshufd xmm0, xmm0, 0x39 |
2258 | pshufd xmm3, xmm3, 0x4E |
2259 | pshufd xmm2, xmm2, 0x93 |
2260 | dec al |
2261 | jz 9f |
2262 | movdqa xmm8, xmm4 |
2263 | shufps xmm8, xmm5, 214 |
2264 | pshufd xmm9, xmm4, 0x0F |
2265 | pshufd xmm4, xmm8, 0x39 |
2266 | movdqa xmm8, xmm6 |
2267 | shufps xmm8, xmm7, 250 |
2268 | pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] |
2269 | pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] |
2270 | por xmm9, xmm8 |
2271 | movdqa xmm8, xmm7 |
2272 | punpcklqdq xmm8, xmm5 |
2273 | movdqa xmm14, xmm6 |
2274 | pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] |
2275 | pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip] |
2276 | por xmm8, xmm14 |
2277 | pshufd xmm8, xmm8, 0x78 |
2278 | punpckhdq xmm5, xmm7 |
2279 | punpckldq xmm6, xmm5 |
2280 | pshufd xmm7, xmm6, 0x1E |
2281 | movdqa xmm5, xmm9 |
2282 | movdqa xmm6, xmm8 |
2283 | jmp 9b |
2284 | 9: |
2285 | movdqu xmm4, xmmword ptr [rcx] |
2286 | movdqu xmm5, xmmword ptr [rcx+0x10] |
2287 | pxor xmm0, xmm2 |
2288 | pxor xmm1, xmm3 |
2289 | pxor xmm2, xmm4 |
2290 | pxor xmm3, xmm5 |
2291 | movups xmmword ptr [r10], xmm0 |
2292 | movups xmmword ptr [r10+0x10], xmm1 |
2293 | movups xmmword ptr [r10+0x20], xmm2 |
2294 | movups xmmword ptr [r10+0x30], xmm3 |
2295 | movdqa xmm6, xmmword ptr [rsp] |
2296 | movdqa xmm7, xmmword ptr [rsp+0x10] |
2297 | movdqa xmm8, xmmword ptr [rsp+0x20] |
2298 | movdqa xmm9, xmmword ptr [rsp+0x30] |
2299 | movdqa xmm11, xmmword ptr [rsp+0x40] |
2300 | movdqa xmm14, xmmword ptr [rsp+0x50] |
2301 | movdqa xmm15, xmmword ptr [rsp+0x60] |
2302 | add rsp, 120 |
2303 | ret |
2304 | |
2305 | |
2306 | .section .rodata |
2307 | .p2align 6 |
2308 | BLAKE3_IV: |
2309 | .long 0x6A09E667, 0xBB67AE85 |
2310 | .long 0x3C6EF372, 0xA54FF53A |
2311 | ADD0: |
2312 | .long 0, 1, 2, 3 |
2313 | ADD1: |
2314 | .long 4, 4, 4, 4 |
2315 | BLAKE3_IV_0: |
2316 | .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 |
2317 | BLAKE3_IV_1: |
2318 | .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 |
2319 | BLAKE3_IV_2: |
2320 | .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 |
2321 | BLAKE3_IV_3: |
2322 | .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A |
2323 | BLAKE3_BLOCK_LEN: |
2324 | .long 64, 64, 64, 64 |
2325 | CMP_MSB_MASK: |
2326 | .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 |
2327 | PBLENDW_0x33_MASK: |
2328 | .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 |
2329 | PBLENDW_0xCC_MASK: |
2330 | .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF |
2331 | PBLENDW_0x3F_MASK: |
2332 | .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 |
2333 | PBLENDW_0xC0_MASK: |
2334 | .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF |
2335 | |