1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | /* |
3 | * Original implementation written by Andy Polyakov, @dot-asm. |
4 | * This is an adaptation of the original code for kernel use. |
5 | * |
6 | * Copyright (C) 2006-2019 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved. |
7 | */ |
8 | |
9 | #include <linux/linkage.h> |
10 | #include <asm/nospec-insn.h> |
11 | #include <asm/fpu-insn.h> |
12 | |
13 | #define SP %r15 |
14 | #define FRAME (16 * 8 + 4 * 8) |
15 | |
16 | .data |
17 | .balign 32 |
18 | |
19 | SYM_DATA_START_LOCAL(sigma) |
20 | .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral |
21 | .long 1,0,0,0 |
22 | .long 2,0,0,0 |
23 | .long 3,0,0,0 |
24 | .long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c # byte swap |
25 | |
26 | .long 0,1,2,3 |
27 | .long 0x61707865,0x61707865,0x61707865,0x61707865 # smashed sigma |
28 | .long 0x3320646e,0x3320646e,0x3320646e,0x3320646e |
29 | .long 0x79622d32,0x79622d32,0x79622d32,0x79622d32 |
30 | .long 0x6b206574,0x6b206574,0x6b206574,0x6b206574 |
31 | SYM_DATA_END(sigma) |
32 | |
33 | .previous |
34 | |
35 | GEN_BR_THUNK %r14 |
36 | |
37 | .text |
38 | |
39 | ############################################################################# |
40 | # void chacha20_vx_4x(u8 *out, counst u8 *inp, size_t len, |
41 | # counst u32 *key, const u32 *counter) |
42 | |
43 | #define OUT %r2 |
44 | #define INP %r3 |
45 | #define LEN %r4 |
46 | #define KEY %r5 |
47 | #define COUNTER %r6 |
48 | |
49 | #define BEPERM %v31 |
50 | #define CTR %v26 |
51 | |
52 | #define K0 %v16 |
53 | #define K1 %v17 |
54 | #define K2 %v18 |
55 | #define K3 %v19 |
56 | |
57 | #define XA0 %v0 |
58 | #define XA1 %v1 |
59 | #define XA2 %v2 |
60 | #define XA3 %v3 |
61 | |
62 | #define XB0 %v4 |
63 | #define XB1 %v5 |
64 | #define XB2 %v6 |
65 | #define XB3 %v7 |
66 | |
67 | #define XC0 %v8 |
68 | #define XC1 %v9 |
69 | #define XC2 %v10 |
70 | #define XC3 %v11 |
71 | |
72 | #define XD0 %v12 |
73 | #define XD1 %v13 |
74 | #define XD2 %v14 |
75 | #define XD3 %v15 |
76 | |
77 | #define XT0 %v27 |
78 | #define XT1 %v28 |
79 | #define XT2 %v29 |
80 | #define XT3 %v30 |
81 | |
82 | SYM_FUNC_START(chacha20_vx_4x) |
83 | stmg %r6,%r7,6*8(SP) |
84 | |
85 | larl %r7,sigma |
86 | lhi %r0,10 |
87 | lhi %r1,0 |
88 | |
89 | VL K0,0,,%r7 # load sigma |
90 | VL K1,0,,KEY # load key |
91 | VL K2,16,,KEY |
92 | VL K3,0,,COUNTER # load counter |
93 | |
94 | VL BEPERM,0x40,,%r7 |
95 | VL CTR,0x50,,%r7 |
96 | |
97 | VLM XA0,XA3,0x60,%r7,4 # load [smashed] sigma |
98 | |
99 | VREPF XB0,K1,0 # smash the key |
100 | VREPF XB1,K1,1 |
101 | VREPF XB2,K1,2 |
102 | VREPF XB3,K1,3 |
103 | |
104 | VREPF XD0,K3,0 |
105 | VREPF XD1,K3,1 |
106 | VREPF XD2,K3,2 |
107 | VREPF XD3,K3,3 |
108 | VAF XD0,XD0,CTR |
109 | |
110 | VREPF XC0,K2,0 |
111 | VREPF XC1,K2,1 |
112 | VREPF XC2,K2,2 |
113 | VREPF XC3,K2,3 |
114 | |
115 | .Loop_4x: |
116 | VAF XA0,XA0,XB0 |
117 | VX XD0,XD0,XA0 |
118 | VERLLF XD0,XD0,16 |
119 | |
120 | VAF XA1,XA1,XB1 |
121 | VX XD1,XD1,XA1 |
122 | VERLLF XD1,XD1,16 |
123 | |
124 | VAF XA2,XA2,XB2 |
125 | VX XD2,XD2,XA2 |
126 | VERLLF XD2,XD2,16 |
127 | |
128 | VAF XA3,XA3,XB3 |
129 | VX XD3,XD3,XA3 |
130 | VERLLF XD3,XD3,16 |
131 | |
132 | VAF XC0,XC0,XD0 |
133 | VX XB0,XB0,XC0 |
134 | VERLLF XB0,XB0,12 |
135 | |
136 | VAF XC1,XC1,XD1 |
137 | VX XB1,XB1,XC1 |
138 | VERLLF XB1,XB1,12 |
139 | |
140 | VAF XC2,XC2,XD2 |
141 | VX XB2,XB2,XC2 |
142 | VERLLF XB2,XB2,12 |
143 | |
144 | VAF XC3,XC3,XD3 |
145 | VX XB3,XB3,XC3 |
146 | VERLLF XB3,XB3,12 |
147 | |
148 | VAF XA0,XA0,XB0 |
149 | VX XD0,XD0,XA0 |
150 | VERLLF XD0,XD0,8 |
151 | |
152 | VAF XA1,XA1,XB1 |
153 | VX XD1,XD1,XA1 |
154 | VERLLF XD1,XD1,8 |
155 | |
156 | VAF XA2,XA2,XB2 |
157 | VX XD2,XD2,XA2 |
158 | VERLLF XD2,XD2,8 |
159 | |
160 | VAF XA3,XA3,XB3 |
161 | VX XD3,XD3,XA3 |
162 | VERLLF XD3,XD3,8 |
163 | |
164 | VAF XC0,XC0,XD0 |
165 | VX XB0,XB0,XC0 |
166 | VERLLF XB0,XB0,7 |
167 | |
168 | VAF XC1,XC1,XD1 |
169 | VX XB1,XB1,XC1 |
170 | VERLLF XB1,XB1,7 |
171 | |
172 | VAF XC2,XC2,XD2 |
173 | VX XB2,XB2,XC2 |
174 | VERLLF XB2,XB2,7 |
175 | |
176 | VAF XC3,XC3,XD3 |
177 | VX XB3,XB3,XC3 |
178 | VERLLF XB3,XB3,7 |
179 | |
180 | VAF XA0,XA0,XB1 |
181 | VX XD3,XD3,XA0 |
182 | VERLLF XD3,XD3,16 |
183 | |
184 | VAF XA1,XA1,XB2 |
185 | VX XD0,XD0,XA1 |
186 | VERLLF XD0,XD0,16 |
187 | |
188 | VAF XA2,XA2,XB3 |
189 | VX XD1,XD1,XA2 |
190 | VERLLF XD1,XD1,16 |
191 | |
192 | VAF XA3,XA3,XB0 |
193 | VX XD2,XD2,XA3 |
194 | VERLLF XD2,XD2,16 |
195 | |
196 | VAF XC2,XC2,XD3 |
197 | VX XB1,XB1,XC2 |
198 | VERLLF XB1,XB1,12 |
199 | |
200 | VAF XC3,XC3,XD0 |
201 | VX XB2,XB2,XC3 |
202 | VERLLF XB2,XB2,12 |
203 | |
204 | VAF XC0,XC0,XD1 |
205 | VX XB3,XB3,XC0 |
206 | VERLLF XB3,XB3,12 |
207 | |
208 | VAF XC1,XC1,XD2 |
209 | VX XB0,XB0,XC1 |
210 | VERLLF XB0,XB0,12 |
211 | |
212 | VAF XA0,XA0,XB1 |
213 | VX XD3,XD3,XA0 |
214 | VERLLF XD3,XD3,8 |
215 | |
216 | VAF XA1,XA1,XB2 |
217 | VX XD0,XD0,XA1 |
218 | VERLLF XD0,XD0,8 |
219 | |
220 | VAF XA2,XA2,XB3 |
221 | VX XD1,XD1,XA2 |
222 | VERLLF XD1,XD1,8 |
223 | |
224 | VAF XA3,XA3,XB0 |
225 | VX XD2,XD2,XA3 |
226 | VERLLF XD2,XD2,8 |
227 | |
228 | VAF XC2,XC2,XD3 |
229 | VX XB1,XB1,XC2 |
230 | VERLLF XB1,XB1,7 |
231 | |
232 | VAF XC3,XC3,XD0 |
233 | VX XB2,XB2,XC3 |
234 | VERLLF XB2,XB2,7 |
235 | |
236 | VAF XC0,XC0,XD1 |
237 | VX XB3,XB3,XC0 |
238 | VERLLF XB3,XB3,7 |
239 | |
240 | VAF XC1,XC1,XD2 |
241 | VX XB0,XB0,XC1 |
242 | VERLLF XB0,XB0,7 |
243 | brct %r0,.Loop_4x |
244 | |
245 | VAF XD0,XD0,CTR |
246 | |
247 | VMRHF XT0,XA0,XA1 # transpose data |
248 | VMRHF XT1,XA2,XA3 |
249 | VMRLF XT2,XA0,XA1 |
250 | VMRLF XT3,XA2,XA3 |
251 | VPDI XA0,XT0,XT1,0b0000 |
252 | VPDI XA1,XT0,XT1,0b0101 |
253 | VPDI XA2,XT2,XT3,0b0000 |
254 | VPDI XA3,XT2,XT3,0b0101 |
255 | |
256 | VMRHF XT0,XB0,XB1 |
257 | VMRHF XT1,XB2,XB3 |
258 | VMRLF XT2,XB0,XB1 |
259 | VMRLF XT3,XB2,XB3 |
260 | VPDI XB0,XT0,XT1,0b0000 |
261 | VPDI XB1,XT0,XT1,0b0101 |
262 | VPDI XB2,XT2,XT3,0b0000 |
263 | VPDI XB3,XT2,XT3,0b0101 |
264 | |
265 | VMRHF XT0,XC0,XC1 |
266 | VMRHF XT1,XC2,XC3 |
267 | VMRLF XT2,XC0,XC1 |
268 | VMRLF XT3,XC2,XC3 |
269 | VPDI XC0,XT0,XT1,0b0000 |
270 | VPDI XC1,XT0,XT1,0b0101 |
271 | VPDI XC2,XT2,XT3,0b0000 |
272 | VPDI XC3,XT2,XT3,0b0101 |
273 | |
274 | VMRHF XT0,XD0,XD1 |
275 | VMRHF XT1,XD2,XD3 |
276 | VMRLF XT2,XD0,XD1 |
277 | VMRLF XT3,XD2,XD3 |
278 | VPDI XD0,XT0,XT1,0b0000 |
279 | VPDI XD1,XT0,XT1,0b0101 |
280 | VPDI XD2,XT2,XT3,0b0000 |
281 | VPDI XD3,XT2,XT3,0b0101 |
282 | |
283 | VAF XA0,XA0,K0 |
284 | VAF XB0,XB0,K1 |
285 | VAF XC0,XC0,K2 |
286 | VAF XD0,XD0,K3 |
287 | |
288 | VPERM XA0,XA0,XA0,BEPERM |
289 | VPERM XB0,XB0,XB0,BEPERM |
290 | VPERM XC0,XC0,XC0,BEPERM |
291 | VPERM XD0,XD0,XD0,BEPERM |
292 | |
293 | VLM XT0,XT3,0,INP,0 |
294 | |
295 | VX XT0,XT0,XA0 |
296 | VX XT1,XT1,XB0 |
297 | VX XT2,XT2,XC0 |
298 | VX XT3,XT3,XD0 |
299 | |
300 | VSTM XT0,XT3,0,OUT,0 |
301 | |
302 | la INP,0x40(INP) |
303 | la OUT,0x40(OUT) |
304 | aghi LEN,-0x40 |
305 | |
306 | VAF XA0,XA1,K0 |
307 | VAF XB0,XB1,K1 |
308 | VAF XC0,XC1,K2 |
309 | VAF XD0,XD1,K3 |
310 | |
311 | VPERM XA0,XA0,XA0,BEPERM |
312 | VPERM XB0,XB0,XB0,BEPERM |
313 | VPERM XC0,XC0,XC0,BEPERM |
314 | VPERM XD0,XD0,XD0,BEPERM |
315 | |
316 | clgfi LEN,0x40 |
317 | jl .Ltail_4x |
318 | |
319 | VLM XT0,XT3,0,INP,0 |
320 | |
321 | VX XT0,XT0,XA0 |
322 | VX XT1,XT1,XB0 |
323 | VX XT2,XT2,XC0 |
324 | VX XT3,XT3,XD0 |
325 | |
326 | VSTM XT0,XT3,0,OUT,0 |
327 | |
328 | la INP,0x40(INP) |
329 | la OUT,0x40(OUT) |
330 | aghi LEN,-0x40 |
331 | je .Ldone_4x |
332 | |
333 | VAF XA0,XA2,K0 |
334 | VAF XB0,XB2,K1 |
335 | VAF XC0,XC2,K2 |
336 | VAF XD0,XD2,K3 |
337 | |
338 | VPERM XA0,XA0,XA0,BEPERM |
339 | VPERM XB0,XB0,XB0,BEPERM |
340 | VPERM XC0,XC0,XC0,BEPERM |
341 | VPERM XD0,XD0,XD0,BEPERM |
342 | |
343 | clgfi LEN,0x40 |
344 | jl .Ltail_4x |
345 | |
346 | VLM XT0,XT3,0,INP,0 |
347 | |
348 | VX XT0,XT0,XA0 |
349 | VX XT1,XT1,XB0 |
350 | VX XT2,XT2,XC0 |
351 | VX XT3,XT3,XD0 |
352 | |
353 | VSTM XT0,XT3,0,OUT,0 |
354 | |
355 | la INP,0x40(INP) |
356 | la OUT,0x40(OUT) |
357 | aghi LEN,-0x40 |
358 | je .Ldone_4x |
359 | |
360 | VAF XA0,XA3,K0 |
361 | VAF XB0,XB3,K1 |
362 | VAF XC0,XC3,K2 |
363 | VAF XD0,XD3,K3 |
364 | |
365 | VPERM XA0,XA0,XA0,BEPERM |
366 | VPERM XB0,XB0,XB0,BEPERM |
367 | VPERM XC0,XC0,XC0,BEPERM |
368 | VPERM XD0,XD0,XD0,BEPERM |
369 | |
370 | clgfi LEN,0x40 |
371 | jl .Ltail_4x |
372 | |
373 | VLM XT0,XT3,0,INP,0 |
374 | |
375 | VX XT0,XT0,XA0 |
376 | VX XT1,XT1,XB0 |
377 | VX XT2,XT2,XC0 |
378 | VX XT3,XT3,XD0 |
379 | |
380 | VSTM XT0,XT3,0,OUT,0 |
381 | |
382 | .Ldone_4x: |
383 | lmg %r6,%r7,6*8(SP) |
384 | BR_EX %r14 |
385 | |
386 | .Ltail_4x: |
387 | VLR XT0,XC0 |
388 | VLR XT1,XD0 |
389 | |
390 | VST XA0,8*8+0x00,,SP |
391 | VST XB0,8*8+0x10,,SP |
392 | VST XT0,8*8+0x20,,SP |
393 | VST XT1,8*8+0x30,,SP |
394 | |
395 | lghi %r1,0 |
396 | |
397 | .Loop_tail_4x: |
398 | llgc %r5,0(%r1,INP) |
399 | llgc %r6,8*8(%r1,SP) |
400 | xr %r6,%r5 |
401 | stc %r6,0(%r1,OUT) |
402 | la %r1,1(%r1) |
403 | brct LEN,.Loop_tail_4x |
404 | |
405 | lmg %r6,%r7,6*8(SP) |
406 | BR_EX %r14 |
407 | SYM_FUNC_END(chacha20_vx_4x) |
408 | |
409 | #undef OUT |
410 | #undef INP |
411 | #undef LEN |
412 | #undef KEY |
413 | #undef COUNTER |
414 | |
415 | #undef BEPERM |
416 | |
417 | #undef K0 |
418 | #undef K1 |
419 | #undef K2 |
420 | #undef K3 |
421 | |
422 | |
423 | ############################################################################# |
424 | # void chacha20_vx(u8 *out, counst u8 *inp, size_t len, |
425 | # counst u32 *key, const u32 *counter) |
426 | |
427 | #define OUT %r2 |
428 | #define INP %r3 |
429 | #define LEN %r4 |
430 | #define KEY %r5 |
431 | #define COUNTER %r6 |
432 | |
433 | #define BEPERM %v31 |
434 | |
435 | #define K0 %v27 |
436 | #define K1 %v24 |
437 | #define K2 %v25 |
438 | #define K3 %v26 |
439 | |
440 | #define A0 %v0 |
441 | #define B0 %v1 |
442 | #define C0 %v2 |
443 | #define D0 %v3 |
444 | |
445 | #define A1 %v4 |
446 | #define B1 %v5 |
447 | #define C1 %v6 |
448 | #define D1 %v7 |
449 | |
450 | #define A2 %v8 |
451 | #define B2 %v9 |
452 | #define C2 %v10 |
453 | #define D2 %v11 |
454 | |
455 | #define A3 %v12 |
456 | #define B3 %v13 |
457 | #define C3 %v14 |
458 | #define D3 %v15 |
459 | |
460 | #define A4 %v16 |
461 | #define B4 %v17 |
462 | #define C4 %v18 |
463 | #define D4 %v19 |
464 | |
465 | #define A5 %v20 |
466 | #define B5 %v21 |
467 | #define C5 %v22 |
468 | #define D5 %v23 |
469 | |
470 | #define T0 %v27 |
471 | #define T1 %v28 |
472 | #define T2 %v29 |
473 | #define T3 %v30 |
474 | |
475 | SYM_FUNC_START(chacha20_vx) |
476 | clgfi LEN,256 |
477 | jle chacha20_vx_4x |
478 | stmg %r6,%r7,6*8(SP) |
479 | |
480 | lghi %r1,-FRAME |
481 | lgr %r0,SP |
482 | la SP,0(%r1,SP) |
483 | stg %r0,0(SP) # back-chain |
484 | |
485 | larl %r7,sigma |
486 | lhi %r0,10 |
487 | |
488 | VLM K1,K2,0,KEY,0 # load key |
489 | VL K3,0,,COUNTER # load counter |
490 | |
491 | VLM K0,BEPERM,0,%r7,4 # load sigma, increments, ... |
492 | |
493 | .Loop_outer_vx: |
494 | VLR A0,K0 |
495 | VLR B0,K1 |
496 | VLR A1,K0 |
497 | VLR B1,K1 |
498 | VLR A2,K0 |
499 | VLR B2,K1 |
500 | VLR A3,K0 |
501 | VLR B3,K1 |
502 | VLR A4,K0 |
503 | VLR B4,K1 |
504 | VLR A5,K0 |
505 | VLR B5,K1 |
506 | |
507 | VLR D0,K3 |
508 | VAF D1,K3,T1 # K[3]+1 |
509 | VAF D2,K3,T2 # K[3]+2 |
510 | VAF D3,K3,T3 # K[3]+3 |
511 | VAF D4,D2,T2 # K[3]+4 |
512 | VAF D5,D2,T3 # K[3]+5 |
513 | |
514 | VLR C0,K2 |
515 | VLR C1,K2 |
516 | VLR C2,K2 |
517 | VLR C3,K2 |
518 | VLR C4,K2 |
519 | VLR C5,K2 |
520 | |
521 | VLR T1,D1 |
522 | VLR T2,D2 |
523 | VLR T3,D3 |
524 | |
525 | .Loop_vx: |
526 | VAF A0,A0,B0 |
527 | VAF A1,A1,B1 |
528 | VAF A2,A2,B2 |
529 | VAF A3,A3,B3 |
530 | VAF A4,A4,B4 |
531 | VAF A5,A5,B5 |
532 | VX D0,D0,A0 |
533 | VX D1,D1,A1 |
534 | VX D2,D2,A2 |
535 | VX D3,D3,A3 |
536 | VX D4,D4,A4 |
537 | VX D5,D5,A5 |
538 | VERLLF D0,D0,16 |
539 | VERLLF D1,D1,16 |
540 | VERLLF D2,D2,16 |
541 | VERLLF D3,D3,16 |
542 | VERLLF D4,D4,16 |
543 | VERLLF D5,D5,16 |
544 | |
545 | VAF C0,C0,D0 |
546 | VAF C1,C1,D1 |
547 | VAF C2,C2,D2 |
548 | VAF C3,C3,D3 |
549 | VAF C4,C4,D4 |
550 | VAF C5,C5,D5 |
551 | VX B0,B0,C0 |
552 | VX B1,B1,C1 |
553 | VX B2,B2,C2 |
554 | VX B3,B3,C3 |
555 | VX B4,B4,C4 |
556 | VX B5,B5,C5 |
557 | VERLLF B0,B0,12 |
558 | VERLLF B1,B1,12 |
559 | VERLLF B2,B2,12 |
560 | VERLLF B3,B3,12 |
561 | VERLLF B4,B4,12 |
562 | VERLLF B5,B5,12 |
563 | |
564 | VAF A0,A0,B0 |
565 | VAF A1,A1,B1 |
566 | VAF A2,A2,B2 |
567 | VAF A3,A3,B3 |
568 | VAF A4,A4,B4 |
569 | VAF A5,A5,B5 |
570 | VX D0,D0,A0 |
571 | VX D1,D1,A1 |
572 | VX D2,D2,A2 |
573 | VX D3,D3,A3 |
574 | VX D4,D4,A4 |
575 | VX D5,D5,A5 |
576 | VERLLF D0,D0,8 |
577 | VERLLF D1,D1,8 |
578 | VERLLF D2,D2,8 |
579 | VERLLF D3,D3,8 |
580 | VERLLF D4,D4,8 |
581 | VERLLF D5,D5,8 |
582 | |
583 | VAF C0,C0,D0 |
584 | VAF C1,C1,D1 |
585 | VAF C2,C2,D2 |
586 | VAF C3,C3,D3 |
587 | VAF C4,C4,D4 |
588 | VAF C5,C5,D5 |
589 | VX B0,B0,C0 |
590 | VX B1,B1,C1 |
591 | VX B2,B2,C2 |
592 | VX B3,B3,C3 |
593 | VX B4,B4,C4 |
594 | VX B5,B5,C5 |
595 | VERLLF B0,B0,7 |
596 | VERLLF B1,B1,7 |
597 | VERLLF B2,B2,7 |
598 | VERLLF B3,B3,7 |
599 | VERLLF B4,B4,7 |
600 | VERLLF B5,B5,7 |
601 | |
602 | VSLDB C0,C0,C0,8 |
603 | VSLDB C1,C1,C1,8 |
604 | VSLDB C2,C2,C2,8 |
605 | VSLDB C3,C3,C3,8 |
606 | VSLDB C4,C4,C4,8 |
607 | VSLDB C5,C5,C5,8 |
608 | VSLDB B0,B0,B0,4 |
609 | VSLDB B1,B1,B1,4 |
610 | VSLDB B2,B2,B2,4 |
611 | VSLDB B3,B3,B3,4 |
612 | VSLDB B4,B4,B4,4 |
613 | VSLDB B5,B5,B5,4 |
614 | VSLDB D0,D0,D0,12 |
615 | VSLDB D1,D1,D1,12 |
616 | VSLDB D2,D2,D2,12 |
617 | VSLDB D3,D3,D3,12 |
618 | VSLDB D4,D4,D4,12 |
619 | VSLDB D5,D5,D5,12 |
620 | |
621 | VAF A0,A0,B0 |
622 | VAF A1,A1,B1 |
623 | VAF A2,A2,B2 |
624 | VAF A3,A3,B3 |
625 | VAF A4,A4,B4 |
626 | VAF A5,A5,B5 |
627 | VX D0,D0,A0 |
628 | VX D1,D1,A1 |
629 | VX D2,D2,A2 |
630 | VX D3,D3,A3 |
631 | VX D4,D4,A4 |
632 | VX D5,D5,A5 |
633 | VERLLF D0,D0,16 |
634 | VERLLF D1,D1,16 |
635 | VERLLF D2,D2,16 |
636 | VERLLF D3,D3,16 |
637 | VERLLF D4,D4,16 |
638 | VERLLF D5,D5,16 |
639 | |
640 | VAF C0,C0,D0 |
641 | VAF C1,C1,D1 |
642 | VAF C2,C2,D2 |
643 | VAF C3,C3,D3 |
644 | VAF C4,C4,D4 |
645 | VAF C5,C5,D5 |
646 | VX B0,B0,C0 |
647 | VX B1,B1,C1 |
648 | VX B2,B2,C2 |
649 | VX B3,B3,C3 |
650 | VX B4,B4,C4 |
651 | VX B5,B5,C5 |
652 | VERLLF B0,B0,12 |
653 | VERLLF B1,B1,12 |
654 | VERLLF B2,B2,12 |
655 | VERLLF B3,B3,12 |
656 | VERLLF B4,B4,12 |
657 | VERLLF B5,B5,12 |
658 | |
659 | VAF A0,A0,B0 |
660 | VAF A1,A1,B1 |
661 | VAF A2,A2,B2 |
662 | VAF A3,A3,B3 |
663 | VAF A4,A4,B4 |
664 | VAF A5,A5,B5 |
665 | VX D0,D0,A0 |
666 | VX D1,D1,A1 |
667 | VX D2,D2,A2 |
668 | VX D3,D3,A3 |
669 | VX D4,D4,A4 |
670 | VX D5,D5,A5 |
671 | VERLLF D0,D0,8 |
672 | VERLLF D1,D1,8 |
673 | VERLLF D2,D2,8 |
674 | VERLLF D3,D3,8 |
675 | VERLLF D4,D4,8 |
676 | VERLLF D5,D5,8 |
677 | |
678 | VAF C0,C0,D0 |
679 | VAF C1,C1,D1 |
680 | VAF C2,C2,D2 |
681 | VAF C3,C3,D3 |
682 | VAF C4,C4,D4 |
683 | VAF C5,C5,D5 |
684 | VX B0,B0,C0 |
685 | VX B1,B1,C1 |
686 | VX B2,B2,C2 |
687 | VX B3,B3,C3 |
688 | VX B4,B4,C4 |
689 | VX B5,B5,C5 |
690 | VERLLF B0,B0,7 |
691 | VERLLF B1,B1,7 |
692 | VERLLF B2,B2,7 |
693 | VERLLF B3,B3,7 |
694 | VERLLF B4,B4,7 |
695 | VERLLF B5,B5,7 |
696 | |
697 | VSLDB C0,C0,C0,8 |
698 | VSLDB C1,C1,C1,8 |
699 | VSLDB C2,C2,C2,8 |
700 | VSLDB C3,C3,C3,8 |
701 | VSLDB C4,C4,C4,8 |
702 | VSLDB C5,C5,C5,8 |
703 | VSLDB B0,B0,B0,12 |
704 | VSLDB B1,B1,B1,12 |
705 | VSLDB B2,B2,B2,12 |
706 | VSLDB B3,B3,B3,12 |
707 | VSLDB B4,B4,B4,12 |
708 | VSLDB B5,B5,B5,12 |
709 | VSLDB D0,D0,D0,4 |
710 | VSLDB D1,D1,D1,4 |
711 | VSLDB D2,D2,D2,4 |
712 | VSLDB D3,D3,D3,4 |
713 | VSLDB D4,D4,D4,4 |
714 | VSLDB D5,D5,D5,4 |
715 | brct %r0,.Loop_vx |
716 | |
717 | VAF A0,A0,K0 |
718 | VAF B0,B0,K1 |
719 | VAF C0,C0,K2 |
720 | VAF D0,D0,K3 |
721 | VAF A1,A1,K0 |
722 | VAF D1,D1,T1 # +K[3]+1 |
723 | |
724 | VPERM A0,A0,A0,BEPERM |
725 | VPERM B0,B0,B0,BEPERM |
726 | VPERM C0,C0,C0,BEPERM |
727 | VPERM D0,D0,D0,BEPERM |
728 | |
729 | clgfi LEN,0x40 |
730 | jl .Ltail_vx |
731 | |
732 | VAF D2,D2,T2 # +K[3]+2 |
733 | VAF D3,D3,T3 # +K[3]+3 |
734 | VLM T0,T3,0,INP,0 |
735 | |
736 | VX A0,A0,T0 |
737 | VX B0,B0,T1 |
738 | VX C0,C0,T2 |
739 | VX D0,D0,T3 |
740 | |
741 | VLM K0,T3,0,%r7,4 # re-load sigma and increments |
742 | |
743 | VSTM A0,D0,0,OUT,0 |
744 | |
745 | la INP,0x40(INP) |
746 | la OUT,0x40(OUT) |
747 | aghi LEN,-0x40 |
748 | je .Ldone_vx |
749 | |
750 | VAF B1,B1,K1 |
751 | VAF C1,C1,K2 |
752 | |
753 | VPERM A0,A1,A1,BEPERM |
754 | VPERM B0,B1,B1,BEPERM |
755 | VPERM C0,C1,C1,BEPERM |
756 | VPERM D0,D1,D1,BEPERM |
757 | |
758 | clgfi LEN,0x40 |
759 | jl .Ltail_vx |
760 | |
761 | VLM A1,D1,0,INP,0 |
762 | |
763 | VX A0,A0,A1 |
764 | VX B0,B0,B1 |
765 | VX C0,C0,C1 |
766 | VX D0,D0,D1 |
767 | |
768 | VSTM A0,D0,0,OUT,0 |
769 | |
770 | la INP,0x40(INP) |
771 | la OUT,0x40(OUT) |
772 | aghi LEN,-0x40 |
773 | je .Ldone_vx |
774 | |
775 | VAF A2,A2,K0 |
776 | VAF B2,B2,K1 |
777 | VAF C2,C2,K2 |
778 | |
779 | VPERM A0,A2,A2,BEPERM |
780 | VPERM B0,B2,B2,BEPERM |
781 | VPERM C0,C2,C2,BEPERM |
782 | VPERM D0,D2,D2,BEPERM |
783 | |
784 | clgfi LEN,0x40 |
785 | jl .Ltail_vx |
786 | |
787 | VLM A1,D1,0,INP,0 |
788 | |
789 | VX A0,A0,A1 |
790 | VX B0,B0,B1 |
791 | VX C0,C0,C1 |
792 | VX D0,D0,D1 |
793 | |
794 | VSTM A0,D0,0,OUT,0 |
795 | |
796 | la INP,0x40(INP) |
797 | la OUT,0x40(OUT) |
798 | aghi LEN,-0x40 |
799 | je .Ldone_vx |
800 | |
801 | VAF A3,A3,K0 |
802 | VAF B3,B3,K1 |
803 | VAF C3,C3,K2 |
804 | VAF D2,K3,T3 # K[3]+3 |
805 | |
806 | VPERM A0,A3,A3,BEPERM |
807 | VPERM B0,B3,B3,BEPERM |
808 | VPERM C0,C3,C3,BEPERM |
809 | VPERM D0,D3,D3,BEPERM |
810 | |
811 | clgfi LEN,0x40 |
812 | jl .Ltail_vx |
813 | |
814 | VAF D3,D2,T1 # K[3]+4 |
815 | VLM A1,D1,0,INP,0 |
816 | |
817 | VX A0,A0,A1 |
818 | VX B0,B0,B1 |
819 | VX C0,C0,C1 |
820 | VX D0,D0,D1 |
821 | |
822 | VSTM A0,D0,0,OUT,0 |
823 | |
824 | la INP,0x40(INP) |
825 | la OUT,0x40(OUT) |
826 | aghi LEN,-0x40 |
827 | je .Ldone_vx |
828 | |
829 | VAF A4,A4,K0 |
830 | VAF B4,B4,K1 |
831 | VAF C4,C4,K2 |
832 | VAF D4,D4,D3 # +K[3]+4 |
833 | VAF D3,D3,T1 # K[3]+5 |
834 | VAF K3,D2,T3 # K[3]+=6 |
835 | |
836 | VPERM A0,A4,A4,BEPERM |
837 | VPERM B0,B4,B4,BEPERM |
838 | VPERM C0,C4,C4,BEPERM |
839 | VPERM D0,D4,D4,BEPERM |
840 | |
841 | clgfi LEN,0x40 |
842 | jl .Ltail_vx |
843 | |
844 | VLM A1,D1,0,INP,0 |
845 | |
846 | VX A0,A0,A1 |
847 | VX B0,B0,B1 |
848 | VX C0,C0,C1 |
849 | VX D0,D0,D1 |
850 | |
851 | VSTM A0,D0,0,OUT,0 |
852 | |
853 | la INP,0x40(INP) |
854 | la OUT,0x40(OUT) |
855 | aghi LEN,-0x40 |
856 | je .Ldone_vx |
857 | |
858 | VAF A5,A5,K0 |
859 | VAF B5,B5,K1 |
860 | VAF C5,C5,K2 |
861 | VAF D5,D5,D3 # +K[3]+5 |
862 | |
863 | VPERM A0,A5,A5,BEPERM |
864 | VPERM B0,B5,B5,BEPERM |
865 | VPERM C0,C5,C5,BEPERM |
866 | VPERM D0,D5,D5,BEPERM |
867 | |
868 | clgfi LEN,0x40 |
869 | jl .Ltail_vx |
870 | |
871 | VLM A1,D1,0,INP,0 |
872 | |
873 | VX A0,A0,A1 |
874 | VX B0,B0,B1 |
875 | VX C0,C0,C1 |
876 | VX D0,D0,D1 |
877 | |
878 | VSTM A0,D0,0,OUT,0 |
879 | |
880 | la INP,0x40(INP) |
881 | la OUT,0x40(OUT) |
882 | lhi %r0,10 |
883 | aghi LEN,-0x40 |
884 | jne .Loop_outer_vx |
885 | |
886 | .Ldone_vx: |
887 | lmg %r6,%r7,FRAME+6*8(SP) |
888 | la SP,FRAME(SP) |
889 | BR_EX %r14 |
890 | |
891 | .Ltail_vx: |
892 | VSTM A0,D0,8*8,SP,3 |
893 | lghi %r1,0 |
894 | |
895 | .Loop_tail_vx: |
896 | llgc %r5,0(%r1,INP) |
897 | llgc %r6,8*8(%r1,SP) |
898 | xr %r6,%r5 |
899 | stc %r6,0(%r1,OUT) |
900 | la %r1,1(%r1) |
901 | brct LEN,.Loop_tail_vx |
902 | |
903 | lmg %r6,%r7,FRAME+6*8(SP) |
904 | la SP,FRAME(SP) |
905 | BR_EX %r14 |
906 | SYM_FUNC_END(chacha20_vx) |
907 | |
908 | .previous |
909 | |