1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
2 | /* |
3 | * SM4 Cipher Algorithm for ARMv8 with Crypto Extensions |
4 | * as specified in |
5 | * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html |
6 | * |
7 | * Copyright (C) 2022, Alibaba Group. |
8 | * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> |
9 | */ |
10 | |
11 | #include <linux/linkage.h> |
12 | #include <asm/assembler.h> |
13 | #include "sm4-ce-asm.h" |
14 | |
15 | .arch armv8-a+crypto |
16 | |
17 | .irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ |
18 | 20, 24, 25, 26, 27, 28, 29, 30, 31 |
19 | .set .Lv\b\().4s, \b |
20 | .endr |
21 | |
22 | .macro sm4e, vd, vn |
23 | .inst 0xcec08400 | (.L\vn << 5) | .L\vd |
24 | .endm |
25 | |
26 | .macro sm4ekey, vd, vn, vm |
27 | .inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd |
28 | .endm |
29 | |
30 | /* Register macros */ |
31 | |
32 | #define RTMP0 v16 |
33 | #define RTMP1 v17 |
34 | #define RTMP2 v18 |
35 | #define RTMP3 v19 |
36 | |
37 | #define RIV v20 |
38 | #define RMAC v20 |
39 | #define RMASK v21 |
40 | |
41 | |
42 | .align 3 |
43 | SYM_FUNC_START(sm4_ce_expand_key) |
44 | /* input: |
45 | * x0: 128-bit key |
46 | * x1: rkey_enc |
47 | * x2: rkey_dec |
48 | * x3: fk array |
49 | * x4: ck array |
50 | */ |
51 | ld1 {v0.16b}, [x0]; |
52 | rev32 v0.16b, v0.16b; |
53 | ld1 {v1.16b}, [x3]; |
54 | /* load ck */ |
55 | ld1 {v24.16b-v27.16b}, [x4], #64; |
56 | ld1 {v28.16b-v31.16b}, [x4]; |
57 | |
58 | /* input ^ fk */ |
59 | eor v0.16b, v0.16b, v1.16b; |
60 | |
61 | sm4ekey v0.4s, v0.4s, v24.4s; |
62 | sm4ekey v1.4s, v0.4s, v25.4s; |
63 | sm4ekey v2.4s, v1.4s, v26.4s; |
64 | sm4ekey v3.4s, v2.4s, v27.4s; |
65 | sm4ekey v4.4s, v3.4s, v28.4s; |
66 | sm4ekey v5.4s, v4.4s, v29.4s; |
67 | sm4ekey v6.4s, v5.4s, v30.4s; |
68 | sm4ekey v7.4s, v6.4s, v31.4s; |
69 | |
70 | adr_l x5, .Lbswap128_mask |
71 | ld1 {v24.16b}, [x5] |
72 | |
73 | st1 {v0.16b-v3.16b}, [x1], #64; |
74 | st1 {v4.16b-v7.16b}, [x1]; |
75 | |
76 | tbl v16.16b, {v7.16b}, v24.16b |
77 | tbl v17.16b, {v6.16b}, v24.16b |
78 | tbl v18.16b, {v5.16b}, v24.16b |
79 | tbl v19.16b, {v4.16b}, v24.16b |
80 | tbl v20.16b, {v3.16b}, v24.16b |
81 | tbl v21.16b, {v2.16b}, v24.16b |
82 | tbl v22.16b, {v1.16b}, v24.16b |
83 | tbl v23.16b, {v0.16b}, v24.16b |
84 | |
85 | st1 {v16.16b-v19.16b}, [x2], #64 |
86 | st1 {v20.16b-v23.16b}, [x2] |
87 | |
88 | ret; |
89 | SYM_FUNC_END(sm4_ce_expand_key) |
90 | |
91 | .align 3 |
92 | SYM_FUNC_START(sm4_ce_crypt_block) |
93 | /* input: |
94 | * x0: round key array, CTX |
95 | * x1: dst |
96 | * x2: src |
97 | */ |
98 | SM4_PREPARE(x0) |
99 | |
100 | ld1 {v0.16b}, [x2]; |
101 | SM4_CRYPT_BLK(v0); |
102 | st1 {v0.16b}, [x1]; |
103 | |
104 | ret; |
105 | SYM_FUNC_END(sm4_ce_crypt_block) |
106 | |
107 | .align 3 |
108 | SYM_FUNC_START(sm4_ce_crypt) |
109 | /* input: |
110 | * x0: round key array, CTX |
111 | * x1: dst |
112 | * x2: src |
113 | * w3: nblocks |
114 | */ |
115 | SM4_PREPARE(x0) |
116 | |
117 | .Lcrypt_loop_blk: |
118 | sub w3, w3, #8; |
119 | tbnz w3, #31, .Lcrypt_tail8; |
120 | |
121 | ld1 {v0.16b-v3.16b}, [x2], #64; |
122 | ld1 {v4.16b-v7.16b}, [x2], #64; |
123 | |
124 | SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); |
125 | |
126 | st1 {v0.16b-v3.16b}, [x1], #64; |
127 | st1 {v4.16b-v7.16b}, [x1], #64; |
128 | |
129 | cbz w3, .Lcrypt_end; |
130 | b .Lcrypt_loop_blk; |
131 | |
132 | .Lcrypt_tail8: |
133 | add w3, w3, #8; |
134 | cmp w3, #4; |
135 | blt .Lcrypt_tail4; |
136 | |
137 | sub w3, w3, #4; |
138 | |
139 | ld1 {v0.16b-v3.16b}, [x2], #64; |
140 | SM4_CRYPT_BLK4(v0, v1, v2, v3); |
141 | st1 {v0.16b-v3.16b}, [x1], #64; |
142 | |
143 | cbz w3, .Lcrypt_end; |
144 | |
145 | .Lcrypt_tail4: |
146 | sub w3, w3, #1; |
147 | |
148 | ld1 {v0.16b}, [x2], #16; |
149 | SM4_CRYPT_BLK(v0); |
150 | st1 {v0.16b}, [x1], #16; |
151 | |
152 | cbnz w3, .Lcrypt_tail4; |
153 | |
154 | .Lcrypt_end: |
155 | ret; |
156 | SYM_FUNC_END(sm4_ce_crypt) |
157 | |
158 | .align 3 |
159 | SYM_FUNC_START(sm4_ce_cbc_enc) |
160 | /* input: |
161 | * x0: round key array, CTX |
162 | * x1: dst |
163 | * x2: src |
164 | * x3: iv (big endian, 128 bit) |
165 | * w4: nblocks |
166 | */ |
167 | SM4_PREPARE(x0) |
168 | |
169 | ld1 {RIV.16b}, [x3] |
170 | |
171 | .Lcbc_enc_loop_4x: |
172 | cmp w4, #4 |
173 | blt .Lcbc_enc_loop_1x |
174 | |
175 | sub w4, w4, #4 |
176 | |
177 | ld1 {v0.16b-v3.16b}, [x2], #64 |
178 | |
179 | eor v0.16b, v0.16b, RIV.16b |
180 | SM4_CRYPT_BLK(v0) |
181 | eor v1.16b, v1.16b, v0.16b |
182 | SM4_CRYPT_BLK(v1) |
183 | eor v2.16b, v2.16b, v1.16b |
184 | SM4_CRYPT_BLK(v2) |
185 | eor v3.16b, v3.16b, v2.16b |
186 | SM4_CRYPT_BLK(v3) |
187 | |
188 | st1 {v0.16b-v3.16b}, [x1], #64 |
189 | mov RIV.16b, v3.16b |
190 | |
191 | cbz w4, .Lcbc_enc_end |
192 | b .Lcbc_enc_loop_4x |
193 | |
194 | .Lcbc_enc_loop_1x: |
195 | sub w4, w4, #1 |
196 | |
197 | ld1 {v0.16b}, [x2], #16 |
198 | |
199 | eor RIV.16b, RIV.16b, v0.16b |
200 | SM4_CRYPT_BLK(RIV) |
201 | |
202 | st1 {RIV.16b}, [x1], #16 |
203 | |
204 | cbnz w4, .Lcbc_enc_loop_1x |
205 | |
206 | .Lcbc_enc_end: |
207 | /* store new IV */ |
208 | st1 {RIV.16b}, [x3] |
209 | |
210 | ret |
211 | SYM_FUNC_END(sm4_ce_cbc_enc) |
212 | |
213 | .align 3 |
214 | SYM_FUNC_START(sm4_ce_cbc_dec) |
215 | /* input: |
216 | * x0: round key array, CTX |
217 | * x1: dst |
218 | * x2: src |
219 | * x3: iv (big endian, 128 bit) |
220 | * w4: nblocks |
221 | */ |
222 | SM4_PREPARE(x0) |
223 | |
224 | ld1 {RIV.16b}, [x3] |
225 | |
226 | .Lcbc_dec_loop_8x: |
227 | sub w4, w4, #8 |
228 | tbnz w4, #31, .Lcbc_dec_4x |
229 | |
230 | ld1 {v0.16b-v3.16b}, [x2], #64 |
231 | ld1 {v4.16b-v7.16b}, [x2], #64 |
232 | |
233 | rev32 v8.16b, v0.16b |
234 | rev32 v9.16b, v1.16b |
235 | rev32 v10.16b, v2.16b |
236 | rev32 v11.16b, v3.16b |
237 | rev32 v12.16b, v4.16b |
238 | rev32 v13.16b, v5.16b |
239 | rev32 v14.16b, v6.16b |
240 | rev32 v15.16b, v7.16b |
241 | |
242 | SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15) |
243 | |
244 | eor v8.16b, v8.16b, RIV.16b |
245 | eor v9.16b, v9.16b, v0.16b |
246 | eor v10.16b, v10.16b, v1.16b |
247 | eor v11.16b, v11.16b, v2.16b |
248 | eor v12.16b, v12.16b, v3.16b |
249 | eor v13.16b, v13.16b, v4.16b |
250 | eor v14.16b, v14.16b, v5.16b |
251 | eor v15.16b, v15.16b, v6.16b |
252 | |
253 | st1 {v8.16b-v11.16b}, [x1], #64 |
254 | st1 {v12.16b-v15.16b}, [x1], #64 |
255 | |
256 | mov RIV.16b, v7.16b |
257 | |
258 | cbz w4, .Lcbc_dec_end |
259 | b .Lcbc_dec_loop_8x |
260 | |
261 | .Lcbc_dec_4x: |
262 | add w4, w4, #8 |
263 | cmp w4, #4 |
264 | blt .Lcbc_dec_loop_1x |
265 | |
266 | sub w4, w4, #4 |
267 | |
268 | ld1 {v0.16b-v3.16b}, [x2], #64 |
269 | |
270 | rev32 v8.16b, v0.16b |
271 | rev32 v9.16b, v1.16b |
272 | rev32 v10.16b, v2.16b |
273 | rev32 v11.16b, v3.16b |
274 | |
275 | SM4_CRYPT_BLK4_BE(v8, v9, v10, v11) |
276 | |
277 | eor v8.16b, v8.16b, RIV.16b |
278 | eor v9.16b, v9.16b, v0.16b |
279 | eor v10.16b, v10.16b, v1.16b |
280 | eor v11.16b, v11.16b, v2.16b |
281 | |
282 | st1 {v8.16b-v11.16b}, [x1], #64 |
283 | |
284 | mov RIV.16b, v3.16b |
285 | |
286 | cbz w4, .Lcbc_dec_end |
287 | |
288 | .Lcbc_dec_loop_1x: |
289 | sub w4, w4, #1 |
290 | |
291 | ld1 {v0.16b}, [x2], #16 |
292 | |
293 | rev32 v8.16b, v0.16b |
294 | |
295 | SM4_CRYPT_BLK_BE(v8) |
296 | |
297 | eor v8.16b, v8.16b, RIV.16b |
298 | st1 {v8.16b}, [x1], #16 |
299 | |
300 | mov RIV.16b, v0.16b |
301 | |
302 | cbnz w4, .Lcbc_dec_loop_1x |
303 | |
304 | .Lcbc_dec_end: |
305 | /* store new IV */ |
306 | st1 {RIV.16b}, [x3] |
307 | |
308 | ret |
309 | SYM_FUNC_END(sm4_ce_cbc_dec) |
310 | |
311 | .align 3 |
312 | SYM_FUNC_START(sm4_ce_cbc_cts_enc) |
313 | /* input: |
314 | * x0: round key array, CTX |
315 | * x1: dst |
316 | * x2: src |
317 | * x3: iv (big endian, 128 bit) |
318 | * w4: nbytes |
319 | */ |
320 | SM4_PREPARE(x0) |
321 | |
322 | sub w5, w4, #16 |
323 | uxtw x5, w5 |
324 | |
325 | ld1 {RIV.16b}, [x3] |
326 | |
327 | ld1 {v0.16b}, [x2] |
328 | eor RIV.16b, RIV.16b, v0.16b |
329 | SM4_CRYPT_BLK(RIV) |
330 | |
331 | /* load permute table */ |
332 | adr_l x6, .Lcts_permute_table |
333 | add x7, x6, #32 |
334 | add x6, x6, x5 |
335 | sub x7, x7, x5 |
336 | ld1 {v3.16b}, [x6] |
337 | ld1 {v4.16b}, [x7] |
338 | |
339 | /* overlapping loads */ |
340 | add x2, x2, x5 |
341 | ld1 {v1.16b}, [x2] |
342 | |
343 | /* create Cn from En-1 */ |
344 | tbl v0.16b, {RIV.16b}, v3.16b |
345 | /* padding Pn with zeros */ |
346 | tbl v1.16b, {v1.16b}, v4.16b |
347 | |
348 | eor v1.16b, v1.16b, RIV.16b |
349 | SM4_CRYPT_BLK(v1) |
350 | |
351 | /* overlapping stores */ |
352 | add x5, x1, x5 |
353 | st1 {v0.16b}, [x5] |
354 | st1 {v1.16b}, [x1] |
355 | |
356 | ret |
357 | SYM_FUNC_END(sm4_ce_cbc_cts_enc) |
358 | |
359 | .align 3 |
360 | SYM_FUNC_START(sm4_ce_cbc_cts_dec) |
361 | /* input: |
362 | * x0: round key array, CTX |
363 | * x1: dst |
364 | * x2: src |
365 | * x3: iv (big endian, 128 bit) |
366 | * w4: nbytes |
367 | */ |
368 | SM4_PREPARE(x0) |
369 | |
370 | sub w5, w4, #16 |
371 | uxtw x5, w5 |
372 | |
373 | ld1 {RIV.16b}, [x3] |
374 | |
375 | /* load permute table */ |
376 | adr_l x6, .Lcts_permute_table |
377 | add x7, x6, #32 |
378 | add x6, x6, x5 |
379 | sub x7, x7, x5 |
380 | ld1 {v3.16b}, [x6] |
381 | ld1 {v4.16b}, [x7] |
382 | |
383 | /* overlapping loads */ |
384 | ld1 {v0.16b}, [x2], x5 |
385 | ld1 {v1.16b}, [x2] |
386 | |
387 | SM4_CRYPT_BLK(v0) |
388 | /* select the first Ln bytes of Xn to create Pn */ |
389 | tbl v2.16b, {v0.16b}, v3.16b |
390 | eor v2.16b, v2.16b, v1.16b |
391 | |
392 | /* overwrite the first Ln bytes with Cn to create En-1 */ |
393 | tbx v0.16b, {v1.16b}, v4.16b |
394 | SM4_CRYPT_BLK(v0) |
395 | eor v0.16b, v0.16b, RIV.16b |
396 | |
397 | /* overlapping stores */ |
398 | add x5, x1, x5 |
399 | st1 {v2.16b}, [x5] |
400 | st1 {v0.16b}, [x1] |
401 | |
402 | ret |
403 | SYM_FUNC_END(sm4_ce_cbc_cts_dec) |
404 | |
405 | .align 3 |
406 | SYM_FUNC_START(sm4_ce_ctr_enc) |
407 | /* input: |
408 | * x0: round key array, CTX |
409 | * x1: dst |
410 | * x2: src |
411 | * x3: ctr (big endian, 128 bit) |
412 | * w4: nblocks |
413 | */ |
414 | SM4_PREPARE(x0) |
415 | |
416 | ldp x7, x8, [x3] |
417 | rev x7, x7 |
418 | rev x8, x8 |
419 | |
420 | .Lctr_loop_8x: |
421 | sub w4, w4, #8 |
422 | tbnz w4, #31, .Lctr_4x |
423 | |
424 | #define inc_le128(vctr) \ |
425 | mov vctr.d[1], x8; \ |
426 | mov vctr.d[0], x7; \ |
427 | adds x8, x8, #1; \ |
428 | rev64 vctr.16b, vctr.16b; \ |
429 | adc x7, x7, xzr; |
430 | |
431 | /* construct CTRs */ |
432 | inc_le128(v0) /* +0 */ |
433 | inc_le128(v1) /* +1 */ |
434 | inc_le128(v2) /* +2 */ |
435 | inc_le128(v3) /* +3 */ |
436 | inc_le128(v4) /* +4 */ |
437 | inc_le128(v5) /* +5 */ |
438 | inc_le128(v6) /* +6 */ |
439 | inc_le128(v7) /* +7 */ |
440 | |
441 | ld1 {v8.16b-v11.16b}, [x2], #64 |
442 | ld1 {v12.16b-v15.16b}, [x2], #64 |
443 | |
444 | SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7) |
445 | |
446 | eor v0.16b, v0.16b, v8.16b |
447 | eor v1.16b, v1.16b, v9.16b |
448 | eor v2.16b, v2.16b, v10.16b |
449 | eor v3.16b, v3.16b, v11.16b |
450 | eor v4.16b, v4.16b, v12.16b |
451 | eor v5.16b, v5.16b, v13.16b |
452 | eor v6.16b, v6.16b, v14.16b |
453 | eor v7.16b, v7.16b, v15.16b |
454 | |
455 | st1 {v0.16b-v3.16b}, [x1], #64 |
456 | st1 {v4.16b-v7.16b}, [x1], #64 |
457 | |
458 | cbz w4, .Lctr_end |
459 | b .Lctr_loop_8x |
460 | |
461 | .Lctr_4x: |
462 | add w4, w4, #8 |
463 | cmp w4, #4 |
464 | blt .Lctr_loop_1x |
465 | |
466 | sub w4, w4, #4 |
467 | |
468 | /* construct CTRs */ |
469 | inc_le128(v0) /* +0 */ |
470 | inc_le128(v1) /* +1 */ |
471 | inc_le128(v2) /* +2 */ |
472 | inc_le128(v3) /* +3 */ |
473 | |
474 | ld1 {v8.16b-v11.16b}, [x2], #64 |
475 | |
476 | SM4_CRYPT_BLK4(v0, v1, v2, v3) |
477 | |
478 | eor v0.16b, v0.16b, v8.16b |
479 | eor v1.16b, v1.16b, v9.16b |
480 | eor v2.16b, v2.16b, v10.16b |
481 | eor v3.16b, v3.16b, v11.16b |
482 | |
483 | st1 {v0.16b-v3.16b}, [x1], #64 |
484 | |
485 | cbz w4, .Lctr_end |
486 | |
487 | .Lctr_loop_1x: |
488 | sub w4, w4, #1 |
489 | |
490 | /* construct CTRs */ |
491 | inc_le128(v0) |
492 | |
493 | ld1 {v8.16b}, [x2], #16 |
494 | |
495 | SM4_CRYPT_BLK(v0) |
496 | |
497 | eor v0.16b, v0.16b, v8.16b |
498 | st1 {v0.16b}, [x1], #16 |
499 | |
500 | cbnz w4, .Lctr_loop_1x |
501 | |
502 | .Lctr_end: |
503 | /* store new CTR */ |
504 | rev x7, x7 |
505 | rev x8, x8 |
506 | stp x7, x8, [x3] |
507 | |
508 | ret |
509 | SYM_FUNC_END(sm4_ce_ctr_enc) |
510 | |
511 | |
512 | #define tweak_next(vt, vin, RTMP) \ |
513 | sshr RTMP.2d, vin.2d, #63; \ |
514 | and RTMP.16b, RTMP.16b, RMASK.16b; \ |
515 | add vt.2d, vin.2d, vin.2d; \ |
516 | ext RTMP.16b, RTMP.16b, RTMP.16b, #8; \ |
517 | eor vt.16b, vt.16b, RTMP.16b; |
518 | |
519 | .align 3 |
520 | SYM_FUNC_START(sm4_ce_xts_enc) |
521 | /* input: |
522 | * x0: round key array, CTX |
523 | * x1: dst |
524 | * x2: src |
525 | * x3: tweak (big endian, 128 bit) |
526 | * w4: nbytes |
527 | * x5: round key array for IV |
528 | */ |
529 | ld1 {v8.16b}, [x3] |
530 | |
531 | cbz x5, .Lxts_enc_nofirst |
532 | |
533 | SM4_PREPARE(x5) |
534 | |
535 | /* Generate first tweak */ |
536 | SM4_CRYPT_BLK(v8) |
537 | |
538 | .Lxts_enc_nofirst: |
539 | SM4_PREPARE(x0) |
540 | |
541 | ands w5, w4, #15 |
542 | lsr w4, w4, #4 |
543 | sub w6, w4, #1 |
544 | csel w4, w4, w6, eq |
545 | uxtw x5, w5 |
546 | |
547 | movi RMASK.2s, #0x1 |
548 | movi RTMP0.2s, #0x87 |
549 | uzp1 RMASK.4s, RMASK.4s, RTMP0.4s |
550 | |
551 | cbz w4, .Lxts_enc_cts |
552 | |
553 | .Lxts_enc_loop_8x: |
554 | sub w4, w4, #8 |
555 | tbnz w4, #31, .Lxts_enc_4x |
556 | |
557 | tweak_next( v9, v8, RTMP0) |
558 | tweak_next(v10, v9, RTMP1) |
559 | tweak_next(v11, v10, RTMP2) |
560 | tweak_next(v12, v11, RTMP3) |
561 | tweak_next(v13, v12, RTMP0) |
562 | tweak_next(v14, v13, RTMP1) |
563 | tweak_next(v15, v14, RTMP2) |
564 | |
565 | ld1 {v0.16b-v3.16b}, [x2], #64 |
566 | ld1 {v4.16b-v7.16b}, [x2], #64 |
567 | eor v0.16b, v0.16b, v8.16b |
568 | eor v1.16b, v1.16b, v9.16b |
569 | eor v2.16b, v2.16b, v10.16b |
570 | eor v3.16b, v3.16b, v11.16b |
571 | eor v4.16b, v4.16b, v12.16b |
572 | eor v5.16b, v5.16b, v13.16b |
573 | eor v6.16b, v6.16b, v14.16b |
574 | eor v7.16b, v7.16b, v15.16b |
575 | |
576 | SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7) |
577 | |
578 | eor v0.16b, v0.16b, v8.16b |
579 | eor v1.16b, v1.16b, v9.16b |
580 | eor v2.16b, v2.16b, v10.16b |
581 | eor v3.16b, v3.16b, v11.16b |
582 | eor v4.16b, v4.16b, v12.16b |
583 | eor v5.16b, v5.16b, v13.16b |
584 | eor v6.16b, v6.16b, v14.16b |
585 | eor v7.16b, v7.16b, v15.16b |
586 | st1 {v0.16b-v3.16b}, [x1], #64 |
587 | st1 {v4.16b-v7.16b}, [x1], #64 |
588 | |
589 | tweak_next(v8, v15, RTMP3) |
590 | |
591 | cbz w4, .Lxts_enc_cts |
592 | b .Lxts_enc_loop_8x |
593 | |
594 | .Lxts_enc_4x: |
595 | add w4, w4, #8 |
596 | cmp w4, #4 |
597 | blt .Lxts_enc_loop_1x |
598 | |
599 | sub w4, w4, #4 |
600 | |
601 | tweak_next( v9, v8, RTMP0) |
602 | tweak_next(v10, v9, RTMP1) |
603 | tweak_next(v11, v10, RTMP2) |
604 | |
605 | ld1 {v0.16b-v3.16b}, [x2], #64 |
606 | eor v0.16b, v0.16b, v8.16b |
607 | eor v1.16b, v1.16b, v9.16b |
608 | eor v2.16b, v2.16b, v10.16b |
609 | eor v3.16b, v3.16b, v11.16b |
610 | |
611 | SM4_CRYPT_BLK4(v0, v1, v2, v3) |
612 | |
613 | eor v0.16b, v0.16b, v8.16b |
614 | eor v1.16b, v1.16b, v9.16b |
615 | eor v2.16b, v2.16b, v10.16b |
616 | eor v3.16b, v3.16b, v11.16b |
617 | st1 {v0.16b-v3.16b}, [x1], #64 |
618 | |
619 | tweak_next(v8, v11, RTMP3) |
620 | |
621 | cbz w4, .Lxts_enc_cts |
622 | |
623 | .Lxts_enc_loop_1x: |
624 | sub w4, w4, #1 |
625 | |
626 | ld1 {v0.16b}, [x2], #16 |
627 | eor v0.16b, v0.16b, v8.16b |
628 | |
629 | SM4_CRYPT_BLK(v0) |
630 | |
631 | eor v0.16b, v0.16b, v8.16b |
632 | st1 {v0.16b}, [x1], #16 |
633 | |
634 | tweak_next(v8, v8, RTMP0) |
635 | |
636 | cbnz w4, .Lxts_enc_loop_1x |
637 | |
638 | .Lxts_enc_cts: |
639 | cbz x5, .Lxts_enc_end |
640 | |
641 | /* cipher text stealing */ |
642 | |
643 | tweak_next(v9, v8, RTMP0) |
644 | ld1 {v0.16b}, [x2] |
645 | eor v0.16b, v0.16b, v8.16b |
646 | SM4_CRYPT_BLK(v0) |
647 | eor v0.16b, v0.16b, v8.16b |
648 | |
649 | /* load permute table */ |
650 | adr_l x6, .Lcts_permute_table |
651 | add x7, x6, #32 |
652 | add x6, x6, x5 |
653 | sub x7, x7, x5 |
654 | ld1 {v3.16b}, [x6] |
655 | ld1 {v4.16b}, [x7] |
656 | |
657 | /* overlapping loads */ |
658 | add x2, x2, x5 |
659 | ld1 {v1.16b}, [x2] |
660 | |
661 | /* create Cn from En-1 */ |
662 | tbl v2.16b, {v0.16b}, v3.16b |
663 | /* padding Pn with En-1 at the end */ |
664 | tbx v0.16b, {v1.16b}, v4.16b |
665 | |
666 | eor v0.16b, v0.16b, v9.16b |
667 | SM4_CRYPT_BLK(v0) |
668 | eor v0.16b, v0.16b, v9.16b |
669 | |
670 | |
671 | /* overlapping stores */ |
672 | add x5, x1, x5 |
673 | st1 {v2.16b}, [x5] |
674 | st1 {v0.16b}, [x1] |
675 | |
676 | b .Lxts_enc_ret |
677 | |
678 | .Lxts_enc_end: |
679 | /* store new tweak */ |
680 | st1 {v8.16b}, [x3] |
681 | |
682 | .Lxts_enc_ret: |
683 | ret |
684 | SYM_FUNC_END(sm4_ce_xts_enc) |
685 | |
686 | .align 3 |
687 | SYM_FUNC_START(sm4_ce_xts_dec) |
688 | /* input: |
689 | * x0: round key array, CTX |
690 | * x1: dst |
691 | * x2: src |
692 | * x3: tweak (big endian, 128 bit) |
693 | * w4: nbytes |
694 | * x5: round key array for IV |
695 | */ |
696 | ld1 {v8.16b}, [x3] |
697 | |
698 | cbz x5, .Lxts_dec_nofirst |
699 | |
700 | SM4_PREPARE(x5) |
701 | |
702 | /* Generate first tweak */ |
703 | SM4_CRYPT_BLK(v8) |
704 | |
705 | .Lxts_dec_nofirst: |
706 | SM4_PREPARE(x0) |
707 | |
708 | ands w5, w4, #15 |
709 | lsr w4, w4, #4 |
710 | sub w6, w4, #1 |
711 | csel w4, w4, w6, eq |
712 | uxtw x5, w5 |
713 | |
714 | movi RMASK.2s, #0x1 |
715 | movi RTMP0.2s, #0x87 |
716 | uzp1 RMASK.4s, RMASK.4s, RTMP0.4s |
717 | |
718 | cbz w4, .Lxts_dec_cts |
719 | |
720 | .Lxts_dec_loop_8x: |
721 | sub w4, w4, #8 |
722 | tbnz w4, #31, .Lxts_dec_4x |
723 | |
724 | tweak_next( v9, v8, RTMP0) |
725 | tweak_next(v10, v9, RTMP1) |
726 | tweak_next(v11, v10, RTMP2) |
727 | tweak_next(v12, v11, RTMP3) |
728 | tweak_next(v13, v12, RTMP0) |
729 | tweak_next(v14, v13, RTMP1) |
730 | tweak_next(v15, v14, RTMP2) |
731 | |
732 | ld1 {v0.16b-v3.16b}, [x2], #64 |
733 | ld1 {v4.16b-v7.16b}, [x2], #64 |
734 | eor v0.16b, v0.16b, v8.16b |
735 | eor v1.16b, v1.16b, v9.16b |
736 | eor v2.16b, v2.16b, v10.16b |
737 | eor v3.16b, v3.16b, v11.16b |
738 | eor v4.16b, v4.16b, v12.16b |
739 | eor v5.16b, v5.16b, v13.16b |
740 | eor v6.16b, v6.16b, v14.16b |
741 | eor v7.16b, v7.16b, v15.16b |
742 | |
743 | SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7) |
744 | |
745 | eor v0.16b, v0.16b, v8.16b |
746 | eor v1.16b, v1.16b, v9.16b |
747 | eor v2.16b, v2.16b, v10.16b |
748 | eor v3.16b, v3.16b, v11.16b |
749 | eor v4.16b, v4.16b, v12.16b |
750 | eor v5.16b, v5.16b, v13.16b |
751 | eor v6.16b, v6.16b, v14.16b |
752 | eor v7.16b, v7.16b, v15.16b |
753 | st1 {v0.16b-v3.16b}, [x1], #64 |
754 | st1 {v4.16b-v7.16b}, [x1], #64 |
755 | |
756 | tweak_next(v8, v15, RTMP3) |
757 | |
758 | cbz w4, .Lxts_dec_cts |
759 | b .Lxts_dec_loop_8x |
760 | |
761 | .Lxts_dec_4x: |
762 | add w4, w4, #8 |
763 | cmp w4, #4 |
764 | blt .Lxts_dec_loop_1x |
765 | |
766 | sub w4, w4, #4 |
767 | |
768 | tweak_next( v9, v8, RTMP0) |
769 | tweak_next(v10, v9, RTMP1) |
770 | tweak_next(v11, v10, RTMP2) |
771 | |
772 | ld1 {v0.16b-v3.16b}, [x2], #64 |
773 | eor v0.16b, v0.16b, v8.16b |
774 | eor v1.16b, v1.16b, v9.16b |
775 | eor v2.16b, v2.16b, v10.16b |
776 | eor v3.16b, v3.16b, v11.16b |
777 | |
778 | SM4_CRYPT_BLK4(v0, v1, v2, v3) |
779 | |
780 | eor v0.16b, v0.16b, v8.16b |
781 | eor v1.16b, v1.16b, v9.16b |
782 | eor v2.16b, v2.16b, v10.16b |
783 | eor v3.16b, v3.16b, v11.16b |
784 | st1 {v0.16b-v3.16b}, [x1], #64 |
785 | |
786 | tweak_next(v8, v11, RTMP3) |
787 | |
788 | cbz w4, .Lxts_dec_cts |
789 | |
790 | .Lxts_dec_loop_1x: |
791 | sub w4, w4, #1 |
792 | |
793 | ld1 {v0.16b}, [x2], #16 |
794 | eor v0.16b, v0.16b, v8.16b |
795 | |
796 | SM4_CRYPT_BLK(v0) |
797 | |
798 | eor v0.16b, v0.16b, v8.16b |
799 | st1 {v0.16b}, [x1], #16 |
800 | |
801 | tweak_next(v8, v8, RTMP0) |
802 | |
803 | cbnz w4, .Lxts_dec_loop_1x |
804 | |
805 | .Lxts_dec_cts: |
806 | cbz x5, .Lxts_dec_end |
807 | |
808 | /* cipher text stealing */ |
809 | |
810 | tweak_next(v9, v8, RTMP0) |
811 | ld1 {v0.16b}, [x2] |
812 | eor v0.16b, v0.16b, v9.16b |
813 | SM4_CRYPT_BLK(v0) |
814 | eor v0.16b, v0.16b, v9.16b |
815 | |
816 | /* load permute table */ |
817 | adr_l x6, .Lcts_permute_table |
818 | add x7, x6, #32 |
819 | add x6, x6, x5 |
820 | sub x7, x7, x5 |
821 | ld1 {v3.16b}, [x6] |
822 | ld1 {v4.16b}, [x7] |
823 | |
824 | /* overlapping loads */ |
825 | add x2, x2, x5 |
826 | ld1 {v1.16b}, [x2] |
827 | |
828 | /* create Cn from En-1 */ |
829 | tbl v2.16b, {v0.16b}, v3.16b |
830 | /* padding Pn with En-1 at the end */ |
831 | tbx v0.16b, {v1.16b}, v4.16b |
832 | |
833 | eor v0.16b, v0.16b, v8.16b |
834 | SM4_CRYPT_BLK(v0) |
835 | eor v0.16b, v0.16b, v8.16b |
836 | |
837 | |
838 | /* overlapping stores */ |
839 | add x5, x1, x5 |
840 | st1 {v2.16b}, [x5] |
841 | st1 {v0.16b}, [x1] |
842 | |
843 | b .Lxts_dec_ret |
844 | |
845 | .Lxts_dec_end: |
846 | /* store new tweak */ |
847 | st1 {v8.16b}, [x3] |
848 | |
849 | .Lxts_dec_ret: |
850 | ret |
851 | SYM_FUNC_END(sm4_ce_xts_dec) |
852 | |
853 | .align 3 |
854 | SYM_FUNC_START(sm4_ce_mac_update) |
855 | /* input: |
856 | * x0: round key array, CTX |
857 | * x1: digest |
858 | * x2: src |
859 | * w3: nblocks |
860 | * w4: enc_before |
861 | * w5: enc_after |
862 | */ |
863 | SM4_PREPARE(x0) |
864 | |
865 | ld1 {RMAC.16b}, [x1] |
866 | |
867 | cbz w4, .Lmac_update |
868 | |
869 | SM4_CRYPT_BLK(RMAC) |
870 | |
871 | .Lmac_update: |
872 | cbz w3, .Lmac_ret |
873 | |
874 | sub w6, w3, #1 |
875 | cmp w5, wzr |
876 | csel w3, w3, w6, ne |
877 | |
878 | cbz w3, .Lmac_end |
879 | |
880 | .Lmac_loop_4x: |
881 | cmp w3, #4 |
882 | blt .Lmac_loop_1x |
883 | |
884 | sub w3, w3, #4 |
885 | |
886 | ld1 {v0.16b-v3.16b}, [x2], #64 |
887 | |
888 | eor RMAC.16b, RMAC.16b, v0.16b |
889 | SM4_CRYPT_BLK(RMAC) |
890 | eor RMAC.16b, RMAC.16b, v1.16b |
891 | SM4_CRYPT_BLK(RMAC) |
892 | eor RMAC.16b, RMAC.16b, v2.16b |
893 | SM4_CRYPT_BLK(RMAC) |
894 | eor RMAC.16b, RMAC.16b, v3.16b |
895 | SM4_CRYPT_BLK(RMAC) |
896 | |
897 | cbz w3, .Lmac_end |
898 | b .Lmac_loop_4x |
899 | |
900 | .Lmac_loop_1x: |
901 | sub w3, w3, #1 |
902 | |
903 | ld1 {v0.16b}, [x2], #16 |
904 | |
905 | eor RMAC.16b, RMAC.16b, v0.16b |
906 | SM4_CRYPT_BLK(RMAC) |
907 | |
908 | cbnz w3, .Lmac_loop_1x |
909 | |
910 | |
911 | .Lmac_end: |
912 | cbnz w5, .Lmac_ret |
913 | |
914 | ld1 {v0.16b}, [x2], #16 |
915 | eor RMAC.16b, RMAC.16b, v0.16b |
916 | |
917 | .Lmac_ret: |
918 | st1 {RMAC.16b}, [x1] |
919 | ret |
920 | SYM_FUNC_END(sm4_ce_mac_update) |
921 | |
922 | |
923 | .section ".rodata" , "a" |
924 | .align 4 |
925 | .Lbswap128_mask: |
926 | .byte 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b |
927 | .byte 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 |
928 | |
929 | .Lcts_permute_table: |
930 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
931 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
932 | .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 |
933 | .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf |
934 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
935 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
936 | |