1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
2 | /* |
3 | * SM4-GCM AEAD Algorithm using ARMv8 Crypto Extensions |
4 | * as specified in rfc8998 |
5 | * https://datatracker.ietf.org/doc/html/rfc8998 |
6 | * |
7 | * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi> |
8 | * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> |
9 | */ |
10 | |
11 | #include <linux/linkage.h> |
12 | #include <linux/cfi_types.h> |
13 | #include <asm/assembler.h> |
14 | #include "sm4-ce-asm.h" |
15 | |
16 | .arch armv8-a+crypto |
17 | |
18 | .irp b, 0, 1, 2, 3, 24, 25, 26, 27, 28, 29, 30, 31 |
19 | .set .Lv\b\().4s, \b |
20 | .endr |
21 | |
22 | .macro sm4e, vd, vn |
23 | .inst 0xcec08400 | (.L\vn << 5) | .L\vd |
24 | .endm |
25 | |
26 | /* Register macros */ |
27 | |
28 | /* Used for both encryption and decryption */ |
29 | #define RHASH v21 |
30 | #define RRCONST v22 |
31 | #define RZERO v23 |
32 | |
33 | /* Helper macros. */ |
34 | |
35 | /* |
36 | * input: m0, m1 |
37 | * output: r0:r1 (low 128-bits in r0, high in r1) |
38 | */ |
39 | #define PMUL_128x128(r0, r1, m0, m1, T0, T1) \ |
40 | ext T0.16b, m1.16b, m1.16b, #8; \ |
41 | pmull r0.1q, m0.1d, m1.1d; \ |
42 | pmull T1.1q, m0.1d, T0.1d; \ |
43 | pmull2 T0.1q, m0.2d, T0.2d; \ |
44 | pmull2 r1.1q, m0.2d, m1.2d; \ |
45 | eor T0.16b, T0.16b, T1.16b; \ |
46 | ext T1.16b, RZERO.16b, T0.16b, #8; \ |
47 | ext T0.16b, T0.16b, RZERO.16b, #8; \ |
48 | eor r0.16b, r0.16b, T1.16b; \ |
49 | eor r1.16b, r1.16b, T0.16b; |
50 | |
51 | #define PMUL_128x128_4x(r0, r1, m0, m1, T0, T1, \ |
52 | r2, r3, m2, m3, T2, T3, \ |
53 | r4, r5, m4, m5, T4, T5, \ |
54 | r6, r7, m6, m7, T6, T7) \ |
55 | ext T0.16b, m1.16b, m1.16b, #8; \ |
56 | ext T2.16b, m3.16b, m3.16b, #8; \ |
57 | ext T4.16b, m5.16b, m5.16b, #8; \ |
58 | ext T6.16b, m7.16b, m7.16b, #8; \ |
59 | pmull r0.1q, m0.1d, m1.1d; \ |
60 | pmull r2.1q, m2.1d, m3.1d; \ |
61 | pmull r4.1q, m4.1d, m5.1d; \ |
62 | pmull r6.1q, m6.1d, m7.1d; \ |
63 | pmull T1.1q, m0.1d, T0.1d; \ |
64 | pmull T3.1q, m2.1d, T2.1d; \ |
65 | pmull T5.1q, m4.1d, T4.1d; \ |
66 | pmull T7.1q, m6.1d, T6.1d; \ |
67 | pmull2 T0.1q, m0.2d, T0.2d; \ |
68 | pmull2 T2.1q, m2.2d, T2.2d; \ |
69 | pmull2 T4.1q, m4.2d, T4.2d; \ |
70 | pmull2 T6.1q, m6.2d, T6.2d; \ |
71 | pmull2 r1.1q, m0.2d, m1.2d; \ |
72 | pmull2 r3.1q, m2.2d, m3.2d; \ |
73 | pmull2 r5.1q, m4.2d, m5.2d; \ |
74 | pmull2 r7.1q, m6.2d, m7.2d; \ |
75 | eor T0.16b, T0.16b, T1.16b; \ |
76 | eor T2.16b, T2.16b, T3.16b; \ |
77 | eor T4.16b, T4.16b, T5.16b; \ |
78 | eor T6.16b, T6.16b, T7.16b; \ |
79 | ext T1.16b, RZERO.16b, T0.16b, #8; \ |
80 | ext T3.16b, RZERO.16b, T2.16b, #8; \ |
81 | ext T5.16b, RZERO.16b, T4.16b, #8; \ |
82 | ext T7.16b, RZERO.16b, T6.16b, #8; \ |
83 | ext T0.16b, T0.16b, RZERO.16b, #8; \ |
84 | ext T2.16b, T2.16b, RZERO.16b, #8; \ |
85 | ext T4.16b, T4.16b, RZERO.16b, #8; \ |
86 | ext T6.16b, T6.16b, RZERO.16b, #8; \ |
87 | eor r0.16b, r0.16b, T1.16b; \ |
88 | eor r2.16b, r2.16b, T3.16b; \ |
89 | eor r4.16b, r4.16b, T5.16b; \ |
90 | eor r6.16b, r6.16b, T7.16b; \ |
91 | eor r1.16b, r1.16b, T0.16b; \ |
92 | eor r3.16b, r3.16b, T2.16b; \ |
93 | eor r5.16b, r5.16b, T4.16b; \ |
94 | eor r7.16b, r7.16b, T6.16b; |
95 | |
96 | /* |
97 | * input: r0:r1 (low 128-bits in r0, high in r1) |
98 | * output: a |
99 | */ |
100 | #define REDUCTION(a, r0, r1, rconst, T0, T1) \ |
101 | pmull2 T0.1q, r1.2d, rconst.2d; \ |
102 | ext T1.16b, T0.16b, RZERO.16b, #8; \ |
103 | ext T0.16b, RZERO.16b, T0.16b, #8; \ |
104 | eor r1.16b, r1.16b, T1.16b; \ |
105 | eor r0.16b, r0.16b, T0.16b; \ |
106 | pmull T0.1q, r1.1d, rconst.1d; \ |
107 | eor a.16b, r0.16b, T0.16b; |
108 | |
109 | #define SM4_CRYPT_PMUL_128x128_BLK(b0, r0, r1, m0, m1, T0, T1) \ |
110 | rev32 b0.16b, b0.16b; \ |
111 | ext T0.16b, m1.16b, m1.16b, #8; \ |
112 | sm4e b0.4s, v24.4s; \ |
113 | pmull r0.1q, m0.1d, m1.1d; \ |
114 | sm4e b0.4s, v25.4s; \ |
115 | pmull T1.1q, m0.1d, T0.1d; \ |
116 | sm4e b0.4s, v26.4s; \ |
117 | pmull2 T0.1q, m0.2d, T0.2d; \ |
118 | sm4e b0.4s, v27.4s; \ |
119 | pmull2 r1.1q, m0.2d, m1.2d; \ |
120 | sm4e b0.4s, v28.4s; \ |
121 | eor T0.16b, T0.16b, T1.16b; \ |
122 | sm4e b0.4s, v29.4s; \ |
123 | ext T1.16b, RZERO.16b, T0.16b, #8; \ |
124 | sm4e b0.4s, v30.4s; \ |
125 | ext T0.16b, T0.16b, RZERO.16b, #8; \ |
126 | sm4e b0.4s, v31.4s; \ |
127 | eor r0.16b, r0.16b, T1.16b; \ |
128 | rev64 b0.4s, b0.4s; \ |
129 | eor r1.16b, r1.16b, T0.16b; \ |
130 | ext b0.16b, b0.16b, b0.16b, #8; \ |
131 | rev32 b0.16b, b0.16b; |
132 | |
133 | #define SM4_CRYPT_PMUL_128x128_BLK3(b0, b1, b2, \ |
134 | r0, r1, m0, m1, T0, T1, \ |
135 | r2, r3, m2, m3, T2, T3, \ |
136 | r4, r5, m4, m5, T4, T5) \ |
137 | rev32 b0.16b, b0.16b; \ |
138 | rev32 b1.16b, b1.16b; \ |
139 | rev32 b2.16b, b2.16b; \ |
140 | ext T0.16b, m1.16b, m1.16b, #8; \ |
141 | ext T2.16b, m3.16b, m3.16b, #8; \ |
142 | ext T4.16b, m5.16b, m5.16b, #8; \ |
143 | sm4e b0.4s, v24.4s; \ |
144 | sm4e b1.4s, v24.4s; \ |
145 | sm4e b2.4s, v24.4s; \ |
146 | pmull r0.1q, m0.1d, m1.1d; \ |
147 | pmull r2.1q, m2.1d, m3.1d; \ |
148 | pmull r4.1q, m4.1d, m5.1d; \ |
149 | sm4e b0.4s, v25.4s; \ |
150 | sm4e b1.4s, v25.4s; \ |
151 | sm4e b2.4s, v25.4s; \ |
152 | pmull T1.1q, m0.1d, T0.1d; \ |
153 | pmull T3.1q, m2.1d, T2.1d; \ |
154 | pmull T5.1q, m4.1d, T4.1d; \ |
155 | sm4e b0.4s, v26.4s; \ |
156 | sm4e b1.4s, v26.4s; \ |
157 | sm4e b2.4s, v26.4s; \ |
158 | pmull2 T0.1q, m0.2d, T0.2d; \ |
159 | pmull2 T2.1q, m2.2d, T2.2d; \ |
160 | pmull2 T4.1q, m4.2d, T4.2d; \ |
161 | sm4e b0.4s, v27.4s; \ |
162 | sm4e b1.4s, v27.4s; \ |
163 | sm4e b2.4s, v27.4s; \ |
164 | pmull2 r1.1q, m0.2d, m1.2d; \ |
165 | pmull2 r3.1q, m2.2d, m3.2d; \ |
166 | pmull2 r5.1q, m4.2d, m5.2d; \ |
167 | sm4e b0.4s, v28.4s; \ |
168 | sm4e b1.4s, v28.4s; \ |
169 | sm4e b2.4s, v28.4s; \ |
170 | eor T0.16b, T0.16b, T1.16b; \ |
171 | eor T2.16b, T2.16b, T3.16b; \ |
172 | eor T4.16b, T4.16b, T5.16b; \ |
173 | sm4e b0.4s, v29.4s; \ |
174 | sm4e b1.4s, v29.4s; \ |
175 | sm4e b2.4s, v29.4s; \ |
176 | ext T1.16b, RZERO.16b, T0.16b, #8; \ |
177 | ext T3.16b, RZERO.16b, T2.16b, #8; \ |
178 | ext T5.16b, RZERO.16b, T4.16b, #8; \ |
179 | sm4e b0.4s, v30.4s; \ |
180 | sm4e b1.4s, v30.4s; \ |
181 | sm4e b2.4s, v30.4s; \ |
182 | ext T0.16b, T0.16b, RZERO.16b, #8; \ |
183 | ext T2.16b, T2.16b, RZERO.16b, #8; \ |
184 | ext T4.16b, T4.16b, RZERO.16b, #8; \ |
185 | sm4e b0.4s, v31.4s; \ |
186 | sm4e b1.4s, v31.4s; \ |
187 | sm4e b2.4s, v31.4s; \ |
188 | eor r0.16b, r0.16b, T1.16b; \ |
189 | eor r2.16b, r2.16b, T3.16b; \ |
190 | eor r4.16b, r4.16b, T5.16b; \ |
191 | rev64 b0.4s, b0.4s; \ |
192 | rev64 b1.4s, b1.4s; \ |
193 | rev64 b2.4s, b2.4s; \ |
194 | eor r1.16b, r1.16b, T0.16b; \ |
195 | eor r3.16b, r3.16b, T2.16b; \ |
196 | eor r5.16b, r5.16b, T4.16b; \ |
197 | ext b0.16b, b0.16b, b0.16b, #8; \ |
198 | ext b1.16b, b1.16b, b1.16b, #8; \ |
199 | ext b2.16b, b2.16b, b2.16b, #8; \ |
200 | eor r0.16b, r0.16b, r2.16b; \ |
201 | eor r1.16b, r1.16b, r3.16b; \ |
202 | rev32 b0.16b, b0.16b; \ |
203 | rev32 b1.16b, b1.16b; \ |
204 | rev32 b2.16b, b2.16b; \ |
205 | eor r0.16b, r0.16b, r4.16b; \ |
206 | eor r1.16b, r1.16b, r5.16b; |
207 | |
208 | #define inc32_le128(vctr) \ |
209 | mov vctr.d[1], x9; \ |
210 | add w6, w9, #1; \ |
211 | mov vctr.d[0], x8; \ |
212 | bfi x9, x6, #0, #32; \ |
213 | rev64 vctr.16b, vctr.16b; |
214 | |
215 | #define GTAG_HASH_LENGTHS(vctr0, vlen) \ |
216 | ld1 {vlen.16b}, [x7]; \ |
217 | /* construct CTR0 */ \ |
218 | /* the lower 32-bits of initial IV is always be32(1) */ \ |
219 | mov x6, #0x1; \ |
220 | bfi x9, x6, #0, #32; \ |
221 | mov vctr0.d[0], x8; \ |
222 | mov vctr0.d[1], x9; \ |
223 | rbit vlen.16b, vlen.16b; \ |
224 | rev64 vctr0.16b, vctr0.16b; \ |
225 | /* authtag = GCTR(CTR0, GHASH) */ \ |
226 | eor RHASH.16b, RHASH.16b, vlen.16b; \ |
227 | SM4_CRYPT_PMUL_128x128_BLK(vctr0, RR0, RR1, RHASH, RH1, \ |
228 | RTMP0, RTMP1); \ |
229 | REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3); \ |
230 | rbit RHASH.16b, RHASH.16b; \ |
231 | eor RHASH.16b, RHASH.16b, vctr0.16b; |
232 | |
233 | |
234 | /* Register macros for encrypt and ghash */ |
235 | |
236 | /* can be the same as input v0-v3 */ |
237 | #define RR1 v0 |
238 | #define RR3 v1 |
239 | #define RR5 v2 |
240 | #define RR7 v3 |
241 | |
242 | #define RR0 v4 |
243 | #define RR2 v5 |
244 | #define RR4 v6 |
245 | #define RR6 v7 |
246 | |
247 | #define RTMP0 v8 |
248 | #define RTMP1 v9 |
249 | #define RTMP2 v10 |
250 | #define RTMP3 v11 |
251 | #define RTMP4 v12 |
252 | #define RTMP5 v13 |
253 | #define RTMP6 v14 |
254 | #define RTMP7 v15 |
255 | |
256 | #define RH1 v16 |
257 | #define RH2 v17 |
258 | #define RH3 v18 |
259 | #define RH4 v19 |
260 | |
261 | .align 3 |
262 | SYM_FUNC_START(sm4_ce_pmull_ghash_setup) |
263 | /* input: |
264 | * x0: round key array, CTX |
265 | * x1: ghash table |
266 | */ |
267 | SM4_PREPARE(x0) |
268 | |
269 | adr_l x2, .Lghash_rconst |
270 | ld1r {RRCONST.2d}, [x2] |
271 | |
272 | eor RZERO.16b, RZERO.16b, RZERO.16b |
273 | |
274 | /* H = E(K, 0^128) */ |
275 | rev32 v0.16b, RZERO.16b |
276 | SM4_CRYPT_BLK_BE(v0) |
277 | |
278 | /* H ^ 1 */ |
279 | rbit RH1.16b, v0.16b |
280 | |
281 | /* H ^ 2 */ |
282 | PMUL_128x128(RR0, RR1, RH1, RH1, RTMP0, RTMP1) |
283 | REDUCTION(RH2, RR0, RR1, RRCONST, RTMP2, RTMP3) |
284 | |
285 | /* H ^ 3 */ |
286 | PMUL_128x128(RR0, RR1, RH2, RH1, RTMP0, RTMP1) |
287 | REDUCTION(RH3, RR0, RR1, RRCONST, RTMP2, RTMP3) |
288 | |
289 | /* H ^ 4 */ |
290 | PMUL_128x128(RR0, RR1, RH2, RH2, RTMP0, RTMP1) |
291 | REDUCTION(RH4, RR0, RR1, RRCONST, RTMP2, RTMP3) |
292 | |
293 | st1 {RH1.16b-RH4.16b}, [x1] |
294 | |
295 | ret |
296 | SYM_FUNC_END(sm4_ce_pmull_ghash_setup) |
297 | |
298 | .align 3 |
299 | SYM_FUNC_START(pmull_ghash_update) |
300 | /* input: |
301 | * x0: ghash table |
302 | * x1: ghash result |
303 | * x2: src |
304 | * w3: nblocks |
305 | */ |
306 | ld1 {RH1.16b-RH4.16b}, [x0] |
307 | |
308 | ld1 {RHASH.16b}, [x1] |
309 | rbit RHASH.16b, RHASH.16b |
310 | |
311 | adr_l x4, .Lghash_rconst |
312 | ld1r {RRCONST.2d}, [x4] |
313 | |
314 | eor RZERO.16b, RZERO.16b, RZERO.16b |
315 | |
316 | .Lghash_loop_4x: |
317 | cmp w3, #4 |
318 | blt .Lghash_loop_1x |
319 | |
320 | sub w3, w3, #4 |
321 | |
322 | ld1 {v0.16b-v3.16b}, [x2], #64 |
323 | |
324 | rbit v0.16b, v0.16b |
325 | rbit v1.16b, v1.16b |
326 | rbit v2.16b, v2.16b |
327 | rbit v3.16b, v3.16b |
328 | |
329 | /* |
330 | * (in0 ^ HASH) * H^4 => rr0:rr1 |
331 | * (in1) * H^3 => rr2:rr3 |
332 | * (in2) * H^2 => rr4:rr5 |
333 | * (in3) * H^1 => rr6:rr7 |
334 | */ |
335 | eor RHASH.16b, RHASH.16b, v0.16b |
336 | |
337 | PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1, |
338 | RR2, RR3, v1, RH3, RTMP2, RTMP3, |
339 | RR4, RR5, v2, RH2, RTMP4, RTMP5, |
340 | RR6, RR7, v3, RH1, RTMP6, RTMP7) |
341 | |
342 | eor RR0.16b, RR0.16b, RR2.16b |
343 | eor RR1.16b, RR1.16b, RR3.16b |
344 | eor RR0.16b, RR0.16b, RR4.16b |
345 | eor RR1.16b, RR1.16b, RR5.16b |
346 | eor RR0.16b, RR0.16b, RR6.16b |
347 | eor RR1.16b, RR1.16b, RR7.16b |
348 | |
349 | REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1) |
350 | |
351 | cbz w3, .Lghash_end |
352 | b .Lghash_loop_4x |
353 | |
354 | .Lghash_loop_1x: |
355 | sub w3, w3, #1 |
356 | |
357 | ld1 {v0.16b}, [x2], #16 |
358 | rbit v0.16b, v0.16b |
359 | eor RHASH.16b, RHASH.16b, v0.16b |
360 | |
361 | PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1) |
362 | REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3) |
363 | |
364 | cbnz w3, .Lghash_loop_1x |
365 | |
366 | .Lghash_end: |
367 | rbit RHASH.16b, RHASH.16b |
368 | st1 {RHASH.2d}, [x1] |
369 | |
370 | ret |
371 | SYM_FUNC_END(pmull_ghash_update) |
372 | |
373 | .align 3 |
374 | SYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_enc) |
375 | /* input: |
376 | * x0: round key array, CTX |
377 | * x1: dst |
378 | * x2: src |
379 | * x3: ctr (big endian, 128 bit) |
380 | * w4: nbytes |
381 | * x5: ghash result |
382 | * x6: ghash table |
383 | * x7: lengths (only for last block) |
384 | */ |
385 | SM4_PREPARE(x0) |
386 | |
387 | ldp x8, x9, [x3] |
388 | rev x8, x8 |
389 | rev x9, x9 |
390 | |
391 | ld1 {RH1.16b-RH4.16b}, [x6] |
392 | |
393 | ld1 {RHASH.16b}, [x5] |
394 | rbit RHASH.16b, RHASH.16b |
395 | |
396 | adr_l x6, .Lghash_rconst |
397 | ld1r {RRCONST.2d}, [x6] |
398 | |
399 | eor RZERO.16b, RZERO.16b, RZERO.16b |
400 | |
401 | cbz w4, .Lgcm_enc_hash_len |
402 | |
403 | .Lgcm_enc_loop_4x: |
404 | cmp w4, #(4 * 16) |
405 | blt .Lgcm_enc_loop_1x |
406 | |
407 | sub w4, w4, #(4 * 16) |
408 | |
409 | /* construct CTRs */ |
410 | inc32_le128(v0) /* +0 */ |
411 | inc32_le128(v1) /* +1 */ |
412 | inc32_le128(v2) /* +2 */ |
413 | inc32_le128(v3) /* +3 */ |
414 | |
415 | ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64 |
416 | |
417 | SM4_CRYPT_BLK4(v0, v1, v2, v3) |
418 | |
419 | eor v0.16b, v0.16b, RTMP0.16b |
420 | eor v1.16b, v1.16b, RTMP1.16b |
421 | eor v2.16b, v2.16b, RTMP2.16b |
422 | eor v3.16b, v3.16b, RTMP3.16b |
423 | st1 {v0.16b-v3.16b}, [x1], #64 |
424 | |
425 | /* ghash update */ |
426 | |
427 | rbit v0.16b, v0.16b |
428 | rbit v1.16b, v1.16b |
429 | rbit v2.16b, v2.16b |
430 | rbit v3.16b, v3.16b |
431 | |
432 | /* |
433 | * (in0 ^ HASH) * H^4 => rr0:rr1 |
434 | * (in1) * H^3 => rr2:rr3 |
435 | * (in2) * H^2 => rr4:rr5 |
436 | * (in3) * H^1 => rr6:rr7 |
437 | */ |
438 | eor RHASH.16b, RHASH.16b, v0.16b |
439 | |
440 | PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1, |
441 | RR2, RR3, v1, RH3, RTMP2, RTMP3, |
442 | RR4, RR5, v2, RH2, RTMP4, RTMP5, |
443 | RR6, RR7, v3, RH1, RTMP6, RTMP7) |
444 | |
445 | eor RR0.16b, RR0.16b, RR2.16b |
446 | eor RR1.16b, RR1.16b, RR3.16b |
447 | eor RR0.16b, RR0.16b, RR4.16b |
448 | eor RR1.16b, RR1.16b, RR5.16b |
449 | eor RR0.16b, RR0.16b, RR6.16b |
450 | eor RR1.16b, RR1.16b, RR7.16b |
451 | |
452 | REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1) |
453 | |
454 | cbz w4, .Lgcm_enc_hash_len |
455 | b .Lgcm_enc_loop_4x |
456 | |
457 | .Lgcm_enc_loop_1x: |
458 | cmp w4, #16 |
459 | blt .Lgcm_enc_tail |
460 | |
461 | sub w4, w4, #16 |
462 | |
463 | /* construct CTRs */ |
464 | inc32_le128(v0) |
465 | |
466 | ld1 {RTMP0.16b}, [x2], #16 |
467 | |
468 | SM4_CRYPT_BLK(v0) |
469 | |
470 | eor v0.16b, v0.16b, RTMP0.16b |
471 | st1 {v0.16b}, [x1], #16 |
472 | |
473 | /* ghash update */ |
474 | rbit v0.16b, v0.16b |
475 | eor RHASH.16b, RHASH.16b, v0.16b |
476 | PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1) |
477 | REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3) |
478 | |
479 | cbz w4, .Lgcm_enc_hash_len |
480 | b .Lgcm_enc_loop_1x |
481 | |
482 | .Lgcm_enc_tail: |
483 | /* construct CTRs */ |
484 | inc32_le128(v0) |
485 | SM4_CRYPT_BLK(v0) |
486 | |
487 | /* load permute table */ |
488 | adr_l x0, .Lcts_permute_table |
489 | add x0, x0, #32 |
490 | sub x0, x0, w4, uxtw |
491 | ld1 {v3.16b}, [x0] |
492 | |
493 | .Lgcm_enc_tail_loop: |
494 | /* do encrypt */ |
495 | ldrb w0, [x2], #1 /* get 1 byte from input */ |
496 | umov w6, v0.b[0] /* get top crypted byte */ |
497 | eor w6, w6, w0 /* w6 = CTR ^ input */ |
498 | strb w6, [x1], #1 /* store out byte */ |
499 | |
500 | /* shift right out one byte */ |
501 | ext v0.16b, v0.16b, v0.16b, #1 |
502 | /* the last ciphertext is placed in high bytes */ |
503 | ins v0.b[15], w6 |
504 | |
505 | subs w4, w4, #1 |
506 | bne .Lgcm_enc_tail_loop |
507 | |
508 | /* padding last block with zeros */ |
509 | tbl v0.16b, {v0.16b}, v3.16b |
510 | |
511 | /* ghash update */ |
512 | rbit v0.16b, v0.16b |
513 | eor RHASH.16b, RHASH.16b, v0.16b |
514 | PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1) |
515 | REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3) |
516 | |
517 | .Lgcm_enc_hash_len: |
518 | cbz x7, .Lgcm_enc_end |
519 | |
520 | GTAG_HASH_LENGTHS(v1, v3) |
521 | |
522 | b .Lgcm_enc_ret |
523 | |
524 | .Lgcm_enc_end: |
525 | /* store new CTR */ |
526 | rev x8, x8 |
527 | rev x9, x9 |
528 | stp x8, x9, [x3] |
529 | |
530 | rbit RHASH.16b, RHASH.16b |
531 | |
532 | .Lgcm_enc_ret: |
533 | /* store new MAC */ |
534 | st1 {RHASH.2d}, [x5] |
535 | |
536 | ret |
537 | SYM_FUNC_END(sm4_ce_pmull_gcm_enc) |
538 | |
539 | #undef RR1 |
540 | #undef RR3 |
541 | #undef RR5 |
542 | #undef RR7 |
543 | #undef RR0 |
544 | #undef RR2 |
545 | #undef RR4 |
546 | #undef RR6 |
547 | #undef RTMP0 |
548 | #undef RTMP1 |
549 | #undef RTMP2 |
550 | #undef RTMP3 |
551 | #undef RTMP4 |
552 | #undef RTMP5 |
553 | #undef RTMP6 |
554 | #undef RTMP7 |
555 | #undef RH1 |
556 | #undef RH2 |
557 | #undef RH3 |
558 | #undef RH4 |
559 | |
560 | |
561 | /* Register macros for decrypt */ |
562 | |
563 | /* v0-v2 for building CTRs, v3-v5 for saving inputs */ |
564 | |
565 | #define RR1 v6 |
566 | #define RR3 v7 |
567 | #define RR5 v8 |
568 | |
569 | #define RR0 v9 |
570 | #define RR2 v10 |
571 | #define RR4 v11 |
572 | |
573 | #define RTMP0 v12 |
574 | #define RTMP1 v13 |
575 | #define RTMP2 v14 |
576 | #define RTMP3 v15 |
577 | #define RTMP4 v16 |
578 | #define RTMP5 v17 |
579 | |
580 | #define RH1 v18 |
581 | #define RH2 v19 |
582 | #define RH3 v20 |
583 | |
584 | .align 3 |
585 | SYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_dec) |
586 | /* input: |
587 | * x0: round key array, CTX |
588 | * x1: dst |
589 | * x2: src |
590 | * x3: ctr (big endian, 128 bit) |
591 | * w4: nbytes |
592 | * x5: ghash result |
593 | * x6: ghash table |
594 | * x7: lengths (only for last block) |
595 | */ |
596 | SM4_PREPARE(x0) |
597 | |
598 | ldp x8, x9, [x3] |
599 | rev x8, x8 |
600 | rev x9, x9 |
601 | |
602 | ld1 {RH1.16b-RH3.16b}, [x6] |
603 | |
604 | ld1 {RHASH.16b}, [x5] |
605 | rbit RHASH.16b, RHASH.16b |
606 | |
607 | adr_l x6, .Lghash_rconst |
608 | ld1r {RRCONST.2d}, [x6] |
609 | |
610 | eor RZERO.16b, RZERO.16b, RZERO.16b |
611 | |
612 | cbz w4, .Lgcm_dec_hash_len |
613 | |
614 | .Lgcm_dec_loop_3x: |
615 | cmp w4, #(3 * 16) |
616 | blt .Lgcm_dec_loop_1x |
617 | |
618 | sub w4, w4, #(3 * 16) |
619 | |
620 | ld1 {v3.16b-v5.16b}, [x2], #(3 * 16) |
621 | |
622 | /* construct CTRs */ |
623 | inc32_le128(v0) /* +0 */ |
624 | rbit v6.16b, v3.16b |
625 | inc32_le128(v1) /* +1 */ |
626 | rbit v7.16b, v4.16b |
627 | inc32_le128(v2) /* +2 */ |
628 | rbit v8.16b, v5.16b |
629 | |
630 | eor RHASH.16b, RHASH.16b, v6.16b |
631 | |
632 | /* decrypt & ghash update */ |
633 | SM4_CRYPT_PMUL_128x128_BLK3(v0, v1, v2, |
634 | RR0, RR1, RHASH, RH3, RTMP0, RTMP1, |
635 | RR2, RR3, v7, RH2, RTMP2, RTMP3, |
636 | RR4, RR5, v8, RH1, RTMP4, RTMP5) |
637 | |
638 | eor v0.16b, v0.16b, v3.16b |
639 | eor v1.16b, v1.16b, v4.16b |
640 | eor v2.16b, v2.16b, v5.16b |
641 | |
642 | REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1) |
643 | |
644 | st1 {v0.16b-v2.16b}, [x1], #(3 * 16) |
645 | |
646 | cbz w4, .Lgcm_dec_hash_len |
647 | b .Lgcm_dec_loop_3x |
648 | |
649 | .Lgcm_dec_loop_1x: |
650 | cmp w4, #16 |
651 | blt .Lgcm_dec_tail |
652 | |
653 | sub w4, w4, #16 |
654 | |
655 | ld1 {v3.16b}, [x2], #16 |
656 | |
657 | /* construct CTRs */ |
658 | inc32_le128(v0) |
659 | rbit v6.16b, v3.16b |
660 | |
661 | eor RHASH.16b, RHASH.16b, v6.16b |
662 | |
663 | SM4_CRYPT_PMUL_128x128_BLK(v0, RR0, RR1, RHASH, RH1, RTMP0, RTMP1) |
664 | |
665 | eor v0.16b, v0.16b, v3.16b |
666 | |
667 | REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3) |
668 | |
669 | st1 {v0.16b}, [x1], #16 |
670 | |
671 | cbz w4, .Lgcm_dec_hash_len |
672 | b .Lgcm_dec_loop_1x |
673 | |
674 | .Lgcm_dec_tail: |
675 | /* construct CTRs */ |
676 | inc32_le128(v0) |
677 | SM4_CRYPT_BLK(v0) |
678 | |
679 | /* load permute table */ |
680 | adr_l x0, .Lcts_permute_table |
681 | add x0, x0, #32 |
682 | sub x0, x0, w4, uxtw |
683 | ld1 {v3.16b}, [x0] |
684 | |
685 | .Lgcm_dec_tail_loop: |
686 | /* do decrypt */ |
687 | ldrb w0, [x2], #1 /* get 1 byte from input */ |
688 | umov w6, v0.b[0] /* get top crypted byte */ |
689 | eor w6, w6, w0 /* w6 = CTR ^ input */ |
690 | strb w6, [x1], #1 /* store out byte */ |
691 | |
692 | /* shift right out one byte */ |
693 | ext v0.16b, v0.16b, v0.16b, #1 |
694 | /* the last ciphertext is placed in high bytes */ |
695 | ins v0.b[15], w0 |
696 | |
697 | subs w4, w4, #1 |
698 | bne .Lgcm_dec_tail_loop |
699 | |
700 | /* padding last block with zeros */ |
701 | tbl v0.16b, {v0.16b}, v3.16b |
702 | |
703 | /* ghash update */ |
704 | rbit v0.16b, v0.16b |
705 | eor RHASH.16b, RHASH.16b, v0.16b |
706 | PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1) |
707 | REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3) |
708 | |
709 | .Lgcm_dec_hash_len: |
710 | cbz x7, .Lgcm_dec_end |
711 | |
712 | GTAG_HASH_LENGTHS(v1, v3) |
713 | |
714 | b .Lgcm_dec_ret |
715 | |
716 | .Lgcm_dec_end: |
717 | /* store new CTR */ |
718 | rev x8, x8 |
719 | rev x9, x9 |
720 | stp x8, x9, [x3] |
721 | |
722 | rbit RHASH.16b, RHASH.16b |
723 | |
724 | .Lgcm_dec_ret: |
725 | /* store new MAC */ |
726 | st1 {RHASH.2d}, [x5] |
727 | |
728 | ret |
729 | SYM_FUNC_END(sm4_ce_pmull_gcm_dec) |
730 | |
731 | .section ".rodata" , "a" |
732 | .align 4 |
733 | .Lcts_permute_table: |
734 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
735 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
736 | .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 |
737 | .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf |
738 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
739 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
740 | |
741 | .Lghash_rconst: |
742 | .quad 0x87 |
743 | |