1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
2 | /* |
3 | * SM4-CCM AEAD Algorithm using ARMv8 Crypto Extensions |
4 | * as specified in rfc8998 |
5 | * https://datatracker.ietf.org/doc/html/rfc8998 |
6 | * |
7 | * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> |
8 | */ |
9 | |
10 | #include <linux/linkage.h> |
11 | #include <linux/cfi_types.h> |
12 | #include <asm/assembler.h> |
13 | #include "sm4-ce-asm.h" |
14 | |
15 | .arch armv8-a+crypto |
16 | |
17 | .irp b, 0, 1, 8, 9, 10, 11, 12, 13, 14, 15, 16, 24, 25, 26, 27, 28, 29, 30, 31 |
18 | .set .Lv\b\().4s, \b |
19 | .endr |
20 | |
21 | .macro sm4e, vd, vn |
22 | .inst 0xcec08400 | (.L\vn << 5) | .L\vd |
23 | .endm |
24 | |
25 | /* Register macros */ |
26 | |
27 | #define RMAC v16 |
28 | |
29 | /* Helper macros. */ |
30 | |
31 | #define inc_le128(vctr) \ |
32 | mov vctr.d[1], x8; \ |
33 | mov vctr.d[0], x7; \ |
34 | adds x8, x8, #1; \ |
35 | rev64 vctr.16b, vctr.16b; \ |
36 | adc x7, x7, xzr; |
37 | |
38 | |
39 | .align 3 |
40 | SYM_FUNC_START(sm4_ce_cbcmac_update) |
41 | /* input: |
42 | * x0: round key array, CTX |
43 | * x1: mac |
44 | * x2: src |
45 | * w3: nblocks |
46 | */ |
47 | SM4_PREPARE(x0) |
48 | |
49 | ld1 {RMAC.16b}, [x1] |
50 | |
51 | .Lcbcmac_loop_4x: |
52 | cmp w3, #4 |
53 | blt .Lcbcmac_loop_1x |
54 | |
55 | sub w3, w3, #4 |
56 | |
57 | ld1 {v0.16b-v3.16b}, [x2], #64 |
58 | |
59 | SM4_CRYPT_BLK(RMAC) |
60 | eor RMAC.16b, RMAC.16b, v0.16b |
61 | SM4_CRYPT_BLK(RMAC) |
62 | eor RMAC.16b, RMAC.16b, v1.16b |
63 | SM4_CRYPT_BLK(RMAC) |
64 | eor RMAC.16b, RMAC.16b, v2.16b |
65 | SM4_CRYPT_BLK(RMAC) |
66 | eor RMAC.16b, RMAC.16b, v3.16b |
67 | |
68 | cbz w3, .Lcbcmac_end |
69 | b .Lcbcmac_loop_4x |
70 | |
71 | .Lcbcmac_loop_1x: |
72 | sub w3, w3, #1 |
73 | |
74 | ld1 {v0.16b}, [x2], #16 |
75 | |
76 | SM4_CRYPT_BLK(RMAC) |
77 | eor RMAC.16b, RMAC.16b, v0.16b |
78 | |
79 | cbnz w3, .Lcbcmac_loop_1x |
80 | |
81 | .Lcbcmac_end: |
82 | st1 {RMAC.16b}, [x1] |
83 | ret |
84 | SYM_FUNC_END(sm4_ce_cbcmac_update) |
85 | |
86 | .align 3 |
87 | SYM_FUNC_START(sm4_ce_ccm_final) |
88 | /* input: |
89 | * x0: round key array, CTX |
90 | * x1: ctr0 (big endian, 128 bit) |
91 | * x2: mac |
92 | */ |
93 | SM4_PREPARE(x0) |
94 | |
95 | ld1 {RMAC.16b}, [x2] |
96 | ld1 {v0.16b}, [x1] |
97 | |
98 | SM4_CRYPT_BLK2(RMAC, v0) |
99 | |
100 | /* en-/decrypt the mac with ctr0 */ |
101 | eor RMAC.16b, RMAC.16b, v0.16b |
102 | st1 {RMAC.16b}, [x2] |
103 | |
104 | ret |
105 | SYM_FUNC_END(sm4_ce_ccm_final) |
106 | |
107 | .align 3 |
108 | SYM_TYPED_FUNC_START(sm4_ce_ccm_enc) |
109 | /* input: |
110 | * x0: round key array, CTX |
111 | * x1: dst |
112 | * x2: src |
113 | * x3: ctr (big endian, 128 bit) |
114 | * w4: nbytes |
115 | * x5: mac |
116 | */ |
117 | SM4_PREPARE(x0) |
118 | |
119 | ldp x7, x8, [x3] |
120 | rev x7, x7 |
121 | rev x8, x8 |
122 | |
123 | ld1 {RMAC.16b}, [x5] |
124 | |
125 | .Lccm_enc_loop_4x: |
126 | cmp w4, #(4 * 16) |
127 | blt .Lccm_enc_loop_1x |
128 | |
129 | sub w4, w4, #(4 * 16) |
130 | |
131 | /* construct CTRs */ |
132 | inc_le128(v8) /* +0 */ |
133 | inc_le128(v9) /* +1 */ |
134 | inc_le128(v10) /* +2 */ |
135 | inc_le128(v11) /* +3 */ |
136 | |
137 | ld1 {v0.16b-v3.16b}, [x2], #64 |
138 | |
139 | SM4_CRYPT_BLK2(v8, RMAC) |
140 | eor v8.16b, v8.16b, v0.16b |
141 | eor RMAC.16b, RMAC.16b, v0.16b |
142 | SM4_CRYPT_BLK2(v9, RMAC) |
143 | eor v9.16b, v9.16b, v1.16b |
144 | eor RMAC.16b, RMAC.16b, v1.16b |
145 | SM4_CRYPT_BLK2(v10, RMAC) |
146 | eor v10.16b, v10.16b, v2.16b |
147 | eor RMAC.16b, RMAC.16b, v2.16b |
148 | SM4_CRYPT_BLK2(v11, RMAC) |
149 | eor v11.16b, v11.16b, v3.16b |
150 | eor RMAC.16b, RMAC.16b, v3.16b |
151 | |
152 | st1 {v8.16b-v11.16b}, [x1], #64 |
153 | |
154 | cbz w4, .Lccm_enc_end |
155 | b .Lccm_enc_loop_4x |
156 | |
157 | .Lccm_enc_loop_1x: |
158 | cmp w4, #16 |
159 | blt .Lccm_enc_tail |
160 | |
161 | sub w4, w4, #16 |
162 | |
163 | /* construct CTRs */ |
164 | inc_le128(v8) |
165 | |
166 | ld1 {v0.16b}, [x2], #16 |
167 | |
168 | SM4_CRYPT_BLK2(v8, RMAC) |
169 | eor v8.16b, v8.16b, v0.16b |
170 | eor RMAC.16b, RMAC.16b, v0.16b |
171 | |
172 | st1 {v8.16b}, [x1], #16 |
173 | |
174 | cbz w4, .Lccm_enc_end |
175 | b .Lccm_enc_loop_1x |
176 | |
177 | .Lccm_enc_tail: |
178 | /* construct CTRs */ |
179 | inc_le128(v8) |
180 | |
181 | SM4_CRYPT_BLK2(RMAC, v8) |
182 | |
183 | /* store new MAC */ |
184 | st1 {RMAC.16b}, [x5] |
185 | |
186 | .Lccm_enc_tail_loop: |
187 | ldrb w0, [x2], #1 /* get 1 byte from input */ |
188 | umov w9, v8.b[0] /* get top crypted CTR byte */ |
189 | umov w6, RMAC.b[0] /* get top MAC byte */ |
190 | |
191 | eor w9, w9, w0 /* w9 = CTR ^ input */ |
192 | eor w6, w6, w0 /* w6 = MAC ^ input */ |
193 | |
194 | strb w9, [x1], #1 /* store out byte */ |
195 | strb w6, [x5], #1 /* store MAC byte */ |
196 | |
197 | subs w4, w4, #1 |
198 | beq .Lccm_enc_ret |
199 | |
200 | /* shift out one byte */ |
201 | ext RMAC.16b, RMAC.16b, RMAC.16b, #1 |
202 | ext v8.16b, v8.16b, v8.16b, #1 |
203 | |
204 | b .Lccm_enc_tail_loop |
205 | |
206 | .Lccm_enc_end: |
207 | /* store new MAC */ |
208 | st1 {RMAC.16b}, [x5] |
209 | |
210 | /* store new CTR */ |
211 | rev x7, x7 |
212 | rev x8, x8 |
213 | stp x7, x8, [x3] |
214 | |
215 | .Lccm_enc_ret: |
216 | ret |
217 | SYM_FUNC_END(sm4_ce_ccm_enc) |
218 | |
219 | .align 3 |
220 | SYM_TYPED_FUNC_START(sm4_ce_ccm_dec) |
221 | /* input: |
222 | * x0: round key array, CTX |
223 | * x1: dst |
224 | * x2: src |
225 | * x3: ctr (big endian, 128 bit) |
226 | * w4: nbytes |
227 | * x5: mac |
228 | */ |
229 | SM4_PREPARE(x0) |
230 | |
231 | ldp x7, x8, [x3] |
232 | rev x7, x7 |
233 | rev x8, x8 |
234 | |
235 | ld1 {RMAC.16b}, [x5] |
236 | |
237 | .Lccm_dec_loop_4x: |
238 | cmp w4, #(4 * 16) |
239 | blt .Lccm_dec_loop_1x |
240 | |
241 | sub w4, w4, #(4 * 16) |
242 | |
243 | /* construct CTRs */ |
244 | inc_le128(v8) /* +0 */ |
245 | inc_le128(v9) /* +1 */ |
246 | inc_le128(v10) /* +2 */ |
247 | inc_le128(v11) /* +3 */ |
248 | |
249 | ld1 {v0.16b-v3.16b}, [x2], #64 |
250 | |
251 | SM4_CRYPT_BLK2(v8, RMAC) |
252 | eor v8.16b, v8.16b, v0.16b |
253 | eor RMAC.16b, RMAC.16b, v8.16b |
254 | SM4_CRYPT_BLK2(v9, RMAC) |
255 | eor v9.16b, v9.16b, v1.16b |
256 | eor RMAC.16b, RMAC.16b, v9.16b |
257 | SM4_CRYPT_BLK2(v10, RMAC) |
258 | eor v10.16b, v10.16b, v2.16b |
259 | eor RMAC.16b, RMAC.16b, v10.16b |
260 | SM4_CRYPT_BLK2(v11, RMAC) |
261 | eor v11.16b, v11.16b, v3.16b |
262 | eor RMAC.16b, RMAC.16b, v11.16b |
263 | |
264 | st1 {v8.16b-v11.16b}, [x1], #64 |
265 | |
266 | cbz w4, .Lccm_dec_end |
267 | b .Lccm_dec_loop_4x |
268 | |
269 | .Lccm_dec_loop_1x: |
270 | cmp w4, #16 |
271 | blt .Lccm_dec_tail |
272 | |
273 | sub w4, w4, #16 |
274 | |
275 | /* construct CTRs */ |
276 | inc_le128(v8) |
277 | |
278 | ld1 {v0.16b}, [x2], #16 |
279 | |
280 | SM4_CRYPT_BLK2(v8, RMAC) |
281 | eor v8.16b, v8.16b, v0.16b |
282 | eor RMAC.16b, RMAC.16b, v8.16b |
283 | |
284 | st1 {v8.16b}, [x1], #16 |
285 | |
286 | cbz w4, .Lccm_dec_end |
287 | b .Lccm_dec_loop_1x |
288 | |
289 | .Lccm_dec_tail: |
290 | /* construct CTRs */ |
291 | inc_le128(v8) |
292 | |
293 | SM4_CRYPT_BLK2(RMAC, v8) |
294 | |
295 | /* store new MAC */ |
296 | st1 {RMAC.16b}, [x5] |
297 | |
298 | .Lccm_dec_tail_loop: |
299 | ldrb w0, [x2], #1 /* get 1 byte from input */ |
300 | umov w9, v8.b[0] /* get top crypted CTR byte */ |
301 | umov w6, RMAC.b[0] /* get top MAC byte */ |
302 | |
303 | eor w9, w9, w0 /* w9 = CTR ^ input */ |
304 | eor w6, w6, w9 /* w6 = MAC ^ output */ |
305 | |
306 | strb w9, [x1], #1 /* store out byte */ |
307 | strb w6, [x5], #1 /* store MAC byte */ |
308 | |
309 | subs w4, w4, #1 |
310 | beq .Lccm_dec_ret |
311 | |
312 | /* shift out one byte */ |
313 | ext RMAC.16b, RMAC.16b, RMAC.16b, #1 |
314 | ext v8.16b, v8.16b, v8.16b, #1 |
315 | |
316 | b .Lccm_dec_tail_loop |
317 | |
318 | .Lccm_dec_end: |
319 | /* store new MAC */ |
320 | st1 {RMAC.16b}, [x5] |
321 | |
322 | /* store new CTR */ |
323 | rev x7, x7 |
324 | rev x8, x8 |
325 | stp x7, x8, [x3] |
326 | |
327 | .Lccm_dec_ret: |
328 | ret |
329 | SYM_FUNC_END(sm4_ce_ccm_dec) |
330 | |