1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
2 | /* |
3 | * SM4 Cipher Algorithm for ARMv8 NEON |
4 | * as specified in |
5 | * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html |
6 | * |
7 | * Copyright (C) 2022, Alibaba Group. |
8 | * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> |
9 | */ |
10 | |
11 | #include <linux/linkage.h> |
12 | #include <asm/assembler.h> |
13 | |
14 | /* Register macros */ |
15 | |
16 | #define RTMP0 v8 |
17 | #define RTMP1 v9 |
18 | #define RTMP2 v10 |
19 | #define RTMP3 v11 |
20 | |
21 | #define RTMP4 v12 |
22 | #define RTMP5 v13 |
23 | #define RTMP6 v14 |
24 | #define RTMP7 v15 |
25 | |
26 | #define RX0 v12 |
27 | #define RX1 v13 |
28 | #define RKEY v14 |
29 | #define RIV v15 |
30 | |
31 | /* Helper macros. */ |
32 | |
33 | #define SM4_PREPARE() \ |
34 | adr_l x5, crypto_sm4_sbox; \ |
35 | ld1 {v16.16b-v19.16b}, [x5], #64; \ |
36 | ld1 {v20.16b-v23.16b}, [x5], #64; \ |
37 | ld1 {v24.16b-v27.16b}, [x5], #64; \ |
38 | ld1 {v28.16b-v31.16b}, [x5]; |
39 | |
40 | #define transpose_4x4(s0, s1, s2, s3) \ |
41 | zip1 RTMP0.4s, s0.4s, s1.4s; \ |
42 | zip1 RTMP1.4s, s2.4s, s3.4s; \ |
43 | zip2 RTMP2.4s, s0.4s, s1.4s; \ |
44 | zip2 RTMP3.4s, s2.4s, s3.4s; \ |
45 | zip1 s0.2d, RTMP0.2d, RTMP1.2d; \ |
46 | zip2 s1.2d, RTMP0.2d, RTMP1.2d; \ |
47 | zip1 s2.2d, RTMP2.2d, RTMP3.2d; \ |
48 | zip2 s3.2d, RTMP2.2d, RTMP3.2d; |
49 | |
50 | #define transpose_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7) \ |
51 | zip1 RTMP0.4s, s0.4s, s1.4s; \ |
52 | zip1 RTMP1.4s, s2.4s, s3.4s; \ |
53 | zip2 RTMP2.4s, s0.4s, s1.4s; \ |
54 | zip2 RTMP3.4s, s2.4s, s3.4s; \ |
55 | zip1 RTMP4.4s, s4.4s, s5.4s; \ |
56 | zip1 RTMP5.4s, s6.4s, s7.4s; \ |
57 | zip2 RTMP6.4s, s4.4s, s5.4s; \ |
58 | zip2 RTMP7.4s, s6.4s, s7.4s; \ |
59 | zip1 s0.2d, RTMP0.2d, RTMP1.2d; \ |
60 | zip2 s1.2d, RTMP0.2d, RTMP1.2d; \ |
61 | zip1 s2.2d, RTMP2.2d, RTMP3.2d; \ |
62 | zip2 s3.2d, RTMP2.2d, RTMP3.2d; \ |
63 | zip1 s4.2d, RTMP4.2d, RTMP5.2d; \ |
64 | zip2 s5.2d, RTMP4.2d, RTMP5.2d; \ |
65 | zip1 s6.2d, RTMP6.2d, RTMP7.2d; \ |
66 | zip2 s7.2d, RTMP6.2d, RTMP7.2d; |
67 | |
68 | #define rotate_clockwise_4x4(s0, s1, s2, s3) \ |
69 | zip1 RTMP0.4s, s1.4s, s0.4s; \ |
70 | zip2 RTMP1.4s, s1.4s, s0.4s; \ |
71 | zip1 RTMP2.4s, s3.4s, s2.4s; \ |
72 | zip2 RTMP3.4s, s3.4s, s2.4s; \ |
73 | zip1 s0.2d, RTMP2.2d, RTMP0.2d; \ |
74 | zip2 s1.2d, RTMP2.2d, RTMP0.2d; \ |
75 | zip1 s2.2d, RTMP3.2d, RTMP1.2d; \ |
76 | zip2 s3.2d, RTMP3.2d, RTMP1.2d; |
77 | |
78 | #define rotate_clockwise_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7) \ |
79 | zip1 RTMP0.4s, s1.4s, s0.4s; \ |
80 | zip1 RTMP2.4s, s3.4s, s2.4s; \ |
81 | zip2 RTMP1.4s, s1.4s, s0.4s; \ |
82 | zip2 RTMP3.4s, s3.4s, s2.4s; \ |
83 | zip1 RTMP4.4s, s5.4s, s4.4s; \ |
84 | zip1 RTMP6.4s, s7.4s, s6.4s; \ |
85 | zip2 RTMP5.4s, s5.4s, s4.4s; \ |
86 | zip2 RTMP7.4s, s7.4s, s6.4s; \ |
87 | zip1 s0.2d, RTMP2.2d, RTMP0.2d; \ |
88 | zip2 s1.2d, RTMP2.2d, RTMP0.2d; \ |
89 | zip1 s2.2d, RTMP3.2d, RTMP1.2d; \ |
90 | zip2 s3.2d, RTMP3.2d, RTMP1.2d; \ |
91 | zip1 s4.2d, RTMP6.2d, RTMP4.2d; \ |
92 | zip2 s5.2d, RTMP6.2d, RTMP4.2d; \ |
93 | zip1 s6.2d, RTMP7.2d, RTMP5.2d; \ |
94 | zip2 s7.2d, RTMP7.2d, RTMP5.2d; |
95 | |
96 | #define ROUND4(round, s0, s1, s2, s3) \ |
97 | dup RX0.4s, RKEY.s[round]; \ |
98 | /* rk ^ s1 ^ s2 ^ s3 */ \ |
99 | eor RTMP1.16b, s2.16b, s3.16b; \ |
100 | eor RX0.16b, RX0.16b, s1.16b; \ |
101 | eor RX0.16b, RX0.16b, RTMP1.16b; \ |
102 | \ |
103 | /* sbox, non-linear part */ \ |
104 | movi RTMP3.16b, #64; /* sizeof(sbox) / 4 */ \ |
105 | tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \ |
106 | sub RX0.16b, RX0.16b, RTMP3.16b; \ |
107 | tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \ |
108 | sub RX0.16b, RX0.16b, RTMP3.16b; \ |
109 | tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \ |
110 | sub RX0.16b, RX0.16b, RTMP3.16b; \ |
111 | tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \ |
112 | \ |
113 | /* linear part */ \ |
114 | shl RTMP1.4s, RTMP0.4s, #8; \ |
115 | shl RTMP2.4s, RTMP0.4s, #16; \ |
116 | shl RTMP3.4s, RTMP0.4s, #24; \ |
117 | sri RTMP1.4s, RTMP0.4s, #(32-8); \ |
118 | sri RTMP2.4s, RTMP0.4s, #(32-16); \ |
119 | sri RTMP3.4s, RTMP0.4s, #(32-24); \ |
120 | /* RTMP1 = x ^ rol32(x, 8) ^ rol32(x, 16) */ \ |
121 | eor RTMP1.16b, RTMP1.16b, RTMP0.16b; \ |
122 | eor RTMP1.16b, RTMP1.16b, RTMP2.16b; \ |
123 | /* RTMP3 = x ^ rol32(x, 24) ^ rol32(RTMP1, 2) */ \ |
124 | eor RTMP3.16b, RTMP3.16b, RTMP0.16b; \ |
125 | shl RTMP2.4s, RTMP1.4s, 2; \ |
126 | sri RTMP2.4s, RTMP1.4s, #(32-2); \ |
127 | eor RTMP3.16b, RTMP3.16b, RTMP2.16b; \ |
128 | /* s0 ^= RTMP3 */ \ |
129 | eor s0.16b, s0.16b, RTMP3.16b; |
130 | |
131 | #define SM4_CRYPT_BLK4_BE(b0, b1, b2, b3) \ |
132 | mov x6, 8; \ |
133 | 4: \ |
134 | ld1 {RKEY.4s}, [x0], #16; \ |
135 | subs x6, x6, #1; \ |
136 | \ |
137 | ROUND4(0, b0, b1, b2, b3); \ |
138 | ROUND4(1, b1, b2, b3, b0); \ |
139 | ROUND4(2, b2, b3, b0, b1); \ |
140 | ROUND4(3, b3, b0, b1, b2); \ |
141 | \ |
142 | bne 4b; \ |
143 | \ |
144 | rev32 b0.16b, b0.16b; \ |
145 | rev32 b1.16b, b1.16b; \ |
146 | rev32 b2.16b, b2.16b; \ |
147 | rev32 b3.16b, b3.16b; \ |
148 | \ |
149 | rotate_clockwise_4x4(b0, b1, b2, b3); \ |
150 | \ |
151 | /* repoint to rkey */ \ |
152 | sub x0, x0, #128; |
153 | |
154 | #define SM4_CRYPT_BLK4(b0, b1, b2, b3) \ |
155 | rev32 b0.16b, b0.16b; \ |
156 | rev32 b1.16b, b1.16b; \ |
157 | rev32 b2.16b, b2.16b; \ |
158 | rev32 b3.16b, b3.16b; \ |
159 | SM4_CRYPT_BLK4_BE(b0, b1, b2, b3); |
160 | |
161 | #define ROUND8(round, s0, s1, s2, s3, t0, t1, t2, t3) \ |
162 | /* rk ^ s1 ^ s2 ^ s3 */ \ |
163 | dup RX0.4s, RKEY.s[round]; \ |
164 | eor RTMP0.16b, s2.16b, s3.16b; \ |
165 | mov RX1.16b, RX0.16b; \ |
166 | eor RTMP1.16b, t2.16b, t3.16b; \ |
167 | eor RX0.16b, RX0.16b, s1.16b; \ |
168 | eor RX1.16b, RX1.16b, t1.16b; \ |
169 | eor RX0.16b, RX0.16b, RTMP0.16b; \ |
170 | eor RX1.16b, RX1.16b, RTMP1.16b; \ |
171 | \ |
172 | /* sbox, non-linear part */ \ |
173 | movi RTMP3.16b, #64; /* sizeof(sbox) / 4 */ \ |
174 | tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \ |
175 | tbl RTMP1.16b, {v16.16b-v19.16b}, RX1.16b; \ |
176 | sub RX0.16b, RX0.16b, RTMP3.16b; \ |
177 | sub RX1.16b, RX1.16b, RTMP3.16b; \ |
178 | tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \ |
179 | tbx RTMP1.16b, {v20.16b-v23.16b}, RX1.16b; \ |
180 | sub RX0.16b, RX0.16b, RTMP3.16b; \ |
181 | sub RX1.16b, RX1.16b, RTMP3.16b; \ |
182 | tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \ |
183 | tbx RTMP1.16b, {v24.16b-v27.16b}, RX1.16b; \ |
184 | sub RX0.16b, RX0.16b, RTMP3.16b; \ |
185 | sub RX1.16b, RX1.16b, RTMP3.16b; \ |
186 | tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \ |
187 | tbx RTMP1.16b, {v28.16b-v31.16b}, RX1.16b; \ |
188 | \ |
189 | /* linear part */ \ |
190 | shl RX0.4s, RTMP0.4s, #8; \ |
191 | shl RX1.4s, RTMP1.4s, #8; \ |
192 | shl RTMP2.4s, RTMP0.4s, #16; \ |
193 | shl RTMP3.4s, RTMP1.4s, #16; \ |
194 | sri RX0.4s, RTMP0.4s, #(32 - 8); \ |
195 | sri RX1.4s, RTMP1.4s, #(32 - 8); \ |
196 | sri RTMP2.4s, RTMP0.4s, #(32 - 16); \ |
197 | sri RTMP3.4s, RTMP1.4s, #(32 - 16); \ |
198 | /* RX = x ^ rol32(x, 8) ^ rol32(x, 16) */ \ |
199 | eor RX0.16b, RX0.16b, RTMP0.16b; \ |
200 | eor RX1.16b, RX1.16b, RTMP1.16b; \ |
201 | eor RX0.16b, RX0.16b, RTMP2.16b; \ |
202 | eor RX1.16b, RX1.16b, RTMP3.16b; \ |
203 | /* RTMP0/1 ^= x ^ rol32(x, 24) ^ rol32(RX, 2) */ \ |
204 | shl RTMP2.4s, RTMP0.4s, #24; \ |
205 | shl RTMP3.4s, RTMP1.4s, #24; \ |
206 | sri RTMP2.4s, RTMP0.4s, #(32 - 24); \ |
207 | sri RTMP3.4s, RTMP1.4s, #(32 - 24); \ |
208 | eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \ |
209 | eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \ |
210 | shl RTMP2.4s, RX0.4s, #2; \ |
211 | shl RTMP3.4s, RX1.4s, #2; \ |
212 | sri RTMP2.4s, RX0.4s, #(32 - 2); \ |
213 | sri RTMP3.4s, RX1.4s, #(32 - 2); \ |
214 | eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \ |
215 | eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \ |
216 | /* s0/t0 ^= RTMP0/1 */ \ |
217 | eor s0.16b, s0.16b, RTMP0.16b; \ |
218 | eor t0.16b, t0.16b, RTMP1.16b; |
219 | |
220 | #define SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7) \ |
221 | rev32 b0.16b, b0.16b; \ |
222 | rev32 b1.16b, b1.16b; \ |
223 | rev32 b2.16b, b2.16b; \ |
224 | rev32 b3.16b, b3.16b; \ |
225 | rev32 b4.16b, b4.16b; \ |
226 | rev32 b5.16b, b5.16b; \ |
227 | rev32 b6.16b, b6.16b; \ |
228 | rev32 b7.16b, b7.16b; \ |
229 | \ |
230 | mov x6, 8; \ |
231 | 8: \ |
232 | ld1 {RKEY.4s}, [x0], #16; \ |
233 | subs x6, x6, #1; \ |
234 | \ |
235 | ROUND8(0, b0, b1, b2, b3, b4, b5, b6, b7); \ |
236 | ROUND8(1, b1, b2, b3, b0, b5, b6, b7, b4); \ |
237 | ROUND8(2, b2, b3, b0, b1, b6, b7, b4, b5); \ |
238 | ROUND8(3, b3, b0, b1, b2, b7, b4, b5, b6); \ |
239 | \ |
240 | bne 8b; \ |
241 | \ |
242 | rev32 b0.16b, b0.16b; \ |
243 | rev32 b1.16b, b1.16b; \ |
244 | rev32 b2.16b, b2.16b; \ |
245 | rev32 b3.16b, b3.16b; \ |
246 | rev32 b4.16b, b4.16b; \ |
247 | rev32 b5.16b, b5.16b; \ |
248 | rev32 b6.16b, b6.16b; \ |
249 | rev32 b7.16b, b7.16b; \ |
250 | \ |
251 | /* repoint to rkey */ \ |
252 | sub x0, x0, #128; |
253 | |
254 | #define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \ |
255 | SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7); \ |
256 | rotate_clockwise_4x4_2x(b0, b1, b2, b3, b4, b5, b6, b7); \ |
257 | |
258 | |
259 | .align 3 |
260 | SYM_FUNC_START(sm4_neon_crypt) |
261 | /* input: |
262 | * x0: round key array, CTX |
263 | * x1: dst |
264 | * x2: src |
265 | * w3: nblocks |
266 | */ |
267 | SM4_PREPARE() |
268 | |
269 | .Lcrypt_loop_8x: |
270 | sub w3, w3, #8 |
271 | tbnz w3, #31, .Lcrypt_4x |
272 | |
273 | ld4 {v0.4s-v3.4s}, [x2], #64 |
274 | ld4 {v4.4s-v7.4s}, [x2], #64 |
275 | |
276 | SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7) |
277 | |
278 | st1 {v0.16b-v3.16b}, [x1], #64 |
279 | st1 {v4.16b-v7.16b}, [x1], #64 |
280 | |
281 | cbz w3, .Lcrypt_end |
282 | b .Lcrypt_loop_8x |
283 | |
284 | .Lcrypt_4x: |
285 | add w3, w3, #8 |
286 | cmp w3, #4 |
287 | blt .Lcrypt_tail |
288 | |
289 | sub w3, w3, #4 |
290 | |
291 | ld4 {v0.4s-v3.4s}, [x2], #64 |
292 | |
293 | SM4_CRYPT_BLK4(v0, v1, v2, v3) |
294 | |
295 | st1 {v0.16b-v3.16b}, [x1], #64 |
296 | |
297 | cbz w3, .Lcrypt_end |
298 | |
299 | .Lcrypt_tail: |
300 | cmp w3, #2 |
301 | ld1 {v0.16b}, [x2], #16 |
302 | blt .Lcrypt_tail_load_done |
303 | ld1 {v1.16b}, [x2], #16 |
304 | beq .Lcrypt_tail_load_done |
305 | ld1 {v2.16b}, [x2], #16 |
306 | |
307 | .Lcrypt_tail_load_done: |
308 | transpose_4x4(v0, v1, v2, v3) |
309 | |
310 | SM4_CRYPT_BLK4(v0, v1, v2, v3) |
311 | |
312 | cmp w3, #2 |
313 | st1 {v0.16b}, [x1], #16 |
314 | blt .Lcrypt_end |
315 | st1 {v1.16b}, [x1], #16 |
316 | beq .Lcrypt_end |
317 | st1 {v2.16b}, [x1], #16 |
318 | |
319 | .Lcrypt_end: |
320 | ret |
321 | SYM_FUNC_END(sm4_neon_crypt) |
322 | |
323 | .align 3 |
324 | SYM_FUNC_START(sm4_neon_cbc_dec) |
325 | /* input: |
326 | * x0: round key array, CTX |
327 | * x1: dst |
328 | * x2: src |
329 | * x3: iv (big endian, 128 bit) |
330 | * w4: nblocks |
331 | */ |
332 | SM4_PREPARE() |
333 | |
334 | ld1 {RIV.16b}, [x3] |
335 | |
336 | .Lcbc_dec_loop_8x: |
337 | sub w4, w4, #8 |
338 | tbnz w4, #31, .Lcbc_dec_4x |
339 | |
340 | ld4 {v0.4s-v3.4s}, [x2], #64 |
341 | ld4 {v4.4s-v7.4s}, [x2] |
342 | |
343 | SM4_CRYPT_BLK8_norotate(v0, v1, v2, v3, v4, v5, v6, v7) |
344 | |
345 | /* Avoid overwriting the RIV register */ |
346 | rotate_clockwise_4x4(v0, v1, v2, v3) |
347 | rotate_clockwise_4x4(v4, v5, v6, v7) |
348 | |
349 | sub x2, x2, #64 |
350 | |
351 | eor v0.16b, v0.16b, RIV.16b |
352 | |
353 | ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64 |
354 | ld1 {RTMP4.16b-RTMP7.16b}, [x2], #64 |
355 | |
356 | eor v1.16b, v1.16b, RTMP0.16b |
357 | eor v2.16b, v2.16b, RTMP1.16b |
358 | eor v3.16b, v3.16b, RTMP2.16b |
359 | eor v4.16b, v4.16b, RTMP3.16b |
360 | eor v5.16b, v5.16b, RTMP4.16b |
361 | eor v6.16b, v6.16b, RTMP5.16b |
362 | eor v7.16b, v7.16b, RTMP6.16b |
363 | |
364 | mov RIV.16b, RTMP7.16b |
365 | |
366 | st1 {v0.16b-v3.16b}, [x1], #64 |
367 | st1 {v4.16b-v7.16b}, [x1], #64 |
368 | |
369 | cbz w4, .Lcbc_dec_end |
370 | b .Lcbc_dec_loop_8x |
371 | |
372 | .Lcbc_dec_4x: |
373 | add w4, w4, #8 |
374 | cmp w4, #4 |
375 | blt .Lcbc_dec_tail |
376 | |
377 | sub w4, w4, #4 |
378 | |
379 | ld1 {v0.16b-v3.16b}, [x2], #64 |
380 | |
381 | rev32 v4.16b, v0.16b |
382 | rev32 v5.16b, v1.16b |
383 | rev32 v6.16b, v2.16b |
384 | rev32 v7.16b, v3.16b |
385 | |
386 | transpose_4x4(v4, v5, v6, v7) |
387 | |
388 | SM4_CRYPT_BLK4_BE(v4, v5, v6, v7) |
389 | |
390 | eor v4.16b, v4.16b, RIV.16b |
391 | eor v5.16b, v5.16b, v0.16b |
392 | eor v6.16b, v6.16b, v1.16b |
393 | eor v7.16b, v7.16b, v2.16b |
394 | |
395 | mov RIV.16b, v3.16b |
396 | |
397 | st1 {v4.16b-v7.16b}, [x1], #64 |
398 | |
399 | cbz w4, .Lcbc_dec_end |
400 | |
401 | .Lcbc_dec_tail: |
402 | cmp w4, #2 |
403 | ld1 {v0.16b}, [x2], #16 |
404 | blt .Lcbc_dec_tail_load_done |
405 | ld1 {v1.16b}, [x2], #16 |
406 | beq .Lcbc_dec_tail_load_done |
407 | ld1 {v2.16b}, [x2], #16 |
408 | |
409 | .Lcbc_dec_tail_load_done: |
410 | rev32 v4.16b, v0.16b |
411 | rev32 v5.16b, v1.16b |
412 | rev32 v6.16b, v2.16b |
413 | |
414 | transpose_4x4(v4, v5, v6, v7) |
415 | |
416 | SM4_CRYPT_BLK4_BE(v4, v5, v6, v7) |
417 | |
418 | cmp w4, #2 |
419 | eor v4.16b, v4.16b, RIV.16b |
420 | mov RIV.16b, v0.16b |
421 | st1 {v4.16b}, [x1], #16 |
422 | blt .Lcbc_dec_end |
423 | |
424 | eor v5.16b, v5.16b, v0.16b |
425 | mov RIV.16b, v1.16b |
426 | st1 {v5.16b}, [x1], #16 |
427 | beq .Lcbc_dec_end |
428 | |
429 | eor v6.16b, v6.16b, v1.16b |
430 | mov RIV.16b, v2.16b |
431 | st1 {v6.16b}, [x1], #16 |
432 | |
433 | .Lcbc_dec_end: |
434 | /* store new IV */ |
435 | st1 {RIV.16b}, [x3] |
436 | |
437 | ret |
438 | SYM_FUNC_END(sm4_neon_cbc_dec) |
439 | |
440 | .align 3 |
441 | SYM_FUNC_START(sm4_neon_ctr_crypt) |
442 | /* input: |
443 | * x0: round key array, CTX |
444 | * x1: dst |
445 | * x2: src |
446 | * x3: ctr (big endian, 128 bit) |
447 | * w4: nblocks |
448 | */ |
449 | SM4_PREPARE() |
450 | |
451 | ldp x7, x8, [x3] |
452 | rev x7, x7 |
453 | rev x8, x8 |
454 | |
455 | .Lctr_crypt_loop_8x: |
456 | sub w4, w4, #8 |
457 | tbnz w4, #31, .Lctr_crypt_4x |
458 | |
459 | #define inc_le128(vctr) \ |
460 | mov vctr.d[1], x8; \ |
461 | mov vctr.d[0], x7; \ |
462 | adds x8, x8, #1; \ |
463 | rev64 vctr.16b, vctr.16b; \ |
464 | adc x7, x7, xzr; |
465 | |
466 | /* construct CTRs */ |
467 | inc_le128(v0) /* +0 */ |
468 | inc_le128(v1) /* +1 */ |
469 | inc_le128(v2) /* +2 */ |
470 | inc_le128(v3) /* +3 */ |
471 | inc_le128(v4) /* +4 */ |
472 | inc_le128(v5) /* +5 */ |
473 | inc_le128(v6) /* +6 */ |
474 | inc_le128(v7) /* +7 */ |
475 | |
476 | transpose_4x4_2x(v0, v1, v2, v3, v4, v5, v6, v7) |
477 | |
478 | SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7) |
479 | |
480 | ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64 |
481 | ld1 {RTMP4.16b-RTMP7.16b}, [x2], #64 |
482 | |
483 | eor v0.16b, v0.16b, RTMP0.16b |
484 | eor v1.16b, v1.16b, RTMP1.16b |
485 | eor v2.16b, v2.16b, RTMP2.16b |
486 | eor v3.16b, v3.16b, RTMP3.16b |
487 | eor v4.16b, v4.16b, RTMP4.16b |
488 | eor v5.16b, v5.16b, RTMP5.16b |
489 | eor v6.16b, v6.16b, RTMP6.16b |
490 | eor v7.16b, v7.16b, RTMP7.16b |
491 | |
492 | st1 {v0.16b-v3.16b}, [x1], #64 |
493 | st1 {v4.16b-v7.16b}, [x1], #64 |
494 | |
495 | cbz w4, .Lctr_crypt_end |
496 | b .Lctr_crypt_loop_8x |
497 | |
498 | .Lctr_crypt_4x: |
499 | add w4, w4, #8 |
500 | cmp w4, #4 |
501 | blt .Lctr_crypt_tail |
502 | |
503 | sub w4, w4, #4 |
504 | |
505 | /* construct CTRs */ |
506 | inc_le128(v0) /* +0 */ |
507 | inc_le128(v1) /* +1 */ |
508 | inc_le128(v2) /* +2 */ |
509 | inc_le128(v3) /* +3 */ |
510 | |
511 | ld1 {v4.16b-v7.16b}, [x2], #64 |
512 | |
513 | transpose_4x4(v0, v1, v2, v3) |
514 | |
515 | SM4_CRYPT_BLK4(v0, v1, v2, v3) |
516 | |
517 | eor v0.16b, v0.16b, v4.16b |
518 | eor v1.16b, v1.16b, v5.16b |
519 | eor v2.16b, v2.16b, v6.16b |
520 | eor v3.16b, v3.16b, v7.16b |
521 | |
522 | st1 {v0.16b-v3.16b}, [x1], #64 |
523 | |
524 | cbz w4, .Lctr_crypt_end |
525 | |
526 | .Lctr_crypt_tail: |
527 | /* inc_le128 will change the sign bit */ |
528 | ld1 {v4.16b}, [x2], #16 |
529 | inc_le128(v0) |
530 | cmp w4, #2 |
531 | blt .Lctr_crypt_tail_load_done |
532 | |
533 | ld1 {v5.16b}, [x2], #16 |
534 | inc_le128(v1) |
535 | cmp w4, #2 |
536 | beq .Lctr_crypt_tail_load_done |
537 | |
538 | ld1 {v6.16b}, [x2], #16 |
539 | inc_le128(v2) |
540 | |
541 | .Lctr_crypt_tail_load_done: |
542 | transpose_4x4(v0, v1, v2, v3) |
543 | |
544 | SM4_CRYPT_BLK4(v0, v1, v2, v3) |
545 | |
546 | cmp w4, #2 |
547 | |
548 | eor v0.16b, v0.16b, v4.16b |
549 | st1 {v0.16b}, [x1], #16 |
550 | blt .Lctr_crypt_end |
551 | |
552 | eor v1.16b, v1.16b, v5.16b |
553 | st1 {v1.16b}, [x1], #16 |
554 | beq .Lctr_crypt_end |
555 | |
556 | eor v2.16b, v2.16b, v6.16b |
557 | st1 {v2.16b}, [x1], #16 |
558 | |
559 | .Lctr_crypt_end: |
560 | /* store new CTR */ |
561 | rev x7, x7 |
562 | rev x8, x8 |
563 | stp x7, x8, [x3] |
564 | |
565 | ret |
566 | SYM_FUNC_END(sm4_neon_ctr_crypt) |
567 | |