1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * ARM NEON accelerated ChaCha and XChaCha stream ciphers, |
4 | * including ChaCha20 (RFC7539) |
5 | * |
6 | * Copyright (C) 2016-2019 Linaro, Ltd. <ard.biesheuvel@linaro.org> |
7 | * Copyright (C) 2015 Martin Willi |
8 | */ |
9 | |
10 | #include <crypto/algapi.h> |
11 | #include <crypto/internal/chacha.h> |
12 | #include <crypto/internal/simd.h> |
13 | #include <crypto/internal/skcipher.h> |
14 | #include <linux/jump_label.h> |
15 | #include <linux/kernel.h> |
16 | #include <linux/module.h> |
17 | |
18 | #include <asm/cputype.h> |
19 | #include <asm/hwcap.h> |
20 | #include <asm/neon.h> |
21 | #include <asm/simd.h> |
22 | |
23 | asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src, |
24 | int nrounds); |
25 | asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src, |
26 | int nrounds, unsigned int nbytes); |
27 | asmlinkage void hchacha_block_arm(const u32 *state, u32 *out, int nrounds); |
28 | asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds); |
29 | |
30 | asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes, |
31 | const u32 *state, int nrounds); |
32 | |
33 | static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon); |
34 | |
35 | static inline bool neon_usable(void) |
36 | { |
37 | return static_branch_likely(&use_neon) && crypto_simd_usable(); |
38 | } |
39 | |
40 | static void chacha_doneon(u32 *state, u8 *dst, const u8 *src, |
41 | unsigned int bytes, int nrounds) |
42 | { |
43 | u8 buf[CHACHA_BLOCK_SIZE]; |
44 | |
45 | while (bytes > CHACHA_BLOCK_SIZE) { |
46 | unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U); |
47 | |
48 | chacha_4block_xor_neon(state, dst, src, nrounds, nbytes: l); |
49 | bytes -= l; |
50 | src += l; |
51 | dst += l; |
52 | state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE); |
53 | } |
54 | if (bytes) { |
55 | const u8 *s = src; |
56 | u8 *d = dst; |
57 | |
58 | if (bytes != CHACHA_BLOCK_SIZE) |
59 | s = d = memcpy(buf, src, bytes); |
60 | chacha_block_xor_neon(state, dst: d, src: s, nrounds); |
61 | if (d != dst) |
62 | memcpy(dst, buf, bytes); |
63 | state[12]++; |
64 | } |
65 | } |
66 | |
67 | void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds) |
68 | { |
69 | if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) { |
70 | hchacha_block_arm(state, out: stream, nrounds); |
71 | } else { |
72 | kernel_neon_begin(); |
73 | hchacha_block_neon(state, out: stream, nrounds); |
74 | kernel_neon_end(); |
75 | } |
76 | } |
77 | EXPORT_SYMBOL(hchacha_block_arch); |
78 | |
79 | void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv) |
80 | { |
81 | chacha_init_generic(state, key, iv); |
82 | } |
83 | EXPORT_SYMBOL(chacha_init_arch); |
84 | |
85 | void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, |
86 | int nrounds) |
87 | { |
88 | if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() || |
89 | bytes <= CHACHA_BLOCK_SIZE) { |
90 | chacha_doarm(dst, src, bytes, state, nrounds); |
91 | state[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE); |
92 | return; |
93 | } |
94 | |
95 | do { |
96 | unsigned int todo = min_t(unsigned int, bytes, SZ_4K); |
97 | |
98 | kernel_neon_begin(); |
99 | chacha_doneon(state, dst, src, bytes: todo, nrounds); |
100 | kernel_neon_end(); |
101 | |
102 | bytes -= todo; |
103 | src += todo; |
104 | dst += todo; |
105 | } while (bytes); |
106 | } |
107 | EXPORT_SYMBOL(chacha_crypt_arch); |
108 | |
109 | static int chacha_stream_xor(struct skcipher_request *req, |
110 | const struct chacha_ctx *ctx, const u8 *iv, |
111 | bool neon) |
112 | { |
113 | struct skcipher_walk walk; |
114 | u32 state[16]; |
115 | int err; |
116 | |
117 | err = skcipher_walk_virt(walk: &walk, req, atomic: false); |
118 | |
119 | chacha_init_generic(state, key: ctx->key, iv); |
120 | |
121 | while (walk.nbytes > 0) { |
122 | unsigned int nbytes = walk.nbytes; |
123 | |
124 | if (nbytes < walk.total) |
125 | nbytes = round_down(nbytes, walk.stride); |
126 | |
127 | if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon) { |
128 | chacha_doarm(dst: walk.dst.virt.addr, src: walk.src.virt.addr, |
129 | bytes: nbytes, state, nrounds: ctx->nrounds); |
130 | state[12] += DIV_ROUND_UP(nbytes, CHACHA_BLOCK_SIZE); |
131 | } else { |
132 | kernel_neon_begin(); |
133 | chacha_doneon(state, dst: walk.dst.virt.addr, |
134 | src: walk.src.virt.addr, bytes: nbytes, nrounds: ctx->nrounds); |
135 | kernel_neon_end(); |
136 | } |
137 | err = skcipher_walk_done(walk: &walk, err: walk.nbytes - nbytes); |
138 | } |
139 | |
140 | return err; |
141 | } |
142 | |
143 | static int do_chacha(struct skcipher_request *req, bool neon) |
144 | { |
145 | struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); |
146 | struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); |
147 | |
148 | return chacha_stream_xor(req, ctx, iv: req->iv, neon); |
149 | } |
150 | |
151 | static int chacha_arm(struct skcipher_request *req) |
152 | { |
153 | return do_chacha(req, neon: false); |
154 | } |
155 | |
156 | static int chacha_neon(struct skcipher_request *req) |
157 | { |
158 | return do_chacha(req, neon: neon_usable()); |
159 | } |
160 | |
161 | static int do_xchacha(struct skcipher_request *req, bool neon) |
162 | { |
163 | struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); |
164 | struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); |
165 | struct chacha_ctx subctx; |
166 | u32 state[16]; |
167 | u8 real_iv[16]; |
168 | |
169 | chacha_init_generic(state, key: ctx->key, iv: req->iv); |
170 | |
171 | if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon) { |
172 | hchacha_block_arm(state, out: subctx.key, nrounds: ctx->nrounds); |
173 | } else { |
174 | kernel_neon_begin(); |
175 | hchacha_block_neon(state, out: subctx.key, nrounds: ctx->nrounds); |
176 | kernel_neon_end(); |
177 | } |
178 | subctx.nrounds = ctx->nrounds; |
179 | |
180 | memcpy(&real_iv[0], req->iv + 24, 8); |
181 | memcpy(&real_iv[8], req->iv + 16, 8); |
182 | return chacha_stream_xor(req, ctx: &subctx, iv: real_iv, neon); |
183 | } |
184 | |
185 | static int xchacha_arm(struct skcipher_request *req) |
186 | { |
187 | return do_xchacha(req, neon: false); |
188 | } |
189 | |
190 | static int xchacha_neon(struct skcipher_request *req) |
191 | { |
192 | return do_xchacha(req, neon: neon_usable()); |
193 | } |
194 | |
195 | static struct skcipher_alg arm_algs[] = { |
196 | { |
197 | .base.cra_name = "chacha20" , |
198 | .base.cra_driver_name = "chacha20-arm" , |
199 | .base.cra_priority = 200, |
200 | .base.cra_blocksize = 1, |
201 | .base.cra_ctxsize = sizeof(struct chacha_ctx), |
202 | .base.cra_module = THIS_MODULE, |
203 | |
204 | .min_keysize = CHACHA_KEY_SIZE, |
205 | .max_keysize = CHACHA_KEY_SIZE, |
206 | .ivsize = CHACHA_IV_SIZE, |
207 | .chunksize = CHACHA_BLOCK_SIZE, |
208 | .setkey = chacha20_setkey, |
209 | .encrypt = chacha_arm, |
210 | .decrypt = chacha_arm, |
211 | }, { |
212 | .base.cra_name = "xchacha20" , |
213 | .base.cra_driver_name = "xchacha20-arm" , |
214 | .base.cra_priority = 200, |
215 | .base.cra_blocksize = 1, |
216 | .base.cra_ctxsize = sizeof(struct chacha_ctx), |
217 | .base.cra_module = THIS_MODULE, |
218 | |
219 | .min_keysize = CHACHA_KEY_SIZE, |
220 | .max_keysize = CHACHA_KEY_SIZE, |
221 | .ivsize = XCHACHA_IV_SIZE, |
222 | .chunksize = CHACHA_BLOCK_SIZE, |
223 | .setkey = chacha20_setkey, |
224 | .encrypt = xchacha_arm, |
225 | .decrypt = xchacha_arm, |
226 | }, { |
227 | .base.cra_name = "xchacha12" , |
228 | .base.cra_driver_name = "xchacha12-arm" , |
229 | .base.cra_priority = 200, |
230 | .base.cra_blocksize = 1, |
231 | .base.cra_ctxsize = sizeof(struct chacha_ctx), |
232 | .base.cra_module = THIS_MODULE, |
233 | |
234 | .min_keysize = CHACHA_KEY_SIZE, |
235 | .max_keysize = CHACHA_KEY_SIZE, |
236 | .ivsize = XCHACHA_IV_SIZE, |
237 | .chunksize = CHACHA_BLOCK_SIZE, |
238 | .setkey = chacha12_setkey, |
239 | .encrypt = xchacha_arm, |
240 | .decrypt = xchacha_arm, |
241 | }, |
242 | }; |
243 | |
244 | static struct skcipher_alg neon_algs[] = { |
245 | { |
246 | .base.cra_name = "chacha20" , |
247 | .base.cra_driver_name = "chacha20-neon" , |
248 | .base.cra_priority = 300, |
249 | .base.cra_blocksize = 1, |
250 | .base.cra_ctxsize = sizeof(struct chacha_ctx), |
251 | .base.cra_module = THIS_MODULE, |
252 | |
253 | .min_keysize = CHACHA_KEY_SIZE, |
254 | .max_keysize = CHACHA_KEY_SIZE, |
255 | .ivsize = CHACHA_IV_SIZE, |
256 | .chunksize = CHACHA_BLOCK_SIZE, |
257 | .walksize = 4 * CHACHA_BLOCK_SIZE, |
258 | .setkey = chacha20_setkey, |
259 | .encrypt = chacha_neon, |
260 | .decrypt = chacha_neon, |
261 | }, { |
262 | .base.cra_name = "xchacha20" , |
263 | .base.cra_driver_name = "xchacha20-neon" , |
264 | .base.cra_priority = 300, |
265 | .base.cra_blocksize = 1, |
266 | .base.cra_ctxsize = sizeof(struct chacha_ctx), |
267 | .base.cra_module = THIS_MODULE, |
268 | |
269 | .min_keysize = CHACHA_KEY_SIZE, |
270 | .max_keysize = CHACHA_KEY_SIZE, |
271 | .ivsize = XCHACHA_IV_SIZE, |
272 | .chunksize = CHACHA_BLOCK_SIZE, |
273 | .walksize = 4 * CHACHA_BLOCK_SIZE, |
274 | .setkey = chacha20_setkey, |
275 | .encrypt = xchacha_neon, |
276 | .decrypt = xchacha_neon, |
277 | }, { |
278 | .base.cra_name = "xchacha12" , |
279 | .base.cra_driver_name = "xchacha12-neon" , |
280 | .base.cra_priority = 300, |
281 | .base.cra_blocksize = 1, |
282 | .base.cra_ctxsize = sizeof(struct chacha_ctx), |
283 | .base.cra_module = THIS_MODULE, |
284 | |
285 | .min_keysize = CHACHA_KEY_SIZE, |
286 | .max_keysize = CHACHA_KEY_SIZE, |
287 | .ivsize = XCHACHA_IV_SIZE, |
288 | .chunksize = CHACHA_BLOCK_SIZE, |
289 | .walksize = 4 * CHACHA_BLOCK_SIZE, |
290 | .setkey = chacha12_setkey, |
291 | .encrypt = xchacha_neon, |
292 | .decrypt = xchacha_neon, |
293 | } |
294 | }; |
295 | |
296 | static int __init chacha_simd_mod_init(void) |
297 | { |
298 | int err = 0; |
299 | |
300 | if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER)) { |
301 | err = crypto_register_skciphers(algs: arm_algs, ARRAY_SIZE(arm_algs)); |
302 | if (err) |
303 | return err; |
304 | } |
305 | |
306 | if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) { |
307 | int i; |
308 | |
309 | switch (read_cpuid_part()) { |
310 | case ARM_CPU_PART_CORTEX_A7: |
311 | case ARM_CPU_PART_CORTEX_A5: |
312 | /* |
313 | * The Cortex-A7 and Cortex-A5 do not perform well with |
314 | * the NEON implementation but do incredibly with the |
315 | * scalar one and use less power. |
316 | */ |
317 | for (i = 0; i < ARRAY_SIZE(neon_algs); i++) |
318 | neon_algs[i].base.cra_priority = 0; |
319 | break; |
320 | default: |
321 | static_branch_enable(&use_neon); |
322 | } |
323 | |
324 | if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER)) { |
325 | err = crypto_register_skciphers(algs: neon_algs, ARRAY_SIZE(neon_algs)); |
326 | if (err) |
327 | crypto_unregister_skciphers(algs: arm_algs, ARRAY_SIZE(arm_algs)); |
328 | } |
329 | } |
330 | return err; |
331 | } |
332 | |
333 | static void __exit chacha_simd_mod_fini(void) |
334 | { |
335 | if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER)) { |
336 | crypto_unregister_skciphers(algs: arm_algs, ARRAY_SIZE(arm_algs)); |
337 | if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) |
338 | crypto_unregister_skciphers(algs: neon_algs, ARRAY_SIZE(neon_algs)); |
339 | } |
340 | } |
341 | |
342 | module_init(chacha_simd_mod_init); |
343 | module_exit(chacha_simd_mod_fini); |
344 | |
345 | MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (scalar and NEON accelerated)" ); |
346 | MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>" ); |
347 | MODULE_LICENSE("GPL v2" ); |
348 | MODULE_ALIAS_CRYPTO("chacha20" ); |
349 | MODULE_ALIAS_CRYPTO("chacha20-arm" ); |
350 | MODULE_ALIAS_CRYPTO("xchacha20" ); |
351 | MODULE_ALIAS_CRYPTO("xchacha20-arm" ); |
352 | MODULE_ALIAS_CRYPTO("xchacha12" ); |
353 | MODULE_ALIAS_CRYPTO("xchacha12-arm" ); |
354 | #ifdef CONFIG_KERNEL_MODE_NEON |
355 | MODULE_ALIAS_CRYPTO("chacha20-neon" ); |
356 | MODULE_ALIAS_CRYPTO("xchacha20-neon" ); |
357 | MODULE_ALIAS_CRYPTO("xchacha12-neon" ); |
358 | #endif |
359 | |