1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * x64 SIMD accelerated ChaCha and XChaCha stream ciphers, |
4 | * including ChaCha20 (RFC7539) |
5 | * |
6 | * Copyright (C) 2015 Martin Willi |
7 | */ |
8 | |
9 | #include <crypto/algapi.h> |
10 | #include <crypto/internal/chacha.h> |
11 | #include <crypto/internal/simd.h> |
12 | #include <crypto/internal/skcipher.h> |
13 | #include <linux/kernel.h> |
14 | #include <linux/module.h> |
15 | #include <linux/sizes.h> |
16 | #include <asm/simd.h> |
17 | |
18 | asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, |
19 | unsigned int len, int nrounds); |
20 | asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, |
21 | unsigned int len, int nrounds); |
22 | asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds); |
23 | |
24 | asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src, |
25 | unsigned int len, int nrounds); |
26 | asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src, |
27 | unsigned int len, int nrounds); |
28 | asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src, |
29 | unsigned int len, int nrounds); |
30 | |
31 | asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, |
32 | unsigned int len, int nrounds); |
33 | asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, |
34 | unsigned int len, int nrounds); |
35 | asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, |
36 | unsigned int len, int nrounds); |
37 | |
38 | static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd); |
39 | static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2); |
40 | static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl); |
41 | |
42 | static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks) |
43 | { |
44 | len = min(len, maxblocks * CHACHA_BLOCK_SIZE); |
45 | return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE; |
46 | } |
47 | |
48 | static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, |
49 | unsigned int bytes, int nrounds) |
50 | { |
51 | if (IS_ENABLED(CONFIG_AS_AVX512) && |
52 | static_branch_likely(&chacha_use_avx512vl)) { |
53 | while (bytes >= CHACHA_BLOCK_SIZE * 8) { |
54 | chacha_8block_xor_avx512vl(state, dst, src, len: bytes, |
55 | nrounds); |
56 | bytes -= CHACHA_BLOCK_SIZE * 8; |
57 | src += CHACHA_BLOCK_SIZE * 8; |
58 | dst += CHACHA_BLOCK_SIZE * 8; |
59 | state[12] += 8; |
60 | } |
61 | if (bytes > CHACHA_BLOCK_SIZE * 4) { |
62 | chacha_8block_xor_avx512vl(state, dst, src, len: bytes, |
63 | nrounds); |
64 | state[12] += chacha_advance(len: bytes, maxblocks: 8); |
65 | return; |
66 | } |
67 | if (bytes > CHACHA_BLOCK_SIZE * 2) { |
68 | chacha_4block_xor_avx512vl(state, dst, src, len: bytes, |
69 | nrounds); |
70 | state[12] += chacha_advance(len: bytes, maxblocks: 4); |
71 | return; |
72 | } |
73 | if (bytes) { |
74 | chacha_2block_xor_avx512vl(state, dst, src, len: bytes, |
75 | nrounds); |
76 | state[12] += chacha_advance(len: bytes, maxblocks: 2); |
77 | return; |
78 | } |
79 | } |
80 | |
81 | if (static_branch_likely(&chacha_use_avx2)) { |
82 | while (bytes >= CHACHA_BLOCK_SIZE * 8) { |
83 | chacha_8block_xor_avx2(state, dst, src, len: bytes, nrounds); |
84 | bytes -= CHACHA_BLOCK_SIZE * 8; |
85 | src += CHACHA_BLOCK_SIZE * 8; |
86 | dst += CHACHA_BLOCK_SIZE * 8; |
87 | state[12] += 8; |
88 | } |
89 | if (bytes > CHACHA_BLOCK_SIZE * 4) { |
90 | chacha_8block_xor_avx2(state, dst, src, len: bytes, nrounds); |
91 | state[12] += chacha_advance(len: bytes, maxblocks: 8); |
92 | return; |
93 | } |
94 | if (bytes > CHACHA_BLOCK_SIZE * 2) { |
95 | chacha_4block_xor_avx2(state, dst, src, len: bytes, nrounds); |
96 | state[12] += chacha_advance(len: bytes, maxblocks: 4); |
97 | return; |
98 | } |
99 | if (bytes > CHACHA_BLOCK_SIZE) { |
100 | chacha_2block_xor_avx2(state, dst, src, len: bytes, nrounds); |
101 | state[12] += chacha_advance(len: bytes, maxblocks: 2); |
102 | return; |
103 | } |
104 | } |
105 | |
106 | while (bytes >= CHACHA_BLOCK_SIZE * 4) { |
107 | chacha_4block_xor_ssse3(state, dst, src, len: bytes, nrounds); |
108 | bytes -= CHACHA_BLOCK_SIZE * 4; |
109 | src += CHACHA_BLOCK_SIZE * 4; |
110 | dst += CHACHA_BLOCK_SIZE * 4; |
111 | state[12] += 4; |
112 | } |
113 | if (bytes > CHACHA_BLOCK_SIZE) { |
114 | chacha_4block_xor_ssse3(state, dst, src, len: bytes, nrounds); |
115 | state[12] += chacha_advance(len: bytes, maxblocks: 4); |
116 | return; |
117 | } |
118 | if (bytes) { |
119 | chacha_block_xor_ssse3(state, dst, src, len: bytes, nrounds); |
120 | state[12]++; |
121 | } |
122 | } |
123 | |
124 | void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds) |
125 | { |
126 | if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) { |
127 | hchacha_block_generic(state, out: stream, nrounds); |
128 | } else { |
129 | kernel_fpu_begin(); |
130 | hchacha_block_ssse3(state, out: stream, nrounds); |
131 | kernel_fpu_end(); |
132 | } |
133 | } |
134 | EXPORT_SYMBOL(hchacha_block_arch); |
135 | |
136 | void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv) |
137 | { |
138 | chacha_init_generic(state, key, iv); |
139 | } |
140 | EXPORT_SYMBOL(chacha_init_arch); |
141 | |
142 | void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, |
143 | int nrounds) |
144 | { |
145 | if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() || |
146 | bytes <= CHACHA_BLOCK_SIZE) |
147 | return chacha_crypt_generic(state, dst, src, bytes, nrounds); |
148 | |
149 | do { |
150 | unsigned int todo = min_t(unsigned int, bytes, SZ_4K); |
151 | |
152 | kernel_fpu_begin(); |
153 | chacha_dosimd(state, dst, src, bytes: todo, nrounds); |
154 | kernel_fpu_end(); |
155 | |
156 | bytes -= todo; |
157 | src += todo; |
158 | dst += todo; |
159 | } while (bytes); |
160 | } |
161 | EXPORT_SYMBOL(chacha_crypt_arch); |
162 | |
163 | static int chacha_simd_stream_xor(struct skcipher_request *req, |
164 | const struct chacha_ctx *ctx, const u8 *iv) |
165 | { |
166 | u32 state[CHACHA_STATE_WORDS] __aligned(8); |
167 | struct skcipher_walk walk; |
168 | int err; |
169 | |
170 | err = skcipher_walk_virt(walk: &walk, req, atomic: false); |
171 | |
172 | chacha_init_generic(state, key: ctx->key, iv); |
173 | |
174 | while (walk.nbytes > 0) { |
175 | unsigned int nbytes = walk.nbytes; |
176 | |
177 | if (nbytes < walk.total) |
178 | nbytes = round_down(nbytes, walk.stride); |
179 | |
180 | if (!static_branch_likely(&chacha_use_simd) || |
181 | !crypto_simd_usable()) { |
182 | chacha_crypt_generic(state, dst: walk.dst.virt.addr, |
183 | src: walk.src.virt.addr, bytes: nbytes, |
184 | nrounds: ctx->nrounds); |
185 | } else { |
186 | kernel_fpu_begin(); |
187 | chacha_dosimd(state, dst: walk.dst.virt.addr, |
188 | src: walk.src.virt.addr, bytes: nbytes, |
189 | nrounds: ctx->nrounds); |
190 | kernel_fpu_end(); |
191 | } |
192 | err = skcipher_walk_done(walk: &walk, err: walk.nbytes - nbytes); |
193 | } |
194 | |
195 | return err; |
196 | } |
197 | |
198 | static int chacha_simd(struct skcipher_request *req) |
199 | { |
200 | struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); |
201 | struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); |
202 | |
203 | return chacha_simd_stream_xor(req, ctx, iv: req->iv); |
204 | } |
205 | |
206 | static int xchacha_simd(struct skcipher_request *req) |
207 | { |
208 | struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); |
209 | struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); |
210 | u32 state[CHACHA_STATE_WORDS] __aligned(8); |
211 | struct chacha_ctx subctx; |
212 | u8 real_iv[16]; |
213 | |
214 | chacha_init_generic(state, key: ctx->key, iv: req->iv); |
215 | |
216 | if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) { |
217 | kernel_fpu_begin(); |
218 | hchacha_block_ssse3(state, out: subctx.key, nrounds: ctx->nrounds); |
219 | kernel_fpu_end(); |
220 | } else { |
221 | hchacha_block_generic(state, out: subctx.key, nrounds: ctx->nrounds); |
222 | } |
223 | subctx.nrounds = ctx->nrounds; |
224 | |
225 | memcpy(&real_iv[0], req->iv + 24, 8); |
226 | memcpy(&real_iv[8], req->iv + 16, 8); |
227 | return chacha_simd_stream_xor(req, ctx: &subctx, iv: real_iv); |
228 | } |
229 | |
230 | static struct skcipher_alg algs[] = { |
231 | { |
232 | .base.cra_name = "chacha20" , |
233 | .base.cra_driver_name = "chacha20-simd" , |
234 | .base.cra_priority = 300, |
235 | .base.cra_blocksize = 1, |
236 | .base.cra_ctxsize = sizeof(struct chacha_ctx), |
237 | .base.cra_module = THIS_MODULE, |
238 | |
239 | .min_keysize = CHACHA_KEY_SIZE, |
240 | .max_keysize = CHACHA_KEY_SIZE, |
241 | .ivsize = CHACHA_IV_SIZE, |
242 | .chunksize = CHACHA_BLOCK_SIZE, |
243 | .setkey = chacha20_setkey, |
244 | .encrypt = chacha_simd, |
245 | .decrypt = chacha_simd, |
246 | }, { |
247 | .base.cra_name = "xchacha20" , |
248 | .base.cra_driver_name = "xchacha20-simd" , |
249 | .base.cra_priority = 300, |
250 | .base.cra_blocksize = 1, |
251 | .base.cra_ctxsize = sizeof(struct chacha_ctx), |
252 | .base.cra_module = THIS_MODULE, |
253 | |
254 | .min_keysize = CHACHA_KEY_SIZE, |
255 | .max_keysize = CHACHA_KEY_SIZE, |
256 | .ivsize = XCHACHA_IV_SIZE, |
257 | .chunksize = CHACHA_BLOCK_SIZE, |
258 | .setkey = chacha20_setkey, |
259 | .encrypt = xchacha_simd, |
260 | .decrypt = xchacha_simd, |
261 | }, { |
262 | .base.cra_name = "xchacha12" , |
263 | .base.cra_driver_name = "xchacha12-simd" , |
264 | .base.cra_priority = 300, |
265 | .base.cra_blocksize = 1, |
266 | .base.cra_ctxsize = sizeof(struct chacha_ctx), |
267 | .base.cra_module = THIS_MODULE, |
268 | |
269 | .min_keysize = CHACHA_KEY_SIZE, |
270 | .max_keysize = CHACHA_KEY_SIZE, |
271 | .ivsize = XCHACHA_IV_SIZE, |
272 | .chunksize = CHACHA_BLOCK_SIZE, |
273 | .setkey = chacha12_setkey, |
274 | .encrypt = xchacha_simd, |
275 | .decrypt = xchacha_simd, |
276 | }, |
277 | }; |
278 | |
279 | static int __init chacha_simd_mod_init(void) |
280 | { |
281 | if (!boot_cpu_has(X86_FEATURE_SSSE3)) |
282 | return 0; |
283 | |
284 | static_branch_enable(&chacha_use_simd); |
285 | |
286 | if (boot_cpu_has(X86_FEATURE_AVX) && |
287 | boot_cpu_has(X86_FEATURE_AVX2) && |
288 | cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { |
289 | static_branch_enable(&chacha_use_avx2); |
290 | |
291 | if (IS_ENABLED(CONFIG_AS_AVX512) && |
292 | boot_cpu_has(X86_FEATURE_AVX512VL) && |
293 | boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */ |
294 | static_branch_enable(&chacha_use_avx512vl); |
295 | } |
296 | return IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) ? |
297 | crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0; |
298 | } |
299 | |
300 | static void __exit chacha_simd_mod_fini(void) |
301 | { |
302 | if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) && boot_cpu_has(X86_FEATURE_SSSE3)) |
303 | crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); |
304 | } |
305 | |
306 | module_init(chacha_simd_mod_init); |
307 | module_exit(chacha_simd_mod_fini); |
308 | |
309 | MODULE_LICENSE("GPL" ); |
310 | MODULE_AUTHOR("Martin Willi <martin@strongswan.org>" ); |
311 | MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (x64 SIMD accelerated)" ); |
312 | MODULE_ALIAS_CRYPTO("chacha20" ); |
313 | MODULE_ALIAS_CRYPTO("chacha20-simd" ); |
314 | MODULE_ALIAS_CRYPTO("xchacha20" ); |
315 | MODULE_ALIAS_CRYPTO("xchacha20-simd" ); |
316 | MODULE_ALIAS_CRYPTO("xchacha12" ); |
317 | MODULE_ALIAS_CRYPTO("xchacha12-simd" ); |
318 | |