aegis128-neon-inner.c source code [linux/crypto/aegis128-neon-inner.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
4	*/
5
6	#ifdef CONFIG_ARM64
7	#include <asm/neon-intrinsics.h>
8
9	#define AES_ROUND "aese %0.16b, %1.16b \n\t aesmc %0.16b, %0.16b"
10	#else
11	#include <arm_neon.h>
12
13	#define AES_ROUND "aese.8 %q0, %q1 \n\t aesmc.8 %q0, %q0"
14	#endif
15
16	#define AEGIS_BLOCK_SIZE 16
17
18	#include <stddef.h>
19	#include "aegis-neon.h"
20
21	extern int aegis128_have_aes_insn;
22
23	void memcpy(void* dest, const* void *src, size_t n);
24
25	struct aegis128_state {
26	uint8x16_t v[`5`];
27	};
28
29	extern const uint8_t crypto_aes_sbox[];
30
31	static struct aegis128_state aegis128_load_state_neon(const void *state)
32	{
33	return (struct aegis128_state){ {
34	vld1q_u8(state),
35	vld1q_u8(state + `16`),
36	vld1q_u8(state + `32`),
37	vld1q_u8(state + `48`),
38	vld1q_u8(state + `64`)
39	} };
40	}
41
42	static void aegis128_save_state_neon(struct aegis128_state st, void *state)
43	{
44	vst1q_u8(state, st.v[`0`]);
45	vst1q_u8(state + `16`, st.v[`1`]);
46	vst1q_u8(state + `32`, st.v[`2`]);
47	vst1q_u8(state + `48`, st.v[`3`]);
48	vst1q_u8(state + `64`, st.v[`4`]);
49	}
50
51	static inline __attribute__((always_inline))
52	uint8x16_t aegis_aes_round(uint8x16_t w)
53	{
54	uint8x16_t z = {};
55
56	#ifdef CONFIG_ARM64
57	if (!__builtin_expect(aegis128_have_aes_insn, `1`)) {
58	static const uint8_t shift_rows[] = {
59	`0x0`, `0x5`, `0xa`, `0xf`, `0x4`, `0x9`, `0xe`, `0x3`,
60	`0x8`, `0xd`, `0x2`, `0x7`, `0xc`, `0x1`, `0x6`, `0xb`,
61	};
62	static const uint8_t ror32by8[] = {
63	`0x1`, `0x2`, `0x3`, `0x0`, `0x5`, `0x6`, `0x7`, `0x4`,
64	`0x9`, `0xa`, `0xb`, `0x8`, `0xd`, `0xe`, `0xf`, `0xc`,
65	};
66	uint8x16_t v;
67
68	// shift rows
69	w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
70
71	// sub bytes
72	#ifndef CONFIG_CC_IS_GCC
73	v = vqtbl4q_u8(vld1q_u8_x4(crypto_aes_sbox), w);
74	v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + `0x40`), w - `0x40`);
75	v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + `0x80`), w - `0x80`);
76	v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + `0xc0`), w - `0xc0`);
77	#else
78	asm("tbl %0.16b, {v16.16b-v19.16b}, %1.16b" : "=w"(v) : "w"(w));
79	w -= `0x40`;
80	asm("tbx %0.16b, {v20.16b-v23.16b}, %1.16b" : "+w"(v) : "w"(w));
81	w -= `0x40`;
82	asm("tbx %0.16b, {v24.16b-v27.16b}, %1.16b" : "+w"(v) : "w"(w));
83	w -= `0x40`;
84	asm("tbx %0.16b, {v28.16b-v31.16b}, %1.16b" : "+w"(v) : "w"(w));
85	#endif
86
87	// mix columns
88	w = (v << `1`) ^ (uint8x16_t)(((int8x16_t)v >> `7`) & `0x1b`);
89	w ^= (uint8x16_t)vrev32q_u16((uint16x8_t)v);
90	w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
91
92	return w;
93	}
94	#endif
95
96	/*
97	* We use inline asm here instead of the vaeseq_u8/vaesmcq_u8 intrinsics
98	* to force the compiler to issue the aese/aesmc instructions in pairs.
99	* This is much faster on many cores, where the instruction pair can
100	* execute in a single cycle.
101	*/
102	asm(AES_ROUND : "+w"(w) : "w"(z));
103	return w;
104	}
105
106	static inline __attribute__((always_inline))
107	struct aegis128_state aegis128_update_neon(struct aegis128_state st,
108	uint8x16_t m)
109	{
110	m ^= aegis_aes_round(st.v[`4`]);
111	st.v[`4`] ^= aegis_aes_round(st.v[`3`]);
112	st.v[`3`] ^= aegis_aes_round(st.v[`2`]);
113	st.v[`2`] ^= aegis_aes_round(st.v[`1`]);
114	st.v[`1`] ^= aegis_aes_round(st.v[`0`]);
115	st.v[`0`] ^= m;
116
117	return st;
118	}
119
120	static inline __attribute__((always_inline))
121	void preload_sbox(void)
122	{
123	if (!IS_ENABLED(CONFIG_ARM64) \|\|
124	!IS_ENABLED(CONFIG_CC_IS_GCC) \|\|
125	__builtin_expect(aegis128_have_aes_insn, `1`))
126	return;
127
128	asm("ld1 {v16.16b-v19.16b}, [%0], #64 \n\t"
129	"ld1 {v20.16b-v23.16b}, [%0], #64 \n\t"
130	"ld1 {v24.16b-v27.16b}, [%0], #64 \n\t"
131	"ld1 {v28.16b-v31.16b}, [%0] \n\t"
132	:: "r"(crypto_aes_sbox));
133	}
134
135	void crypto_aegis128_init_neon(void state, const* void key, const* void *iv)
136	{
137	static const uint8_t const0[] = {
138	`0x00`, `0x01`, `0x01`, `0x02`, `0x03`, `0x05`, `0x08`, `0x0d`,
139	`0x15`, `0x22`, `0x37`, `0x59`, `0x90`, `0xe9`, `0x79`, `0x62`,
140	};
141	static const uint8_t const1[] = {
142	`0xdb`, `0x3d`, `0x18`, `0x55`, `0x6d`, `0xc2`, `0x2f`, `0xf1`,
143	`0x20`, `0x11`, `0x31`, `0x42`, `0x73`, `0xb5`, `0x28`, `0xdd`,
144	};
145	uint8x16_t k = vld1q_u8(key);
146	uint8x16_t kiv = k ^ vld1q_u8(iv);
147	struct aegis128_state st = {{
148	kiv,
149	vld1q_u8(const1),
150	vld1q_u8(const0),
151	k ^ vld1q_u8(const0),
152	k ^ vld1q_u8(const1),
153	}};
154	int i;
155
156	preload_sbox();
157
158	for (i = `0`; i < `5`; i++) {
159	st = aegis128_update_neon(st, k);
160	st = aegis128_update_neon(st, kiv);
161	}
162	aegis128_save_state_neon(st, state);
163	}
164
165	void crypto_aegis128_update_neon(void state, const* void *msg)
166	{
167	struct aegis128_state st = aegis128_load_state_neon(state);
168
169	preload_sbox();
170
171	st = aegis128_update_neon(st, vld1q_u8(msg));
172
173	aegis128_save_state_neon(st, state);
174	}
175
176	#ifdef CONFIG_ARM
177	/*
178	* AArch32 does not provide these intrinsics natively because it does not
179	* implement the underlying instructions. AArch32 only provides 64-bit
180	* wide vtbl.8/vtbx.8 instruction, so use those instead.
181	*/
182	static uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
183	{
184	union {
185	uint8x16_t val;
186	uint8x8x2_t pair;
187	} __a = { a };
188
189	return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)),
190	vtbl2_u8(__a.pair, vget_high_u8(b)));
191	}
192
193	static uint8x16_t vqtbx1q_u8(uint8x16_t v, uint8x16_t a, uint8x16_t b)
194	{
195	union {
196	uint8x16_t val;
197	uint8x8x2_t pair;
198	} __a = { a };
199
200	return vcombine_u8(vtbx2_u8(vget_low_u8(v), __a.pair, vget_low_u8(b)),
201	vtbx2_u8(vget_high_u8(v), __a.pair, vget_high_u8(b)));
202	}
203
204	static int8_t vminvq_s8(int8x16_t v)
205	{
206	int8x8_t s = vpmin_s8(vget_low_s8(v), vget_high_s8(v));
207
208	s = vpmin_s8(s, s);
209	s = vpmin_s8(s, s);
210	s = vpmin_s8(s, s);
211
212	return vget_lane_s8(s, `0`);
213	}
214	#endif
215
216	static const uint8_t permute[] __aligned(`64`) = {
217	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
218	`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`,
219	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
220	};
221
222	void crypto_aegis128_encrypt_chunk_neon(void state, void* dst, const* void *src,
223	unsigned int size)
224	{
225	struct aegis128_state st = aegis128_load_state_neon(state);
226	const int short_input = size < AEGIS_BLOCK_SIZE;
227	uint8x16_t msg;
228
229	preload_sbox();
230
231	while (size >= AEGIS_BLOCK_SIZE) {
232	uint8x16_t s = st.v[`1`] ^ (st.v[`2`] & st.v[`3`]) ^ st.v[`4`];
233
234	msg = vld1q_u8(src);
235	st = aegis128_update_neon(st, msg);
236	msg ^= s;
237	vst1q_u8(dst, msg);
238
239	size -= AEGIS_BLOCK_SIZE;
240	src += AEGIS_BLOCK_SIZE;
241	dst += AEGIS_BLOCK_SIZE;
242	}
243
244	if (size > `0`) {
245	uint8x16_t s = st.v[`1`] ^ (st.v[`2`] & st.v[`3`]) ^ st.v[`4`];
246	uint8_t buf[AEGIS_BLOCK_SIZE];
247	const void *in = src;
248	void *out = dst;
249	uint8x16_t m;
250
251	if (__builtin_expect(short_input, `0`))
252	in = out = memcpy(buf + AEGIS_BLOCK_SIZE - size, src, size);
253
254	m = vqtbl1q_u8(vld1q_u8(in + size - AEGIS_BLOCK_SIZE),
255	vld1q_u8(permute + `32` - size));
256
257	st = aegis128_update_neon(st, m);
258
259	vst1q_u8(out + size - AEGIS_BLOCK_SIZE,
260	vqtbl1q_u8(m ^ s, vld1q_u8(permute + size)));
261
262	if (__builtin_expect(short_input, `0`))
263	memcpy(dst, out, size);
264	else
265	vst1q_u8(out - AEGIS_BLOCK_SIZE, msg);
266	}
267
268	aegis128_save_state_neon(st, state);
269	}
270
271	void crypto_aegis128_decrypt_chunk_neon(void state, void* dst, const* void *src,
272	unsigned int size)
273	{
274	struct aegis128_state st = aegis128_load_state_neon(state);
275	const int short_input = size < AEGIS_BLOCK_SIZE;
276	uint8x16_t msg;
277
278	preload_sbox();
279
280	while (size >= AEGIS_BLOCK_SIZE) {
281	msg = vld1q_u8(src) ^ st.v[`1`] ^ (st.v[`2`] & st.v[`3`]) ^ st.v[`4`];
282	st = aegis128_update_neon(st, msg);
283	vst1q_u8(dst, msg);
284
285	size -= AEGIS_BLOCK_SIZE;
286	src += AEGIS_BLOCK_SIZE;
287	dst += AEGIS_BLOCK_SIZE;
288	}
289
290	if (size > `0`) {
291	uint8x16_t s = st.v[`1`] ^ (st.v[`2`] & st.v[`3`]) ^ st.v[`4`];
292	uint8_t buf[AEGIS_BLOCK_SIZE];
293	const void *in = src;
294	void *out = dst;
295	uint8x16_t m;
296
297	if (__builtin_expect(short_input, `0`))
298	in = out = memcpy(buf + AEGIS_BLOCK_SIZE - size, src, size);
299
300	m = s ^ vqtbx1q_u8(s, vld1q_u8(in + size - AEGIS_BLOCK_SIZE),
301	vld1q_u8(permute + `32` - size));
302
303	st = aegis128_update_neon(st, m);
304
305	vst1q_u8(out + size - AEGIS_BLOCK_SIZE,
306	vqtbl1q_u8(m, vld1q_u8(permute + size)));
307
308	if (__builtin_expect(short_input, `0`))
309	memcpy(dst, out, size);
310	else
311	vst1q_u8(out - AEGIS_BLOCK_SIZE, msg);
312	}
313
314	aegis128_save_state_neon(st, state);
315	}
316
317	int crypto_aegis128_final_neon(void state, void* *tag_xor,
318	unsigned int assoclen,
319	unsigned int cryptlen,
320	unsigned int authsize)
321	{
322	struct aegis128_state st = aegis128_load_state_neon(state);
323	uint8x16_t v;
324	int i;
325
326	preload_sbox();
327
328	v = st.v[`3`] ^ (uint8x16_t)vcombine_u64(vmov_n_u64(`8ULL` * assoclen),
329	vmov_n_u64(`8ULL` * cryptlen));
330
331	for (i = `0`; i < `7`; i++)
332	st = aegis128_update_neon(st, v);
333
334	v = st.v[`0`] ^ st.v[`1`] ^ st.v[`2`] ^ st.v[`3`] ^ st.v[`4`];
335
336	if (authsize > `0`) {
337	v = vqtbl1q_u8(~vceqq_u8(v, vld1q_u8(tag_xor)),
338	vld1q_u8(permute + authsize));
339
340	return vminvq_s8((int8x16_t)v);
341	}
342
343	vst1q_u8(tag_xor, v);
344	return `0`;
345	}
346

source code of linux/crypto/aegis128-neon-inner.c