nh-neon-core.S source code [linux/arch/arm64/crypto/nh-neon-core.S]

1	/ SPDX-License-Identifier: GPL-2.0 /
2	/*
3	* NH - ε-almost-universal hash function, ARM64 NEON accelerated version
4	*
5	* Copyright 2018 Google LLC
6	*
7	* Author: Eric Biggers <ebiggers@google.com>
8	*/
9
10	#include <linux/linkage.h>
11	#include <linux/cfi_types.h>
12
13	KEY .req x0
14	MESSAGE .req x1
15	MESSAGE_LEN .req x2
16	HASH .req x3
17
18	PASS0_SUMS .req v0
19	PASS1_SUMS .req v1
20	PASS2_SUMS .req v2
21	PASS3_SUMS .req v3
22	K0 .req v4
23	K1 .req v5
24	K2 .req v6
25	K3 .req v7
26	T0 .req v8
27	T1 .req v9
28	T2 .req v10
29	T3 .req v11
30	T4 .req v12
31	T5 .req v13
32	T6 .req v14
33	T7 .req v15
34
35	.macro _nh_stride k0, k1, k2, k3
36
37	// Load next message stride
38	ld1 {T3`.16b`}, [MESSAGE], #`16`
39
40	// Load next key stride
41	ld1 {\k3\()`.4s`}, [KEY], #`16`
42
43	// Add message words to key words
44	add T0`.4s`, T3`.4s`, \k0\()`.4s`
45	add T1`.4s`, T3`.4s`, \k1\()`.4s`
46	add T2`.4s`, T3`.4s`, \k2\()`.4s`
47	add T3`.4s`, T3`.4s`, \k3\()`.4s`
48
49	// Multiply 32x32 => 64 and accumulate
50	mov T4.d[`0`], T0.d[`1`]
51	mov T5.d[`0`], T1.d[`1`]
52	mov T6.d[`0`], T2.d[`1`]
53	mov T7.d[`0`], T3.d[`1`]
54	umlal PASS0_SUMS`.2d`, T0`.2s`, T4`.2s`
55	umlal PASS1_SUMS`.2d`, T1`.2s`, T5`.2s`
56	umlal PASS2_SUMS`.2d`, T2`.2s`, T6`.2s`
57	umlal PASS3_SUMS`.2d`, T3`.2s`, T7`.2s`
58	.endm
59
60	/*
61	* void nh_neon(const u32 key, const u8 message, size_t message_len,
62	* __le64 hash[NH_NUM_PASSES])
63	*
64	* It's guaranteed that message_len % 16 == 0.
65	*/
66	SYM_TYPED_FUNC_START(nh_neon)
67
68	ld1 {K0`.4s`,K1`.4s`}, [KEY], #`32`
69	movi PASS0_SUMS`.2d`, #`0`
70	movi PASS1_SUMS`.2d`, #`0`
71	ld1 {K2`.4s`}, [KEY], #`16`
72	movi PASS2_SUMS`.2d`, #`0`
73	movi PASS3_SUMS`.2d`, #`0`
74
75	subs MESSAGE_LEN, MESSAGE_LEN, #`64`
76	blt .Lloop4_done
77	.Lloop4:
78	_nh_stride K0, K1, K2, K3
79	_nh_stride K1, K2, K3, K0
80	_nh_stride K2, K3, K0, K1
81	_nh_stride K3, K0, K1, K2
82	subs MESSAGE_LEN, MESSAGE_LEN, #`64`
83	bge .Lloop4
84
85	.Lloop4_done:
86	ands MESSAGE_LEN, MESSAGE_LEN, #`63`
87	beq .Ldone
88	_nh_stride K0, K1, K2, K3
89
90	subs MESSAGE_LEN, MESSAGE_LEN, #`16`
91	beq .Ldone
92	_nh_stride K1, K2, K3, K0
93
94	subs MESSAGE_LEN, MESSAGE_LEN, #`16`
95	beq .Ldone
96	_nh_stride K2, K3, K0, K1
97
98	.Ldone:
99	// Sum the accumulators for each pass, then store the sums to 'hash'
100	addp T0`.2d`, PASS0_SUMS`.2d`, PASS1_SUMS`.2d`
101	addp T1`.2d`, PASS2_SUMS`.2d`, PASS3_SUMS`.2d`
102	st1 {T0`.16b`,T1`.16b`}, [HASH]
103	ret
104	SYM_FUNC_END(nh_neon)
105

source code of linux/arch/arm64/crypto/nh-neon-core.S