1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | /* |
3 | * NH - ε-almost-universal hash function, ARM64 NEON accelerated version |
4 | * |
5 | * Copyright 2018 Google LLC |
6 | * |
7 | * Author: Eric Biggers <ebiggers@google.com> |
8 | */ |
9 | |
10 | #include <linux/linkage.h> |
11 | #include <linux/cfi_types.h> |
12 | |
13 | KEY .req x0 |
14 | MESSAGE .req x1 |
15 | MESSAGE_LEN .req x2 |
16 | HASH .req x3 |
17 | |
18 | PASS0_SUMS .req v0 |
19 | PASS1_SUMS .req v1 |
20 | PASS2_SUMS .req v2 |
21 | PASS3_SUMS .req v3 |
22 | K0 .req v4 |
23 | K1 .req v5 |
24 | K2 .req v6 |
25 | K3 .req v7 |
26 | T0 .req v8 |
27 | T1 .req v9 |
28 | T2 .req v10 |
29 | T3 .req v11 |
30 | T4 .req v12 |
31 | T5 .req v13 |
32 | T6 .req v14 |
33 | T7 .req v15 |
34 | |
35 | .macro _nh_stride k0, k1, k2, k3 |
36 | |
37 | // Load next message stride |
38 | ld1 {T3.16b}, [MESSAGE], #16 |
39 | |
40 | // Load next key stride |
41 | ld1 {\k3\().4s}, [KEY], #16 |
42 | |
43 | // Add message words to key words |
44 | add T0.4s, T3.4s, \k0\().4s |
45 | add T1.4s, T3.4s, \k1\().4s |
46 | add T2.4s, T3.4s, \k2\().4s |
47 | add T3.4s, T3.4s, \k3\().4s |
48 | |
49 | // Multiply 32x32 => 64 and accumulate |
50 | mov T4.d[0], T0.d[1] |
51 | mov T5.d[0], T1.d[1] |
52 | mov T6.d[0], T2.d[1] |
53 | mov T7.d[0], T3.d[1] |
54 | umlal PASS0_SUMS.2d, T0.2s, T4.2s |
55 | umlal PASS1_SUMS.2d, T1.2s, T5.2s |
56 | umlal PASS2_SUMS.2d, T2.2s, T6.2s |
57 | umlal PASS3_SUMS.2d, T3.2s, T7.2s |
58 | .endm |
59 | |
60 | /* |
61 | * void nh_neon(const u32 *key, const u8 *message, size_t message_len, |
62 | * __le64 hash[NH_NUM_PASSES]) |
63 | * |
64 | * It's guaranteed that message_len % 16 == 0. |
65 | */ |
66 | SYM_TYPED_FUNC_START(nh_neon) |
67 | |
68 | ld1 {K0.4s,K1.4s}, [KEY], #32 |
69 | movi PASS0_SUMS.2d, #0 |
70 | movi PASS1_SUMS.2d, #0 |
71 | ld1 {K2.4s}, [KEY], #16 |
72 | movi PASS2_SUMS.2d, #0 |
73 | movi PASS3_SUMS.2d, #0 |
74 | |
75 | subs MESSAGE_LEN, MESSAGE_LEN, #64 |
76 | blt .Lloop4_done |
77 | .Lloop4: |
78 | _nh_stride K0, K1, K2, K3 |
79 | _nh_stride K1, K2, K3, K0 |
80 | _nh_stride K2, K3, K0, K1 |
81 | _nh_stride K3, K0, K1, K2 |
82 | subs MESSAGE_LEN, MESSAGE_LEN, #64 |
83 | bge .Lloop4 |
84 | |
85 | .Lloop4_done: |
86 | ands MESSAGE_LEN, MESSAGE_LEN, #63 |
87 | beq .Ldone |
88 | _nh_stride K0, K1, K2, K3 |
89 | |
90 | subs MESSAGE_LEN, MESSAGE_LEN, #16 |
91 | beq .Ldone |
92 | _nh_stride K1, K2, K3, K0 |
93 | |
94 | subs MESSAGE_LEN, MESSAGE_LEN, #16 |
95 | beq .Ldone |
96 | _nh_stride K2, K3, K0, K1 |
97 | |
98 | .Ldone: |
99 | // Sum the accumulators for each pass, then store the sums to 'hash' |
100 | addp T0.2d, PASS0_SUMS.2d, PASS1_SUMS.2d |
101 | addp T1.2d, PASS2_SUMS.2d, PASS3_SUMS.2d |
102 | st1 {T0.16b,T1.16b}, [HASH] |
103 | ret |
104 | SYM_FUNC_END(nh_neon) |
105 | |