1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | /* |
3 | * NH - ε-almost-universal hash function, NEON accelerated version |
4 | * |
5 | * Copyright 2018 Google LLC |
6 | * |
7 | * Author: Eric Biggers <ebiggers@google.com> |
8 | */ |
9 | |
10 | #include <linux/linkage.h> |
11 | |
12 | .text |
13 | .fpu neon |
14 | |
15 | KEY .req r0 |
16 | MESSAGE .req r1 |
17 | MESSAGE_LEN .req r2 |
18 | HASH .req r3 |
19 | |
20 | PASS0_SUMS .req q0 |
21 | PASS0_SUM_A .req d0 |
22 | PASS0_SUM_B .req d1 |
23 | PASS1_SUMS .req q1 |
24 | PASS1_SUM_A .req d2 |
25 | PASS1_SUM_B .req d3 |
26 | PASS2_SUMS .req q2 |
27 | PASS2_SUM_A .req d4 |
28 | PASS2_SUM_B .req d5 |
29 | PASS3_SUMS .req q3 |
30 | PASS3_SUM_A .req d6 |
31 | PASS3_SUM_B .req d7 |
32 | K0 .req q4 |
33 | K1 .req q5 |
34 | K2 .req q6 |
35 | K3 .req q7 |
36 | T0 .req q8 |
37 | T0_L .req d16 |
38 | T0_H .req d17 |
39 | T1 .req q9 |
40 | T1_L .req d18 |
41 | T1_H .req d19 |
42 | T2 .req q10 |
43 | T2_L .req d20 |
44 | T2_H .req d21 |
45 | T3 .req q11 |
46 | T3_L .req d22 |
47 | T3_H .req d23 |
48 | |
49 | .macro _nh_stride k0, k1, k2, k3 |
50 | |
51 | // Load next message stride |
52 | vld1.8 {T3}, [MESSAGE]! |
53 | |
54 | // Load next key stride |
55 | vld1.32 {\k3}, [KEY]! |
56 | |
57 | // Add message words to key words |
58 | vadd.u32 T0, T3, \k0 |
59 | vadd.u32 T1, T3, \k1 |
60 | vadd.u32 T2, T3, \k2 |
61 | vadd.u32 T3, T3, \k3 |
62 | |
63 | // Multiply 32x32 => 64 and accumulate |
64 | vmlal.u32 PASS0_SUMS, T0_L, T0_H |
65 | vmlal.u32 PASS1_SUMS, T1_L, T1_H |
66 | vmlal.u32 PASS2_SUMS, T2_L, T2_H |
67 | vmlal.u32 PASS3_SUMS, T3_L, T3_H |
68 | .endm |
69 | |
70 | /* |
71 | * void nh_neon(const u32 *key, const u8 *message, size_t message_len, |
72 | * __le64 hash[NH_NUM_PASSES]) |
73 | * |
74 | * It's guaranteed that message_len % 16 == 0. |
75 | */ |
76 | ENTRY(nh_neon) |
77 | |
78 | vld1.32 {K0,K1}, [KEY]! |
79 | vmov.u64 PASS0_SUMS, #0 |
80 | vmov.u64 PASS1_SUMS, #0 |
81 | vld1.32 {K2}, [KEY]! |
82 | vmov.u64 PASS2_SUMS, #0 |
83 | vmov.u64 PASS3_SUMS, #0 |
84 | |
85 | subs MESSAGE_LEN, MESSAGE_LEN, #64 |
86 | blt .Lloop4_done |
87 | .Lloop4: |
88 | _nh_stride K0, K1, K2, K3 |
89 | _nh_stride K1, K2, K3, K0 |
90 | _nh_stride K2, K3, K0, K1 |
91 | _nh_stride K3, K0, K1, K2 |
92 | subs MESSAGE_LEN, MESSAGE_LEN, #64 |
93 | bge .Lloop4 |
94 | |
95 | .Lloop4_done: |
96 | ands MESSAGE_LEN, MESSAGE_LEN, #63 |
97 | beq .Ldone |
98 | _nh_stride K0, K1, K2, K3 |
99 | |
100 | subs MESSAGE_LEN, MESSAGE_LEN, #16 |
101 | beq .Ldone |
102 | _nh_stride K1, K2, K3, K0 |
103 | |
104 | subs MESSAGE_LEN, MESSAGE_LEN, #16 |
105 | beq .Ldone |
106 | _nh_stride K2, K3, K0, K1 |
107 | |
108 | .Ldone: |
109 | // Sum the accumulators for each pass, then store the sums to 'hash' |
110 | vadd.u64 T0_L, PASS0_SUM_A, PASS0_SUM_B |
111 | vadd.u64 T0_H, PASS1_SUM_A, PASS1_SUM_B |
112 | vadd.u64 T1_L, PASS2_SUM_A, PASS2_SUM_B |
113 | vadd.u64 T1_H, PASS3_SUM_A, PASS3_SUM_B |
114 | vst1.8 {T0-T1}, [HASH] |
115 | bx lr |
116 | ENDPROC(nh_neon) |
117 | |