1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * arch/arm64/lib/xor-neon.c
4 *
5 * Authors: Jackie Liu <liuyun01@kylinos.cn>
6 * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
7 */
8
9#include <linux/raid/xor.h>
10#include <linux/module.h>
11#include <asm/neon-intrinsics.h>
12
13static void xor_arm64_neon_2(unsigned long bytes, unsigned long * __restrict p1,
14 const unsigned long * __restrict p2)
15{
16 uint64_t *dp1 = (uint64_t *)p1;
17 uint64_t *dp2 = (uint64_t *)p2;
18
19 register uint64x2_t v0, v1, v2, v3;
20 long lines = bytes / (sizeof(uint64x2_t) * 4);
21
22 do {
23 /* p1 ^= p2 */
24 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
25 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
26 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
27 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
28
29 /* store */
30 vst1q_u64(dp1 + 0, v0);
31 vst1q_u64(dp1 + 2, v1);
32 vst1q_u64(dp1 + 4, v2);
33 vst1q_u64(dp1 + 6, v3);
34
35 dp1 += 8;
36 dp2 += 8;
37 } while (--lines > 0);
38}
39
40static void xor_arm64_neon_3(unsigned long bytes, unsigned long * __restrict p1,
41 const unsigned long * __restrict p2,
42 const unsigned long * __restrict p3)
43{
44 uint64_t *dp1 = (uint64_t *)p1;
45 uint64_t *dp2 = (uint64_t *)p2;
46 uint64_t *dp3 = (uint64_t *)p3;
47
48 register uint64x2_t v0, v1, v2, v3;
49 long lines = bytes / (sizeof(uint64x2_t) * 4);
50
51 do {
52 /* p1 ^= p2 */
53 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
54 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
55 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
56 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
57
58 /* p1 ^= p3 */
59 v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
60 v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
61 v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
62 v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));
63
64 /* store */
65 vst1q_u64(dp1 + 0, v0);
66 vst1q_u64(dp1 + 2, v1);
67 vst1q_u64(dp1 + 4, v2);
68 vst1q_u64(dp1 + 6, v3);
69
70 dp1 += 8;
71 dp2 += 8;
72 dp3 += 8;
73 } while (--lines > 0);
74}
75
76static void xor_arm64_neon_4(unsigned long bytes, unsigned long * __restrict p1,
77 const unsigned long * __restrict p2,
78 const unsigned long * __restrict p3,
79 const unsigned long * __restrict p4)
80{
81 uint64_t *dp1 = (uint64_t *)p1;
82 uint64_t *dp2 = (uint64_t *)p2;
83 uint64_t *dp3 = (uint64_t *)p3;
84 uint64_t *dp4 = (uint64_t *)p4;
85
86 register uint64x2_t v0, v1, v2, v3;
87 long lines = bytes / (sizeof(uint64x2_t) * 4);
88
89 do {
90 /* p1 ^= p2 */
91 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
92 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
93 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
94 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
95
96 /* p1 ^= p3 */
97 v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
98 v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
99 v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
100 v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));
101
102 /* p1 ^= p4 */
103 v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
104 v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
105 v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
106 v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
107
108 /* store */
109 vst1q_u64(dp1 + 0, v0);
110 vst1q_u64(dp1 + 2, v1);
111 vst1q_u64(dp1 + 4, v2);
112 vst1q_u64(dp1 + 6, v3);
113
114 dp1 += 8;
115 dp2 += 8;
116 dp3 += 8;
117 dp4 += 8;
118 } while (--lines > 0);
119}
120
121static void xor_arm64_neon_5(unsigned long bytes, unsigned long * __restrict p1,
122 const unsigned long * __restrict p2,
123 const unsigned long * __restrict p3,
124 const unsigned long * __restrict p4,
125 const unsigned long * __restrict p5)
126{
127 uint64_t *dp1 = (uint64_t *)p1;
128 uint64_t *dp2 = (uint64_t *)p2;
129 uint64_t *dp3 = (uint64_t *)p3;
130 uint64_t *dp4 = (uint64_t *)p4;
131 uint64_t *dp5 = (uint64_t *)p5;
132
133 register uint64x2_t v0, v1, v2, v3;
134 long lines = bytes / (sizeof(uint64x2_t) * 4);
135
136 do {
137 /* p1 ^= p2 */
138 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
139 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
140 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
141 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
142
143 /* p1 ^= p3 */
144 v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
145 v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
146 v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
147 v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));
148
149 /* p1 ^= p4 */
150 v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
151 v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
152 v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
153 v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
154
155 /* p1 ^= p5 */
156 v0 = veorq_u64(v0, vld1q_u64(dp5 + 0));
157 v1 = veorq_u64(v1, vld1q_u64(dp5 + 2));
158 v2 = veorq_u64(v2, vld1q_u64(dp5 + 4));
159 v3 = veorq_u64(v3, vld1q_u64(dp5 + 6));
160
161 /* store */
162 vst1q_u64(dp1 + 0, v0);
163 vst1q_u64(dp1 + 2, v1);
164 vst1q_u64(dp1 + 4, v2);
165 vst1q_u64(dp1 + 6, v3);
166
167 dp1 += 8;
168 dp2 += 8;
169 dp3 += 8;
170 dp4 += 8;
171 dp5 += 8;
172 } while (--lines > 0);
173}
174
175struct xor_block_template xor_block_inner_neon __ro_after_init = {
176 .name = "__inner_neon__",
177 .do_2 = xor_arm64_neon_2,
178 .do_3 = xor_arm64_neon_3,
179 .do_4 = xor_arm64_neon_4,
180 .do_5 = xor_arm64_neon_5,
181};
182EXPORT_SYMBOL(xor_block_inner_neon);
183
184static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
185{
186 uint64x2_t res;
187
188 asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n"
189 "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
190 : "=w"(res) : "w"(p), "w"(q), "w"(r));
191 return res;
192}
193
194static void xor_arm64_eor3_3(unsigned long bytes,
195 unsigned long * __restrict p1,
196 const unsigned long * __restrict p2,
197 const unsigned long * __restrict p3)
198{
199 uint64_t *dp1 = (uint64_t *)p1;
200 uint64_t *dp2 = (uint64_t *)p2;
201 uint64_t *dp3 = (uint64_t *)p3;
202
203 register uint64x2_t v0, v1, v2, v3;
204 long lines = bytes / (sizeof(uint64x2_t) * 4);
205
206 do {
207 /* p1 ^= p2 ^ p3 */
208 v0 = eor3(p: vld1q_u64(dp1 + 0), q: vld1q_u64(dp2 + 0),
209 r: vld1q_u64(dp3 + 0));
210 v1 = eor3(p: vld1q_u64(dp1 + 2), q: vld1q_u64(dp2 + 2),
211 r: vld1q_u64(dp3 + 2));
212 v2 = eor3(p: vld1q_u64(dp1 + 4), q: vld1q_u64(dp2 + 4),
213 r: vld1q_u64(dp3 + 4));
214 v3 = eor3(p: vld1q_u64(dp1 + 6), q: vld1q_u64(dp2 + 6),
215 r: vld1q_u64(dp3 + 6));
216
217 /* store */
218 vst1q_u64(dp1 + 0, v0);
219 vst1q_u64(dp1 + 2, v1);
220 vst1q_u64(dp1 + 4, v2);
221 vst1q_u64(dp1 + 6, v3);
222
223 dp1 += 8;
224 dp2 += 8;
225 dp3 += 8;
226 } while (--lines > 0);
227}
228
229static void xor_arm64_eor3_4(unsigned long bytes,
230 unsigned long * __restrict p1,
231 const unsigned long * __restrict p2,
232 const unsigned long * __restrict p3,
233 const unsigned long * __restrict p4)
234{
235 uint64_t *dp1 = (uint64_t *)p1;
236 uint64_t *dp2 = (uint64_t *)p2;
237 uint64_t *dp3 = (uint64_t *)p3;
238 uint64_t *dp4 = (uint64_t *)p4;
239
240 register uint64x2_t v0, v1, v2, v3;
241 long lines = bytes / (sizeof(uint64x2_t) * 4);
242
243 do {
244 /* p1 ^= p2 ^ p3 */
245 v0 = eor3(p: vld1q_u64(dp1 + 0), q: vld1q_u64(dp2 + 0),
246 r: vld1q_u64(dp3 + 0));
247 v1 = eor3(p: vld1q_u64(dp1 + 2), q: vld1q_u64(dp2 + 2),
248 r: vld1q_u64(dp3 + 2));
249 v2 = eor3(p: vld1q_u64(dp1 + 4), q: vld1q_u64(dp2 + 4),
250 r: vld1q_u64(dp3 + 4));
251 v3 = eor3(p: vld1q_u64(dp1 + 6), q: vld1q_u64(dp2 + 6),
252 r: vld1q_u64(dp3 + 6));
253
254 /* p1 ^= p4 */
255 v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
256 v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
257 v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
258 v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
259
260 /* store */
261 vst1q_u64(dp1 + 0, v0);
262 vst1q_u64(dp1 + 2, v1);
263 vst1q_u64(dp1 + 4, v2);
264 vst1q_u64(dp1 + 6, v3);
265
266 dp1 += 8;
267 dp2 += 8;
268 dp3 += 8;
269 dp4 += 8;
270 } while (--lines > 0);
271}
272
273static void xor_arm64_eor3_5(unsigned long bytes,
274 unsigned long * __restrict p1,
275 const unsigned long * __restrict p2,
276 const unsigned long * __restrict p3,
277 const unsigned long * __restrict p4,
278 const unsigned long * __restrict p5)
279{
280 uint64_t *dp1 = (uint64_t *)p1;
281 uint64_t *dp2 = (uint64_t *)p2;
282 uint64_t *dp3 = (uint64_t *)p3;
283 uint64_t *dp4 = (uint64_t *)p4;
284 uint64_t *dp5 = (uint64_t *)p5;
285
286 register uint64x2_t v0, v1, v2, v3;
287 long lines = bytes / (sizeof(uint64x2_t) * 4);
288
289 do {
290 /* p1 ^= p2 ^ p3 */
291 v0 = eor3(p: vld1q_u64(dp1 + 0), q: vld1q_u64(dp2 + 0),
292 r: vld1q_u64(dp3 + 0));
293 v1 = eor3(p: vld1q_u64(dp1 + 2), q: vld1q_u64(dp2 + 2),
294 r: vld1q_u64(dp3 + 2));
295 v2 = eor3(p: vld1q_u64(dp1 + 4), q: vld1q_u64(dp2 + 4),
296 r: vld1q_u64(dp3 + 4));
297 v3 = eor3(p: vld1q_u64(dp1 + 6), q: vld1q_u64(dp2 + 6),
298 r: vld1q_u64(dp3 + 6));
299
300 /* p1 ^= p4 ^ p5 */
301 v0 = eor3(p: v0, q: vld1q_u64(dp4 + 0), r: vld1q_u64(dp5 + 0));
302 v1 = eor3(p: v1, q: vld1q_u64(dp4 + 2), r: vld1q_u64(dp5 + 2));
303 v2 = eor3(p: v2, q: vld1q_u64(dp4 + 4), r: vld1q_u64(dp5 + 4));
304 v3 = eor3(p: v3, q: vld1q_u64(dp4 + 6), r: vld1q_u64(dp5 + 6));
305
306 /* store */
307 vst1q_u64(dp1 + 0, v0);
308 vst1q_u64(dp1 + 2, v1);
309 vst1q_u64(dp1 + 4, v2);
310 vst1q_u64(dp1 + 6, v3);
311
312 dp1 += 8;
313 dp2 += 8;
314 dp3 += 8;
315 dp4 += 8;
316 dp5 += 8;
317 } while (--lines > 0);
318}
319
320static int __init xor_neon_init(void)
321{
322 if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) {
323 xor_block_inner_neon.do_3 = xor_arm64_eor3_3;
324 xor_block_inner_neon.do_4 = xor_arm64_eor3_4;
325 xor_block_inner_neon.do_5 = xor_arm64_eor3_5;
326 }
327 return 0;
328}
329module_init(xor_neon_init);
330
331static void __exit xor_neon_exit(void)
332{
333}
334module_exit(xor_neon_exit);
335
336MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
337MODULE_DESCRIPTION("ARMv8 XOR Extensions");
338MODULE_LICENSE("GPL");
339

source code of linux/arch/arm64/lib/xor-neon.c