1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
2 | #ifndef _ASM_X86_XOR_H |
3 | #define _ASM_X86_XOR_H |
4 | |
5 | /* |
6 | * Optimized RAID-5 checksumming functions for SSE. |
7 | */ |
8 | |
9 | /* |
10 | * Cache avoiding checksumming functions utilizing KNI instructions |
11 | * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) |
12 | */ |
13 | |
14 | /* |
15 | * Based on |
16 | * High-speed RAID5 checksumming functions utilizing SSE instructions. |
17 | * Copyright (C) 1998 Ingo Molnar. |
18 | */ |
19 | |
20 | /* |
21 | * x86-64 changes / gcc fixes from Andi Kleen. |
22 | * Copyright 2002 Andi Kleen, SuSE Labs. |
23 | * |
24 | * This hasn't been optimized for the hammer yet, but there are likely |
25 | * no advantages to be gotten from x86-64 here anyways. |
26 | */ |
27 | |
28 | #include <asm/fpu/api.h> |
29 | |
30 | #ifdef CONFIG_X86_32 |
31 | /* reduce register pressure */ |
32 | # define XOR_CONSTANT_CONSTRAINT "i" |
33 | #else |
34 | # define XOR_CONSTANT_CONSTRAINT "re" |
35 | #endif |
36 | |
37 | #define OFFS(x) "16*("#x")" |
38 | #define PF_OFFS(x) "256+16*("#x")" |
39 | #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" |
40 | #define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" |
41 | #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" |
42 | #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" |
43 | #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" |
44 | #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" |
45 | #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" |
46 | #define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" |
47 | #define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" |
48 | #define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" |
49 | #define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" |
50 | #define NOP(x) |
51 | |
52 | #define BLK64(pf, op, i) \ |
53 | pf(i) \ |
54 | op(i, 0) \ |
55 | op(i + 1, 1) \ |
56 | op(i + 2, 2) \ |
57 | op(i + 3, 3) |
58 | |
59 | static void |
60 | xor_sse_2(unsigned long bytes, unsigned long * __restrict p1, |
61 | const unsigned long * __restrict p2) |
62 | { |
63 | unsigned long lines = bytes >> 8; |
64 | |
65 | kernel_fpu_begin(); |
66 | |
67 | asm volatile( |
68 | #undef BLOCK |
69 | #define BLOCK(i) \ |
70 | LD(i, 0) \ |
71 | LD(i + 1, 1) \ |
72 | PF1(i) \ |
73 | PF1(i + 2) \ |
74 | LD(i + 2, 2) \ |
75 | LD(i + 3, 3) \ |
76 | PF0(i + 4) \ |
77 | PF0(i + 6) \ |
78 | XO1(i, 0) \ |
79 | XO1(i + 1, 1) \ |
80 | XO1(i + 2, 2) \ |
81 | XO1(i + 3, 3) \ |
82 | ST(i, 0) \ |
83 | ST(i + 1, 1) \ |
84 | ST(i + 2, 2) \ |
85 | ST(i + 3, 3) \ |
86 | |
87 | |
88 | PF0(0) |
89 | PF0(2) |
90 | |
91 | " .align 32 ;\n" |
92 | " 1: ;\n" |
93 | |
94 | BLOCK(0) |
95 | BLOCK(4) |
96 | BLOCK(8) |
97 | BLOCK(12) |
98 | |
99 | " add %[inc], %[p1] ;\n" |
100 | " add %[inc], %[p2] ;\n" |
101 | " dec %[cnt] ;\n" |
102 | " jnz 1b ;\n" |
103 | : [cnt] "+r" (lines), |
104 | [p1] "+r" (p1), [p2] "+r" (p2) |
105 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) |
106 | : "memory" ); |
107 | |
108 | kernel_fpu_end(); |
109 | } |
110 | |
111 | static void |
112 | xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1, |
113 | const unsigned long * __restrict p2) |
114 | { |
115 | unsigned long lines = bytes >> 8; |
116 | |
117 | kernel_fpu_begin(); |
118 | |
119 | asm volatile( |
120 | #undef BLOCK |
121 | #define BLOCK(i) \ |
122 | BLK64(PF0, LD, i) \ |
123 | BLK64(PF1, XO1, i) \ |
124 | BLK64(NOP, ST, i) \ |
125 | |
126 | " .align 32 ;\n" |
127 | " 1: ;\n" |
128 | |
129 | BLOCK(0) |
130 | BLOCK(4) |
131 | BLOCK(8) |
132 | BLOCK(12) |
133 | |
134 | " add %[inc], %[p1] ;\n" |
135 | " add %[inc], %[p2] ;\n" |
136 | " dec %[cnt] ;\n" |
137 | " jnz 1b ;\n" |
138 | : [cnt] "+r" (lines), |
139 | [p1] "+r" (p1), [p2] "+r" (p2) |
140 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) |
141 | : "memory" ); |
142 | |
143 | kernel_fpu_end(); |
144 | } |
145 | |
146 | static void |
147 | xor_sse_3(unsigned long bytes, unsigned long * __restrict p1, |
148 | const unsigned long * __restrict p2, |
149 | const unsigned long * __restrict p3) |
150 | { |
151 | unsigned long lines = bytes >> 8; |
152 | |
153 | kernel_fpu_begin(); |
154 | |
155 | asm volatile( |
156 | #undef BLOCK |
157 | #define BLOCK(i) \ |
158 | PF1(i) \ |
159 | PF1(i + 2) \ |
160 | LD(i, 0) \ |
161 | LD(i + 1, 1) \ |
162 | LD(i + 2, 2) \ |
163 | LD(i + 3, 3) \ |
164 | PF2(i) \ |
165 | PF2(i + 2) \ |
166 | PF0(i + 4) \ |
167 | PF0(i + 6) \ |
168 | XO1(i, 0) \ |
169 | XO1(i + 1, 1) \ |
170 | XO1(i + 2, 2) \ |
171 | XO1(i + 3, 3) \ |
172 | XO2(i, 0) \ |
173 | XO2(i + 1, 1) \ |
174 | XO2(i + 2, 2) \ |
175 | XO2(i + 3, 3) \ |
176 | ST(i, 0) \ |
177 | ST(i + 1, 1) \ |
178 | ST(i + 2, 2) \ |
179 | ST(i + 3, 3) \ |
180 | |
181 | |
182 | PF0(0) |
183 | PF0(2) |
184 | |
185 | " .align 32 ;\n" |
186 | " 1: ;\n" |
187 | |
188 | BLOCK(0) |
189 | BLOCK(4) |
190 | BLOCK(8) |
191 | BLOCK(12) |
192 | |
193 | " add %[inc], %[p1] ;\n" |
194 | " add %[inc], %[p2] ;\n" |
195 | " add %[inc], %[p3] ;\n" |
196 | " dec %[cnt] ;\n" |
197 | " jnz 1b ;\n" |
198 | : [cnt] "+r" (lines), |
199 | [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) |
200 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) |
201 | : "memory" ); |
202 | |
203 | kernel_fpu_end(); |
204 | } |
205 | |
206 | static void |
207 | xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1, |
208 | const unsigned long * __restrict p2, |
209 | const unsigned long * __restrict p3) |
210 | { |
211 | unsigned long lines = bytes >> 8; |
212 | |
213 | kernel_fpu_begin(); |
214 | |
215 | asm volatile( |
216 | #undef BLOCK |
217 | #define BLOCK(i) \ |
218 | BLK64(PF0, LD, i) \ |
219 | BLK64(PF1, XO1, i) \ |
220 | BLK64(PF2, XO2, i) \ |
221 | BLK64(NOP, ST, i) \ |
222 | |
223 | " .align 32 ;\n" |
224 | " 1: ;\n" |
225 | |
226 | BLOCK(0) |
227 | BLOCK(4) |
228 | BLOCK(8) |
229 | BLOCK(12) |
230 | |
231 | " add %[inc], %[p1] ;\n" |
232 | " add %[inc], %[p2] ;\n" |
233 | " add %[inc], %[p3] ;\n" |
234 | " dec %[cnt] ;\n" |
235 | " jnz 1b ;\n" |
236 | : [cnt] "+r" (lines), |
237 | [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) |
238 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) |
239 | : "memory" ); |
240 | |
241 | kernel_fpu_end(); |
242 | } |
243 | |
244 | static void |
245 | xor_sse_4(unsigned long bytes, unsigned long * __restrict p1, |
246 | const unsigned long * __restrict p2, |
247 | const unsigned long * __restrict p3, |
248 | const unsigned long * __restrict p4) |
249 | { |
250 | unsigned long lines = bytes >> 8; |
251 | |
252 | kernel_fpu_begin(); |
253 | |
254 | asm volatile( |
255 | #undef BLOCK |
256 | #define BLOCK(i) \ |
257 | PF1(i) \ |
258 | PF1(i + 2) \ |
259 | LD(i, 0) \ |
260 | LD(i + 1, 1) \ |
261 | LD(i + 2, 2) \ |
262 | LD(i + 3, 3) \ |
263 | PF2(i) \ |
264 | PF2(i + 2) \ |
265 | XO1(i, 0) \ |
266 | XO1(i + 1, 1) \ |
267 | XO1(i + 2, 2) \ |
268 | XO1(i + 3, 3) \ |
269 | PF3(i) \ |
270 | PF3(i + 2) \ |
271 | PF0(i + 4) \ |
272 | PF0(i + 6) \ |
273 | XO2(i, 0) \ |
274 | XO2(i + 1, 1) \ |
275 | XO2(i + 2, 2) \ |
276 | XO2(i + 3, 3) \ |
277 | XO3(i, 0) \ |
278 | XO3(i + 1, 1) \ |
279 | XO3(i + 2, 2) \ |
280 | XO3(i + 3, 3) \ |
281 | ST(i, 0) \ |
282 | ST(i + 1, 1) \ |
283 | ST(i + 2, 2) \ |
284 | ST(i + 3, 3) \ |
285 | |
286 | |
287 | PF0(0) |
288 | PF0(2) |
289 | |
290 | " .align 32 ;\n" |
291 | " 1: ;\n" |
292 | |
293 | BLOCK(0) |
294 | BLOCK(4) |
295 | BLOCK(8) |
296 | BLOCK(12) |
297 | |
298 | " add %[inc], %[p1] ;\n" |
299 | " add %[inc], %[p2] ;\n" |
300 | " add %[inc], %[p3] ;\n" |
301 | " add %[inc], %[p4] ;\n" |
302 | " dec %[cnt] ;\n" |
303 | " jnz 1b ;\n" |
304 | : [cnt] "+r" (lines), [p1] "+r" (p1), |
305 | [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) |
306 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) |
307 | : "memory" ); |
308 | |
309 | kernel_fpu_end(); |
310 | } |
311 | |
312 | static void |
313 | xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1, |
314 | const unsigned long * __restrict p2, |
315 | const unsigned long * __restrict p3, |
316 | const unsigned long * __restrict p4) |
317 | { |
318 | unsigned long lines = bytes >> 8; |
319 | |
320 | kernel_fpu_begin(); |
321 | |
322 | asm volatile( |
323 | #undef BLOCK |
324 | #define BLOCK(i) \ |
325 | BLK64(PF0, LD, i) \ |
326 | BLK64(PF1, XO1, i) \ |
327 | BLK64(PF2, XO2, i) \ |
328 | BLK64(PF3, XO3, i) \ |
329 | BLK64(NOP, ST, i) \ |
330 | |
331 | " .align 32 ;\n" |
332 | " 1: ;\n" |
333 | |
334 | BLOCK(0) |
335 | BLOCK(4) |
336 | BLOCK(8) |
337 | BLOCK(12) |
338 | |
339 | " add %[inc], %[p1] ;\n" |
340 | " add %[inc], %[p2] ;\n" |
341 | " add %[inc], %[p3] ;\n" |
342 | " add %[inc], %[p4] ;\n" |
343 | " dec %[cnt] ;\n" |
344 | " jnz 1b ;\n" |
345 | : [cnt] "+r" (lines), [p1] "+r" (p1), |
346 | [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) |
347 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) |
348 | : "memory" ); |
349 | |
350 | kernel_fpu_end(); |
351 | } |
352 | |
353 | static void |
354 | xor_sse_5(unsigned long bytes, unsigned long * __restrict p1, |
355 | const unsigned long * __restrict p2, |
356 | const unsigned long * __restrict p3, |
357 | const unsigned long * __restrict p4, |
358 | const unsigned long * __restrict p5) |
359 | { |
360 | unsigned long lines = bytes >> 8; |
361 | |
362 | kernel_fpu_begin(); |
363 | |
364 | asm volatile( |
365 | #undef BLOCK |
366 | #define BLOCK(i) \ |
367 | PF1(i) \ |
368 | PF1(i + 2) \ |
369 | LD(i, 0) \ |
370 | LD(i + 1, 1) \ |
371 | LD(i + 2, 2) \ |
372 | LD(i + 3, 3) \ |
373 | PF2(i) \ |
374 | PF2(i + 2) \ |
375 | XO1(i, 0) \ |
376 | XO1(i + 1, 1) \ |
377 | XO1(i + 2, 2) \ |
378 | XO1(i + 3, 3) \ |
379 | PF3(i) \ |
380 | PF3(i + 2) \ |
381 | XO2(i, 0) \ |
382 | XO2(i + 1, 1) \ |
383 | XO2(i + 2, 2) \ |
384 | XO2(i + 3, 3) \ |
385 | PF4(i) \ |
386 | PF4(i + 2) \ |
387 | PF0(i + 4) \ |
388 | PF0(i + 6) \ |
389 | XO3(i, 0) \ |
390 | XO3(i + 1, 1) \ |
391 | XO3(i + 2, 2) \ |
392 | XO3(i + 3, 3) \ |
393 | XO4(i, 0) \ |
394 | XO4(i + 1, 1) \ |
395 | XO4(i + 2, 2) \ |
396 | XO4(i + 3, 3) \ |
397 | ST(i, 0) \ |
398 | ST(i + 1, 1) \ |
399 | ST(i + 2, 2) \ |
400 | ST(i + 3, 3) \ |
401 | |
402 | |
403 | PF0(0) |
404 | PF0(2) |
405 | |
406 | " .align 32 ;\n" |
407 | " 1: ;\n" |
408 | |
409 | BLOCK(0) |
410 | BLOCK(4) |
411 | BLOCK(8) |
412 | BLOCK(12) |
413 | |
414 | " add %[inc], %[p1] ;\n" |
415 | " add %[inc], %[p2] ;\n" |
416 | " add %[inc], %[p3] ;\n" |
417 | " add %[inc], %[p4] ;\n" |
418 | " add %[inc], %[p5] ;\n" |
419 | " dec %[cnt] ;\n" |
420 | " jnz 1b ;\n" |
421 | : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), |
422 | [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) |
423 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) |
424 | : "memory" ); |
425 | |
426 | kernel_fpu_end(); |
427 | } |
428 | |
429 | static void |
430 | xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1, |
431 | const unsigned long * __restrict p2, |
432 | const unsigned long * __restrict p3, |
433 | const unsigned long * __restrict p4, |
434 | const unsigned long * __restrict p5) |
435 | { |
436 | unsigned long lines = bytes >> 8; |
437 | |
438 | kernel_fpu_begin(); |
439 | |
440 | asm volatile( |
441 | #undef BLOCK |
442 | #define BLOCK(i) \ |
443 | BLK64(PF0, LD, i) \ |
444 | BLK64(PF1, XO1, i) \ |
445 | BLK64(PF2, XO2, i) \ |
446 | BLK64(PF3, XO3, i) \ |
447 | BLK64(PF4, XO4, i) \ |
448 | BLK64(NOP, ST, i) \ |
449 | |
450 | " .align 32 ;\n" |
451 | " 1: ;\n" |
452 | |
453 | BLOCK(0) |
454 | BLOCK(4) |
455 | BLOCK(8) |
456 | BLOCK(12) |
457 | |
458 | " add %[inc], %[p1] ;\n" |
459 | " add %[inc], %[p2] ;\n" |
460 | " add %[inc], %[p3] ;\n" |
461 | " add %[inc], %[p4] ;\n" |
462 | " add %[inc], %[p5] ;\n" |
463 | " dec %[cnt] ;\n" |
464 | " jnz 1b ;\n" |
465 | : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), |
466 | [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) |
467 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) |
468 | : "memory" ); |
469 | |
470 | kernel_fpu_end(); |
471 | } |
472 | |
473 | static struct xor_block_template xor_block_sse_pf64 = { |
474 | .name = "prefetch64-sse" , |
475 | .do_2 = xor_sse_2_pf64, |
476 | .do_3 = xor_sse_3_pf64, |
477 | .do_4 = xor_sse_4_pf64, |
478 | .do_5 = xor_sse_5_pf64, |
479 | }; |
480 | |
481 | #undef LD |
482 | #undef XO1 |
483 | #undef XO2 |
484 | #undef XO3 |
485 | #undef XO4 |
486 | #undef ST |
487 | #undef NOP |
488 | #undef BLK64 |
489 | #undef BLOCK |
490 | |
491 | #undef XOR_CONSTANT_CONSTRAINT |
492 | |
493 | #ifdef CONFIG_X86_32 |
494 | # include <asm/xor_32.h> |
495 | #else |
496 | # include <asm/xor_64.h> |
497 | #endif |
498 | |
499 | #define XOR_SELECT_TEMPLATE(FASTEST) \ |
500 | AVX_SELECT(FASTEST) |
501 | |
502 | #endif /* _ASM_X86_XOR_H */ |
503 | |