1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * RAID6 recovery algorithms in LoongArch SIMD (LSX & LASX) |
4 | * |
5 | * Copyright (C) 2023 WANG Xuerui <git@xen0n.name> |
6 | * |
7 | * Originally based on recov_avx2.c and recov_ssse3.c: |
8 | * |
9 | * Copyright (C) 2012 Intel Corporation |
10 | * Author: Jim Kukunas <james.t.kukunas@linux.intel.com> |
11 | */ |
12 | |
13 | #include <linux/raid/pq.h> |
14 | #include "loongarch.h" |
15 | |
16 | /* |
17 | * Unlike with the syndrome calculation algorithms, there's no boot-time |
18 | * selection of recovery algorithms by benchmarking, so we have to specify |
19 | * the priorities and hope the future cores will all have decent vector |
20 | * support (i.e. no LASX slower than LSX, or even scalar code). |
21 | */ |
22 | |
23 | #ifdef CONFIG_CPU_HAS_LSX |
24 | static int raid6_has_lsx(void) |
25 | { |
26 | return cpu_has_lsx; |
27 | } |
28 | |
29 | static void raid6_2data_recov_lsx(int disks, size_t bytes, int faila, |
30 | int failb, void **ptrs) |
31 | { |
32 | u8 *p, *q, *dp, *dq; |
33 | const u8 *pbmul; /* P multiplier table for B data */ |
34 | const u8 *qmul; /* Q multiplier table (for both) */ |
35 | |
36 | p = (u8 *)ptrs[disks - 2]; |
37 | q = (u8 *)ptrs[disks - 1]; |
38 | |
39 | /* |
40 | * Compute syndrome with zero for the missing data pages |
41 | * Use the dead data pages as temporary storage for |
42 | * delta p and delta q |
43 | */ |
44 | dp = (u8 *)ptrs[faila]; |
45 | ptrs[faila] = (void *)raid6_empty_zero_page; |
46 | ptrs[disks - 2] = dp; |
47 | dq = (u8 *)ptrs[failb]; |
48 | ptrs[failb] = (void *)raid6_empty_zero_page; |
49 | ptrs[disks - 1] = dq; |
50 | |
51 | raid6_call.gen_syndrome(disks, bytes, ptrs); |
52 | |
53 | /* Restore pointer table */ |
54 | ptrs[faila] = dp; |
55 | ptrs[failb] = dq; |
56 | ptrs[disks - 2] = p; |
57 | ptrs[disks - 1] = q; |
58 | |
59 | /* Now, pick the proper data tables */ |
60 | pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]]; |
61 | qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ raid6_gfexp[failb]]]; |
62 | |
63 | kernel_fpu_begin(); |
64 | |
65 | /* |
66 | * vr20, vr21: qmul |
67 | * vr22, vr23: pbmul |
68 | */ |
69 | asm volatile("vld $vr20, %0" : : "m" (qmul[0])); |
70 | asm volatile("vld $vr21, %0" : : "m" (qmul[16])); |
71 | asm volatile("vld $vr22, %0" : : "m" (pbmul[0])); |
72 | asm volatile("vld $vr23, %0" : : "m" (pbmul[16])); |
73 | |
74 | while (bytes) { |
75 | /* vr4 - vr7: Q */ |
76 | asm volatile("vld $vr4, %0" : : "m" (q[0])); |
77 | asm volatile("vld $vr5, %0" : : "m" (q[16])); |
78 | asm volatile("vld $vr6, %0" : : "m" (q[32])); |
79 | asm volatile("vld $vr7, %0" : : "m" (q[48])); |
80 | /* vr4 - vr7: Q + Qxy */ |
81 | asm volatile("vld $vr8, %0" : : "m" (dq[0])); |
82 | asm volatile("vld $vr9, %0" : : "m" (dq[16])); |
83 | asm volatile("vld $vr10, %0" : : "m" (dq[32])); |
84 | asm volatile("vld $vr11, %0" : : "m" (dq[48])); |
85 | asm volatile("vxor.v $vr4, $vr4, $vr8" ); |
86 | asm volatile("vxor.v $vr5, $vr5, $vr9" ); |
87 | asm volatile("vxor.v $vr6, $vr6, $vr10" ); |
88 | asm volatile("vxor.v $vr7, $vr7, $vr11" ); |
89 | /* vr0 - vr3: P */ |
90 | asm volatile("vld $vr0, %0" : : "m" (p[0])); |
91 | asm volatile("vld $vr1, %0" : : "m" (p[16])); |
92 | asm volatile("vld $vr2, %0" : : "m" (p[32])); |
93 | asm volatile("vld $vr3, %0" : : "m" (p[48])); |
94 | /* vr0 - vr3: P + Pxy */ |
95 | asm volatile("vld $vr8, %0" : : "m" (dp[0])); |
96 | asm volatile("vld $vr9, %0" : : "m" (dp[16])); |
97 | asm volatile("vld $vr10, %0" : : "m" (dp[32])); |
98 | asm volatile("vld $vr11, %0" : : "m" (dp[48])); |
99 | asm volatile("vxor.v $vr0, $vr0, $vr8" ); |
100 | asm volatile("vxor.v $vr1, $vr1, $vr9" ); |
101 | asm volatile("vxor.v $vr2, $vr2, $vr10" ); |
102 | asm volatile("vxor.v $vr3, $vr3, $vr11" ); |
103 | |
104 | /* vr8 - vr11: higher 4 bits of each byte of (Q + Qxy) */ |
105 | asm volatile("vsrli.b $vr8, $vr4, 4" ); |
106 | asm volatile("vsrli.b $vr9, $vr5, 4" ); |
107 | asm volatile("vsrli.b $vr10, $vr6, 4" ); |
108 | asm volatile("vsrli.b $vr11, $vr7, 4" ); |
109 | /* vr4 - vr7: lower 4 bits of each byte of (Q + Qxy) */ |
110 | asm volatile("vandi.b $vr4, $vr4, 0x0f" ); |
111 | asm volatile("vandi.b $vr5, $vr5, 0x0f" ); |
112 | asm volatile("vandi.b $vr6, $vr6, 0x0f" ); |
113 | asm volatile("vandi.b $vr7, $vr7, 0x0f" ); |
114 | /* lookup from qmul[0] */ |
115 | asm volatile("vshuf.b $vr4, $vr20, $vr20, $vr4" ); |
116 | asm volatile("vshuf.b $vr5, $vr20, $vr20, $vr5" ); |
117 | asm volatile("vshuf.b $vr6, $vr20, $vr20, $vr6" ); |
118 | asm volatile("vshuf.b $vr7, $vr20, $vr20, $vr7" ); |
119 | /* lookup from qmul[16] */ |
120 | asm volatile("vshuf.b $vr8, $vr21, $vr21, $vr8" ); |
121 | asm volatile("vshuf.b $vr9, $vr21, $vr21, $vr9" ); |
122 | asm volatile("vshuf.b $vr10, $vr21, $vr21, $vr10" ); |
123 | asm volatile("vshuf.b $vr11, $vr21, $vr21, $vr11" ); |
124 | /* vr16 - vr19: B(Q + Qxy) */ |
125 | asm volatile("vxor.v $vr16, $vr8, $vr4" ); |
126 | asm volatile("vxor.v $vr17, $vr9, $vr5" ); |
127 | asm volatile("vxor.v $vr18, $vr10, $vr6" ); |
128 | asm volatile("vxor.v $vr19, $vr11, $vr7" ); |
129 | |
130 | /* vr4 - vr7: higher 4 bits of each byte of (P + Pxy) */ |
131 | asm volatile("vsrli.b $vr4, $vr0, 4" ); |
132 | asm volatile("vsrli.b $vr5, $vr1, 4" ); |
133 | asm volatile("vsrli.b $vr6, $vr2, 4" ); |
134 | asm volatile("vsrli.b $vr7, $vr3, 4" ); |
135 | /* vr12 - vr15: lower 4 bits of each byte of (P + Pxy) */ |
136 | asm volatile("vandi.b $vr12, $vr0, 0x0f" ); |
137 | asm volatile("vandi.b $vr13, $vr1, 0x0f" ); |
138 | asm volatile("vandi.b $vr14, $vr2, 0x0f" ); |
139 | asm volatile("vandi.b $vr15, $vr3, 0x0f" ); |
140 | /* lookup from pbmul[0] */ |
141 | asm volatile("vshuf.b $vr12, $vr22, $vr22, $vr12" ); |
142 | asm volatile("vshuf.b $vr13, $vr22, $vr22, $vr13" ); |
143 | asm volatile("vshuf.b $vr14, $vr22, $vr22, $vr14" ); |
144 | asm volatile("vshuf.b $vr15, $vr22, $vr22, $vr15" ); |
145 | /* lookup from pbmul[16] */ |
146 | asm volatile("vshuf.b $vr4, $vr23, $vr23, $vr4" ); |
147 | asm volatile("vshuf.b $vr5, $vr23, $vr23, $vr5" ); |
148 | asm volatile("vshuf.b $vr6, $vr23, $vr23, $vr6" ); |
149 | asm volatile("vshuf.b $vr7, $vr23, $vr23, $vr7" ); |
150 | /* vr4 - vr7: A(P + Pxy) */ |
151 | asm volatile("vxor.v $vr4, $vr4, $vr12" ); |
152 | asm volatile("vxor.v $vr5, $vr5, $vr13" ); |
153 | asm volatile("vxor.v $vr6, $vr6, $vr14" ); |
154 | asm volatile("vxor.v $vr7, $vr7, $vr15" ); |
155 | |
156 | /* vr4 - vr7: A(P + Pxy) + B(Q + Qxy) = Dx */ |
157 | asm volatile("vxor.v $vr4, $vr4, $vr16" ); |
158 | asm volatile("vxor.v $vr5, $vr5, $vr17" ); |
159 | asm volatile("vxor.v $vr6, $vr6, $vr18" ); |
160 | asm volatile("vxor.v $vr7, $vr7, $vr19" ); |
161 | asm volatile("vst $vr4, %0" : "=m" (dq[0])); |
162 | asm volatile("vst $vr5, %0" : "=m" (dq[16])); |
163 | asm volatile("vst $vr6, %0" : "=m" (dq[32])); |
164 | asm volatile("vst $vr7, %0" : "=m" (dq[48])); |
165 | |
166 | /* vr0 - vr3: P + Pxy + Dx = Dy */ |
167 | asm volatile("vxor.v $vr0, $vr0, $vr4" ); |
168 | asm volatile("vxor.v $vr1, $vr1, $vr5" ); |
169 | asm volatile("vxor.v $vr2, $vr2, $vr6" ); |
170 | asm volatile("vxor.v $vr3, $vr3, $vr7" ); |
171 | asm volatile("vst $vr0, %0" : "=m" (dp[0])); |
172 | asm volatile("vst $vr1, %0" : "=m" (dp[16])); |
173 | asm volatile("vst $vr2, %0" : "=m" (dp[32])); |
174 | asm volatile("vst $vr3, %0" : "=m" (dp[48])); |
175 | |
176 | bytes -= 64; |
177 | p += 64; |
178 | q += 64; |
179 | dp += 64; |
180 | dq += 64; |
181 | } |
182 | |
183 | kernel_fpu_end(); |
184 | } |
185 | |
186 | static void raid6_datap_recov_lsx(int disks, size_t bytes, int faila, |
187 | void **ptrs) |
188 | { |
189 | u8 *p, *q, *dq; |
190 | const u8 *qmul; /* Q multiplier table */ |
191 | |
192 | p = (u8 *)ptrs[disks - 2]; |
193 | q = (u8 *)ptrs[disks - 1]; |
194 | |
195 | /* |
196 | * Compute syndrome with zero for the missing data page |
197 | * Use the dead data page as temporary storage for delta q |
198 | */ |
199 | dq = (u8 *)ptrs[faila]; |
200 | ptrs[faila] = (void *)raid6_empty_zero_page; |
201 | ptrs[disks - 1] = dq; |
202 | |
203 | raid6_call.gen_syndrome(disks, bytes, ptrs); |
204 | |
205 | /* Restore pointer table */ |
206 | ptrs[faila] = dq; |
207 | ptrs[disks - 1] = q; |
208 | |
209 | /* Now, pick the proper data tables */ |
210 | qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; |
211 | |
212 | kernel_fpu_begin(); |
213 | |
214 | /* vr22, vr23: qmul */ |
215 | asm volatile("vld $vr22, %0" : : "m" (qmul[0])); |
216 | asm volatile("vld $vr23, %0" : : "m" (qmul[16])); |
217 | |
218 | while (bytes) { |
219 | /* vr0 - vr3: P + Dx */ |
220 | asm volatile("vld $vr0, %0" : : "m" (p[0])); |
221 | asm volatile("vld $vr1, %0" : : "m" (p[16])); |
222 | asm volatile("vld $vr2, %0" : : "m" (p[32])); |
223 | asm volatile("vld $vr3, %0" : : "m" (p[48])); |
224 | /* vr4 - vr7: Qx */ |
225 | asm volatile("vld $vr4, %0" : : "m" (dq[0])); |
226 | asm volatile("vld $vr5, %0" : : "m" (dq[16])); |
227 | asm volatile("vld $vr6, %0" : : "m" (dq[32])); |
228 | asm volatile("vld $vr7, %0" : : "m" (dq[48])); |
229 | /* vr4 - vr7: Q + Qx */ |
230 | asm volatile("vld $vr8, %0" : : "m" (q[0])); |
231 | asm volatile("vld $vr9, %0" : : "m" (q[16])); |
232 | asm volatile("vld $vr10, %0" : : "m" (q[32])); |
233 | asm volatile("vld $vr11, %0" : : "m" (q[48])); |
234 | asm volatile("vxor.v $vr4, $vr4, $vr8" ); |
235 | asm volatile("vxor.v $vr5, $vr5, $vr9" ); |
236 | asm volatile("vxor.v $vr6, $vr6, $vr10" ); |
237 | asm volatile("vxor.v $vr7, $vr7, $vr11" ); |
238 | |
239 | /* vr8 - vr11: higher 4 bits of each byte of (Q + Qx) */ |
240 | asm volatile("vsrli.b $vr8, $vr4, 4" ); |
241 | asm volatile("vsrli.b $vr9, $vr5, 4" ); |
242 | asm volatile("vsrli.b $vr10, $vr6, 4" ); |
243 | asm volatile("vsrli.b $vr11, $vr7, 4" ); |
244 | /* vr4 - vr7: lower 4 bits of each byte of (Q + Qx) */ |
245 | asm volatile("vandi.b $vr4, $vr4, 0x0f" ); |
246 | asm volatile("vandi.b $vr5, $vr5, 0x0f" ); |
247 | asm volatile("vandi.b $vr6, $vr6, 0x0f" ); |
248 | asm volatile("vandi.b $vr7, $vr7, 0x0f" ); |
249 | /* lookup from qmul[0] */ |
250 | asm volatile("vshuf.b $vr4, $vr22, $vr22, $vr4" ); |
251 | asm volatile("vshuf.b $vr5, $vr22, $vr22, $vr5" ); |
252 | asm volatile("vshuf.b $vr6, $vr22, $vr22, $vr6" ); |
253 | asm volatile("vshuf.b $vr7, $vr22, $vr22, $vr7" ); |
254 | /* lookup from qmul[16] */ |
255 | asm volatile("vshuf.b $vr8, $vr23, $vr23, $vr8" ); |
256 | asm volatile("vshuf.b $vr9, $vr23, $vr23, $vr9" ); |
257 | asm volatile("vshuf.b $vr10, $vr23, $vr23, $vr10" ); |
258 | asm volatile("vshuf.b $vr11, $vr23, $vr23, $vr11" ); |
259 | /* vr4 - vr7: qmul(Q + Qx) = Dx */ |
260 | asm volatile("vxor.v $vr4, $vr4, $vr8" ); |
261 | asm volatile("vxor.v $vr5, $vr5, $vr9" ); |
262 | asm volatile("vxor.v $vr6, $vr6, $vr10" ); |
263 | asm volatile("vxor.v $vr7, $vr7, $vr11" ); |
264 | asm volatile("vst $vr4, %0" : "=m" (dq[0])); |
265 | asm volatile("vst $vr5, %0" : "=m" (dq[16])); |
266 | asm volatile("vst $vr6, %0" : "=m" (dq[32])); |
267 | asm volatile("vst $vr7, %0" : "=m" (dq[48])); |
268 | |
269 | /* vr0 - vr3: P + Dx + Dx = P */ |
270 | asm volatile("vxor.v $vr0, $vr0, $vr4" ); |
271 | asm volatile("vxor.v $vr1, $vr1, $vr5" ); |
272 | asm volatile("vxor.v $vr2, $vr2, $vr6" ); |
273 | asm volatile("vxor.v $vr3, $vr3, $vr7" ); |
274 | asm volatile("vst $vr0, %0" : "=m" (p[0])); |
275 | asm volatile("vst $vr1, %0" : "=m" (p[16])); |
276 | asm volatile("vst $vr2, %0" : "=m" (p[32])); |
277 | asm volatile("vst $vr3, %0" : "=m" (p[48])); |
278 | |
279 | bytes -= 64; |
280 | p += 64; |
281 | q += 64; |
282 | dq += 64; |
283 | } |
284 | |
285 | kernel_fpu_end(); |
286 | } |
287 | |
288 | const struct raid6_recov_calls raid6_recov_lsx = { |
289 | .data2 = raid6_2data_recov_lsx, |
290 | .datap = raid6_datap_recov_lsx, |
291 | .valid = raid6_has_lsx, |
292 | .name = "lsx" , |
293 | .priority = 1, |
294 | }; |
295 | #endif /* CONFIG_CPU_HAS_LSX */ |
296 | |
297 | #ifdef CONFIG_CPU_HAS_LASX |
298 | static int raid6_has_lasx(void) |
299 | { |
300 | return cpu_has_lasx; |
301 | } |
302 | |
303 | static void raid6_2data_recov_lasx(int disks, size_t bytes, int faila, |
304 | int failb, void **ptrs) |
305 | { |
306 | u8 *p, *q, *dp, *dq; |
307 | const u8 *pbmul; /* P multiplier table for B data */ |
308 | const u8 *qmul; /* Q multiplier table (for both) */ |
309 | |
310 | p = (u8 *)ptrs[disks - 2]; |
311 | q = (u8 *)ptrs[disks - 1]; |
312 | |
313 | /* |
314 | * Compute syndrome with zero for the missing data pages |
315 | * Use the dead data pages as temporary storage for |
316 | * delta p and delta q |
317 | */ |
318 | dp = (u8 *)ptrs[faila]; |
319 | ptrs[faila] = (void *)raid6_empty_zero_page; |
320 | ptrs[disks - 2] = dp; |
321 | dq = (u8 *)ptrs[failb]; |
322 | ptrs[failb] = (void *)raid6_empty_zero_page; |
323 | ptrs[disks - 1] = dq; |
324 | |
325 | raid6_call.gen_syndrome(disks, bytes, ptrs); |
326 | |
327 | /* Restore pointer table */ |
328 | ptrs[faila] = dp; |
329 | ptrs[failb] = dq; |
330 | ptrs[disks - 2] = p; |
331 | ptrs[disks - 1] = q; |
332 | |
333 | /* Now, pick the proper data tables */ |
334 | pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]]; |
335 | qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ raid6_gfexp[failb]]]; |
336 | |
337 | kernel_fpu_begin(); |
338 | |
339 | /* |
340 | * xr20, xr21: qmul |
341 | * xr22, xr23: pbmul |
342 | */ |
343 | asm volatile("vld $vr20, %0" : : "m" (qmul[0])); |
344 | asm volatile("vld $vr21, %0" : : "m" (qmul[16])); |
345 | asm volatile("vld $vr22, %0" : : "m" (pbmul[0])); |
346 | asm volatile("vld $vr23, %0" : : "m" (pbmul[16])); |
347 | asm volatile("xvreplve0.q $xr20, $xr20" ); |
348 | asm volatile("xvreplve0.q $xr21, $xr21" ); |
349 | asm volatile("xvreplve0.q $xr22, $xr22" ); |
350 | asm volatile("xvreplve0.q $xr23, $xr23" ); |
351 | |
352 | while (bytes) { |
353 | /* xr0, xr1: Q */ |
354 | asm volatile("xvld $xr0, %0" : : "m" (q[0])); |
355 | asm volatile("xvld $xr1, %0" : : "m" (q[32])); |
356 | /* xr0, xr1: Q + Qxy */ |
357 | asm volatile("xvld $xr4, %0" : : "m" (dq[0])); |
358 | asm volatile("xvld $xr5, %0" : : "m" (dq[32])); |
359 | asm volatile("xvxor.v $xr0, $xr0, $xr4" ); |
360 | asm volatile("xvxor.v $xr1, $xr1, $xr5" ); |
361 | /* xr2, xr3: P */ |
362 | asm volatile("xvld $xr2, %0" : : "m" (p[0])); |
363 | asm volatile("xvld $xr3, %0" : : "m" (p[32])); |
364 | /* xr2, xr3: P + Pxy */ |
365 | asm volatile("xvld $xr4, %0" : : "m" (dp[0])); |
366 | asm volatile("xvld $xr5, %0" : : "m" (dp[32])); |
367 | asm volatile("xvxor.v $xr2, $xr2, $xr4" ); |
368 | asm volatile("xvxor.v $xr3, $xr3, $xr5" ); |
369 | |
370 | /* xr4, xr5: higher 4 bits of each byte of (Q + Qxy) */ |
371 | asm volatile("xvsrli.b $xr4, $xr0, 4" ); |
372 | asm volatile("xvsrli.b $xr5, $xr1, 4" ); |
373 | /* xr0, xr1: lower 4 bits of each byte of (Q + Qxy) */ |
374 | asm volatile("xvandi.b $xr0, $xr0, 0x0f" ); |
375 | asm volatile("xvandi.b $xr1, $xr1, 0x0f" ); |
376 | /* lookup from qmul[0] */ |
377 | asm volatile("xvshuf.b $xr0, $xr20, $xr20, $xr0" ); |
378 | asm volatile("xvshuf.b $xr1, $xr20, $xr20, $xr1" ); |
379 | /* lookup from qmul[16] */ |
380 | asm volatile("xvshuf.b $xr4, $xr21, $xr21, $xr4" ); |
381 | asm volatile("xvshuf.b $xr5, $xr21, $xr21, $xr5" ); |
382 | /* xr6, xr7: B(Q + Qxy) */ |
383 | asm volatile("xvxor.v $xr6, $xr4, $xr0" ); |
384 | asm volatile("xvxor.v $xr7, $xr5, $xr1" ); |
385 | |
386 | /* xr4, xr5: higher 4 bits of each byte of (P + Pxy) */ |
387 | asm volatile("xvsrli.b $xr4, $xr2, 4" ); |
388 | asm volatile("xvsrli.b $xr5, $xr3, 4" ); |
389 | /* xr0, xr1: lower 4 bits of each byte of (P + Pxy) */ |
390 | asm volatile("xvandi.b $xr0, $xr2, 0x0f" ); |
391 | asm volatile("xvandi.b $xr1, $xr3, 0x0f" ); |
392 | /* lookup from pbmul[0] */ |
393 | asm volatile("xvshuf.b $xr0, $xr22, $xr22, $xr0" ); |
394 | asm volatile("xvshuf.b $xr1, $xr22, $xr22, $xr1" ); |
395 | /* lookup from pbmul[16] */ |
396 | asm volatile("xvshuf.b $xr4, $xr23, $xr23, $xr4" ); |
397 | asm volatile("xvshuf.b $xr5, $xr23, $xr23, $xr5" ); |
398 | /* xr0, xr1: A(P + Pxy) */ |
399 | asm volatile("xvxor.v $xr0, $xr0, $xr4" ); |
400 | asm volatile("xvxor.v $xr1, $xr1, $xr5" ); |
401 | |
402 | /* xr0, xr1: A(P + Pxy) + B(Q + Qxy) = Dx */ |
403 | asm volatile("xvxor.v $xr0, $xr0, $xr6" ); |
404 | asm volatile("xvxor.v $xr1, $xr1, $xr7" ); |
405 | |
406 | /* xr2, xr3: P + Pxy + Dx = Dy */ |
407 | asm volatile("xvxor.v $xr2, $xr2, $xr0" ); |
408 | asm volatile("xvxor.v $xr3, $xr3, $xr1" ); |
409 | |
410 | asm volatile("xvst $xr0, %0" : "=m" (dq[0])); |
411 | asm volatile("xvst $xr1, %0" : "=m" (dq[32])); |
412 | asm volatile("xvst $xr2, %0" : "=m" (dp[0])); |
413 | asm volatile("xvst $xr3, %0" : "=m" (dp[32])); |
414 | |
415 | bytes -= 64; |
416 | p += 64; |
417 | q += 64; |
418 | dp += 64; |
419 | dq += 64; |
420 | } |
421 | |
422 | kernel_fpu_end(); |
423 | } |
424 | |
425 | static void raid6_datap_recov_lasx(int disks, size_t bytes, int faila, |
426 | void **ptrs) |
427 | { |
428 | u8 *p, *q, *dq; |
429 | const u8 *qmul; /* Q multiplier table */ |
430 | |
431 | p = (u8 *)ptrs[disks - 2]; |
432 | q = (u8 *)ptrs[disks - 1]; |
433 | |
434 | /* |
435 | * Compute syndrome with zero for the missing data page |
436 | * Use the dead data page as temporary storage for delta q |
437 | */ |
438 | dq = (u8 *)ptrs[faila]; |
439 | ptrs[faila] = (void *)raid6_empty_zero_page; |
440 | ptrs[disks - 1] = dq; |
441 | |
442 | raid6_call.gen_syndrome(disks, bytes, ptrs); |
443 | |
444 | /* Restore pointer table */ |
445 | ptrs[faila] = dq; |
446 | ptrs[disks - 1] = q; |
447 | |
448 | /* Now, pick the proper data tables */ |
449 | qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; |
450 | |
451 | kernel_fpu_begin(); |
452 | |
453 | /* xr22, xr23: qmul */ |
454 | asm volatile("vld $vr22, %0" : : "m" (qmul[0])); |
455 | asm volatile("xvreplve0.q $xr22, $xr22" ); |
456 | asm volatile("vld $vr23, %0" : : "m" (qmul[16])); |
457 | asm volatile("xvreplve0.q $xr23, $xr23" ); |
458 | |
459 | while (bytes) { |
460 | /* xr0, xr1: P + Dx */ |
461 | asm volatile("xvld $xr0, %0" : : "m" (p[0])); |
462 | asm volatile("xvld $xr1, %0" : : "m" (p[32])); |
463 | /* xr2, xr3: Qx */ |
464 | asm volatile("xvld $xr2, %0" : : "m" (dq[0])); |
465 | asm volatile("xvld $xr3, %0" : : "m" (dq[32])); |
466 | /* xr2, xr3: Q + Qx */ |
467 | asm volatile("xvld $xr4, %0" : : "m" (q[0])); |
468 | asm volatile("xvld $xr5, %0" : : "m" (q[32])); |
469 | asm volatile("xvxor.v $xr2, $xr2, $xr4" ); |
470 | asm volatile("xvxor.v $xr3, $xr3, $xr5" ); |
471 | |
472 | /* xr4, xr5: higher 4 bits of each byte of (Q + Qx) */ |
473 | asm volatile("xvsrli.b $xr4, $xr2, 4" ); |
474 | asm volatile("xvsrli.b $xr5, $xr3, 4" ); |
475 | /* xr2, xr3: lower 4 bits of each byte of (Q + Qx) */ |
476 | asm volatile("xvandi.b $xr2, $xr2, 0x0f" ); |
477 | asm volatile("xvandi.b $xr3, $xr3, 0x0f" ); |
478 | /* lookup from qmul[0] */ |
479 | asm volatile("xvshuf.b $xr2, $xr22, $xr22, $xr2" ); |
480 | asm volatile("xvshuf.b $xr3, $xr22, $xr22, $xr3" ); |
481 | /* lookup from qmul[16] */ |
482 | asm volatile("xvshuf.b $xr4, $xr23, $xr23, $xr4" ); |
483 | asm volatile("xvshuf.b $xr5, $xr23, $xr23, $xr5" ); |
484 | /* xr2, xr3: qmul(Q + Qx) = Dx */ |
485 | asm volatile("xvxor.v $xr2, $xr2, $xr4" ); |
486 | asm volatile("xvxor.v $xr3, $xr3, $xr5" ); |
487 | |
488 | /* xr0, xr1: P + Dx + Dx = P */ |
489 | asm volatile("xvxor.v $xr0, $xr0, $xr2" ); |
490 | asm volatile("xvxor.v $xr1, $xr1, $xr3" ); |
491 | |
492 | asm volatile("xvst $xr2, %0" : "=m" (dq[0])); |
493 | asm volatile("xvst $xr3, %0" : "=m" (dq[32])); |
494 | asm volatile("xvst $xr0, %0" : "=m" (p[0])); |
495 | asm volatile("xvst $xr1, %0" : "=m" (p[32])); |
496 | |
497 | bytes -= 64; |
498 | p += 64; |
499 | q += 64; |
500 | dq += 64; |
501 | } |
502 | |
503 | kernel_fpu_end(); |
504 | } |
505 | |
506 | const struct raid6_recov_calls raid6_recov_lasx = { |
507 | .data2 = raid6_2data_recov_lasx, |
508 | .datap = raid6_datap_recov_lasx, |
509 | .valid = raid6_has_lasx, |
510 | .name = "lasx" , |
511 | .priority = 2, |
512 | }; |
513 | #endif /* CONFIG_CPU_HAS_LASX */ |
514 | |