1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | /* NG4memcpy.S: Niagara-4 optimized memcpy. |
3 | * |
4 | * Copyright (C) 2012 David S. Miller (davem@davemloft.net) |
5 | */ |
6 | |
7 | #ifdef __KERNEL__ |
8 | #include <linux/linkage.h> |
9 | #include <asm/visasm.h> |
10 | #include <asm/asi.h> |
11 | #define GLOBAL_SPARE %g7 |
12 | #else |
13 | #define ASI_BLK_INIT_QUAD_LDD_P 0xe2 |
14 | #define FPRS_FEF 0x04 |
15 | |
16 | /* On T4 it is very expensive to access ASRs like %fprs and |
17 | * %asi, avoiding a read or a write can save ~50 cycles. |
18 | */ |
19 | #define FPU_ENTER \ |
20 | rd %fprs, %o5; \ |
21 | andcc %o5, FPRS_FEF, %g0; \ |
22 | be,a,pn %icc, 999f; \ |
23 | wr %g0, FPRS_FEF, %fprs; \ |
24 | 999: |
25 | |
26 | #ifdef MEMCPY_DEBUG |
27 | #define VISEntryHalf FPU_ENTER; \ |
28 | clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0; |
29 | #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs |
30 | #else |
31 | #define VISEntryHalf FPU_ENTER |
32 | #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs |
33 | #endif |
34 | |
35 | #define GLOBAL_SPARE %g5 |
36 | #endif |
37 | |
38 | #ifndef STORE_ASI |
39 | #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA |
40 | #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P |
41 | #else |
42 | #define STORE_ASI 0x80 /* ASI_P */ |
43 | #endif |
44 | #endif |
45 | |
46 | #if !defined(EX_LD) && !defined(EX_ST) |
47 | #define NON_USER_COPY |
48 | #endif |
49 | |
50 | #ifndef EX_LD |
51 | #define EX_LD(x,y) x |
52 | #endif |
53 | #ifndef EX_LD_FP |
54 | #define EX_LD_FP(x,y) x |
55 | #endif |
56 | |
57 | #ifndef EX_ST |
58 | #define EX_ST(x,y) x |
59 | #endif |
60 | #ifndef EX_ST_FP |
61 | #define EX_ST_FP(x,y) x |
62 | #endif |
63 | |
64 | |
65 | #ifndef LOAD |
66 | #define LOAD(type,addr,dest) type [addr], dest |
67 | #endif |
68 | |
69 | #ifndef STORE |
70 | #ifndef MEMCPY_DEBUG |
71 | #define STORE(type,src,addr) type src, [addr] |
72 | #else |
73 | #define STORE(type,src,addr) type##a src, [addr] %asi |
74 | #endif |
75 | #endif |
76 | |
77 | #ifndef STORE_INIT |
78 | #define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI |
79 | #endif |
80 | |
81 | #ifndef FUNC_NAME |
82 | #define FUNC_NAME NG4memcpy |
83 | #endif |
84 | #ifndef PREAMBLE |
85 | #define PREAMBLE |
86 | #endif |
87 | |
88 | #ifndef XCC |
89 | #define XCC xcc |
90 | #endif |
91 | |
92 | .register %g2,#scratch |
93 | .register %g3,#scratch |
94 | |
95 | .text |
96 | #ifndef EX_RETVAL |
97 | #define EX_RETVAL(x) x |
98 | #endif |
99 | .align 64 |
100 | |
101 | .globl FUNC_NAME |
102 | .type FUNC_NAME,#function |
103 | FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ |
104 | #ifdef MEMCPY_DEBUG |
105 | wr %g0, 0x80, %asi |
106 | #endif |
107 | srlx %o2, 31, %g2 |
108 | cmp %g2, 0 |
109 | tne %XCC, 5 |
110 | PREAMBLE |
111 | mov %o0, %o3 |
112 | brz,pn %o2, .Lexit |
113 | cmp %o2, 3 |
114 | ble,pn %icc, .Ltiny |
115 | cmp %o2, 19 |
116 | ble,pn %icc, .Lsmall |
117 | or %o0, %o1, %g2 |
118 | cmp %o2, 128 |
119 | bl,pn %icc, .Lmedium |
120 | nop |
121 | |
122 | .Llarge:/* len >= 0x80 */ |
123 | /* First get dest 8 byte aligned. */ |
124 | sub %g0, %o0, %g1 |
125 | and %g1, 0x7, %g1 |
126 | brz,pt %g1, 51f |
127 | sub %o2, %g1, %o2 |
128 | |
129 | |
130 | 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2), memcpy_retl_o2_plus_g1) |
131 | add %o1, 1, %o1 |
132 | subcc %g1, 1, %g1 |
133 | add %o0, 1, %o0 |
134 | bne,pt %icc, 1b |
135 | EX_ST(STORE(stb, %g2, %o0 - 0x01), memcpy_retl_o2_plus_g1_plus_1) |
136 | |
137 | 51: LOAD(prefetch, %o1 + 0x040, #n_reads_strong) |
138 | LOAD(prefetch, %o1 + 0x080, #n_reads_strong) |
139 | LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong) |
140 | LOAD(prefetch, %o1 + 0x100, #n_reads_strong) |
141 | LOAD(prefetch, %o1 + 0x140, #n_reads_strong) |
142 | LOAD(prefetch, %o1 + 0x180, #n_reads_strong) |
143 | LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong) |
144 | LOAD(prefetch, %o1 + 0x200, #n_reads_strong) |
145 | |
146 | /* Check if we can use the straight fully aligned |
147 | * loop, or we require the alignaddr/faligndata variant. |
148 | */ |
149 | andcc %o1, 0x7, %o5 |
150 | bne,pn %icc, .Llarge_src_unaligned |
151 | sub %g0, %o0, %g1 |
152 | |
153 | /* Legitimize the use of initializing stores by getting dest |
154 | * to be 64-byte aligned. |
155 | */ |
156 | and %g1, 0x3f, %g1 |
157 | brz,pt %g1, .Llarge_aligned |
158 | sub %o2, %g1, %o2 |
159 | |
160 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g2), memcpy_retl_o2_plus_g1) |
161 | add %o1, 8, %o1 |
162 | subcc %g1, 8, %g1 |
163 | add %o0, 8, %o0 |
164 | bne,pt %icc, 1b |
165 | EX_ST(STORE(stx, %g2, %o0 - 0x08), memcpy_retl_o2_plus_g1_plus_8) |
166 | |
167 | .Llarge_aligned: |
168 | /* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */ |
169 | andn %o2, 0x3f, %o4 |
170 | sub %o2, %o4, %o2 |
171 | |
172 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1), memcpy_retl_o2_plus_o4) |
173 | add %o1, 0x40, %o1 |
174 | EX_LD(LOAD(ldx, %o1 - 0x38, %g2), memcpy_retl_o2_plus_o4) |
175 | subcc %o4, 0x40, %o4 |
176 | EX_LD(LOAD(ldx, %o1 - 0x30, %g3), memcpy_retl_o2_plus_o4_plus_64) |
177 | EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE), memcpy_retl_o2_plus_o4_plus_64) |
178 | EX_LD(LOAD(ldx, %o1 - 0x20, %o5), memcpy_retl_o2_plus_o4_plus_64) |
179 | EX_ST(STORE_INIT(%g1, %o0), memcpy_retl_o2_plus_o4_plus_64) |
180 | add %o0, 0x08, %o0 |
181 | EX_ST(STORE_INIT(%g2, %o0), memcpy_retl_o2_plus_o4_plus_56) |
182 | add %o0, 0x08, %o0 |
183 | EX_LD(LOAD(ldx, %o1 - 0x18, %g2), memcpy_retl_o2_plus_o4_plus_48) |
184 | EX_ST(STORE_INIT(%g3, %o0), memcpy_retl_o2_plus_o4_plus_48) |
185 | add %o0, 0x08, %o0 |
186 | EX_LD(LOAD(ldx, %o1 - 0x10, %g3), memcpy_retl_o2_plus_o4_plus_40) |
187 | EX_ST(STORE_INIT(GLOBAL_SPARE, %o0), memcpy_retl_o2_plus_o4_plus_40) |
188 | add %o0, 0x08, %o0 |
189 | EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE), memcpy_retl_o2_plus_o4_plus_32) |
190 | EX_ST(STORE_INIT(%o5, %o0), memcpy_retl_o2_plus_o4_plus_32) |
191 | add %o0, 0x08, %o0 |
192 | EX_ST(STORE_INIT(%g2, %o0), memcpy_retl_o2_plus_o4_plus_24) |
193 | add %o0, 0x08, %o0 |
194 | EX_ST(STORE_INIT(%g3, %o0), memcpy_retl_o2_plus_o4_plus_16) |
195 | add %o0, 0x08, %o0 |
196 | EX_ST(STORE_INIT(GLOBAL_SPARE, %o0), memcpy_retl_o2_plus_o4_plus_8) |
197 | add %o0, 0x08, %o0 |
198 | bne,pt %icc, 1b |
199 | LOAD(prefetch, %o1 + 0x200, #n_reads_strong) |
200 | |
201 | membar #StoreLoad | #StoreStore |
202 | |
203 | brz,pn %o2, .Lexit |
204 | cmp %o2, 19 |
205 | ble,pn %icc, .Lsmall_unaligned |
206 | nop |
207 | ba,a,pt %icc, .Lmedium_noprefetch |
208 | |
209 | .Lexit: retl |
210 | mov EX_RETVAL(%o3), %o0 |
211 | |
212 | .Llarge_src_unaligned: |
213 | #ifdef NON_USER_COPY |
214 | VISEntryHalfFast(.Lmedium_vis_entry_fail) |
215 | #else |
216 | VISEntryHalf |
217 | #endif |
218 | andn %o2, 0x3f, %o4 |
219 | sub %o2, %o4, %o2 |
220 | alignaddr %o1, %g0, %g1 |
221 | add %o1, %o4, %o1 |
222 | EX_LD_FP(LOAD(ldd, %g1 + 0x00, %f0), memcpy_retl_o2_plus_o4) |
223 | 1: EX_LD_FP(LOAD(ldd, %g1 + 0x08, %f2), memcpy_retl_o2_plus_o4) |
224 | subcc %o4, 0x40, %o4 |
225 | EX_LD_FP(LOAD(ldd, %g1 + 0x10, %f4), memcpy_retl_o2_plus_o4_plus_64) |
226 | EX_LD_FP(LOAD(ldd, %g1 + 0x18, %f6), memcpy_retl_o2_plus_o4_plus_64) |
227 | EX_LD_FP(LOAD(ldd, %g1 + 0x20, %f8), memcpy_retl_o2_plus_o4_plus_64) |
228 | EX_LD_FP(LOAD(ldd, %g1 + 0x28, %f10), memcpy_retl_o2_plus_o4_plus_64) |
229 | EX_LD_FP(LOAD(ldd, %g1 + 0x30, %f12), memcpy_retl_o2_plus_o4_plus_64) |
230 | EX_LD_FP(LOAD(ldd, %g1 + 0x38, %f14), memcpy_retl_o2_plus_o4_plus_64) |
231 | faligndata %f0, %f2, %f16 |
232 | EX_LD_FP(LOAD(ldd, %g1 + 0x40, %f0), memcpy_retl_o2_plus_o4_plus_64) |
233 | faligndata %f2, %f4, %f18 |
234 | add %g1, 0x40, %g1 |
235 | faligndata %f4, %f6, %f20 |
236 | faligndata %f6, %f8, %f22 |
237 | faligndata %f8, %f10, %f24 |
238 | faligndata %f10, %f12, %f26 |
239 | faligndata %f12, %f14, %f28 |
240 | faligndata %f14, %f0, %f30 |
241 | EX_ST_FP(STORE(std, %f16, %o0 + 0x00), memcpy_retl_o2_plus_o4_plus_64) |
242 | EX_ST_FP(STORE(std, %f18, %o0 + 0x08), memcpy_retl_o2_plus_o4_plus_56) |
243 | EX_ST_FP(STORE(std, %f20, %o0 + 0x10), memcpy_retl_o2_plus_o4_plus_48) |
244 | EX_ST_FP(STORE(std, %f22, %o0 + 0x18), memcpy_retl_o2_plus_o4_plus_40) |
245 | EX_ST_FP(STORE(std, %f24, %o0 + 0x20), memcpy_retl_o2_plus_o4_plus_32) |
246 | EX_ST_FP(STORE(std, %f26, %o0 + 0x28), memcpy_retl_o2_plus_o4_plus_24) |
247 | EX_ST_FP(STORE(std, %f28, %o0 + 0x30), memcpy_retl_o2_plus_o4_plus_16) |
248 | EX_ST_FP(STORE(std, %f30, %o0 + 0x38), memcpy_retl_o2_plus_o4_plus_8) |
249 | add %o0, 0x40, %o0 |
250 | bne,pt %icc, 1b |
251 | LOAD(prefetch, %g1 + 0x200, #n_reads_strong) |
252 | #ifdef NON_USER_COPY |
253 | VISExitHalfFast |
254 | #else |
255 | VISExitHalf |
256 | #endif |
257 | brz,pn %o2, .Lexit |
258 | cmp %o2, 19 |
259 | ble,pn %icc, .Lsmall_unaligned |
260 | nop |
261 | ba,a,pt %icc, .Lmedium_unaligned |
262 | |
263 | #ifdef NON_USER_COPY |
264 | .Lmedium_vis_entry_fail: |
265 | or %o0, %o1, %g2 |
266 | #endif |
267 | .Lmedium: |
268 | LOAD(prefetch, %o1 + 0x40, #n_reads_strong) |
269 | andcc %g2, 0x7, %g0 |
270 | bne,pn %icc, .Lmedium_unaligned |
271 | nop |
272 | .Lmedium_noprefetch: |
273 | andncc %o2, 0x20 - 1, %o5 |
274 | be,pn %icc, 2f |
275 | sub %o2, %o5, %o2 |
276 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1), memcpy_retl_o2_plus_o5) |
277 | EX_LD(LOAD(ldx, %o1 + 0x08, %g2), memcpy_retl_o2_plus_o5) |
278 | EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE), memcpy_retl_o2_plus_o5) |
279 | EX_LD(LOAD(ldx, %o1 + 0x18, %o4), memcpy_retl_o2_plus_o5) |
280 | add %o1, 0x20, %o1 |
281 | subcc %o5, 0x20, %o5 |
282 | EX_ST(STORE(stx, %g1, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_32) |
283 | EX_ST(STORE(stx, %g2, %o0 + 0x08), memcpy_retl_o2_plus_o5_plus_24) |
284 | EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10), memcpy_retl_o2_plus_o5_plus_24) |
285 | EX_ST(STORE(stx, %o4, %o0 + 0x18), memcpy_retl_o2_plus_o5_plus_8) |
286 | bne,pt %icc, 1b |
287 | add %o0, 0x20, %o0 |
288 | 2: andcc %o2, 0x18, %o5 |
289 | be,pt %icc, 3f |
290 | sub %o2, %o5, %o2 |
291 | |
292 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1), memcpy_retl_o2_plus_o5) |
293 | add %o1, 0x08, %o1 |
294 | add %o0, 0x08, %o0 |
295 | subcc %o5, 0x08, %o5 |
296 | bne,pt %icc, 1b |
297 | EX_ST(STORE(stx, %g1, %o0 - 0x08), memcpy_retl_o2_plus_o5_plus_8) |
298 | 3: brz,pt %o2, .Lexit |
299 | cmp %o2, 0x04 |
300 | bl,pn %icc, .Ltiny |
301 | nop |
302 | EX_LD(LOAD(lduw, %o1 + 0x00, %g1), memcpy_retl_o2) |
303 | add %o1, 0x04, %o1 |
304 | add %o0, 0x04, %o0 |
305 | subcc %o2, 0x04, %o2 |
306 | bne,pn %icc, .Ltiny |
307 | EX_ST(STORE(stw, %g1, %o0 - 0x04), memcpy_retl_o2_plus_4) |
308 | ba,a,pt %icc, .Lexit |
309 | .Lmedium_unaligned: |
310 | /* First get dest 8 byte aligned. */ |
311 | sub %g0, %o0, %g1 |
312 | and %g1, 0x7, %g1 |
313 | brz,pt %g1, 2f |
314 | sub %o2, %g1, %o2 |
315 | |
316 | 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2), memcpy_retl_o2_plus_g1) |
317 | add %o1, 1, %o1 |
318 | subcc %g1, 1, %g1 |
319 | add %o0, 1, %o0 |
320 | bne,pt %icc, 1b |
321 | EX_ST(STORE(stb, %g2, %o0 - 0x01), memcpy_retl_o2_plus_g1_plus_1) |
322 | 2: |
323 | and %o1, 0x7, %g1 |
324 | brz,pn %g1, .Lmedium_noprefetch |
325 | sll %g1, 3, %g1 |
326 | mov 64, %g2 |
327 | sub %g2, %g1, %g2 |
328 | andn %o1, 0x7, %o1 |
329 | EX_LD(LOAD(ldx, %o1 + 0x00, %o4), memcpy_retl_o2) |
330 | sllx %o4, %g1, %o4 |
331 | andn %o2, 0x08 - 1, %o5 |
332 | sub %o2, %o5, %o2 |
333 | 1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3), memcpy_retl_o2_plus_o5) |
334 | add %o1, 0x08, %o1 |
335 | subcc %o5, 0x08, %o5 |
336 | srlx %g3, %g2, GLOBAL_SPARE |
337 | or GLOBAL_SPARE, %o4, GLOBAL_SPARE |
338 | EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_8) |
339 | add %o0, 0x08, %o0 |
340 | bne,pt %icc, 1b |
341 | sllx %g3, %g1, %o4 |
342 | srl %g1, 3, %g1 |
343 | add %o1, %g1, %o1 |
344 | brz,pn %o2, .Lexit |
345 | nop |
346 | ba,pt %icc, .Lsmall_unaligned |
347 | |
348 | .Ltiny: |
349 | EX_LD(LOAD(ldub, %o1 + 0x00, %g1), memcpy_retl_o2) |
350 | subcc %o2, 1, %o2 |
351 | be,pn %icc, .Lexit |
352 | EX_ST(STORE(stb, %g1, %o0 + 0x00), memcpy_retl_o2_plus_1) |
353 | EX_LD(LOAD(ldub, %o1 + 0x01, %g1), memcpy_retl_o2) |
354 | subcc %o2, 1, %o2 |
355 | be,pn %icc, .Lexit |
356 | EX_ST(STORE(stb, %g1, %o0 + 0x01), memcpy_retl_o2_plus_1) |
357 | EX_LD(LOAD(ldub, %o1 + 0x02, %g1), memcpy_retl_o2) |
358 | ba,pt %icc, .Lexit |
359 | EX_ST(STORE(stb, %g1, %o0 + 0x02), memcpy_retl_o2) |
360 | |
361 | .Lsmall: |
362 | andcc %g2, 0x3, %g0 |
363 | bne,pn %icc, .Lsmall_unaligned |
364 | andn %o2, 0x4 - 1, %o5 |
365 | sub %o2, %o5, %o2 |
366 | 1: |
367 | EX_LD(LOAD(lduw, %o1 + 0x00, %g1), memcpy_retl_o2_plus_o5) |
368 | add %o1, 0x04, %o1 |
369 | subcc %o5, 0x04, %o5 |
370 | add %o0, 0x04, %o0 |
371 | bne,pt %icc, 1b |
372 | EX_ST(STORE(stw, %g1, %o0 - 0x04), memcpy_retl_o2_plus_o5_plus_4) |
373 | brz,pt %o2, .Lexit |
374 | nop |
375 | ba,a,pt %icc, .Ltiny |
376 | |
377 | .Lsmall_unaligned: |
378 | 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g1), memcpy_retl_o2) |
379 | add %o1, 1, %o1 |
380 | add %o0, 1, %o0 |
381 | subcc %o2, 1, %o2 |
382 | bne,pt %icc, 1b |
383 | EX_ST(STORE(stb, %g1, %o0 - 0x01), memcpy_retl_o2_plus_1) |
384 | ba,a,pt %icc, .Lexit |
385 | nop |
386 | .size FUNC_NAME, .-FUNC_NAME |
387 | |