1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | /* memcpy.S: Sparc optimized memcpy and memmove code |
3 | * Hand optimized from GNU libc's memcpy and memmove |
4 | * Copyright (C) 1991,1996 Free Software Foundation |
5 | * Copyright (C) 1995 Linus Torvalds (Linus.Torvalds@helsinki.fi) |
6 | * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) |
7 | * Copyright (C) 1996 Eddie C. Dost (ecd@skynet.be) |
8 | * Copyright (C) 1996 Jakub Jelinek (jj@sunsite.mff.cuni.cz) |
9 | */ |
10 | |
11 | #include <linux/export.h> |
12 | |
13 | #define FUNC(x) \ |
14 | .globl x; \ |
15 | .type x,@function; \ |
16 | .align 4; \ |
17 | x: |
18 | |
19 | /* Both these macros have to start with exactly the same insn */ |
20 | #define MOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \ |
21 | ldd [%src + (offset) + 0x00], %t0; \ |
22 | ldd [%src + (offset) + 0x08], %t2; \ |
23 | ldd [%src + (offset) + 0x10], %t4; \ |
24 | ldd [%src + (offset) + 0x18], %t6; \ |
25 | st %t0, [%dst + (offset) + 0x00]; \ |
26 | st %t1, [%dst + (offset) + 0x04]; \ |
27 | st %t2, [%dst + (offset) + 0x08]; \ |
28 | st %t3, [%dst + (offset) + 0x0c]; \ |
29 | st %t4, [%dst + (offset) + 0x10]; \ |
30 | st %t5, [%dst + (offset) + 0x14]; \ |
31 | st %t6, [%dst + (offset) + 0x18]; \ |
32 | st %t7, [%dst + (offset) + 0x1c]; |
33 | |
34 | #define MOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \ |
35 | ldd [%src + (offset) + 0x00], %t0; \ |
36 | ldd [%src + (offset) + 0x08], %t2; \ |
37 | ldd [%src + (offset) + 0x10], %t4; \ |
38 | ldd [%src + (offset) + 0x18], %t6; \ |
39 | std %t0, [%dst + (offset) + 0x00]; \ |
40 | std %t2, [%dst + (offset) + 0x08]; \ |
41 | std %t4, [%dst + (offset) + 0x10]; \ |
42 | std %t6, [%dst + (offset) + 0x18]; |
43 | |
44 | #define MOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \ |
45 | ldd [%src - (offset) - 0x10], %t0; \ |
46 | ldd [%src - (offset) - 0x08], %t2; \ |
47 | st %t0, [%dst - (offset) - 0x10]; \ |
48 | st %t1, [%dst - (offset) - 0x0c]; \ |
49 | st %t2, [%dst - (offset) - 0x08]; \ |
50 | st %t3, [%dst - (offset) - 0x04]; |
51 | |
52 | #define MOVE_LASTALIGNCHUNK(src, dst, offset, t0, t1, t2, t3) \ |
53 | ldd [%src - (offset) - 0x10], %t0; \ |
54 | ldd [%src - (offset) - 0x08], %t2; \ |
55 | std %t0, [%dst - (offset) - 0x10]; \ |
56 | std %t2, [%dst - (offset) - 0x08]; |
57 | |
58 | #define MOVE_SHORTCHUNK(src, dst, offset, t0, t1) \ |
59 | ldub [%src - (offset) - 0x02], %t0; \ |
60 | ldub [%src - (offset) - 0x01], %t1; \ |
61 | stb %t0, [%dst - (offset) - 0x02]; \ |
62 | stb %t1, [%dst - (offset) - 0x01]; |
63 | |
64 | .text |
65 | .align 4 |
66 | |
67 | FUNC(memmove) |
68 | EXPORT_SYMBOL(memmove) |
69 | cmp %o0, %o1 |
70 | mov %o0, %g7 |
71 | bleu 9f |
72 | sub %o0, %o1, %o4 |
73 | |
74 | add %o1, %o2, %o3 |
75 | cmp %o3, %o0 |
76 | bleu 0f |
77 | andcc %o4, 3, %o5 |
78 | |
79 | add %o1, %o2, %o1 |
80 | add %o0, %o2, %o0 |
81 | sub %o1, 1, %o1 |
82 | sub %o0, 1, %o0 |
83 | |
84 | 1: /* reverse_bytes */ |
85 | |
86 | ldub [%o1], %o4 |
87 | subcc %o2, 1, %o2 |
88 | stb %o4, [%o0] |
89 | sub %o1, 1, %o1 |
90 | bne 1b |
91 | sub %o0, 1, %o0 |
92 | |
93 | retl |
94 | mov %g7, %o0 |
95 | |
96 | /* NOTE: This code is executed just for the cases, |
97 | where %src (=%o1) & 3 is != 0. |
98 | We need to align it to 4. So, for (%src & 3) |
99 | 1 we need to do ldub,lduh |
100 | 2 lduh |
101 | 3 just ldub |
102 | so even if it looks weird, the branches |
103 | are correct here. -jj |
104 | */ |
105 | 78: /* dword_align */ |
106 | |
107 | andcc %o1, 1, %g0 |
108 | be 4f |
109 | andcc %o1, 2, %g0 |
110 | |
111 | ldub [%o1], %g2 |
112 | add %o1, 1, %o1 |
113 | stb %g2, [%o0] |
114 | sub %o2, 1, %o2 |
115 | bne 3f |
116 | add %o0, 1, %o0 |
117 | 4: |
118 | lduh [%o1], %g2 |
119 | add %o1, 2, %o1 |
120 | sth %g2, [%o0] |
121 | sub %o2, 2, %o2 |
122 | b 3f |
123 | add %o0, 2, %o0 |
124 | |
125 | FUNC(memcpy) /* %o0=dst %o1=src %o2=len */ |
126 | EXPORT_SYMBOL(memcpy) |
127 | |
128 | sub %o0, %o1, %o4 |
129 | mov %o0, %g7 |
130 | 9: |
131 | andcc %o4, 3, %o5 |
132 | 0: |
133 | bne 86f |
134 | cmp %o2, 15 |
135 | |
136 | bleu 90f |
137 | andcc %o1, 3, %g0 |
138 | |
139 | bne 78b |
140 | 3: |
141 | andcc %o1, 4, %g0 |
142 | |
143 | be 2f |
144 | mov %o2, %g1 |
145 | |
146 | ld [%o1], %o4 |
147 | sub %g1, 4, %g1 |
148 | st %o4, [%o0] |
149 | add %o1, 4, %o1 |
150 | add %o0, 4, %o0 |
151 | 2: |
152 | andcc %g1, 0xffffff80, %g0 |
153 | be 3f |
154 | andcc %o0, 4, %g0 |
155 | |
156 | be 82f + 4 |
157 | 5: |
158 | MOVE_BIGCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5) |
159 | MOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5) |
160 | MOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5) |
161 | MOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5) |
162 | sub %g1, 128, %g1 |
163 | add %o1, 128, %o1 |
164 | cmp %g1, 128 |
165 | bge 5b |
166 | add %o0, 128, %o0 |
167 | 3: |
168 | andcc %g1, 0x70, %g4 |
169 | be 80f |
170 | andcc %g1, 8, %g0 |
171 | |
172 | sethi %hi(80f), %o5 |
173 | srl %g4, 1, %o4 |
174 | add %g4, %o4, %o4 |
175 | add %o1, %g4, %o1 |
176 | sub %o5, %o4, %o5 |
177 | jmpl %o5 + %lo(80f), %g0 |
178 | add %o0, %g4, %o0 |
179 | |
180 | 79: /* memcpy_table */ |
181 | |
182 | MOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g4, g5) |
183 | MOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g4, g5) |
184 | MOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g4, g5) |
185 | MOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g4, g5) |
186 | MOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g4, g5) |
187 | MOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g4, g5) |
188 | MOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g4, g5) |
189 | |
190 | 80: /* memcpy_table_end */ |
191 | be 81f |
192 | andcc %g1, 4, %g0 |
193 | |
194 | ldd [%o1], %g2 |
195 | add %o0, 8, %o0 |
196 | st %g2, [%o0 - 0x08] |
197 | add %o1, 8, %o1 |
198 | st %g3, [%o0 - 0x04] |
199 | |
200 | 81: /* memcpy_last7 */ |
201 | |
202 | be 1f |
203 | andcc %g1, 2, %g0 |
204 | |
205 | ld [%o1], %g2 |
206 | add %o1, 4, %o1 |
207 | st %g2, [%o0] |
208 | add %o0, 4, %o0 |
209 | 1: |
210 | be 1f |
211 | andcc %g1, 1, %g0 |
212 | |
213 | lduh [%o1], %g2 |
214 | add %o1, 2, %o1 |
215 | sth %g2, [%o0] |
216 | add %o0, 2, %o0 |
217 | 1: |
218 | be 1f |
219 | nop |
220 | |
221 | ldub [%o1], %g2 |
222 | stb %g2, [%o0] |
223 | 1: |
224 | retl |
225 | mov %g7, %o0 |
226 | |
227 | 82: /* ldd_std */ |
228 | MOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5) |
229 | MOVE_BIGALIGNCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5) |
230 | MOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5) |
231 | MOVE_BIGALIGNCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5) |
232 | subcc %g1, 128, %g1 |
233 | add %o1, 128, %o1 |
234 | cmp %g1, 128 |
235 | bge 82b |
236 | add %o0, 128, %o0 |
237 | |
238 | andcc %g1, 0x70, %g4 |
239 | be 84f |
240 | andcc %g1, 8, %g0 |
241 | |
242 | sethi %hi(84f), %o5 |
243 | add %o1, %g4, %o1 |
244 | sub %o5, %g4, %o5 |
245 | jmpl %o5 + %lo(84f), %g0 |
246 | add %o0, %g4, %o0 |
247 | |
248 | 83: /* amemcpy_table */ |
249 | |
250 | MOVE_LASTALIGNCHUNK(o1, o0, 0x60, g2, g3, g4, g5) |
251 | MOVE_LASTALIGNCHUNK(o1, o0, 0x50, g2, g3, g4, g5) |
252 | MOVE_LASTALIGNCHUNK(o1, o0, 0x40, g2, g3, g4, g5) |
253 | MOVE_LASTALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5) |
254 | MOVE_LASTALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5) |
255 | MOVE_LASTALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5) |
256 | MOVE_LASTALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5) |
257 | |
258 | 84: /* amemcpy_table_end */ |
259 | be 85f |
260 | andcc %g1, 4, %g0 |
261 | |
262 | ldd [%o1], %g2 |
263 | add %o0, 8, %o0 |
264 | std %g2, [%o0 - 0x08] |
265 | add %o1, 8, %o1 |
266 | 85: /* amemcpy_last7 */ |
267 | be 1f |
268 | andcc %g1, 2, %g0 |
269 | |
270 | ld [%o1], %g2 |
271 | add %o1, 4, %o1 |
272 | st %g2, [%o0] |
273 | add %o0, 4, %o0 |
274 | 1: |
275 | be 1f |
276 | andcc %g1, 1, %g0 |
277 | |
278 | lduh [%o1], %g2 |
279 | add %o1, 2, %o1 |
280 | sth %g2, [%o0] |
281 | add %o0, 2, %o0 |
282 | 1: |
283 | be 1f |
284 | nop |
285 | |
286 | ldub [%o1], %g2 |
287 | stb %g2, [%o0] |
288 | 1: |
289 | retl |
290 | mov %g7, %o0 |
291 | |
292 | 86: /* non_aligned */ |
293 | cmp %o2, 6 |
294 | bleu 88f |
295 | nop |
296 | |
297 | save %sp, -96, %sp |
298 | andcc %i0, 3, %g0 |
299 | be 61f |
300 | andcc %i0, 1, %g0 |
301 | be 60f |
302 | andcc %i0, 2, %g0 |
303 | |
304 | ldub [%i1], %g5 |
305 | add %i1, 1, %i1 |
306 | stb %g5, [%i0] |
307 | sub %i2, 1, %i2 |
308 | bne 61f |
309 | add %i0, 1, %i0 |
310 | 60: |
311 | ldub [%i1], %g3 |
312 | add %i1, 2, %i1 |
313 | stb %g3, [%i0] |
314 | sub %i2, 2, %i2 |
315 | ldub [%i1 - 1], %g3 |
316 | add %i0, 2, %i0 |
317 | stb %g3, [%i0 - 1] |
318 | 61: |
319 | and %i1, 3, %g2 |
320 | and %i2, 0xc, %g3 |
321 | and %i1, -4, %i1 |
322 | cmp %g3, 4 |
323 | sll %g2, 3, %g4 |
324 | mov 32, %g2 |
325 | be 4f |
326 | sub %g2, %g4, %l0 |
327 | |
328 | blu 3f |
329 | cmp %g3, 0x8 |
330 | |
331 | be 2f |
332 | srl %i2, 2, %g3 |
333 | |
334 | ld [%i1], %i3 |
335 | add %i0, -8, %i0 |
336 | ld [%i1 + 4], %i4 |
337 | b 8f |
338 | add %g3, 1, %g3 |
339 | 2: |
340 | ld [%i1], %i4 |
341 | add %i0, -12, %i0 |
342 | ld [%i1 + 4], %i5 |
343 | add %g3, 2, %g3 |
344 | b 9f |
345 | add %i1, -4, %i1 |
346 | 3: |
347 | ld [%i1], %g1 |
348 | add %i0, -4, %i0 |
349 | ld [%i1 + 4], %i3 |
350 | srl %i2, 2, %g3 |
351 | b 7f |
352 | add %i1, 4, %i1 |
353 | 4: |
354 | ld [%i1], %i5 |
355 | cmp %i2, 7 |
356 | ld [%i1 + 4], %g1 |
357 | srl %i2, 2, %g3 |
358 | bleu 10f |
359 | add %i1, 8, %i1 |
360 | |
361 | ld [%i1], %i3 |
362 | add %g3, -1, %g3 |
363 | 5: |
364 | sll %i5, %g4, %g2 |
365 | srl %g1, %l0, %g5 |
366 | or %g2, %g5, %g2 |
367 | st %g2, [%i0] |
368 | 7: |
369 | ld [%i1 + 4], %i4 |
370 | sll %g1, %g4, %g2 |
371 | srl %i3, %l0, %g5 |
372 | or %g2, %g5, %g2 |
373 | st %g2, [%i0 + 4] |
374 | 8: |
375 | ld [%i1 + 8], %i5 |
376 | sll %i3, %g4, %g2 |
377 | srl %i4, %l0, %g5 |
378 | or %g2, %g5, %g2 |
379 | st %g2, [%i0 + 8] |
380 | 9: |
381 | ld [%i1 + 12], %g1 |
382 | sll %i4, %g4, %g2 |
383 | srl %i5, %l0, %g5 |
384 | addcc %g3, -4, %g3 |
385 | or %g2, %g5, %g2 |
386 | add %i1, 16, %i1 |
387 | st %g2, [%i0 + 12] |
388 | add %i0, 16, %i0 |
389 | bne,a 5b |
390 | ld [%i1], %i3 |
391 | 10: |
392 | sll %i5, %g4, %g2 |
393 | srl %g1, %l0, %g5 |
394 | srl %l0, 3, %g3 |
395 | or %g2, %g5, %g2 |
396 | sub %i1, %g3, %i1 |
397 | andcc %i2, 2, %g0 |
398 | st %g2, [%i0] |
399 | be 1f |
400 | andcc %i2, 1, %g0 |
401 | |
402 | ldub [%i1], %g2 |
403 | add %i1, 2, %i1 |
404 | stb %g2, [%i0 + 4] |
405 | add %i0, 2, %i0 |
406 | ldub [%i1 - 1], %g2 |
407 | stb %g2, [%i0 + 3] |
408 | 1: |
409 | be 1f |
410 | nop |
411 | ldub [%i1], %g2 |
412 | stb %g2, [%i0 + 4] |
413 | 1: |
414 | ret |
415 | restore %g7, %g0, %o0 |
416 | |
417 | 88: /* short_end */ |
418 | |
419 | and %o2, 0xe, %o3 |
420 | 20: |
421 | sethi %hi(89f), %o5 |
422 | sll %o3, 3, %o4 |
423 | add %o0, %o3, %o0 |
424 | sub %o5, %o4, %o5 |
425 | add %o1, %o3, %o1 |
426 | jmpl %o5 + %lo(89f), %g0 |
427 | andcc %o2, 1, %g0 |
428 | |
429 | MOVE_SHORTCHUNK(o1, o0, 0x0c, g2, g3) |
430 | MOVE_SHORTCHUNK(o1, o0, 0x0a, g2, g3) |
431 | MOVE_SHORTCHUNK(o1, o0, 0x08, g2, g3) |
432 | MOVE_SHORTCHUNK(o1, o0, 0x06, g2, g3) |
433 | MOVE_SHORTCHUNK(o1, o0, 0x04, g2, g3) |
434 | MOVE_SHORTCHUNK(o1, o0, 0x02, g2, g3) |
435 | MOVE_SHORTCHUNK(o1, o0, 0x00, g2, g3) |
436 | |
437 | 89: /* short_table_end */ |
438 | |
439 | be 1f |
440 | nop |
441 | |
442 | ldub [%o1], %g2 |
443 | stb %g2, [%o0] |
444 | 1: |
445 | retl |
446 | mov %g7, %o0 |
447 | |
448 | 90: /* short_aligned_end */ |
449 | bne 88b |
450 | andcc %o2, 8, %g0 |
451 | |
452 | be 1f |
453 | andcc %o2, 4, %g0 |
454 | |
455 | ld [%o1 + 0x00], %g2 |
456 | ld [%o1 + 0x04], %g3 |
457 | add %o1, 8, %o1 |
458 | st %g2, [%o0 + 0x00] |
459 | st %g3, [%o0 + 0x04] |
460 | add %o0, 8, %o0 |
461 | 1: |
462 | b 81b |
463 | mov %o2, %g1 |
464 | |