1 | /* A Thunderx2 Optimized memcpy implementation for AARCH64. |
2 | Copyright (C) 2018-2024 Free Software Foundation, Inc. |
3 | |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | #include <sysdep.h> |
21 | |
22 | /* Assumptions: |
23 | * |
24 | * ARMv8-a, AArch64, unaligned accesses. |
25 | * |
26 | */ |
27 | |
28 | #define dstin x0 |
29 | #define src x1 |
30 | #define count x2 |
31 | #define dst x3 |
32 | #define srcend x4 |
33 | #define dstend x5 |
34 | #define tmp2 x6 |
35 | #define tmp3 x7 |
36 | #define tmp3w w7 |
37 | #define A_l x6 |
38 | #define A_lw w6 |
39 | #define A_h x7 |
40 | #define A_hw w7 |
41 | #define B_l x8 |
42 | #define B_lw w8 |
43 | #define B_h x9 |
44 | #define C_l x10 |
45 | #define C_h x11 |
46 | #define D_l x12 |
47 | #define D_h x13 |
48 | #define E_l src |
49 | #define E_h count |
50 | #define F_l srcend |
51 | #define F_h dst |
52 | #define G_l count |
53 | #define G_h dst |
54 | #define tmp1 x14 |
55 | |
56 | #define A_q q0 |
57 | #define B_q q1 |
58 | #define C_q q2 |
59 | #define D_q q3 |
60 | #define E_q q4 |
61 | #define F_q q5 |
62 | #define G_q q6 |
63 | #define H_q q7 |
64 | #define I_q q16 |
65 | #define J_q q17 |
66 | |
67 | #define A_v v0 |
68 | #define B_v v1 |
69 | #define C_v v2 |
70 | #define D_v v3 |
71 | #define E_v v4 |
72 | #define F_v v5 |
73 | #define G_v v6 |
74 | #define H_v v7 |
75 | #define I_v v16 |
76 | #define J_v v17 |
77 | |
78 | /* Overlapping large forward memmoves use a loop that copies backwards. |
79 | Otherwise memcpy is used. Small moves branch to memcopy16 directly. |
80 | The longer memcpy cases fall through to the memcpy head. |
81 | */ |
82 | |
83 | ENTRY (__memmove_thunderx2) |
84 | |
85 | PTR_ARG (0) |
86 | PTR_ARG (1) |
87 | SIZE_ARG (2) |
88 | |
89 | add srcend, src, count |
90 | cmp count, 16 |
91 | b.ls L(memcopy16) |
92 | sub tmp1, dstin, src |
93 | cmp count, 96 |
94 | ccmp tmp1, count, 2, hi |
95 | b.lo L(move_long) |
96 | |
97 | END (__memmove_thunderx2) |
98 | |
99 | |
100 | /* Copies are split into 3 main cases: small copies of up to 16 bytes, |
101 | medium copies of 17..96 bytes which are fully unrolled. Large copies |
102 | of more than 96 bytes align the destination and use load-and-merge |
103 | approach in the case src and dst addresses are unaligned not evenly, |
104 | so that, actual loads and stores are always aligned. |
105 | Large copies use the loops processing 64 bytes per iteration for |
106 | unaligned case and 128 bytes per iteration for aligned ones. |
107 | */ |
108 | |
109 | #define MEMCPY_PREFETCH_LDR 640 |
110 | |
111 | ENTRY (__memcpy_thunderx2) |
112 | |
113 | PTR_ARG (0) |
114 | PTR_ARG (1) |
115 | SIZE_ARG (2) |
116 | |
117 | add srcend, src, count |
118 | cmp count, 16 |
119 | b.ls L(memcopy16) |
120 | ldr A_q, [src], #16 |
121 | add dstend, dstin, count |
122 | and tmp1, src, 15 |
123 | cmp count, 96 |
124 | b.hi L(memcopy_long) |
125 | |
126 | /* Medium copies: 17..96 bytes. */ |
127 | ldr E_q, [srcend, -16] |
128 | cmp count, 64 |
129 | b.gt L(memcpy_copy96) |
130 | cmp count, 48 |
131 | b.le L(bytes_17_to_48) |
132 | /* 49..64 bytes */ |
133 | ldp B_q, C_q, [src] |
134 | str E_q, [dstend, -16] |
135 | stp A_q, B_q, [dstin] |
136 | str C_q, [dstin, 32] |
137 | ret |
138 | |
139 | L(bytes_17_to_48): |
140 | /* 17..48 bytes*/ |
141 | cmp count, 32 |
142 | b.gt L(bytes_32_to_48) |
143 | /* 17..32 bytes*/ |
144 | str A_q, [dstin] |
145 | str E_q, [dstend, -16] |
146 | ret |
147 | |
148 | L(bytes_32_to_48): |
149 | /* 32..48 */ |
150 | ldr B_q, [src] |
151 | str A_q, [dstin] |
152 | str E_q, [dstend, -16] |
153 | str B_q, [dstin, 16] |
154 | ret |
155 | |
156 | .p2align 4 |
157 | /* Small copies: 0..16 bytes. */ |
158 | L(memcopy16): |
159 | cmp count, 8 |
160 | b.lo L(bytes_0_to_8) |
161 | ldr A_l, [src] |
162 | ldr A_h, [srcend, -8] |
163 | add dstend, dstin, count |
164 | str A_l, [dstin] |
165 | str A_h, [dstend, -8] |
166 | ret |
167 | .p2align 4 |
168 | |
169 | L(bytes_0_to_8): |
170 | tbz count, 2, L(bytes_0_to_3) |
171 | ldr A_lw, [src] |
172 | ldr A_hw, [srcend, -4] |
173 | add dstend, dstin, count |
174 | str A_lw, [dstin] |
175 | str A_hw, [dstend, -4] |
176 | ret |
177 | |
178 | /* Copy 0..3 bytes. Use a branchless sequence that copies the same |
179 | byte 3 times if count==1, or the 2nd byte twice if count==2. */ |
180 | L(bytes_0_to_3): |
181 | cbz count, 1f |
182 | lsr tmp1, count, 1 |
183 | ldrb A_lw, [src] |
184 | ldrb A_hw, [srcend, -1] |
185 | add dstend, dstin, count |
186 | ldrb B_lw, [src, tmp1] |
187 | strb B_lw, [dstin, tmp1] |
188 | strb A_hw, [dstend, -1] |
189 | strb A_lw, [dstin] |
190 | 1: |
191 | ret |
192 | |
193 | .p2align 4 |
194 | |
195 | L(memcpy_copy96): |
196 | /* Copying 65..96 bytes. A_q (first 16 bytes) and |
197 | E_q(last 16 bytes) are already loaded. The size |
198 | is large enough to benefit from aligned loads */ |
199 | bic src, src, 15 |
200 | ldp B_q, C_q, [src] |
201 | /* Loaded 64 bytes, second 16-bytes chunk can be |
202 | overlapping with the first chunk by tmp1 bytes. |
203 | Stored 16 bytes. */ |
204 | sub dst, dstin, tmp1 |
205 | add count, count, tmp1 |
206 | /* The range of count being [65..96] becomes [65..111] |
207 | after tmp [0..15] gets added to it, |
208 | count now is <bytes-left-to-load>+48 */ |
209 | cmp count, 80 |
210 | b.gt L(copy96_medium) |
211 | ldr D_q, [src, 32] |
212 | stp B_q, C_q, [dst, 16] |
213 | str D_q, [dst, 48] |
214 | str A_q, [dstin] |
215 | str E_q, [dstend, -16] |
216 | ret |
217 | |
218 | .p2align 4 |
219 | L(copy96_medium): |
220 | ldp D_q, G_q, [src, 32] |
221 | cmp count, 96 |
222 | b.gt L(copy96_large) |
223 | stp B_q, C_q, [dst, 16] |
224 | stp D_q, G_q, [dst, 48] |
225 | str A_q, [dstin] |
226 | str E_q, [dstend, -16] |
227 | ret |
228 | |
229 | L(copy96_large): |
230 | ldr F_q, [src, 64] |
231 | str B_q, [dst, 16] |
232 | stp C_q, D_q, [dst, 32] |
233 | stp G_q, F_q, [dst, 64] |
234 | str A_q, [dstin] |
235 | str E_q, [dstend, -16] |
236 | ret |
237 | |
238 | .p2align 4 |
239 | L(memcopy_long): |
240 | bic src, src, 15 |
241 | ldp B_q, C_q, [src], #32 |
242 | sub dst, dstin, tmp1 |
243 | add count, count, tmp1 |
244 | add dst, dst, 16 |
245 | and tmp1, dst, 15 |
246 | ldp D_q, E_q, [src], #32 |
247 | str A_q, [dstin] |
248 | |
249 | /* Already loaded 64+16 bytes. Check if at |
250 | least 64 more bytes left */ |
251 | subs count, count, 64+64+16 |
252 | b.lt L(loop128_exit0) |
253 | cmp count, MEMCPY_PREFETCH_LDR + 64 + 32 |
254 | b.lt L(loop128) |
255 | cbnz tmp1, L(dst_unaligned) |
256 | sub count, count, MEMCPY_PREFETCH_LDR + 64 + 32 |
257 | |
258 | .p2align 4 |
259 | |
260 | L(loop128_prefetch): |
261 | prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR] |
262 | ldp F_q, G_q, [src], #32 |
263 | stp B_q, C_q, [dst], #32 |
264 | ldp H_q, I_q, [src], #32 |
265 | prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR] |
266 | ldp B_q, C_q, [src], #32 |
267 | stp D_q, E_q, [dst], #32 |
268 | ldp D_q, E_q, [src], #32 |
269 | stp F_q, G_q, [dst], #32 |
270 | stp H_q, I_q, [dst], #32 |
271 | subs count, count, 128 |
272 | b.ge L(loop128_prefetch) |
273 | |
274 | add count, count, MEMCPY_PREFETCH_LDR + 64 + 32 |
275 | .p2align 4 |
276 | L(loop128): |
277 | ldp F_q, G_q, [src], #32 |
278 | ldp H_q, I_q, [src], #32 |
279 | stp B_q, C_q, [dst], #32 |
280 | stp D_q, E_q, [dst], #32 |
281 | subs count, count, 64 |
282 | b.lt L(loop128_exit1) |
283 | ldp B_q, C_q, [src], #32 |
284 | ldp D_q, E_q, [src], #32 |
285 | stp F_q, G_q, [dst], #32 |
286 | stp H_q, I_q, [dst], #32 |
287 | subs count, count, 64 |
288 | b.ge L(loop128) |
289 | L(loop128_exit0): |
290 | ldp F_q, G_q, [srcend, -64] |
291 | ldp H_q, I_q, [srcend, -32] |
292 | stp B_q, C_q, [dst], #32 |
293 | stp D_q, E_q, [dst] |
294 | stp F_q, G_q, [dstend, -64] |
295 | stp H_q, I_q, [dstend, -32] |
296 | ret |
297 | L(loop128_exit1): |
298 | ldp B_q, C_q, [srcend, -64] |
299 | ldp D_q, E_q, [srcend, -32] |
300 | stp F_q, G_q, [dst], #32 |
301 | stp H_q, I_q, [dst] |
302 | stp B_q, C_q, [dstend, -64] |
303 | stp D_q, E_q, [dstend, -32] |
304 | ret |
305 | |
306 | L(dst_unaligned_tail): |
307 | ldp C_q, D_q, [srcend, -64] |
308 | ldp E_q, F_q, [srcend, -32] |
309 | stp A_q, B_q, [dst], #32 |
310 | stp H_q, I_q, [dst], #16 |
311 | str G_q, [dst, tmp1] |
312 | stp C_q, D_q, [dstend, -64] |
313 | stp E_q, F_q, [dstend, -32] |
314 | ret |
315 | |
316 | L(dst_unaligned): |
317 | /* For the unaligned store case the code loads two |
318 | aligned chunks and then merges them using ext |
319 | instruction. This can be up to 30% faster than |
320 | the the simple unaligned store access. |
321 | |
322 | Current state: tmp1 = dst % 16; C_q, D_q, E_q |
323 | contains data yet to be stored. src and dst points |
324 | to next-to-be-processed data. A_q, B_q contains |
325 | data already stored before, count = bytes left to |
326 | be load decremented by 64. |
327 | |
328 | The control is passed here if at least 64 bytes left |
329 | to be loaded. The code does two aligned loads and then |
330 | extracts (16-tmp1) bytes from the first register and |
331 | tmp1 bytes from the next register forming the value |
332 | for the aligned store. |
333 | |
334 | As ext instruction can only have it's index encoded |
335 | as immediate. 15 code chunks process each possible |
336 | index value. Computed goto is used to reach the |
337 | required code. */ |
338 | |
339 | /* Store the 16 bytes to dst and align dst for further |
340 | operations, several bytes will be stored at this |
341 | address once more */ |
342 | |
343 | ldp F_q, G_q, [src], #32 |
344 | stp B_q, C_q, [dst], #32 |
345 | bic dst, dst, 15 |
346 | sub count, count, 32 |
347 | adrp tmp2, L(ext_table) |
348 | add tmp2, tmp2, :lo12:L(ext_table) |
349 | add tmp2, tmp2, tmp1, LSL #2 |
350 | ldr tmp3w, [tmp2] |
351 | add tmp2, tmp2, tmp3w, SXTW |
352 | br tmp2 |
353 | |
354 | .p2align 4 |
355 | /* to make the loop in each chunk 16-bytes aligned */ |
356 | nop |
357 | #define EXT_CHUNK(shft) \ |
358 | L(ext_size_ ## shft):;\ |
359 | ext A_v.16b, C_v.16b, D_v.16b, 16-shft;\ |
360 | ext B_v.16b, D_v.16b, E_v.16b, 16-shft;\ |
361 | ext H_v.16b, E_v.16b, F_v.16b, 16-shft;\ |
362 | 1:;\ |
363 | stp A_q, B_q, [dst], #32;\ |
364 | prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR];\ |
365 | ldp C_q, D_q, [src], #32;\ |
366 | ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\ |
367 | stp H_q, I_q, [dst], #32;\ |
368 | ext A_v.16b, G_v.16b, C_v.16b, 16-shft;\ |
369 | ext B_v.16b, C_v.16b, D_v.16b, 16-shft;\ |
370 | ldp F_q, G_q, [src], #32;\ |
371 | ext H_v.16b, D_v.16b, F_v.16b, 16-shft;\ |
372 | subs count, count, 64;\ |
373 | b.ge 1b;\ |
374 | 2:;\ |
375 | ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\ |
376 | b L(dst_unaligned_tail); |
377 | |
378 | EXT_CHUNK(1) |
379 | EXT_CHUNK(2) |
380 | EXT_CHUNK(3) |
381 | EXT_CHUNK(4) |
382 | EXT_CHUNK(5) |
383 | EXT_CHUNK(6) |
384 | EXT_CHUNK(7) |
385 | EXT_CHUNK(8) |
386 | EXT_CHUNK(9) |
387 | EXT_CHUNK(10) |
388 | EXT_CHUNK(11) |
389 | EXT_CHUNK(12) |
390 | EXT_CHUNK(13) |
391 | EXT_CHUNK(14) |
392 | EXT_CHUNK(15) |
393 | |
394 | L(move_long): |
395 | .p2align 4 |
396 | 1: |
397 | cbz tmp1, 3f |
398 | |
399 | add srcend, src, count |
400 | add dstend, dstin, count |
401 | |
402 | and tmp1, srcend, 15 |
403 | ldr D_q, [srcend, -16] |
404 | sub srcend, srcend, tmp1 |
405 | sub count, count, tmp1 |
406 | ldp A_q, B_q, [srcend, -32] |
407 | str D_q, [dstend, -16] |
408 | ldp C_q, D_q, [srcend, -64]! |
409 | sub dstend, dstend, tmp1 |
410 | subs count, count, 128 |
411 | b.ls 2f |
412 | |
413 | .p2align 4 |
414 | 1: |
415 | subs count, count, 64 |
416 | stp A_q, B_q, [dstend, -32] |
417 | ldp A_q, B_q, [srcend, -32] |
418 | stp C_q, D_q, [dstend, -64]! |
419 | ldp C_q, D_q, [srcend, -64]! |
420 | b.hi 1b |
421 | |
422 | /* Write the last full set of 64 bytes. The remainder is at most 64 |
423 | bytes, so it is safe to always copy 64 bytes from the start even if |
424 | there is just 1 byte left. */ |
425 | 2: |
426 | ldp E_q, F_q, [src, 32] |
427 | ldp G_q, H_q, [src] |
428 | stp A_q, B_q, [dstend, -32] |
429 | stp C_q, D_q, [dstend, -64] |
430 | stp E_q, F_q, [dstin, 32] |
431 | stp G_q, H_q, [dstin] |
432 | 3: ret |
433 | |
434 | |
435 | END (__memcpy_thunderx2) |
436 | .section .rodata |
437 | .p2align 4 |
438 | |
439 | L(ext_table): |
440 | /* The first entry is for the alignment of 0 and is never |
441 | actually used (could be any value). */ |
442 | .word 0 |
443 | .word L(ext_size_1) -. |
444 | .word L(ext_size_2) -. |
445 | .word L(ext_size_3) -. |
446 | .word L(ext_size_4) -. |
447 | .word L(ext_size_5) -. |
448 | .word L(ext_size_6) -. |
449 | .word L(ext_size_7) -. |
450 | .word L(ext_size_8) -. |
451 | .word L(ext_size_9) -. |
452 | .word L(ext_size_10) -. |
453 | .word L(ext_size_11) -. |
454 | .word L(ext_size_12) -. |
455 | .word L(ext_size_13) -. |
456 | .word L(ext_size_14) -. |
457 | .word L(ext_size_15) -. |
458 | |