1/* A Thunderx2 Optimized memcpy implementation for AARCH64.
2 Copyright (C) 2018-2024 Free Software Foundation, Inc.
3
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20#include <sysdep.h>
21
22/* Assumptions:
23 *
24 * ARMv8-a, AArch64, unaligned accesses.
25 *
26 */
27
28#define dstin x0
29#define src x1
30#define count x2
31#define dst x3
32#define srcend x4
33#define dstend x5
34#define tmp2 x6
35#define tmp3 x7
36#define tmp3w w7
37#define A_l x6
38#define A_lw w6
39#define A_h x7
40#define A_hw w7
41#define B_l x8
42#define B_lw w8
43#define B_h x9
44#define C_l x10
45#define C_h x11
46#define D_l x12
47#define D_h x13
48#define E_l src
49#define E_h count
50#define F_l srcend
51#define F_h dst
52#define G_l count
53#define G_h dst
54#define tmp1 x14
55
56#define A_q q0
57#define B_q q1
58#define C_q q2
59#define D_q q3
60#define E_q q4
61#define F_q q5
62#define G_q q6
63#define H_q q7
64#define I_q q16
65#define J_q q17
66
67#define A_v v0
68#define B_v v1
69#define C_v v2
70#define D_v v3
71#define E_v v4
72#define F_v v5
73#define G_v v6
74#define H_v v7
75#define I_v v16
76#define J_v v17
77
78/* Overlapping large forward memmoves use a loop that copies backwards.
79 Otherwise memcpy is used. Small moves branch to memcopy16 directly.
80 The longer memcpy cases fall through to the memcpy head.
81*/
82
83ENTRY (__memmove_thunderx2)
84
85 PTR_ARG (0)
86 PTR_ARG (1)
87 SIZE_ARG (2)
88
89 add srcend, src, count
90 cmp count, 16
91 b.ls L(memcopy16)
92 sub tmp1, dstin, src
93 cmp count, 96
94 ccmp tmp1, count, 2, hi
95 b.lo L(move_long)
96
97END (__memmove_thunderx2)
98
99
100/* Copies are split into 3 main cases: small copies of up to 16 bytes,
101 medium copies of 17..96 bytes which are fully unrolled. Large copies
102 of more than 96 bytes align the destination and use load-and-merge
103 approach in the case src and dst addresses are unaligned not evenly,
104 so that, actual loads and stores are always aligned.
105 Large copies use the loops processing 64 bytes per iteration for
106 unaligned case and 128 bytes per iteration for aligned ones.
107*/
108
109#define MEMCPY_PREFETCH_LDR 640
110
111ENTRY (__memcpy_thunderx2)
112
113 PTR_ARG (0)
114 PTR_ARG (1)
115 SIZE_ARG (2)
116
117 add srcend, src, count
118 cmp count, 16
119 b.ls L(memcopy16)
120 ldr A_q, [src], #16
121 add dstend, dstin, count
122 and tmp1, src, 15
123 cmp count, 96
124 b.hi L(memcopy_long)
125
126 /* Medium copies: 17..96 bytes. */
127 ldr E_q, [srcend, -16]
128 cmp count, 64
129 b.gt L(memcpy_copy96)
130 cmp count, 48
131 b.le L(bytes_17_to_48)
132 /* 49..64 bytes */
133 ldp B_q, C_q, [src]
134 str E_q, [dstend, -16]
135 stp A_q, B_q, [dstin]
136 str C_q, [dstin, 32]
137 ret
138
139L(bytes_17_to_48):
140 /* 17..48 bytes*/
141 cmp count, 32
142 b.gt L(bytes_32_to_48)
143 /* 17..32 bytes*/
144 str A_q, [dstin]
145 str E_q, [dstend, -16]
146 ret
147
148L(bytes_32_to_48):
149 /* 32..48 */
150 ldr B_q, [src]
151 str A_q, [dstin]
152 str E_q, [dstend, -16]
153 str B_q, [dstin, 16]
154 ret
155
156 .p2align 4
157 /* Small copies: 0..16 bytes. */
158L(memcopy16):
159 cmp count, 8
160 b.lo L(bytes_0_to_8)
161 ldr A_l, [src]
162 ldr A_h, [srcend, -8]
163 add dstend, dstin, count
164 str A_l, [dstin]
165 str A_h, [dstend, -8]
166 ret
167 .p2align 4
168
169L(bytes_0_to_8):
170 tbz count, 2, L(bytes_0_to_3)
171 ldr A_lw, [src]
172 ldr A_hw, [srcend, -4]
173 add dstend, dstin, count
174 str A_lw, [dstin]
175 str A_hw, [dstend, -4]
176 ret
177
178 /* Copy 0..3 bytes. Use a branchless sequence that copies the same
179 byte 3 times if count==1, or the 2nd byte twice if count==2. */
180L(bytes_0_to_3):
181 cbz count, 1f
182 lsr tmp1, count, 1
183 ldrb A_lw, [src]
184 ldrb A_hw, [srcend, -1]
185 add dstend, dstin, count
186 ldrb B_lw, [src, tmp1]
187 strb B_lw, [dstin, tmp1]
188 strb A_hw, [dstend, -1]
189 strb A_lw, [dstin]
1901:
191 ret
192
193 .p2align 4
194
195L(memcpy_copy96):
196 /* Copying 65..96 bytes. A_q (first 16 bytes) and
197 E_q(last 16 bytes) are already loaded. The size
198 is large enough to benefit from aligned loads */
199 bic src, src, 15
200 ldp B_q, C_q, [src]
201 /* Loaded 64 bytes, second 16-bytes chunk can be
202 overlapping with the first chunk by tmp1 bytes.
203 Stored 16 bytes. */
204 sub dst, dstin, tmp1
205 add count, count, tmp1
206 /* The range of count being [65..96] becomes [65..111]
207 after tmp [0..15] gets added to it,
208 count now is <bytes-left-to-load>+48 */
209 cmp count, 80
210 b.gt L(copy96_medium)
211 ldr D_q, [src, 32]
212 stp B_q, C_q, [dst, 16]
213 str D_q, [dst, 48]
214 str A_q, [dstin]
215 str E_q, [dstend, -16]
216 ret
217
218 .p2align 4
219L(copy96_medium):
220 ldp D_q, G_q, [src, 32]
221 cmp count, 96
222 b.gt L(copy96_large)
223 stp B_q, C_q, [dst, 16]
224 stp D_q, G_q, [dst, 48]
225 str A_q, [dstin]
226 str E_q, [dstend, -16]
227 ret
228
229L(copy96_large):
230 ldr F_q, [src, 64]
231 str B_q, [dst, 16]
232 stp C_q, D_q, [dst, 32]
233 stp G_q, F_q, [dst, 64]
234 str A_q, [dstin]
235 str E_q, [dstend, -16]
236 ret
237
238 .p2align 4
239L(memcopy_long):
240 bic src, src, 15
241 ldp B_q, C_q, [src], #32
242 sub dst, dstin, tmp1
243 add count, count, tmp1
244 add dst, dst, 16
245 and tmp1, dst, 15
246 ldp D_q, E_q, [src], #32
247 str A_q, [dstin]
248
249 /* Already loaded 64+16 bytes. Check if at
250 least 64 more bytes left */
251 subs count, count, 64+64+16
252 b.lt L(loop128_exit0)
253 cmp count, MEMCPY_PREFETCH_LDR + 64 + 32
254 b.lt L(loop128)
255 cbnz tmp1, L(dst_unaligned)
256 sub count, count, MEMCPY_PREFETCH_LDR + 64 + 32
257
258 .p2align 4
259
260L(loop128_prefetch):
261 prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
262 ldp F_q, G_q, [src], #32
263 stp B_q, C_q, [dst], #32
264 ldp H_q, I_q, [src], #32
265 prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
266 ldp B_q, C_q, [src], #32
267 stp D_q, E_q, [dst], #32
268 ldp D_q, E_q, [src], #32
269 stp F_q, G_q, [dst], #32
270 stp H_q, I_q, [dst], #32
271 subs count, count, 128
272 b.ge L(loop128_prefetch)
273
274 add count, count, MEMCPY_PREFETCH_LDR + 64 + 32
275 .p2align 4
276L(loop128):
277 ldp F_q, G_q, [src], #32
278 ldp H_q, I_q, [src], #32
279 stp B_q, C_q, [dst], #32
280 stp D_q, E_q, [dst], #32
281 subs count, count, 64
282 b.lt L(loop128_exit1)
283 ldp B_q, C_q, [src], #32
284 ldp D_q, E_q, [src], #32
285 stp F_q, G_q, [dst], #32
286 stp H_q, I_q, [dst], #32
287 subs count, count, 64
288 b.ge L(loop128)
289L(loop128_exit0):
290 ldp F_q, G_q, [srcend, -64]
291 ldp H_q, I_q, [srcend, -32]
292 stp B_q, C_q, [dst], #32
293 stp D_q, E_q, [dst]
294 stp F_q, G_q, [dstend, -64]
295 stp H_q, I_q, [dstend, -32]
296 ret
297L(loop128_exit1):
298 ldp B_q, C_q, [srcend, -64]
299 ldp D_q, E_q, [srcend, -32]
300 stp F_q, G_q, [dst], #32
301 stp H_q, I_q, [dst]
302 stp B_q, C_q, [dstend, -64]
303 stp D_q, E_q, [dstend, -32]
304 ret
305
306L(dst_unaligned_tail):
307 ldp C_q, D_q, [srcend, -64]
308 ldp E_q, F_q, [srcend, -32]
309 stp A_q, B_q, [dst], #32
310 stp H_q, I_q, [dst], #16
311 str G_q, [dst, tmp1]
312 stp C_q, D_q, [dstend, -64]
313 stp E_q, F_q, [dstend, -32]
314 ret
315
316L(dst_unaligned):
317 /* For the unaligned store case the code loads two
318 aligned chunks and then merges them using ext
319 instruction. This can be up to 30% faster than
320 the the simple unaligned store access.
321
322 Current state: tmp1 = dst % 16; C_q, D_q, E_q
323 contains data yet to be stored. src and dst points
324 to next-to-be-processed data. A_q, B_q contains
325 data already stored before, count = bytes left to
326 be load decremented by 64.
327
328 The control is passed here if at least 64 bytes left
329 to be loaded. The code does two aligned loads and then
330 extracts (16-tmp1) bytes from the first register and
331 tmp1 bytes from the next register forming the value
332 for the aligned store.
333
334 As ext instruction can only have it's index encoded
335 as immediate. 15 code chunks process each possible
336 index value. Computed goto is used to reach the
337 required code. */
338
339 /* Store the 16 bytes to dst and align dst for further
340 operations, several bytes will be stored at this
341 address once more */
342
343 ldp F_q, G_q, [src], #32
344 stp B_q, C_q, [dst], #32
345 bic dst, dst, 15
346 sub count, count, 32
347 adrp tmp2, L(ext_table)
348 add tmp2, tmp2, :lo12:L(ext_table)
349 add tmp2, tmp2, tmp1, LSL #2
350 ldr tmp3w, [tmp2]
351 add tmp2, tmp2, tmp3w, SXTW
352 br tmp2
353
354.p2align 4
355 /* to make the loop in each chunk 16-bytes aligned */
356 nop
357#define EXT_CHUNK(shft) \
358L(ext_size_ ## shft):;\
359 ext A_v.16b, C_v.16b, D_v.16b, 16-shft;\
360 ext B_v.16b, D_v.16b, E_v.16b, 16-shft;\
361 ext H_v.16b, E_v.16b, F_v.16b, 16-shft;\
3621:;\
363 stp A_q, B_q, [dst], #32;\
364 prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR];\
365 ldp C_q, D_q, [src], #32;\
366 ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\
367 stp H_q, I_q, [dst], #32;\
368 ext A_v.16b, G_v.16b, C_v.16b, 16-shft;\
369 ext B_v.16b, C_v.16b, D_v.16b, 16-shft;\
370 ldp F_q, G_q, [src], #32;\
371 ext H_v.16b, D_v.16b, F_v.16b, 16-shft;\
372 subs count, count, 64;\
373 b.ge 1b;\
3742:;\
375 ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\
376 b L(dst_unaligned_tail);
377
378EXT_CHUNK(1)
379EXT_CHUNK(2)
380EXT_CHUNK(3)
381EXT_CHUNK(4)
382EXT_CHUNK(5)
383EXT_CHUNK(6)
384EXT_CHUNK(7)
385EXT_CHUNK(8)
386EXT_CHUNK(9)
387EXT_CHUNK(10)
388EXT_CHUNK(11)
389EXT_CHUNK(12)
390EXT_CHUNK(13)
391EXT_CHUNK(14)
392EXT_CHUNK(15)
393
394L(move_long):
395 .p2align 4
3961:
397 cbz tmp1, 3f
398
399 add srcend, src, count
400 add dstend, dstin, count
401
402 and tmp1, srcend, 15
403 ldr D_q, [srcend, -16]
404 sub srcend, srcend, tmp1
405 sub count, count, tmp1
406 ldp A_q, B_q, [srcend, -32]
407 str D_q, [dstend, -16]
408 ldp C_q, D_q, [srcend, -64]!
409 sub dstend, dstend, tmp1
410 subs count, count, 128
411 b.ls 2f
412
413 .p2align 4
4141:
415 subs count, count, 64
416 stp A_q, B_q, [dstend, -32]
417 ldp A_q, B_q, [srcend, -32]
418 stp C_q, D_q, [dstend, -64]!
419 ldp C_q, D_q, [srcend, -64]!
420 b.hi 1b
421
422 /* Write the last full set of 64 bytes. The remainder is at most 64
423 bytes, so it is safe to always copy 64 bytes from the start even if
424 there is just 1 byte left. */
4252:
426 ldp E_q, F_q, [src, 32]
427 ldp G_q, H_q, [src]
428 stp A_q, B_q, [dstend, -32]
429 stp C_q, D_q, [dstend, -64]
430 stp E_q, F_q, [dstin, 32]
431 stp G_q, H_q, [dstin]
4323: ret
433
434
435END (__memcpy_thunderx2)
436 .section .rodata
437 .p2align 4
438
439L(ext_table):
440 /* The first entry is for the alignment of 0 and is never
441 actually used (could be any value). */
442 .word 0
443 .word L(ext_size_1) -.
444 .word L(ext_size_2) -.
445 .word L(ext_size_3) -.
446 .word L(ext_size_4) -.
447 .word L(ext_size_5) -.
448 .word L(ext_size_6) -.
449 .word L(ext_size_7) -.
450 .word L(ext_size_8) -.
451 .word L(ext_size_9) -.
452 .word L(ext_size_10) -.
453 .word L(ext_size_11) -.
454 .word L(ext_size_12) -.
455 .word L(ext_size_13) -.
456 .word L(ext_size_14) -.
457 .word L(ext_size_15) -.
458

source code of glibc/sysdeps/aarch64/multiarch/memcpy_thunderx2.S