1/* A Thunderx Optimized memcpy implementation for AARCH64.
2 Copyright (C) 2017-2024 Free Software Foundation, Inc.
3
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20/* The actual code in this memcpy and memmove should be identical to the
21 generic version except for the code under '#ifdef THUNDERX'. This is
22 to make is easier to keep this version and the generic version in sync
23 for changes that are not specific to thunderx. */
24
25#include <sysdep.h>
26
27/* Assumptions:
28 *
29 * ARMv8-a, AArch64, unaligned accesses.
30 *
31 */
32
33#define dstin x0
34#define src x1
35#define count x2
36#define dst x3
37#define srcend x4
38#define dstend x5
39#define A_l x6
40#define A_lw w6
41#define A_h x7
42#define A_hw w7
43#define B_l x8
44#define B_lw w8
45#define B_h x9
46#define C_l x10
47#define C_h x11
48#define D_l x12
49#define D_h x13
50#define E_l src
51#define E_h count
52#define F_l srcend
53#define F_h dst
54#define G_l count
55#define G_h dst
56#define tmp1 x14
57
58/* Copies are split into 3 main cases: small copies of up to 16 bytes,
59 medium copies of 17..96 bytes which are fully unrolled. Large copies
60 of more than 96 bytes align the destination and use an unrolled loop
61 processing 64 bytes per iteration.
62 In order to share code with memmove, small and medium copies read all
63 data before writing, allowing any kind of overlap. So small, medium
64 and large backwards memmoves are handled by falling through into memcpy.
65 Overlapping large forward memmoves use a loop that copies backwards.
66*/
67
68ENTRY (__memmove_thunderx)
69
70 PTR_ARG (0)
71 PTR_ARG (1)
72 SIZE_ARG (2)
73
74 sub tmp1, dstin, src
75 cmp count, 96
76 ccmp tmp1, count, 2, hi
77 b.lo L(move_long)
78
79 /* Common case falls through into memcpy. */
80END (__memmove_thunderx)
81
82ENTRY (__memcpy_thunderx)
83
84 PTR_ARG (0)
85 PTR_ARG (1)
86 SIZE_ARG (2)
87
88 prfm PLDL1KEEP, [src]
89 add srcend, src, count
90 add dstend, dstin, count
91 cmp count, 16
92 b.ls L(copy16)
93 cmp count, 96
94 b.hi L(copy_long)
95
96 /* Medium copies: 17..96 bytes. */
97 sub tmp1, count, 1
98 ldp A_l, A_h, [src]
99 tbnz tmp1, 6, L(copy96)
100 ldp D_l, D_h, [srcend, -16]
101 tbz tmp1, 5, 1f
102 ldp B_l, B_h, [src, 16]
103 ldp C_l, C_h, [srcend, -32]
104 stp B_l, B_h, [dstin, 16]
105 stp C_l, C_h, [dstend, -32]
1061:
107 stp A_l, A_h, [dstin]
108 stp D_l, D_h, [dstend, -16]
109 ret
110
111 .p2align 4
112 /* Small copies: 0..16 bytes. */
113L(copy16):
114 cmp count, 8
115 b.lo 1f
116 ldr A_l, [src]
117 ldr A_h, [srcend, -8]
118 str A_l, [dstin]
119 str A_h, [dstend, -8]
120 ret
121 .p2align 4
1221:
123 tbz count, 2, 1f
124 ldr A_lw, [src]
125 ldr A_hw, [srcend, -4]
126 str A_lw, [dstin]
127 str A_hw, [dstend, -4]
128 ret
129
130 /* Copy 0..3 bytes. Use a branchless sequence that copies the same
131 byte 3 times if count==1, or the 2nd byte twice if count==2. */
1321:
133 cbz count, 2f
134 lsr tmp1, count, 1
135 ldrb A_lw, [src]
136 ldrb A_hw, [srcend, -1]
137 ldrb B_lw, [src, tmp1]
138 strb A_lw, [dstin]
139 strb B_lw, [dstin, tmp1]
140 strb A_hw, [dstend, -1]
1412: ret
142
143 .p2align 4
144 /* Copy 64..96 bytes. Copy 64 bytes from the start and
145 32 bytes from the end. */
146L(copy96):
147 ldp B_l, B_h, [src, 16]
148 ldp C_l, C_h, [src, 32]
149 ldp D_l, D_h, [src, 48]
150 ldp E_l, E_h, [srcend, -32]
151 ldp F_l, F_h, [srcend, -16]
152 stp A_l, A_h, [dstin]
153 stp B_l, B_h, [dstin, 16]
154 stp C_l, C_h, [dstin, 32]
155 stp D_l, D_h, [dstin, 48]
156 stp E_l, E_h, [dstend, -32]
157 stp F_l, F_h, [dstend, -16]
158 ret
159
160 /* Align DST to 16 byte alignment so that we don't cross cache line
161 boundaries on both loads and stores. There are at least 96 bytes
162 to copy, so copy 16 bytes unaligned and then align. The loop
163 copies 64 bytes per iteration and prefetches one iteration ahead. */
164
165 .p2align 4
166L(copy_long):
167
168 /* On thunderx, large memcpy's are helped by software prefetching.
169 This loop is identical to the one below it but with prefetching
170 instructions included. For loops that are less than 32768 bytes,
171 the prefetching does not help and slow the code down so we only
172 use the prefetching loop for the largest memcpys. */
173
174 cmp count, #32768
175 b.lo L(copy_long_without_prefetch)
176 and tmp1, dstin, 15
177 bic dst, dstin, 15
178 ldp D_l, D_h, [src]
179 sub src, src, tmp1
180 prfm pldl1strm, [src, 384]
181 add count, count, tmp1 /* Count is now 16 too large. */
182 ldp A_l, A_h, [src, 16]
183 stp D_l, D_h, [dstin]
184 ldp B_l, B_h, [src, 32]
185 ldp C_l, C_h, [src, 48]
186 ldp D_l, D_h, [src, 64]!
187 subs count, count, 128 + 16 /* Test and readjust count. */
188
189L(prefetch_loop64):
190 tbz src, #6, 1f
191 prfm pldl1strm, [src, 512]
1921:
193 stp A_l, A_h, [dst, 16]
194 ldp A_l, A_h, [src, 16]
195 stp B_l, B_h, [dst, 32]
196 ldp B_l, B_h, [src, 32]
197 stp C_l, C_h, [dst, 48]
198 ldp C_l, C_h, [src, 48]
199 stp D_l, D_h, [dst, 64]!
200 ldp D_l, D_h, [src, 64]!
201 subs count, count, 64
202 b.hi L(prefetch_loop64)
203 b L(last64)
204
205L(copy_long_without_prefetch):
206
207 and tmp1, dstin, 15
208 bic dst, dstin, 15
209 ldp D_l, D_h, [src]
210 sub src, src, tmp1
211 add count, count, tmp1 /* Count is now 16 too large. */
212 ldp A_l, A_h, [src, 16]
213 stp D_l, D_h, [dstin]
214 ldp B_l, B_h, [src, 32]
215 ldp C_l, C_h, [src, 48]
216 ldp D_l, D_h, [src, 64]!
217 subs count, count, 128 + 16 /* Test and readjust count. */
218 b.ls L(last64)
219L(loop64):
220 stp A_l, A_h, [dst, 16]
221 ldp A_l, A_h, [src, 16]
222 stp B_l, B_h, [dst, 32]
223 ldp B_l, B_h, [src, 32]
224 stp C_l, C_h, [dst, 48]
225 ldp C_l, C_h, [src, 48]
226 stp D_l, D_h, [dst, 64]!
227 ldp D_l, D_h, [src, 64]!
228 subs count, count, 64
229 b.hi L(loop64)
230
231 /* Write the last full set of 64 bytes. The remainder is at most 64
232 bytes, so it is safe to always copy 64 bytes from the end even if
233 there is just 1 byte left. */
234L(last64):
235 ldp E_l, E_h, [srcend, -64]
236 stp A_l, A_h, [dst, 16]
237 ldp A_l, A_h, [srcend, -48]
238 stp B_l, B_h, [dst, 32]
239 ldp B_l, B_h, [srcend, -32]
240 stp C_l, C_h, [dst, 48]
241 ldp C_l, C_h, [srcend, -16]
242 stp D_l, D_h, [dst, 64]
243 stp E_l, E_h, [dstend, -64]
244 stp A_l, A_h, [dstend, -48]
245 stp B_l, B_h, [dstend, -32]
246 stp C_l, C_h, [dstend, -16]
247 ret
248
249 .p2align 4
250L(move_long):
251 cbz tmp1, 3f
252
253 add srcend, src, count
254 add dstend, dstin, count
255
256 /* Align dstend to 16 byte alignment so that we don't cross cache line
257 boundaries on both loads and stores. There are at least 96 bytes
258 to copy, so copy 16 bytes unaligned and then align. The loop
259 copies 64 bytes per iteration and prefetches one iteration ahead. */
260
261 and tmp1, dstend, 15
262 ldp D_l, D_h, [srcend, -16]
263 sub srcend, srcend, tmp1
264 sub count, count, tmp1
265 ldp A_l, A_h, [srcend, -16]
266 stp D_l, D_h, [dstend, -16]
267 ldp B_l, B_h, [srcend, -32]
268 ldp C_l, C_h, [srcend, -48]
269 ldp D_l, D_h, [srcend, -64]!
270 sub dstend, dstend, tmp1
271 subs count, count, 128
272 b.ls 2f
273
274 nop
2751:
276 stp A_l, A_h, [dstend, -16]
277 ldp A_l, A_h, [srcend, -16]
278 stp B_l, B_h, [dstend, -32]
279 ldp B_l, B_h, [srcend, -32]
280 stp C_l, C_h, [dstend, -48]
281 ldp C_l, C_h, [srcend, -48]
282 stp D_l, D_h, [dstend, -64]!
283 ldp D_l, D_h, [srcend, -64]!
284 subs count, count, 64
285 b.hi 1b
286
287 /* Write the last full set of 64 bytes. The remainder is at most 64
288 bytes, so it is safe to always copy 64 bytes from the start even if
289 there is just 1 byte left. */
2902:
291 ldp G_l, G_h, [src, 48]
292 stp A_l, A_h, [dstend, -16]
293 ldp A_l, A_h, [src, 32]
294 stp B_l, B_h, [dstend, -32]
295 ldp B_l, B_h, [src, 16]
296 stp C_l, C_h, [dstend, -48]
297 ldp C_l, C_h, [src]
298 stp D_l, D_h, [dstend, -64]
299 stp G_l, G_h, [dstin, 48]
300 stp A_l, A_h, [dstin, 32]
301 stp B_l, B_h, [dstin, 16]
302 stp C_l, C_h, [dstin]
3033: ret
304
305END (__memcpy_thunderx)
306

source code of glibc/sysdeps/aarch64/multiarch/memcpy_thunderx.S