1 | /* A Thunderx Optimized memcpy implementation for AARCH64. |
2 | Copyright (C) 2017-2024 Free Software Foundation, Inc. |
3 | |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | /* The actual code in this memcpy and memmove should be identical to the |
21 | generic version except for the code under '#ifdef THUNDERX'. This is |
22 | to make is easier to keep this version and the generic version in sync |
23 | for changes that are not specific to thunderx. */ |
24 | |
25 | #include <sysdep.h> |
26 | |
27 | /* Assumptions: |
28 | * |
29 | * ARMv8-a, AArch64, unaligned accesses. |
30 | * |
31 | */ |
32 | |
33 | #define dstin x0 |
34 | #define src x1 |
35 | #define count x2 |
36 | #define dst x3 |
37 | #define srcend x4 |
38 | #define dstend x5 |
39 | #define A_l x6 |
40 | #define A_lw w6 |
41 | #define A_h x7 |
42 | #define A_hw w7 |
43 | #define B_l x8 |
44 | #define B_lw w8 |
45 | #define B_h x9 |
46 | #define C_l x10 |
47 | #define C_h x11 |
48 | #define D_l x12 |
49 | #define D_h x13 |
50 | #define E_l src |
51 | #define E_h count |
52 | #define F_l srcend |
53 | #define F_h dst |
54 | #define G_l count |
55 | #define G_h dst |
56 | #define tmp1 x14 |
57 | |
58 | /* Copies are split into 3 main cases: small copies of up to 16 bytes, |
59 | medium copies of 17..96 bytes which are fully unrolled. Large copies |
60 | of more than 96 bytes align the destination and use an unrolled loop |
61 | processing 64 bytes per iteration. |
62 | In order to share code with memmove, small and medium copies read all |
63 | data before writing, allowing any kind of overlap. So small, medium |
64 | and large backwards memmoves are handled by falling through into memcpy. |
65 | Overlapping large forward memmoves use a loop that copies backwards. |
66 | */ |
67 | |
68 | ENTRY (__memmove_thunderx) |
69 | |
70 | PTR_ARG (0) |
71 | PTR_ARG (1) |
72 | SIZE_ARG (2) |
73 | |
74 | sub tmp1, dstin, src |
75 | cmp count, 96 |
76 | ccmp tmp1, count, 2, hi |
77 | b.lo L(move_long) |
78 | |
79 | /* Common case falls through into memcpy. */ |
80 | END (__memmove_thunderx) |
81 | |
82 | ENTRY (__memcpy_thunderx) |
83 | |
84 | PTR_ARG (0) |
85 | PTR_ARG (1) |
86 | SIZE_ARG (2) |
87 | |
88 | prfm PLDL1KEEP, [src] |
89 | add srcend, src, count |
90 | add dstend, dstin, count |
91 | cmp count, 16 |
92 | b.ls L(copy16) |
93 | cmp count, 96 |
94 | b.hi L(copy_long) |
95 | |
96 | /* Medium copies: 17..96 bytes. */ |
97 | sub tmp1, count, 1 |
98 | ldp A_l, A_h, [src] |
99 | tbnz tmp1, 6, L(copy96) |
100 | ldp D_l, D_h, [srcend, -16] |
101 | tbz tmp1, 5, 1f |
102 | ldp B_l, B_h, [src, 16] |
103 | ldp C_l, C_h, [srcend, -32] |
104 | stp B_l, B_h, [dstin, 16] |
105 | stp C_l, C_h, [dstend, -32] |
106 | 1: |
107 | stp A_l, A_h, [dstin] |
108 | stp D_l, D_h, [dstend, -16] |
109 | ret |
110 | |
111 | .p2align 4 |
112 | /* Small copies: 0..16 bytes. */ |
113 | L(copy16): |
114 | cmp count, 8 |
115 | b.lo 1f |
116 | ldr A_l, [src] |
117 | ldr A_h, [srcend, -8] |
118 | str A_l, [dstin] |
119 | str A_h, [dstend, -8] |
120 | ret |
121 | .p2align 4 |
122 | 1: |
123 | tbz count, 2, 1f |
124 | ldr A_lw, [src] |
125 | ldr A_hw, [srcend, -4] |
126 | str A_lw, [dstin] |
127 | str A_hw, [dstend, -4] |
128 | ret |
129 | |
130 | /* Copy 0..3 bytes. Use a branchless sequence that copies the same |
131 | byte 3 times if count==1, or the 2nd byte twice if count==2. */ |
132 | 1: |
133 | cbz count, 2f |
134 | lsr tmp1, count, 1 |
135 | ldrb A_lw, [src] |
136 | ldrb A_hw, [srcend, -1] |
137 | ldrb B_lw, [src, tmp1] |
138 | strb A_lw, [dstin] |
139 | strb B_lw, [dstin, tmp1] |
140 | strb A_hw, [dstend, -1] |
141 | 2: ret |
142 | |
143 | .p2align 4 |
144 | /* Copy 64..96 bytes. Copy 64 bytes from the start and |
145 | 32 bytes from the end. */ |
146 | L(copy96): |
147 | ldp B_l, B_h, [src, 16] |
148 | ldp C_l, C_h, [src, 32] |
149 | ldp D_l, D_h, [src, 48] |
150 | ldp E_l, E_h, [srcend, -32] |
151 | ldp F_l, F_h, [srcend, -16] |
152 | stp A_l, A_h, [dstin] |
153 | stp B_l, B_h, [dstin, 16] |
154 | stp C_l, C_h, [dstin, 32] |
155 | stp D_l, D_h, [dstin, 48] |
156 | stp E_l, E_h, [dstend, -32] |
157 | stp F_l, F_h, [dstend, -16] |
158 | ret |
159 | |
160 | /* Align DST to 16 byte alignment so that we don't cross cache line |
161 | boundaries on both loads and stores. There are at least 96 bytes |
162 | to copy, so copy 16 bytes unaligned and then align. The loop |
163 | copies 64 bytes per iteration and prefetches one iteration ahead. */ |
164 | |
165 | .p2align 4 |
166 | L(copy_long): |
167 | |
168 | /* On thunderx, large memcpy's are helped by software prefetching. |
169 | This loop is identical to the one below it but with prefetching |
170 | instructions included. For loops that are less than 32768 bytes, |
171 | the prefetching does not help and slow the code down so we only |
172 | use the prefetching loop for the largest memcpys. */ |
173 | |
174 | cmp count, #32768 |
175 | b.lo L(copy_long_without_prefetch) |
176 | and tmp1, dstin, 15 |
177 | bic dst, dstin, 15 |
178 | ldp D_l, D_h, [src] |
179 | sub src, src, tmp1 |
180 | prfm pldl1strm, [src, 384] |
181 | add count, count, tmp1 /* Count is now 16 too large. */ |
182 | ldp A_l, A_h, [src, 16] |
183 | stp D_l, D_h, [dstin] |
184 | ldp B_l, B_h, [src, 32] |
185 | ldp C_l, C_h, [src, 48] |
186 | ldp D_l, D_h, [src, 64]! |
187 | subs count, count, 128 + 16 /* Test and readjust count. */ |
188 | |
189 | L(prefetch_loop64): |
190 | tbz src, #6, 1f |
191 | prfm pldl1strm, [src, 512] |
192 | 1: |
193 | stp A_l, A_h, [dst, 16] |
194 | ldp A_l, A_h, [src, 16] |
195 | stp B_l, B_h, [dst, 32] |
196 | ldp B_l, B_h, [src, 32] |
197 | stp C_l, C_h, [dst, 48] |
198 | ldp C_l, C_h, [src, 48] |
199 | stp D_l, D_h, [dst, 64]! |
200 | ldp D_l, D_h, [src, 64]! |
201 | subs count, count, 64 |
202 | b.hi L(prefetch_loop64) |
203 | b L(last64) |
204 | |
205 | L(copy_long_without_prefetch): |
206 | |
207 | and tmp1, dstin, 15 |
208 | bic dst, dstin, 15 |
209 | ldp D_l, D_h, [src] |
210 | sub src, src, tmp1 |
211 | add count, count, tmp1 /* Count is now 16 too large. */ |
212 | ldp A_l, A_h, [src, 16] |
213 | stp D_l, D_h, [dstin] |
214 | ldp B_l, B_h, [src, 32] |
215 | ldp C_l, C_h, [src, 48] |
216 | ldp D_l, D_h, [src, 64]! |
217 | subs count, count, 128 + 16 /* Test and readjust count. */ |
218 | b.ls L(last64) |
219 | L(loop64): |
220 | stp A_l, A_h, [dst, 16] |
221 | ldp A_l, A_h, [src, 16] |
222 | stp B_l, B_h, [dst, 32] |
223 | ldp B_l, B_h, [src, 32] |
224 | stp C_l, C_h, [dst, 48] |
225 | ldp C_l, C_h, [src, 48] |
226 | stp D_l, D_h, [dst, 64]! |
227 | ldp D_l, D_h, [src, 64]! |
228 | subs count, count, 64 |
229 | b.hi L(loop64) |
230 | |
231 | /* Write the last full set of 64 bytes. The remainder is at most 64 |
232 | bytes, so it is safe to always copy 64 bytes from the end even if |
233 | there is just 1 byte left. */ |
234 | L(last64): |
235 | ldp E_l, E_h, [srcend, -64] |
236 | stp A_l, A_h, [dst, 16] |
237 | ldp A_l, A_h, [srcend, -48] |
238 | stp B_l, B_h, [dst, 32] |
239 | ldp B_l, B_h, [srcend, -32] |
240 | stp C_l, C_h, [dst, 48] |
241 | ldp C_l, C_h, [srcend, -16] |
242 | stp D_l, D_h, [dst, 64] |
243 | stp E_l, E_h, [dstend, -64] |
244 | stp A_l, A_h, [dstend, -48] |
245 | stp B_l, B_h, [dstend, -32] |
246 | stp C_l, C_h, [dstend, -16] |
247 | ret |
248 | |
249 | .p2align 4 |
250 | L(move_long): |
251 | cbz tmp1, 3f |
252 | |
253 | add srcend, src, count |
254 | add dstend, dstin, count |
255 | |
256 | /* Align dstend to 16 byte alignment so that we don't cross cache line |
257 | boundaries on both loads and stores. There are at least 96 bytes |
258 | to copy, so copy 16 bytes unaligned and then align. The loop |
259 | copies 64 bytes per iteration and prefetches one iteration ahead. */ |
260 | |
261 | and tmp1, dstend, 15 |
262 | ldp D_l, D_h, [srcend, -16] |
263 | sub srcend, srcend, tmp1 |
264 | sub count, count, tmp1 |
265 | ldp A_l, A_h, [srcend, -16] |
266 | stp D_l, D_h, [dstend, -16] |
267 | ldp B_l, B_h, [srcend, -32] |
268 | ldp C_l, C_h, [srcend, -48] |
269 | ldp D_l, D_h, [srcend, -64]! |
270 | sub dstend, dstend, tmp1 |
271 | subs count, count, 128 |
272 | b.ls 2f |
273 | |
274 | nop |
275 | 1: |
276 | stp A_l, A_h, [dstend, -16] |
277 | ldp A_l, A_h, [srcend, -16] |
278 | stp B_l, B_h, [dstend, -32] |
279 | ldp B_l, B_h, [srcend, -32] |
280 | stp C_l, C_h, [dstend, -48] |
281 | ldp C_l, C_h, [srcend, -48] |
282 | stp D_l, D_h, [dstend, -64]! |
283 | ldp D_l, D_h, [srcend, -64]! |
284 | subs count, count, 64 |
285 | b.hi 1b |
286 | |
287 | /* Write the last full set of 64 bytes. The remainder is at most 64 |
288 | bytes, so it is safe to always copy 64 bytes from the start even if |
289 | there is just 1 byte left. */ |
290 | 2: |
291 | ldp G_l, G_h, [src, 48] |
292 | stp A_l, A_h, [dstend, -16] |
293 | ldp A_l, A_h, [src, 32] |
294 | stp B_l, B_h, [dstend, -32] |
295 | ldp B_l, B_h, [src, 16] |
296 | stp C_l, C_h, [dstend, -48] |
297 | ldp C_l, C_h, [src] |
298 | stp D_l, D_h, [dstend, -64] |
299 | stp G_l, G_h, [dstin, 48] |
300 | stp A_l, A_h, [dstin, 32] |
301 | stp B_l, B_h, [dstin, 16] |
302 | stp C_l, C_h, [dstin] |
303 | 3: ret |
304 | |
305 | END (__memcpy_thunderx) |
306 | |