memcpy_thunderx.S source code [glibc/sysdeps/aarch64/multiarch/memcpy_thunderx.S]

1	/ A Thunderx Optimized memcpy implementation for AARCH64.*
2	Copyright (C) 2017-2024 Free Software Foundation, Inc.
3
4	This file is part of the GNU C Library.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, see
18	<https://www.gnu.org/licenses/>. /*
19
20	/ The actual code in this memcpy and memmove should be identical to the*
21	generic version except for the code under '#ifdef THUNDERX'. This is
22	to make is easier to keep this version and the generic version in sync
23	for changes that are not specific to thunderx. /*
24
25	#include <sysdep.h>
26
27	/ Assumptions:*
28	*
29	* ARMv8-a, AArch64, unaligned accesses.
30	*
31	*/
32
33	#define dstin x0
34	#define src x1
35	#define count x2
36	#define dst x3
37	#define srcend x4
38	#define dstend x5
39	#define A_l x6
40	#define A_lw w6
41	#define A_h x7
42	#define A_hw w7
43	#define B_l x8
44	#define B_lw w8
45	#define B_h x9
46	#define C_l x10
47	#define C_h x11
48	#define D_l x12
49	#define D_h x13
50	#define E_l src
51	#define E_h count
52	#define F_l srcend
53	#define F_h dst
54	#define G_l count
55	#define G_h dst
56	#define tmp1 x14
57
58	/ Copies are split into 3 main cases: small copies of up to 16 bytes,*
59	medium copies of 17..96 bytes which are fully unrolled. Large copies
60	of more than 96 bytes align the destination and use an unrolled loop
61	processing 64 bytes per iteration.
62	In order to share code with memmove, small and medium copies read all
63	data before writing, allowing any kind of overlap. So small, medium
64	and large backwards memmoves are handled by falling through into memcpy.
65	Overlapping large forward memmoves use a loop that copies backwards.
66	*/
67
68	ENTRY (__memmove_thunderx)
69
70	PTR_ARG (`0`)
71	PTR_ARG (`1`)
72	SIZE_ARG (`2`)
73
74	sub tmp1, dstin, src
75	cmp count, `96`
76	ccmp tmp1, count, `2`, hi
77	b.lo L(move_long)
78
79	/ Common case falls through into memcpy. /
80	END (__memmove_thunderx)
81
82	ENTRY (__memcpy_thunderx)
83
84	PTR_ARG (`0`)
85	PTR_ARG (`1`)
86	SIZE_ARG (`2`)
87
88	prfm PLDL1KEEP, [src]
89	add srcend, src, count
90	add dstend, dstin, count
91	cmp count, `16`
92	b.ls L(copy16)
93	cmp count, `96`
94	b.hi L(copy_long)
95
96	/ Medium copies: 17..96 bytes. /
97	sub tmp1, count, `1`
98	ldp A_l, A_h, [src]
99	tbnz tmp1, `6`, L(copy96)
100	ldp D_l, D_h, [srcend, -`16`]
101	tbz tmp1, `5`, `1f`
102	ldp B_l, B_h, [src, `16`]
103	ldp C_l, C_h, [srcend, -`32`]
104	stp B_l, B_h, [dstin, `16`]
105	stp C_l, C_h, [dstend, -`32`]
106	`1`:
107	stp A_l, A_h, [dstin]
108	stp D_l, D_h, [dstend, -`16`]
109	ret
110
111	.p2align `4`
112	/ Small copies: 0..16 bytes. /
113	L(copy16):
114	cmp count, `8`
115	b.lo `1f`
116	ldr A_l, [src]
117	ldr A_h, [srcend, -`8`]
118	str A_l, [dstin]
119	str A_h, [dstend, -`8`]
120	ret
121	.p2align `4`
122	`1`:
123	tbz count, `2`, `1f`
124	ldr A_lw, [src]
125	ldr A_hw, [srcend, -`4`]
126	str A_lw, [dstin]
127	str A_hw, [dstend, -`4`]
128	ret
129
130	/ Copy 0..3 bytes. Use a branchless sequence that copies the same*
131	byte 3 times if count==1, or the 2nd byte twice if count==2. /*
132	`1`:
133	cbz count, `2f`
134	lsr tmp1, count, `1`
135	ldrb A_lw, [src]
136	ldrb A_hw, [srcend, -`1`]
137	ldrb B_lw, [src, tmp1]
138	strb A_lw, [dstin]
139	strb B_lw, [dstin, tmp1]
140	strb A_hw, [dstend, -`1`]
141	`2`: ret
142
143	.p2align `4`
144	/ Copy 64..96 bytes. Copy 64 bytes from the start and*
145	32 bytes from the end. /*
146	L(copy96):
147	ldp B_l, B_h, [src, `16`]
148	ldp C_l, C_h, [src, `32`]
149	ldp D_l, D_h, [src, `48`]
150	ldp E_l, E_h, [srcend, -`32`]
151	ldp F_l, F_h, [srcend, -`16`]
152	stp A_l, A_h, [dstin]
153	stp B_l, B_h, [dstin, `16`]
154	stp C_l, C_h, [dstin, `32`]
155	stp D_l, D_h, [dstin, `48`]
156	stp E_l, E_h, [dstend, -`32`]
157	stp F_l, F_h, [dstend, -`16`]
158	ret
159
160	/ Align DST to 16 byte alignment so that we don't cross cache line*
161	boundaries on both loads and stores. There are at least 96 bytes
162	to copy, so copy 16 bytes unaligned and then align. The loop
163	copies 64 bytes per iteration and prefetches one iteration ahead. /*
164
165	.p2align `4`
166	L(copy_long):
167
168	/ On thunderx, large memcpy's are helped by software prefetching.*
169	This loop is identical to the one below it but with prefetching
170	instructions included. For loops that are less than 32768 bytes,
171	the prefetching does not help and slow the code down so we only
172	use the prefetching loop for the largest memcpys. /*
173
174	cmp count, #`32768`
175	b.lo L(copy_long_without_prefetch)
176	and tmp1, dstin, `15`
177	bic dst, dstin, `15`
178	ldp D_l, D_h, [src]
179	sub src, src, tmp1
180	prfm pldl1strm, [src, `384`]
181	add count, count, tmp1 / Count is now 16 too large. /
182	ldp A_l, A_h, [src, `16`]
183	stp D_l, D_h, [dstin]
184	ldp B_l, B_h, [src, `32`]
185	ldp C_l, C_h, [src, `48`]
186	ldp D_l, D_h, [src, `64`]!
187	subs count, count, `128` + `16` / Test and readjust count. /
188
189	L(prefetch_loop64):
190	tbz src, #`6`, `1f`
191	prfm pldl1strm, [src, `512`]
192	`1`:
193	stp A_l, A_h, [dst, `16`]
194	ldp A_l, A_h, [src, `16`]
195	stp B_l, B_h, [dst, `32`]
196	ldp B_l, B_h, [src, `32`]
197	stp C_l, C_h, [dst, `48`]
198	ldp C_l, C_h, [src, `48`]
199	stp D_l, D_h, [dst, `64`]!
200	ldp D_l, D_h, [src, `64`]!
201	subs count, count, `64`
202	b.hi L(prefetch_loop64)
203	b L(last64)
204
205	L(copy_long_without_prefetch):
206
207	and tmp1, dstin, `15`
208	bic dst, dstin, `15`
209	ldp D_l, D_h, [src]
210	sub src, src, tmp1
211	add count, count, tmp1 / Count is now 16 too large. /
212	ldp A_l, A_h, [src, `16`]
213	stp D_l, D_h, [dstin]
214	ldp B_l, B_h, [src, `32`]
215	ldp C_l, C_h, [src, `48`]
216	ldp D_l, D_h, [src, `64`]!
217	subs count, count, `128` + `16` / Test and readjust count. /
218	b.ls L(last64)
219	L(loop64):
220	stp A_l, A_h, [dst, `16`]
221	ldp A_l, A_h, [src, `16`]
222	stp B_l, B_h, [dst, `32`]
223	ldp B_l, B_h, [src, `32`]
224	stp C_l, C_h, [dst, `48`]
225	ldp C_l, C_h, [src, `48`]
226	stp D_l, D_h, [dst, `64`]!
227	ldp D_l, D_h, [src, `64`]!
228	subs count, count, `64`
229	b.hi L(loop64)
230
231	/ Write the last full set of 64 bytes. The remainder is at most 64*
232	bytes, so it is safe to always copy 64 bytes from the end even if
233	there is just 1 byte left. /*
234	L(last64):
235	ldp E_l, E_h, [srcend, -`64`]
236	stp A_l, A_h, [dst, `16`]
237	ldp A_l, A_h, [srcend, -`48`]
238	stp B_l, B_h, [dst, `32`]
239	ldp B_l, B_h, [srcend, -`32`]
240	stp C_l, C_h, [dst, `48`]
241	ldp C_l, C_h, [srcend, -`16`]
242	stp D_l, D_h, [dst, `64`]
243	stp E_l, E_h, [dstend, -`64`]
244	stp A_l, A_h, [dstend, -`48`]
245	stp B_l, B_h, [dstend, -`32`]
246	stp C_l, C_h, [dstend, -`16`]
247	ret
248
249	.p2align `4`
250	L(move_long):
251	cbz tmp1, `3f`
252
253	add srcend, src, count
254	add dstend, dstin, count
255
256	/ Align dstend to 16 byte alignment so that we don't cross cache line*
257	boundaries on both loads and stores. There are at least 96 bytes
258	to copy, so copy 16 bytes unaligned and then align. The loop
259	copies 64 bytes per iteration and prefetches one iteration ahead. /*
260
261	and tmp1, dstend, `15`
262	ldp D_l, D_h, [srcend, -`16`]
263	sub srcend, srcend, tmp1
264	sub count, count, tmp1
265	ldp A_l, A_h, [srcend, -`16`]
266	stp D_l, D_h, [dstend, -`16`]
267	ldp B_l, B_h, [srcend, -`32`]
268	ldp C_l, C_h, [srcend, -`48`]
269	ldp D_l, D_h, [srcend, -`64`]!
270	sub dstend, dstend, tmp1
271	subs count, count, `128`
272	b.ls `2f`
273
274	nop
275	`1`:
276	stp A_l, A_h, [dstend, -`16`]
277	ldp A_l, A_h, [srcend, -`16`]
278	stp B_l, B_h, [dstend, -`32`]
279	ldp B_l, B_h, [srcend, -`32`]
280	stp C_l, C_h, [dstend, -`48`]
281	ldp C_l, C_h, [srcend, -`48`]
282	stp D_l, D_h, [dstend, -`64`]!
283	ldp D_l, D_h, [srcend, -`64`]!
284	subs count, count, `64`
285	b.hi `1b`
286
287	/ Write the last full set of 64 bytes. The remainder is at most 64*
288	bytes, so it is safe to always copy 64 bytes from the start even if
289	there is just 1 byte left. /*
290	`2`:
291	ldp G_l, G_h, [src, `48`]
292	stp A_l, A_h, [dstend, -`16`]
293	ldp A_l, A_h, [src, `32`]
294	stp B_l, B_h, [dstend, -`32`]
295	ldp B_l, B_h, [src, `16`]
296	stp C_l, C_h, [dstend, -`48`]
297	ldp C_l, C_h, [src]
298	stp D_l, D_h, [dstend, -`64`]
299	stp G_l, G_h, [dstin, `48`]
300	stp A_l, A_h, [dstin, `32`]
301	stp B_l, B_h, [dstin, `16`]
302	stp C_l, C_h, [dstin]
303	`3`: ret
304
305	END (__memcpy_thunderx)
306

source code of glibc/sysdeps/aarch64/multiarch/memcpy_thunderx.S