memcpy_thunderx2.S source code [glibc/sysdeps/aarch64/multiarch/memcpy_thunderx2.S]

1	/ A Thunderx2 Optimized memcpy implementation for AARCH64.*
2	Copyright (C) 2018-2024 Free Software Foundation, Inc.
3
4	This file is part of the GNU C Library.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, see
18	<https://www.gnu.org/licenses/>. /*
19
20	#include <sysdep.h>
21
22	/ Assumptions:*
23	*
24	* ARMv8-a, AArch64, unaligned accesses.
25	*
26	*/
27
28	#define dstin x0
29	#define src x1
30	#define count x2
31	#define dst x3
32	#define srcend x4
33	#define dstend x5
34	#define tmp2 x6
35	#define tmp3 x7
36	#define tmp3w w7
37	#define A_l x6
38	#define A_lw w6
39	#define A_h x7
40	#define A_hw w7
41	#define B_l x8
42	#define B_lw w8
43	#define B_h x9
44	#define C_l x10
45	#define C_h x11
46	#define D_l x12
47	#define D_h x13
48	#define E_l src
49	#define E_h count
50	#define F_l srcend
51	#define F_h dst
52	#define G_l count
53	#define G_h dst
54	#define tmp1 x14
55
56	#define A_q q0
57	#define B_q q1
58	#define C_q q2
59	#define D_q q3
60	#define E_q q4
61	#define F_q q5
62	#define G_q q6
63	#define H_q q7
64	#define I_q q16
65	#define J_q q17
66
67	#define A_v v0
68	#define B_v v1
69	#define C_v v2
70	#define D_v v3
71	#define E_v v4
72	#define F_v v5
73	#define G_v v6
74	#define H_v v7
75	#define I_v v16
76	#define J_v v17
77
78	/ Overlapping large forward memmoves use a loop that copies backwards.*
79	Otherwise memcpy is used. Small moves branch to memcopy16 directly.
80	The longer memcpy cases fall through to the memcpy head.
81	*/
82
83	ENTRY (__memmove_thunderx2)
84
85	PTR_ARG (`0`)
86	PTR_ARG (`1`)
87	SIZE_ARG (`2`)
88
89	add srcend, src, count
90	cmp count, `16`
91	b.ls L(memcopy16)
92	sub tmp1, dstin, src
93	cmp count, `96`
94	ccmp tmp1, count, `2`, hi
95	b.lo L(move_long)
96
97	END (__memmove_thunderx2)
98
99
100	/ Copies are split into 3 main cases: small copies of up to 16 bytes,*
101	medium copies of 17..96 bytes which are fully unrolled. Large copies
102	of more than 96 bytes align the destination and use load-and-merge
103	approach in the case src and dst addresses are unaligned not evenly,
104	so that, actual loads and stores are always aligned.
105	Large copies use the loops processing 64 bytes per iteration for
106	unaligned case and 128 bytes per iteration for aligned ones.
107	*/
108
109	#define MEMCPY_PREFETCH_LDR 640
110
111	ENTRY (__memcpy_thunderx2)
112
113	PTR_ARG (`0`)
114	PTR_ARG (`1`)
115	SIZE_ARG (`2`)
116
117	add srcend, src, count
118	cmp count, `16`
119	b.ls L(memcopy16)
120	ldr A_q, [src], #`16`
121	add dstend, dstin, count
122	and tmp1, src, `15`
123	cmp count, `96`
124	b.hi L(memcopy_long)
125
126	/ Medium copies: 17..96 bytes. /
127	ldr E_q, [srcend, -`16`]
128	cmp count, `64`
129	b.gt L(memcpy_copy96)
130	cmp count, `48`
131	b.le L(bytes_17_to_48)
132	/ 49..64 bytes /
133	ldp B_q, C_q, [src]
134	str E_q, [dstend, -`16`]
135	stp A_q, B_q, [dstin]
136	str C_q, [dstin, `32`]
137	ret
138
139	L(bytes_17_to_48):
140	/ 17..48 bytes/
141	cmp count, `32`
142	b.gt L(bytes_32_to_48)
143	/ 17..32 bytes/
144	str A_q, [dstin]
145	str E_q, [dstend, -`16`]
146	ret
147
148	L(bytes_32_to_48):
149	/ 32..48 /
150	ldr B_q, [src]
151	str A_q, [dstin]
152	str E_q, [dstend, -`16`]
153	str B_q, [dstin, `16`]
154	ret
155
156	.p2align `4`
157	/ Small copies: 0..16 bytes. /
158	L(memcopy16):
159	cmp count, `8`
160	b.lo L(bytes_0_to_8)
161	ldr A_l, [src]
162	ldr A_h, [srcend, -`8`]
163	add dstend, dstin, count
164	str A_l, [dstin]
165	str A_h, [dstend, -`8`]
166	ret
167	.p2align `4`
168
169	L(bytes_0_to_8):
170	tbz count, `2`, L(bytes_0_to_3)
171	ldr A_lw, [src]
172	ldr A_hw, [srcend, -`4`]
173	add dstend, dstin, count
174	str A_lw, [dstin]
175	str A_hw, [dstend, -`4`]
176	ret
177
178	/ Copy 0..3 bytes. Use a branchless sequence that copies the same*
179	byte 3 times if count==1, or the 2nd byte twice if count==2. /*
180	L(bytes_0_to_3):
181	cbz count, `1f`
182	lsr tmp1, count, `1`
183	ldrb A_lw, [src]
184	ldrb A_hw, [srcend, -`1`]
185	add dstend, dstin, count
186	ldrb B_lw, [src, tmp1]
187	strb B_lw, [dstin, tmp1]
188	strb A_hw, [dstend, -`1`]
189	strb A_lw, [dstin]
190	`1`:
191	ret
192
193	.p2align `4`
194
195	L(memcpy_copy96):
196	/ Copying 65..96 bytes. A_q (first 16 bytes) and*
197	E_q(last 16 bytes) are already loaded. The size
198	is large enough to benefit from aligned loads /*
199	bic src, src, `15`
200	ldp B_q, C_q, [src]
201	/ Loaded 64 bytes, second 16-bytes chunk can be*
202	overlapping with the first chunk by tmp1 bytes.
203	Stored 16 bytes. /*
204	sub dst, dstin, tmp1
205	add count, count, tmp1
206	/ The range of count being [65..96] becomes [65..111]*
207	after tmp [0..15] gets added to it,
208	count now is <bytes-left-to-load>+48 /*
209	cmp count, `80`
210	b.gt L(copy96_medium)
211	ldr D_q, [src, `32`]
212	stp B_q, C_q, [dst, `16`]
213	str D_q, [dst, `48`]
214	str A_q, [dstin]
215	str E_q, [dstend, -`16`]
216	ret
217
218	.p2align `4`
219	L(copy96_medium):
220	ldp D_q, G_q, [src, `32`]
221	cmp count, `96`
222	b.gt L(copy96_large)
223	stp B_q, C_q, [dst, `16`]
224	stp D_q, G_q, [dst, `48`]
225	str A_q, [dstin]
226	str E_q, [dstend, -`16`]
227	ret
228
229	L(copy96_large):
230	ldr F_q, [src, `64`]
231	str B_q, [dst, `16`]
232	stp C_q, D_q, [dst, `32`]
233	stp G_q, F_q, [dst, `64`]
234	str A_q, [dstin]
235	str E_q, [dstend, -`16`]
236	ret
237
238	.p2align `4`
239	L(memcopy_long):
240	bic src, src, `15`
241	ldp B_q, C_q, [src], #`32`
242	sub dst, dstin, tmp1
243	add count, count, tmp1
244	add dst, dst, `16`
245	and tmp1, dst, `15`
246	ldp D_q, E_q, [src], #`32`
247	str A_q, [dstin]
248
249	/ Already loaded 64+16 bytes. Check if at*
250	least 64 more bytes left /*
251	subs count, count, `64`+`64`+`16`
252	b.lt L(loop128_exit0)
253	cmp count, MEMCPY_PREFETCH_LDR + `64` + `32`
254	b.lt L(loop128)
255	cbnz tmp1, L(dst_unaligned)
256	sub count, count, MEMCPY_PREFETCH_LDR + `64` + `32`
257
258	.p2align `4`
259
260	L(loop128_prefetch):
261	prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
262	ldp F_q, G_q, [src], #`32`
263	stp B_q, C_q, [dst], #`32`
264	ldp H_q, I_q, [src], #`32`
265	prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
266	ldp B_q, C_q, [src], #`32`
267	stp D_q, E_q, [dst], #`32`
268	ldp D_q, E_q, [src], #`32`
269	stp F_q, G_q, [dst], #`32`
270	stp H_q, I_q, [dst], #`32`
271	subs count, count, `128`
272	b.ge L(loop128_prefetch)
273
274	add count, count, MEMCPY_PREFETCH_LDR + `64` + `32`
275	.p2align `4`
276	L(loop128):
277	ldp F_q, G_q, [src], #`32`
278	ldp H_q, I_q, [src], #`32`
279	stp B_q, C_q, [dst], #`32`
280	stp D_q, E_q, [dst], #`32`
281	subs count, count, `64`
282	b.lt L(loop128_exit1)
283	ldp B_q, C_q, [src], #`32`
284	ldp D_q, E_q, [src], #`32`
285	stp F_q, G_q, [dst], #`32`
286	stp H_q, I_q, [dst], #`32`
287	subs count, count, `64`
288	b.ge L(loop128)
289	L(loop128_exit0):
290	ldp F_q, G_q, [srcend, -`64`]
291	ldp H_q, I_q, [srcend, -`32`]
292	stp B_q, C_q, [dst], #`32`
293	stp D_q, E_q, [dst]
294	stp F_q, G_q, [dstend, -`64`]
295	stp H_q, I_q, [dstend, -`32`]
296	ret
297	L(loop128_exit1):
298	ldp B_q, C_q, [srcend, -`64`]
299	ldp D_q, E_q, [srcend, -`32`]
300	stp F_q, G_q, [dst], #`32`
301	stp H_q, I_q, [dst]
302	stp B_q, C_q, [dstend, -`64`]
303	stp D_q, E_q, [dstend, -`32`]
304	ret
305
306	L(dst_unaligned_tail):
307	ldp C_q, D_q, [srcend, -`64`]
308	ldp E_q, F_q, [srcend, -`32`]
309	stp A_q, B_q, [dst], #`32`
310	stp H_q, I_q, [dst], #`16`
311	str G_q, [dst, tmp1]
312	stp C_q, D_q, [dstend, -`64`]
313	stp E_q, F_q, [dstend, -`32`]
314	ret
315
316	L(dst_unaligned):
317	/ For the unaligned store case the code loads two*
318	aligned chunks and then merges them using ext
319	instruction. This can be up to 30% faster than
320	the the simple unaligned store access.
321
322	Current state: tmp1 = dst % 16; C_q, D_q, E_q
323	contains data yet to be stored. src and dst points
324	to next-to-be-processed data. A_q, B_q contains
325	data already stored before, count = bytes left to
326	be load decremented by 64.
327
328	The control is passed here if at least 64 bytes left
329	to be loaded. The code does two aligned loads and then
330	extracts (16-tmp1) bytes from the first register and
331	tmp1 bytes from the next register forming the value
332	for the aligned store.
333
334	As ext instruction can only have it's index encoded
335	as immediate. 15 code chunks process each possible
336	index value. Computed goto is used to reach the
337	required code. /*
338
339	/ Store the 16 bytes to dst and align dst for further*
340	operations, several bytes will be stored at this
341	address once more /*
342
343	ldp F_q, G_q, [src], #`32`
344	stp B_q, C_q, [dst], #`32`
345	bic dst, dst, `15`
346	sub count, count, `32`
347	adrp tmp2, L(ext_table)
348	add tmp2, tmp2, :lo12:L(ext_table)
349	add tmp2, tmp2, tmp1, LSL #`2`
350	ldr tmp3w, [tmp2]
351	add tmp2, tmp2, tmp3w, SXTW
352	br tmp2
353
354	.p2align `4`
355	/ to make the loop in each chunk 16-bytes aligned /
356	nop
357	#define EXT_CHUNK(shft) \
358	L(ext_size_ ## shft):;\
359	ext A_v.16b, C_v.16b, D_v.16b, 16-shft;\
360	ext B_v.16b, D_v.16b, E_v.16b, 16-shft;\
361	ext H_v.16b, E_v.16b, F_v.16b, 16-shft;\
362	1:;\
363	stp A_q, B_q, [dst], #32;\
364	prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR];\
365	ldp C_q, D_q, [src], #32;\
366	ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\
367	stp H_q, I_q, [dst], #32;\
368	ext A_v.16b, G_v.16b, C_v.16b, 16-shft;\
369	ext B_v.16b, C_v.16b, D_v.16b, 16-shft;\
370	ldp F_q, G_q, [src], #32;\
371	ext H_v.16b, D_v.16b, F_v.16b, 16-shft;\
372	subs count, count, 64;\
373	b.ge 1b;\
374	2:;\
375	ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\
376	b L(dst_unaligned_tail);
377
378	EXT_CHUNK(`1`)
379	EXT_CHUNK(`2`)
380	EXT_CHUNK(`3`)
381	EXT_CHUNK(`4`)
382	EXT_CHUNK(`5`)
383	EXT_CHUNK(`6`)
384	EXT_CHUNK(`7`)
385	EXT_CHUNK(`8`)
386	EXT_CHUNK(`9`)
387	EXT_CHUNK(`10`)
388	EXT_CHUNK(`11`)
389	EXT_CHUNK(`12`)
390	EXT_CHUNK(`13`)
391	EXT_CHUNK(`14`)
392	EXT_CHUNK(`15`)
393
394	L(move_long):
395	.p2align `4`
396	`1`:
397	cbz tmp1, `3f`
398
399	add srcend, src, count
400	add dstend, dstin, count
401
402	and tmp1, srcend, `15`
403	ldr D_q, [srcend, -`16`]
404	sub srcend, srcend, tmp1
405	sub count, count, tmp1
406	ldp A_q, B_q, [srcend, -`32`]
407	str D_q, [dstend, -`16`]
408	ldp C_q, D_q, [srcend, -`64`]!
409	sub dstend, dstend, tmp1
410	subs count, count, `128`
411	b.ls `2f`
412
413	.p2align `4`
414	`1`:
415	subs count, count, `64`
416	stp A_q, B_q, [dstend, -`32`]
417	ldp A_q, B_q, [srcend, -`32`]
418	stp C_q, D_q, [dstend, -`64`]!
419	ldp C_q, D_q, [srcend, -`64`]!
420	b.hi `1b`
421
422	/ Write the last full set of 64 bytes. The remainder is at most 64*
423	bytes, so it is safe to always copy 64 bytes from the start even if
424	there is just 1 byte left. /*
425	`2`:
426	ldp E_q, F_q, [src, `32`]
427	ldp G_q, H_q, [src]
428	stp A_q, B_q, [dstend, -`32`]
429	stp C_q, D_q, [dstend, -`64`]
430	stp E_q, F_q, [dstin, `32`]
431	stp G_q, H_q, [dstin]
432	`3`: ret
433
434
435	END (__memcpy_thunderx2)
436	.section .rodata
437	.p2align `4`
438
439	L(ext_table):
440	/ The first entry is for the alignment of 0 and is never*
441	actually used (could be any value). /*
442	.word `0`
443	.word L(ext_size_1) -.
444	.word L(ext_size_2) -.
445	.word L(ext_size_3) -.
446	.word L(ext_size_4) -.
447	.word L(ext_size_5) -.
448	.word L(ext_size_6) -.
449	.word L(ext_size_7) -.
450	.word L(ext_size_8) -.
451	.word L(ext_size_9) -.
452	.word L(ext_size_10) -.
453	.word L(ext_size_11) -.
454	.word L(ext_size_12) -.
455	.word L(ext_size_13) -.
456	.word L(ext_size_14) -.
457	.word L(ext_size_15) -.
458

source code of glibc/sysdeps/aarch64/multiarch/memcpy_thunderx2.S