memcpy_sve.S source code [glibc/sysdeps/aarch64/multiarch/memcpy_sve.S]

1	/ Optimized memcpy for SVE.*
2	Copyright (C) 2021-2024 Free Software Foundation, Inc.
3
4	This file is part of the GNU C Library.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library. If not, see
18	<https://www.gnu.org/licenses/>. /*
19
20	#include <sysdep.h>
21
22	/ Assumptions:*
23	*
24	* ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
25	*
26	*/
27
28	#define dstin x0
29	#define src x1
30	#define count x2
31	#define dst x3
32	#define srcend x4
33	#define dstend x5
34	#define tmp1 x6
35	#define vlen x6
36
37	#define A_q q0
38	#define B_q q1
39	#define C_q q2
40	#define D_q q3
41	#define E_q q4
42	#define F_q q5
43	#define G_q q6
44	#define H_q q7
45
46	/ This implementation supports both memcpy and memmove and shares most code.*
47	It uses unaligned accesses and branchless sequences to keep the code small,
48	simple and improve performance.
49
50	Copies are split into 3 main cases: small copies of up to 32 bytes, medium
51	copies of up to 128 bytes, and large copies. The overhead of the overlap
52	check in memmove is negligible since it is only required for large copies.
53
54	Large copies use a software pipelined loop processing 64 bytes per iteration.
55	The source pointer is 16-byte aligned to minimize unaligned accesses.
56	The loop tail is handled by always copying 64 bytes from the end.
57	*/
58
59	#if HAVE_AARCH64_SVE_ASM
60
61	.arch armv8`.2`-a+sve
62
63	ENTRY (__memcpy_sve)
64	PTR_ARG (`0`)
65	PTR_ARG (`1`)
66	SIZE_ARG (`2`)
67
68	cmp count, `128`
69	b.hi L(copy_long)
70	cntb vlen
71	cmp count, vlen, lsl `1`
72	b.hi L(copy32_128)
73	whilelo p0.b, xzr, count
74	whilelo p1.b, vlen, count
75	ld1b z0.b, p0/z, [src, `0`, mul vl]
76	ld1b z1.b, p1/z, [src, `1`, mul vl]
77	st1b z0.b, p0, [dstin, `0`, mul vl]
78	st1b z1.b, p1, [dstin, `1`, mul vl]
79	ret
80
81	/ Medium copies: 33..128 bytes. /
82	L(copy32_128):
83	add srcend, src, count
84	add dstend, dstin, count
85	ldp A_q, B_q, [src]
86	ldp C_q, D_q, [srcend, -`32`]
87	cmp count, `64`
88	b.hi L(copy128)
89	stp A_q, B_q, [dstin]
90	stp C_q, D_q, [dstend, -`32`]
91	ret
92
93	/ Copy 65..128 bytes. /
94	L(copy128):
95	ldp E_q, F_q, [src, `32`]
96	cmp count, `96`
97	b.ls L(copy96)
98	ldp G_q, H_q, [srcend, -`64`]
99	stp G_q, H_q, [dstend, -`64`]
100	L(copy96):
101	stp A_q, B_q, [dstin]
102	stp E_q, F_q, [dstin, `32`]
103	stp C_q, D_q, [dstend, -`32`]
104	ret
105
106	.p2align `4`
107	/ Copy more than 128 bytes. /
108	L(copy_long):
109	add srcend, src, count
110	add dstend, dstin, count
111
112	/ Copy 16 bytes and then align src to 16-byte alignment. /
113	ldr D_q, [src]
114	and tmp1, src, `15`
115	bic src, src, `15`
116	sub dst, dstin, tmp1
117	add count, count, tmp1 / Count is now 16 too large. /
118	ldp A_q, B_q, [src, `16`]
119	str D_q, [dstin]
120	ldp C_q, D_q, [src, `48`]
121	subs count, count, `128` + `16` / Test and readjust count. /
122	b.ls L(copy64_from_end)
123	L(loop64):
124	stp A_q, B_q, [dst, `16`]
125	ldp A_q, B_q, [src, `80`]
126	stp C_q, D_q, [dst, `48`]
127	ldp C_q, D_q, [src, `112`]
128	add src, src, `64`
129	add dst, dst, `64`
130	subs count, count, `64`
131	b.hi L(loop64)
132
133	/ Write the last iteration and copy 64 bytes from the end. /
134	L(copy64_from_end):
135	ldp E_q, F_q, [srcend, -`64`]
136	stp A_q, B_q, [dst, `16`]
137	ldp A_q, B_q, [srcend, -`32`]
138	stp C_q, D_q, [dst, `48`]
139	stp E_q, F_q, [dstend, -`64`]
140	stp A_q, B_q, [dstend, -`32`]
141	ret
142
143	END (__memcpy_sve)
144
145
146	ENTRY (__memmove_sve)
147	PTR_ARG (`0`)
148	PTR_ARG (`1`)
149	SIZE_ARG (`2`)
150
151	cmp count, `128`
152	b.hi L(move_long)
153	cntb vlen
154	cmp count, vlen, lsl `1`
155	b.hi L(copy32_128)
156	whilelo p0.b, xzr, count
157	whilelo p1.b, vlen, count
158	ld1b z0.b, p0/z, [src, `0`, mul vl]
159	ld1b z1.b, p1/z, [src, `1`, mul vl]
160	st1b z0.b, p0, [dstin, `0`, mul vl]
161	st1b z1.b, p1, [dstin, `1`, mul vl]
162	ret
163
164	.p2align `4`
165	L(move_long):
166	add srcend, src, count
167	add dstend, dstin, count
168	/ Only use backward copy if there is an overlap. /
169	sub tmp1, dstin, src
170	cbz tmp1, L(return)
171	cmp tmp1, count
172	b.hs L(copy_long)
173
174	/ Large backwards copy for overlapping copies.*
175	Copy 16 bytes and then align srcend to 16-byte alignment. /*
176	ldr D_q, [srcend, -`16`]
177	and tmp1, srcend, `15`
178	bic srcend, srcend, `15`
179	sub count, count, tmp1
180	ldp A_q, B_q, [srcend, -`32`]
181	str D_q, [dstend, -`16`]
182	ldp C_q, D_q, [srcend, -`64`]
183	sub dstend, dstend, tmp1
184	subs count, count, `128`
185	b.ls L(copy64_from_start)
186
187	L(loop64_backwards):
188	str B_q, [dstend, -`16`]
189	str A_q, [dstend, -`32`]
190	ldp A_q, B_q, [srcend, -`96`]
191	str D_q, [dstend, -`48`]
192	str C_q, [dstend, -`64`]!
193	ldp C_q, D_q, [srcend, -`128`]
194	sub srcend, srcend, `64`
195	subs count, count, `64`
196	b.hi L(loop64_backwards)
197
198	/ Write the last iteration and copy 64 bytes from the start. /
199	L(copy64_from_start):
200	ldp E_q, F_q, [src, `32`]
201	stp A_q, B_q, [dstend, -`32`]
202	ldp A_q, B_q, [src]
203	stp C_q, D_q, [dstend, -`64`]
204	stp E_q, F_q, [dstin, `32`]
205	stp A_q, B_q, [dstin]
206	L(return):
207	ret
208
209	END (__memmove_sve)
210	#endif
211

source code of glibc/sysdeps/aarch64/multiarch/memcpy_sve.S