memcpy.S source code [glibc/sysdeps/aarch64/memcpy.S]

1	/ Generic optimized memcpy using SIMD.*
2	Copyright (C) 2012-2024 Free Software Foundation, Inc.
3
4	This file is part of the GNU C Library.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library. If not, see
18	<https://www.gnu.org/licenses/>. /*
19
20	#include <sysdep.h>
21
22	/ Assumptions:*
23	*
24	* ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
25	*
26	*/
27
28	#define dstin x0
29	#define src x1
30	#define count x2
31	#define dst x3
32	#define srcend x4
33	#define dstend x5
34	#define A_l x6
35	#define A_lw w6
36	#define A_h x7
37	#define B_l x8
38	#define B_lw w8
39	#define B_h x9
40	#define C_lw w10
41	#define tmp1 x14
42
43	#define A_q q0
44	#define B_q q1
45	#define C_q q2
46	#define D_q q3
47	#define E_q q4
48	#define F_q q5
49	#define G_q q6
50	#define H_q q7
51
52	#ifndef MEMMOVE
53	# define MEMMOVE memmove
54	#endif
55	#ifndef MEMCPY
56	# define MEMCPY memcpy
57	#endif
58
59	/ This implementation supports both memcpy and memmove and shares most code.*
60	It uses unaligned accesses and branchless sequences to keep the code small,
61	simple and improve performance.
62
63	Copies are split into 3 main cases: small copies of up to 32 bytes, medium
64	copies of up to 128 bytes, and large copies. The overhead of the overlap
65	check in memmove is negligible since it is only required for large copies.
66
67	Large copies use a software pipelined loop processing 64 bytes per
68	iteration. The destination pointer is 16-byte aligned to minimize
69	unaligned accesses. The loop tail is handled by always copying 64 bytes
70	from the end. /*
71
72	ENTRY (MEMCPY)
73	PTR_ARG (`0`)
74	PTR_ARG (`1`)
75	SIZE_ARG (`2`)
76
77	add srcend, src, count
78	add dstend, dstin, count
79	cmp count, `128`
80	b.hi L(copy_long)
81	cmp count, `32`
82	b.hi L(copy32_128)
83
84	/ Small copies: 0..32 bytes. /
85	cmp count, `16`
86	b.lo L(copy16)
87	ldr A_q, [src]
88	ldr B_q, [srcend, -`16`]
89	str A_q, [dstin]
90	str B_q, [dstend, -`16`]
91	ret
92
93	/ Copy 8-15 bytes. /
94	L(copy16):
95	tbz count, `3`, L(copy8)
96	ldr A_l, [src]
97	ldr A_h, [srcend, -`8`]
98	str A_l, [dstin]
99	str A_h, [dstend, -`8`]
100	ret
101
102	/ Copy 4-7 bytes. /
103	L(copy8):
104	tbz count, `2`, L(copy4)
105	ldr A_lw, [src]
106	ldr B_lw, [srcend, -`4`]
107	str A_lw, [dstin]
108	str B_lw, [dstend, -`4`]
109	ret
110
111	/ Copy 0..3 bytes using a branchless sequence. /
112	L(copy4):
113	cbz count, L(copy0)
114	lsr tmp1, count, `1`
115	ldrb A_lw, [src]
116	ldrb C_lw, [srcend, -`1`]
117	ldrb B_lw, [src, tmp1]
118	strb A_lw, [dstin]
119	strb B_lw, [dstin, tmp1]
120	strb C_lw, [dstend, -`1`]
121	L(copy0):
122	ret
123
124	.p2align `4`
125	/ Medium copies: 33..128 bytes. /
126	L(copy32_128):
127	ldp A_q, B_q, [src]
128	ldp C_q, D_q, [srcend, -`32`]
129	cmp count, `64`
130	b.hi L(copy128)
131	stp A_q, B_q, [dstin]
132	stp C_q, D_q, [dstend, -`32`]
133	ret
134
135	.p2align `4`
136	/ Copy 65..128 bytes. /
137	L(copy128):
138	ldp E_q, F_q, [src, `32`]
139	cmp count, `96`
140	b.ls L(copy96)
141	ldp G_q, H_q, [srcend, -`64`]
142	stp G_q, H_q, [dstend, -`64`]
143	L(copy96):
144	stp A_q, B_q, [dstin]
145	stp E_q, F_q, [dstin, `32`]
146	stp C_q, D_q, [dstend, -`32`]
147	ret
148
149	/ Align loop64 below to 16 bytes. /
150	nop
151
152	/ Copy more than 128 bytes. /
153	L(copy_long):
154	/ Copy 16 bytes and then align src to 16-byte alignment. /
155	ldr D_q, [src]
156	and tmp1, src, `15`
157	bic src, src, `15`
158	sub dst, dstin, tmp1
159	add count, count, tmp1 / Count is now 16 too large. /
160	ldp A_q, B_q, [src, `16`]
161	str D_q, [dstin]
162	ldp C_q, D_q, [src, `48`]
163	subs count, count, `128` + `16` / Test and readjust count. /
164	b.ls L(copy64_from_end)
165	L(loop64):
166	stp A_q, B_q, [dst, `16`]
167	ldp A_q, B_q, [src, `80`]
168	stp C_q, D_q, [dst, `48`]
169	ldp C_q, D_q, [src, `112`]
170	add src, src, `64`
171	add dst, dst, `64`
172	subs count, count, `64`
173	b.hi L(loop64)
174
175	/ Write the last iteration and copy 64 bytes from the end. /
176	L(copy64_from_end):
177	ldp E_q, F_q, [srcend, -`64`]
178	stp A_q, B_q, [dst, `16`]
179	ldp A_q, B_q, [srcend, -`32`]
180	stp C_q, D_q, [dst, `48`]
181	stp E_q, F_q, [dstend, -`64`]
182	stp A_q, B_q, [dstend, -`32`]
183	ret
184
185	END (MEMCPY)
186	libc_hidden_builtin_def (MEMCPY)
187
188
189	ENTRY (MEMMOVE)
190	PTR_ARG (`0`)
191	PTR_ARG (`1`)
192	SIZE_ARG (`2`)
193
194	add srcend, src, count
195	add dstend, dstin, count
196	cmp count, `128`
197	b.hi L(move_long)
198	cmp count, `32`
199	b.hi L(copy32_128)
200
201	/ Small moves: 0..32 bytes. /
202	cmp count, `16`
203	b.lo L(copy16)
204	ldr A_q, [src]
205	ldr B_q, [srcend, -`16`]
206	str A_q, [dstin]
207	str B_q, [dstend, -`16`]
208	ret
209
210	L(move_long):
211	/ Only use backward copy if there is an overlap. /
212	sub tmp1, dstin, src
213	cbz tmp1, L(move0)
214	cmp tmp1, count
215	b.hs L(copy_long)
216
217	/ Large backwards copy for overlapping copies.*
218	Copy 16 bytes and then align srcend to 16-byte alignment. /*
219	L(copy_long_backwards):
220	ldr D_q, [srcend, -`16`]
221	and tmp1, srcend, `15`
222	bic srcend, srcend, `15`
223	sub count, count, tmp1
224	ldp A_q, B_q, [srcend, -`32`]
225	str D_q, [dstend, -`16`]
226	ldp C_q, D_q, [srcend, -`64`]
227	sub dstend, dstend, tmp1
228	subs count, count, `128`
229	b.ls L(copy64_from_start)
230
231	L(loop64_backwards):
232	str B_q, [dstend, -`16`]
233	str A_q, [dstend, -`32`]
234	ldp A_q, B_q, [srcend, -`96`]
235	str D_q, [dstend, -`48`]
236	str C_q, [dstend, -`64`]!
237	ldp C_q, D_q, [srcend, -`128`]
238	sub srcend, srcend, `64`
239	subs count, count, `64`
240	b.hi L(loop64_backwards)
241
242	/ Write the last iteration and copy 64 bytes from the start. /
243	L(copy64_from_start):
244	ldp E_q, F_q, [src, `32`]
245	stp A_q, B_q, [dstend, -`32`]
246	ldp A_q, B_q, [src]
247	stp C_q, D_q, [dstend, -`64`]
248	stp E_q, F_q, [dstin, `32`]
249	stp A_q, B_q, [dstin]
250	L(move0):
251	ret
252
253	END (MEMMOVE)
254	libc_hidden_builtin_def (MEMMOVE)
255

source code of glibc/sysdeps/aarch64/memcpy.S