memcpy-advsimd.S source code [libc/AOR_v20.02/string/aarch64/memcpy-advsimd.S]

1	/*
2	* memcpy - copy memory area
3	*
4	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5	* See https://llvm.org/LICENSE.txt for license information.
6	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7	*/
8
9	/ Assumptions:*
10	*
11	* ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
12	*
13	*/
14
15	#include "../asmdefs.h"
16
17	#define dstin x0
18	#define src x1
19	#define count x2
20	#define dst x3
21	#define srcend x4
22	#define dstend x5
23	#define A_l x6
24	#define A_lw w6
25	#define A_h x7
26	#define B_l x8
27	#define B_lw w8
28	#define B_h x9
29	#define C_lw w10
30	#define tmp1 x14
31
32	#define A_q q0
33	#define B_q q1
34	#define C_q q2
35	#define D_q q3
36	#define E_q q4
37	#define F_q q5
38	#define G_q q6
39	#define H_q q7
40
41	/ This implementation handles overlaps and supports both memcpy and memmove*
42	from a single entry point. It uses unaligned accesses and branchless
43	sequences to keep the code small, simple and improve performance.
44
45	Copies are split into 3 main cases: small copies of up to 32 bytes, medium
46	copies of up to 128 bytes, and large copies. The overhead of the overlap
47	check is negligible since it is only required for large copies.
48
49	Large copies use a software pipelined loop processing 64 bytes per iteration.
50	The source pointer is 16-byte aligned to minimize unaligned accesses.
51	The loop tail is handled by always copying 64 bytes from the end.
52	*/
53
54	ENTRY (__memcpy_aarch64_simd)
55	ENTRY_ALIAS (__memmove_aarch64_simd)
56	add srcend, src, count
57	add dstend, dstin, count
58	cmp count, `128`
59	b.hi L(copy_long)
60	cmp count, `32`
61	b.hi L(copy32_128)
62
63	/ Small copies: 0..32 bytes. /
64	cmp count, `16`
65	b.lo L(copy16)
66	ldr A_q, [src]
67	ldr B_q, [srcend, -`16`]
68	str A_q, [dstin]
69	str B_q, [dstend, -`16`]
70	ret
71
72	/ Copy 8-15 bytes. /
73	L(copy16):
74	tbz count, `3`, L(copy8)
75	ldr A_l, [src]
76	ldr A_h, [srcend, -`8`]
77	str A_l, [dstin]
78	str A_h, [dstend, -`8`]
79	ret
80
81	.p2align `3`
82	/ Copy 4-7 bytes. /
83	L(copy8):
84	tbz count, `2`, L(copy4)
85	ldr A_lw, [src]
86	ldr B_lw, [srcend, -`4`]
87	str A_lw, [dstin]
88	str B_lw, [dstend, -`4`]
89	ret
90
91	/ Copy 0..3 bytes using a branchless sequence. /
92	L(copy4):
93	cbz count, L(copy0)
94	lsr tmp1, count, `1`
95	ldrb A_lw, [src]
96	ldrb C_lw, [srcend, -`1`]
97	ldrb B_lw, [src, tmp1]
98	strb A_lw, [dstin]
99	strb B_lw, [dstin, tmp1]
100	strb C_lw, [dstend, -`1`]
101	L(copy0):
102	ret
103
104	.p2align `4`
105	/ Medium copies: 33..128 bytes. /
106	L(copy32_128):
107	ldp A_q, B_q, [src]
108	ldp C_q, D_q, [srcend, -`32`]
109	cmp count, `64`
110	b.hi L(copy128)
111	stp A_q, B_q, [dstin]
112	stp C_q, D_q, [dstend, -`32`]
113	ret
114
115	.p2align `4`
116	/ Copy 65..128 bytes. /
117	L(copy128):
118	ldp E_q, F_q, [src, `32`]
119	cmp count, `96`
120	b.ls L(copy96)
121	ldp G_q, H_q, [srcend, -`64`]
122	stp G_q, H_q, [dstend, -`64`]
123	L(copy96):
124	stp A_q, B_q, [dstin]
125	stp E_q, F_q, [dstin, `32`]
126	stp C_q, D_q, [dstend, -`32`]
127	ret
128
129	/ Copy more than 128 bytes. /
130	L(copy_long):
131	/ Use backwards copy if there is an overlap. /
132	sub tmp1, dstin, src
133	cmp tmp1, count
134	b.lo L(copy_long_backwards)
135
136	/ Copy 16 bytes and then align src to 16-byte alignment. /
137	ldr D_q, [src]
138	and tmp1, src, `15`
139	bic src, src, `15`
140	sub dst, dstin, tmp1
141	add count, count, tmp1 / Count is now 16 too large. /
142	ldp A_q, B_q, [src, `16`]
143	str D_q, [dstin]
144	ldp C_q, D_q, [src, `48`]
145	subs count, count, `128` + `16` / Test and readjust count. /
146	b.ls L(copy64_from_end)
147	L(loop64):
148	stp A_q, B_q, [dst, `16`]
149	ldp A_q, B_q, [src, `80`]
150	stp C_q, D_q, [dst, `48`]
151	ldp C_q, D_q, [src, `112`]
152	add src, src, `64`
153	add dst, dst, `64`
154	subs count, count, `64`
155	b.hi L(loop64)
156
157	/ Write the last iteration and copy 64 bytes from the end. /
158	L(copy64_from_end):
159	ldp E_q, F_q, [srcend, -`64`]
160	stp A_q, B_q, [dst, `16`]
161	ldp A_q, B_q, [srcend, -`32`]
162	stp C_q, D_q, [dst, `48`]
163	stp E_q, F_q, [dstend, -`64`]
164	stp A_q, B_q, [dstend, -`32`]
165	ret
166
167	/ Large backwards copy for overlapping copies.*
168	Copy 16 bytes and then align srcend to 16-byte alignment. /*
169	L(copy_long_backwards):
170	cbz tmp1, L(copy0)
171	ldr D_q, [srcend, -`16`]
172	and tmp1, srcend, `15`
173	bic srcend, srcend, `15`
174	sub count, count, tmp1
175	ldp A_q, B_q, [srcend, -`32`]
176	str D_q, [dstend, -`16`]
177	ldp C_q, D_q, [srcend, -`64`]
178	sub dstend, dstend, tmp1
179	subs count, count, `128`
180	b.ls L(copy64_from_start)
181
182	L(loop64_backwards):
183	stp A_q, B_q, [dstend, -`32`]
184	ldp A_q, B_q, [srcend, -`96`]
185	stp C_q, D_q, [dstend, -`64`]
186	ldp C_q, D_q, [srcend, -`128`]
187	sub srcend, srcend, `64`
188	sub dstend, dstend, `64`
189	subs count, count, `64`
190	b.hi L(loop64_backwards)
191
192	/ Write the last iteration and copy 64 bytes from the start. /
193	L(copy64_from_start):
194	ldp E_q, F_q, [src, `32`]
195	stp A_q, B_q, [dstend, -`32`]
196	ldp A_q, B_q, [src]
197	stp C_q, D_q, [dstend, -`64`]
198	stp E_q, F_q, [dstin, `32`]
199	stp A_q, B_q, [dstin]
200	ret
201
202	END (__memcpy_aarch64_simd)
203

source code of libc/AOR_v20.02/string/aarch64/memcpy-advsimd.S