sme-libc-opt-memcpy-memmove.S source code [compiler-rt/lib/builtins/aarch64/sme-libc-opt-memcpy-memmove.S]

1	//===----------------------------------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	///
9	/// \file
10	/// This file contains assembly-optimized implementations of Scalable Matrix
11	/// Extension (SME) compatible memcpy and memmove functions.
12	///
13	/// These implementations depend on unaligned access support.
14	///
15	/// Routines taken from libc/AOR_v20.02/string/aarch64.
16	///
17	//===----------------------------------------------------------------------===//
18
19	#include "../assembly.h"
20
21	//
22	// __arm_sc_memcpy / __arm_sc_memmove
23	//
24
25	#define dstin x0
26	#define src x1
27	#define count x2
28	#define dst x3
29	#define srcend1 x4
30	#define dstend1 x5
31	#define A_l x6
32	#define A_lw w6
33	#define A_h x7
34	#define B_l x8
35	#define B_lw w8
36	#define B_h x9
37	#define C_l x10
38	#define C_lw w10
39	#define C_h x11
40	#define D_l x12
41	#define D_h x13
42	#define E_l x14
43	#define E_h x15
44	#define F_l x16
45	#define F_h x17
46	#define G_l count
47	#define G_h dst
48	#define H_l src
49	#define H_h srcend1
50	#define tmp1 x14
51
52	/ This implementation handles overlaps and supports both memcpy and memmove*
53	from a single entry point. It uses unaligned accesses and branchless
54	sequences to keep the code small, simple and improve performance.
55
56	Copies are split into 3 main cases: small copies of up to 32 bytes, medium
57	copies of up to 128 bytes, and large copies. The overhead of the overlap
58	check is negligible since it is only required for large copies.
59
60	Large copies use a software pipelined loop processing 64 bytes per iteration.
61	The destination pointer is 16-byte aligned to minimize unaligned accesses.
62	The loop tail is handled by always copying 64 bytes from the end.
63	*/
64
65	DEFINE_COMPILERRT_FUNCTION(__arm_sc_memcpy)
66	add srcend1, src, count
67	add dstend1, dstin, count
68	cmp count, `128`
69	b.hi `7f` // copy_long
70	cmp count, `32`
71	b.hi `4f` // copy32_128
72
73	/ Small copies: 0..32 bytes. /
74	cmp count, `16`
75	b.lo `0f` // copy16
76	ldp A_l, A_h, [src]
77	ldp D_l, D_h, [srcend1, -`16`]
78	stp A_l, A_h, [dstin]
79	stp D_l, D_h, [dstend1, -`16`]
80	ret
81
82	/ Copy 8-15 bytes. /
83	`0`: // copy16
84	tbz count, `3`, `1f` // copy8
85	ldr A_l, [src]
86	ldr A_h, [srcend1, -`8`]
87	str A_l, [dstin]
88	str A_h, [dstend1, -`8`]
89	ret
90
91	.p2align `3`
92	/ Copy 4-7 bytes. /
93	`1`: // copy8
94	tbz count, `2`, `2f` // copy4
95	ldr A_lw, [src]
96	ldr B_lw, [srcend1, -`4`]
97	str A_lw, [dstin]
98	str B_lw, [dstend1, -`4`]
99	ret
100
101	/ Copy 0..3 bytes using a branchless sequence. /
102	`2`: // copy4
103	cbz count, `3f` // copy0
104	lsr tmp1, count, `1`
105	ldrb A_lw, [src]
106	ldrb C_lw, [srcend1, -`1`]
107	ldrb B_lw, [src, tmp1]
108	strb A_lw, [dstin]
109	strb B_lw, [dstin, tmp1]
110	strb C_lw, [dstend1, -`1`]
111	`3`: // copy0
112	ret
113
114	.p2align `4`
115	/ Medium copies: 33..128 bytes. /
116	`4`: // copy32_128
117	ldp A_l, A_h, [src]
118	ldp B_l, B_h, [src, `16`]
119	ldp C_l, C_h, [srcend1, -`32`]
120	ldp D_l, D_h, [srcend1, -`16`]
121	cmp count, `64`
122	b.hi `5f` // copy128
123	stp A_l, A_h, [dstin]
124	stp B_l, B_h, [dstin, `16`]
125	stp C_l, C_h, [dstend1, -`32`]
126	stp D_l, D_h, [dstend1, -`16`]
127	ret
128
129	.p2align `4`
130	/ Copy 65..128 bytes. /
131	`5`: // copy128
132	ldp E_l, E_h, [src, `32`]
133	ldp F_l, F_h, [src, `48`]
134	cmp count, `96`
135	b.ls `6f` // copy96
136	ldp G_l, G_h, [srcend1, -`64`]
137	ldp H_l, H_h, [srcend1, -`48`]
138	stp G_l, G_h, [dstend1, -`64`]
139	stp H_l, H_h, [dstend1, -`48`]
140	`6`: // copy96
141	stp A_l, A_h, [dstin]
142	stp B_l, B_h, [dstin, `16`]
143	stp E_l, E_h, [dstin, `32`]
144	stp F_l, F_h, [dstin, `48`]
145	stp C_l, C_h, [dstend1, -`32`]
146	stp D_l, D_h, [dstend1, -`16`]
147	ret
148
149	.p2align `4`
150	/ Copy more than 128 bytes. /
151	`7`: // copy_long
152	/ Use backwards copy if there is an overlap. /
153	sub tmp1, dstin, src
154	cbz tmp1, `3b` // copy0
155	cmp tmp1, count
156	b.lo `10f` //copy_long_backwards
157
158	/ Copy 16 bytes and then align dst to 16-byte alignment. /
159
160	ldp D_l, D_h, [src]
161	and tmp1, dstin, `15`
162	bic dst, dstin, `15`
163	sub src, src, tmp1
164	add count, count, tmp1 / Count is now 16 too large. /
165	ldp A_l, A_h, [src, `16`]
166	stp D_l, D_h, [dstin]
167	ldp B_l, B_h, [src, `32`]
168	ldp C_l, C_h, [src, `48`]
169	ldp D_l, D_h, [src, `64`]!
170	subs count, count, `128` + `16` / Test and readjust count. /
171	b.ls `9f` // copy64_from_end
172	`8`: // loop64
173	stp A_l, A_h, [dst, `16`]
174	ldp A_l, A_h, [src, `16`]
175	stp B_l, B_h, [dst, `32`]
176	ldp B_l, B_h, [src, `32`]
177	stp C_l, C_h, [dst, `48`]
178	ldp C_l, C_h, [src, `48`]
179	stp D_l, D_h, [dst, `64`]!
180	ldp D_l, D_h, [src, `64`]!
181	subs count, count, `64`
182	b.hi `8b` // loop64
183
184	/ Write the last iteration and copy 64 bytes from the end. /
185	`9`: // copy64_from_end
186	ldp E_l, E_h, [srcend1, -`64`]
187	stp A_l, A_h, [dst, `16`]
188	ldp A_l, A_h, [srcend1, -`48`]
189	stp B_l, B_h, [dst, `32`]
190	ldp B_l, B_h, [srcend1, -`32`]
191	stp C_l, C_h, [dst, `48`]
192	ldp C_l, C_h, [srcend1, -`16`]
193	stp D_l, D_h, [dst, `64`]
194	stp E_l, E_h, [dstend1, -`64`]
195	stp A_l, A_h, [dstend1, -`48`]
196	stp B_l, B_h, [dstend1, -`32`]
197	stp C_l, C_h, [dstend1, -`16`]
198	ret
199
200	.p2align `4`
201
202	/ Large backwards copy for overlapping copies.*
203	Copy 16 bytes and then align dst to 16-byte alignment. /*
204	`10`: // copy_long_backwards
205	ldp D_l, D_h, [srcend1, -`16`]
206	and tmp1, dstend1, `15`
207	sub srcend1, srcend1, tmp1
208	sub count, count, tmp1
209	ldp A_l, A_h, [srcend1, -`16`]
210	stp D_l, D_h, [dstend1, -`16`]
211	ldp B_l, B_h, [srcend1, -`32`]
212	ldp C_l, C_h, [srcend1, -`48`]
213	ldp D_l, D_h, [srcend1, -`64`]!
214	sub dstend1, dstend1, tmp1
215	subs count, count, `128`
216	b.ls `12f` // copy64_from_start
217
218	`11`: // loop64_backwards
219	stp A_l, A_h, [dstend1, -`16`]
220	ldp A_l, A_h, [srcend1, -`16`]
221	stp B_l, B_h, [dstend1, -`32`]
222	ldp B_l, B_h, [srcend1, -`32`]
223	stp C_l, C_h, [dstend1, -`48`]
224	ldp C_l, C_h, [srcend1, -`48`]
225	stp D_l, D_h, [dstend1, -`64`]!
226	ldp D_l, D_h, [srcend1, -`64`]!
227	subs count, count, `64`
228	b.hi `11b` // loop64_backwards
229
230	/ Write the last iteration and copy 64 bytes from the start. /
231	`12`: // copy64_from_start
232	ldp G_l, G_h, [src, `48`]
233	stp A_l, A_h, [dstend1, -`16`]
234	ldp A_l, A_h, [src, `32`]
235	stp B_l, B_h, [dstend1, -`32`]
236	ldp B_l, B_h, [src, `16`]
237	stp C_l, C_h, [dstend1, -`48`]
238	ldp C_l, C_h, [src]
239	stp D_l, D_h, [dstend1, -`64`]
240	stp G_l, G_h, [dstin, `48`]
241	stp A_l, A_h, [dstin, `32`]
242	stp B_l, B_h, [dstin, `16`]
243	stp C_l, C_h, [dstin]
244	ret
245	END_COMPILERRT_FUNCTION(__arm_sc_memcpy)
246
247	DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy)
248
249

source code of compiler-rt/lib/builtins/aarch64/sme-libc-opt-memcpy-memmove.S