memcpy.S source code [linux/arch/arm64/lib/memcpy.S]

1	/ SPDX-License-Identifier: GPL-2.0-only /
2	/*
3	* Copyright (c) 2012-2021, Arm Limited.
4	*
5	* Adapted from the original at:
6	* https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/memcpy.S
7	*/
8
9	#include <linux/linkage.h>
10	#include <asm/assembler.h>
11
12	/ Assumptions:*
13	*
14	* ARMv8-a, AArch64, unaligned accesses.
15	*
16	*/
17
18	#define L(label) .L ## label
19
20	#define dstin x0
21	#define src x1
22	#define count x2
23	#define dst x3
24	#define srcend x4
25	#define dstend x5
26	#define A_l x6
27	#define A_lw w6
28	#define A_h x7
29	#define B_l x8
30	#define B_lw w8
31	#define B_h x9
32	#define C_l x10
33	#define C_lw w10
34	#define C_h x11
35	#define D_l x12
36	#define D_h x13
37	#define E_l x14
38	#define E_h x15
39	#define F_l x16
40	#define F_h x17
41	#define G_l count
42	#define G_h dst
43	#define H_l src
44	#define H_h srcend
45	#define tmp1 x14
46
47	/ This implementation handles overlaps and supports both memcpy and memmove*
48	from a single entry point. It uses unaligned accesses and branchless
49	sequences to keep the code small, simple and improve performance.
50
51	Copies are split into 3 main cases: small copies of up to 32 bytes, medium
52	copies of up to 128 bytes, and large copies. The overhead of the overlap
53	check is negligible since it is only required for large copies.
54
55	Large copies use a software pipelined loop processing 64 bytes per iteration.
56	The destination pointer is 16-byte aligned to minimize unaligned accesses.
57	The loop tail is handled by always copying 64 bytes from the end.
58	*/
59
60	SYM_FUNC_START(__pi_memcpy)
61	add srcend, src, count
62	add dstend, dstin, count
63	cmp count, `128`
64	b.hi L(copy_long)
65	cmp count, `32`
66	b.hi L(copy32_128)
67
68	/ Small copies: 0..32 bytes. /
69	cmp count, `16`
70	b.lo L(copy16)
71	ldp A_l, A_h, [src]
72	ldp D_l, D_h, [srcend, -`16`]
73	stp A_l, A_h, [dstin]
74	stp D_l, D_h, [dstend, -`16`]
75	ret
76
77	/ Copy 8-15 bytes. /
78	L(copy16):
79	tbz count, `3`, L(copy8)
80	ldr A_l, [src]
81	ldr A_h, [srcend, -`8`]
82	str A_l, [dstin]
83	str A_h, [dstend, -`8`]
84	ret
85
86	.p2align `3`
87	/ Copy 4-7 bytes. /
88	L(copy8):
89	tbz count, `2`, L(copy4)
90	ldr A_lw, [src]
91	ldr B_lw, [srcend, -`4`]
92	str A_lw, [dstin]
93	str B_lw, [dstend, -`4`]
94	ret
95
96	/ Copy 0..3 bytes using a branchless sequence. /
97	L(copy4):
98	cbz count, L(copy0)
99	lsr tmp1, count, `1`
100	ldrb A_lw, [src]
101	ldrb C_lw, [srcend, -`1`]
102	ldrb B_lw, [src, tmp1]
103	strb A_lw, [dstin]
104	strb B_lw, [dstin, tmp1]
105	strb C_lw, [dstend, -`1`]
106	L(copy0):
107	ret
108
109	.p2align `4`
110	/ Medium copies: 33..128 bytes. /
111	L(copy32_128):
112	ldp A_l, A_h, [src]
113	ldp B_l, B_h, [src, `16`]
114	ldp C_l, C_h, [srcend, -`32`]
115	ldp D_l, D_h, [srcend, -`16`]
116	cmp count, `64`
117	b.hi L(copy128)
118	stp A_l, A_h, [dstin]
119	stp B_l, B_h, [dstin, `16`]
120	stp C_l, C_h, [dstend, -`32`]
121	stp D_l, D_h, [dstend, -`16`]
122	ret
123
124	.p2align `4`
125	/ Copy 65..128 bytes. /
126	L(copy128):
127	ldp E_l, E_h, [src, `32`]
128	ldp F_l, F_h, [src, `48`]
129	cmp count, `96`
130	b.ls L(copy96)
131	ldp G_l, G_h, [srcend, -`64`]
132	ldp H_l, H_h, [srcend, -`48`]
133	stp G_l, G_h, [dstend, -`64`]
134	stp H_l, H_h, [dstend, -`48`]
135	L(copy96):
136	stp A_l, A_h, [dstin]
137	stp B_l, B_h, [dstin, `16`]
138	stp E_l, E_h, [dstin, `32`]
139	stp F_l, F_h, [dstin, `48`]
140	stp C_l, C_h, [dstend, -`32`]
141	stp D_l, D_h, [dstend, -`16`]
142	ret
143
144	.p2align `4`
145	/ Copy more than 128 bytes. /
146	L(copy_long):
147	/ Use backwards copy if there is an overlap. /
148	sub tmp1, dstin, src
149	cbz tmp1, L(copy0)
150	cmp tmp1, count
151	b.lo L(copy_long_backwards)
152
153	/ Copy 16 bytes and then align dst to 16-byte alignment. /
154
155	ldp D_l, D_h, [src]
156	and tmp1, dstin, `15`
157	bic dst, dstin, `15`
158	sub src, src, tmp1
159	add count, count, tmp1 / Count is now 16 too large. /
160	ldp A_l, A_h, [src, `16`]
161	stp D_l, D_h, [dstin]
162	ldp B_l, B_h, [src, `32`]
163	ldp C_l, C_h, [src, `48`]
164	ldp D_l, D_h, [src, `64`]!
165	subs count, count, `128` + `16` / Test and readjust count. /
166	b.ls L(copy64_from_end)
167
168	L(loop64):
169	stp A_l, A_h, [dst, `16`]
170	ldp A_l, A_h, [src, `16`]
171	stp B_l, B_h, [dst, `32`]
172	ldp B_l, B_h, [src, `32`]
173	stp C_l, C_h, [dst, `48`]
174	ldp C_l, C_h, [src, `48`]
175	stp D_l, D_h, [dst, `64`]!
176	ldp D_l, D_h, [src, `64`]!
177	subs count, count, `64`
178	b.hi L(loop64)
179
180	/ Write the last iteration and copy 64 bytes from the end. /
181	L(copy64_from_end):
182	ldp E_l, E_h, [srcend, -`64`]
183	stp A_l, A_h, [dst, `16`]
184	ldp A_l, A_h, [srcend, -`48`]
185	stp B_l, B_h, [dst, `32`]
186	ldp B_l, B_h, [srcend, -`32`]
187	stp C_l, C_h, [dst, `48`]
188	ldp C_l, C_h, [srcend, -`16`]
189	stp D_l, D_h, [dst, `64`]
190	stp E_l, E_h, [dstend, -`64`]
191	stp A_l, A_h, [dstend, -`48`]
192	stp B_l, B_h, [dstend, -`32`]
193	stp C_l, C_h, [dstend, -`16`]
194	ret
195
196	.p2align `4`
197
198	/ Large backwards copy for overlapping copies.*
199	Copy 16 bytes and then align dst to 16-byte alignment. /*
200	L(copy_long_backwards):
201	ldp D_l, D_h, [srcend, -`16`]
202	and tmp1, dstend, `15`
203	sub srcend, srcend, tmp1
204	sub count, count, tmp1
205	ldp A_l, A_h, [srcend, -`16`]
206	stp D_l, D_h, [dstend, -`16`]
207	ldp B_l, B_h, [srcend, -`32`]
208	ldp C_l, C_h, [srcend, -`48`]
209	ldp D_l, D_h, [srcend, -`64`]!
210	sub dstend, dstend, tmp1
211	subs count, count, `128`
212	b.ls L(copy64_from_start)
213
214	L(loop64_backwards):
215	stp A_l, A_h, [dstend, -`16`]
216	ldp A_l, A_h, [srcend, -`16`]
217	stp B_l, B_h, [dstend, -`32`]
218	ldp B_l, B_h, [srcend, -`32`]
219	stp C_l, C_h, [dstend, -`48`]
220	ldp C_l, C_h, [srcend, -`48`]
221	stp D_l, D_h, [dstend, -`64`]!
222	ldp D_l, D_h, [srcend, -`64`]!
223	subs count, count, `64`
224	b.hi L(loop64_backwards)
225
226	/ Write the last iteration and copy 64 bytes from the start. /
227	L(copy64_from_start):
228	ldp G_l, G_h, [src, `48`]
229	stp A_l, A_h, [dstend, -`16`]
230	ldp A_l, A_h, [src, `32`]
231	stp B_l, B_h, [dstend, -`32`]
232	ldp B_l, B_h, [src, `16`]
233	stp C_l, C_h, [dstend, -`48`]
234	ldp C_l, C_h, [src]
235	stp D_l, D_h, [dstend, -`64`]
236	stp G_l, G_h, [dstin, `48`]
237	stp A_l, A_h, [dstin, `32`]
238	stp B_l, B_h, [dstin, `16`]
239	stp C_l, C_h, [dstin]
240	ret
241	SYM_FUNC_END(__pi_memcpy)
242
243	SYM_FUNC_ALIAS(__memcpy, __pi_memcpy)
244	EXPORT_SYMBOL(__memcpy)
245	SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy)
246	EXPORT_SYMBOL(memcpy)
247
248	SYM_FUNC_ALIAS(__pi_memmove, __pi_memcpy)
249
250	SYM_FUNC_ALIAS(__memmove, __pi_memmove)
251	EXPORT_SYMBOL(__memmove)
252	SYM_FUNC_ALIAS_WEAK(memmove, __memmove)
253	EXPORT_SYMBOL(memmove)
254

source code of linux/arch/arm64/lib/memcpy.S