1/* Generic optimized memcpy using SIMD.
2 Copyright (C) 2012-2024 Free Software Foundation, Inc.
3
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library. If not, see
18 <https://www.gnu.org/licenses/>. */
19
20#include <sysdep.h>
21
22/* Assumptions:
23 *
24 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
25 *
26 */
27
28#define dstin x0
29#define src x1
30#define count x2
31#define dst x3
32#define srcend x4
33#define dstend x5
34#define A_l x6
35#define A_lw w6
36#define A_h x7
37#define B_l x8
38#define B_lw w8
39#define B_h x9
40#define C_lw w10
41#define tmp1 x14
42
43#define A_q q0
44#define B_q q1
45#define C_q q2
46#define D_q q3
47#define E_q q4
48#define F_q q5
49#define G_q q6
50#define H_q q7
51
52#ifndef MEMMOVE
53# define MEMMOVE memmove
54#endif
55#ifndef MEMCPY
56# define MEMCPY memcpy
57#endif
58
59/* This implementation supports both memcpy and memmove and shares most code.
60 It uses unaligned accesses and branchless sequences to keep the code small,
61 simple and improve performance.
62
63 Copies are split into 3 main cases: small copies of up to 32 bytes, medium
64 copies of up to 128 bytes, and large copies. The overhead of the overlap
65 check in memmove is negligible since it is only required for large copies.
66
67 Large copies use a software pipelined loop processing 64 bytes per
68 iteration. The destination pointer is 16-byte aligned to minimize
69 unaligned accesses. The loop tail is handled by always copying 64 bytes
70 from the end. */
71
72ENTRY (MEMCPY)
73 PTR_ARG (0)
74 PTR_ARG (1)
75 SIZE_ARG (2)
76
77 add srcend, src, count
78 add dstend, dstin, count
79 cmp count, 128
80 b.hi L(copy_long)
81 cmp count, 32
82 b.hi L(copy32_128)
83
84 /* Small copies: 0..32 bytes. */
85 cmp count, 16
86 b.lo L(copy16)
87 ldr A_q, [src]
88 ldr B_q, [srcend, -16]
89 str A_q, [dstin]
90 str B_q, [dstend, -16]
91 ret
92
93 /* Copy 8-15 bytes. */
94L(copy16):
95 tbz count, 3, L(copy8)
96 ldr A_l, [src]
97 ldr A_h, [srcend, -8]
98 str A_l, [dstin]
99 str A_h, [dstend, -8]
100 ret
101
102 /* Copy 4-7 bytes. */
103L(copy8):
104 tbz count, 2, L(copy4)
105 ldr A_lw, [src]
106 ldr B_lw, [srcend, -4]
107 str A_lw, [dstin]
108 str B_lw, [dstend, -4]
109 ret
110
111 /* Copy 0..3 bytes using a branchless sequence. */
112L(copy4):
113 cbz count, L(copy0)
114 lsr tmp1, count, 1
115 ldrb A_lw, [src]
116 ldrb C_lw, [srcend, -1]
117 ldrb B_lw, [src, tmp1]
118 strb A_lw, [dstin]
119 strb B_lw, [dstin, tmp1]
120 strb C_lw, [dstend, -1]
121L(copy0):
122 ret
123
124 .p2align 4
125 /* Medium copies: 33..128 bytes. */
126L(copy32_128):
127 ldp A_q, B_q, [src]
128 ldp C_q, D_q, [srcend, -32]
129 cmp count, 64
130 b.hi L(copy128)
131 stp A_q, B_q, [dstin]
132 stp C_q, D_q, [dstend, -32]
133 ret
134
135 .p2align 4
136 /* Copy 65..128 bytes. */
137L(copy128):
138 ldp E_q, F_q, [src, 32]
139 cmp count, 96
140 b.ls L(copy96)
141 ldp G_q, H_q, [srcend, -64]
142 stp G_q, H_q, [dstend, -64]
143L(copy96):
144 stp A_q, B_q, [dstin]
145 stp E_q, F_q, [dstin, 32]
146 stp C_q, D_q, [dstend, -32]
147 ret
148
149 /* Align loop64 below to 16 bytes. */
150 nop
151
152 /* Copy more than 128 bytes. */
153L(copy_long):
154 /* Copy 16 bytes and then align src to 16-byte alignment. */
155 ldr D_q, [src]
156 and tmp1, src, 15
157 bic src, src, 15
158 sub dst, dstin, tmp1
159 add count, count, tmp1 /* Count is now 16 too large. */
160 ldp A_q, B_q, [src, 16]
161 str D_q, [dstin]
162 ldp C_q, D_q, [src, 48]
163 subs count, count, 128 + 16 /* Test and readjust count. */
164 b.ls L(copy64_from_end)
165L(loop64):
166 stp A_q, B_q, [dst, 16]
167 ldp A_q, B_q, [src, 80]
168 stp C_q, D_q, [dst, 48]
169 ldp C_q, D_q, [src, 112]
170 add src, src, 64
171 add dst, dst, 64
172 subs count, count, 64
173 b.hi L(loop64)
174
175 /* Write the last iteration and copy 64 bytes from the end. */
176L(copy64_from_end):
177 ldp E_q, F_q, [srcend, -64]
178 stp A_q, B_q, [dst, 16]
179 ldp A_q, B_q, [srcend, -32]
180 stp C_q, D_q, [dst, 48]
181 stp E_q, F_q, [dstend, -64]
182 stp A_q, B_q, [dstend, -32]
183 ret
184
185END (MEMCPY)
186libc_hidden_builtin_def (MEMCPY)
187
188
189ENTRY (MEMMOVE)
190 PTR_ARG (0)
191 PTR_ARG (1)
192 SIZE_ARG (2)
193
194 add srcend, src, count
195 add dstend, dstin, count
196 cmp count, 128
197 b.hi L(move_long)
198 cmp count, 32
199 b.hi L(copy32_128)
200
201 /* Small moves: 0..32 bytes. */
202 cmp count, 16
203 b.lo L(copy16)
204 ldr A_q, [src]
205 ldr B_q, [srcend, -16]
206 str A_q, [dstin]
207 str B_q, [dstend, -16]
208 ret
209
210L(move_long):
211 /* Only use backward copy if there is an overlap. */
212 sub tmp1, dstin, src
213 cbz tmp1, L(move0)
214 cmp tmp1, count
215 b.hs L(copy_long)
216
217 /* Large backwards copy for overlapping copies.
218 Copy 16 bytes and then align srcend to 16-byte alignment. */
219L(copy_long_backwards):
220 ldr D_q, [srcend, -16]
221 and tmp1, srcend, 15
222 bic srcend, srcend, 15
223 sub count, count, tmp1
224 ldp A_q, B_q, [srcend, -32]
225 str D_q, [dstend, -16]
226 ldp C_q, D_q, [srcend, -64]
227 sub dstend, dstend, tmp1
228 subs count, count, 128
229 b.ls L(copy64_from_start)
230
231L(loop64_backwards):
232 str B_q, [dstend, -16]
233 str A_q, [dstend, -32]
234 ldp A_q, B_q, [srcend, -96]
235 str D_q, [dstend, -48]
236 str C_q, [dstend, -64]!
237 ldp C_q, D_q, [srcend, -128]
238 sub srcend, srcend, 64
239 subs count, count, 64
240 b.hi L(loop64_backwards)
241
242 /* Write the last iteration and copy 64 bytes from the start. */
243L(copy64_from_start):
244 ldp E_q, F_q, [src, 32]
245 stp A_q, B_q, [dstend, -32]
246 ldp A_q, B_q, [src]
247 stp C_q, D_q, [dstend, -64]
248 stp E_q, F_q, [dstin, 32]
249 stp A_q, B_q, [dstin]
250L(move0):
251 ret
252
253END (MEMMOVE)
254libc_hidden_builtin_def (MEMMOVE)
255

source code of glibc/sysdeps/aarch64/memcpy.S