1/* Optimized memcpy for SVE.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library. If not, see
18 <https://www.gnu.org/licenses/>. */
19
20#include <sysdep.h>
21
22/* Assumptions:
23 *
24 * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
25 *
26 */
27
28#define dstin x0
29#define src x1
30#define count x2
31#define dst x3
32#define srcend x4
33#define dstend x5
34#define tmp1 x6
35#define vlen x6
36
37#define A_q q0
38#define B_q q1
39#define C_q q2
40#define D_q q3
41#define E_q q4
42#define F_q q5
43#define G_q q6
44#define H_q q7
45
46/* This implementation supports both memcpy and memmove and shares most code.
47 It uses unaligned accesses and branchless sequences to keep the code small,
48 simple and improve performance.
49
50 Copies are split into 3 main cases: small copies of up to 32 bytes, medium
51 copies of up to 128 bytes, and large copies. The overhead of the overlap
52 check in memmove is negligible since it is only required for large copies.
53
54 Large copies use a software pipelined loop processing 64 bytes per iteration.
55 The source pointer is 16-byte aligned to minimize unaligned accesses.
56 The loop tail is handled by always copying 64 bytes from the end.
57*/
58
59#if HAVE_AARCH64_SVE_ASM
60
61 .arch armv8.2-a+sve
62
63ENTRY (__memcpy_sve)
64 PTR_ARG (0)
65 PTR_ARG (1)
66 SIZE_ARG (2)
67
68 cmp count, 128
69 b.hi L(copy_long)
70 cntb vlen
71 cmp count, vlen, lsl 1
72 b.hi L(copy32_128)
73 whilelo p0.b, xzr, count
74 whilelo p1.b, vlen, count
75 ld1b z0.b, p0/z, [src, 0, mul vl]
76 ld1b z1.b, p1/z, [src, 1, mul vl]
77 st1b z0.b, p0, [dstin, 0, mul vl]
78 st1b z1.b, p1, [dstin, 1, mul vl]
79 ret
80
81 /* Medium copies: 33..128 bytes. */
82L(copy32_128):
83 add srcend, src, count
84 add dstend, dstin, count
85 ldp A_q, B_q, [src]
86 ldp C_q, D_q, [srcend, -32]
87 cmp count, 64
88 b.hi L(copy128)
89 stp A_q, B_q, [dstin]
90 stp C_q, D_q, [dstend, -32]
91 ret
92
93 /* Copy 65..128 bytes. */
94L(copy128):
95 ldp E_q, F_q, [src, 32]
96 cmp count, 96
97 b.ls L(copy96)
98 ldp G_q, H_q, [srcend, -64]
99 stp G_q, H_q, [dstend, -64]
100L(copy96):
101 stp A_q, B_q, [dstin]
102 stp E_q, F_q, [dstin, 32]
103 stp C_q, D_q, [dstend, -32]
104 ret
105
106 .p2align 4
107 /* Copy more than 128 bytes. */
108L(copy_long):
109 add srcend, src, count
110 add dstend, dstin, count
111
112 /* Copy 16 bytes and then align src to 16-byte alignment. */
113 ldr D_q, [src]
114 and tmp1, src, 15
115 bic src, src, 15
116 sub dst, dstin, tmp1
117 add count, count, tmp1 /* Count is now 16 too large. */
118 ldp A_q, B_q, [src, 16]
119 str D_q, [dstin]
120 ldp C_q, D_q, [src, 48]
121 subs count, count, 128 + 16 /* Test and readjust count. */
122 b.ls L(copy64_from_end)
123L(loop64):
124 stp A_q, B_q, [dst, 16]
125 ldp A_q, B_q, [src, 80]
126 stp C_q, D_q, [dst, 48]
127 ldp C_q, D_q, [src, 112]
128 add src, src, 64
129 add dst, dst, 64
130 subs count, count, 64
131 b.hi L(loop64)
132
133 /* Write the last iteration and copy 64 bytes from the end. */
134L(copy64_from_end):
135 ldp E_q, F_q, [srcend, -64]
136 stp A_q, B_q, [dst, 16]
137 ldp A_q, B_q, [srcend, -32]
138 stp C_q, D_q, [dst, 48]
139 stp E_q, F_q, [dstend, -64]
140 stp A_q, B_q, [dstend, -32]
141 ret
142
143END (__memcpy_sve)
144
145
146ENTRY (__memmove_sve)
147 PTR_ARG (0)
148 PTR_ARG (1)
149 SIZE_ARG (2)
150
151 cmp count, 128
152 b.hi L(move_long)
153 cntb vlen
154 cmp count, vlen, lsl 1
155 b.hi L(copy32_128)
156 whilelo p0.b, xzr, count
157 whilelo p1.b, vlen, count
158 ld1b z0.b, p0/z, [src, 0, mul vl]
159 ld1b z1.b, p1/z, [src, 1, mul vl]
160 st1b z0.b, p0, [dstin, 0, mul vl]
161 st1b z1.b, p1, [dstin, 1, mul vl]
162 ret
163
164 .p2align 4
165L(move_long):
166 add srcend, src, count
167 add dstend, dstin, count
168 /* Only use backward copy if there is an overlap. */
169 sub tmp1, dstin, src
170 cbz tmp1, L(return)
171 cmp tmp1, count
172 b.hs L(copy_long)
173
174 /* Large backwards copy for overlapping copies.
175 Copy 16 bytes and then align srcend to 16-byte alignment. */
176 ldr D_q, [srcend, -16]
177 and tmp1, srcend, 15
178 bic srcend, srcend, 15
179 sub count, count, tmp1
180 ldp A_q, B_q, [srcend, -32]
181 str D_q, [dstend, -16]
182 ldp C_q, D_q, [srcend, -64]
183 sub dstend, dstend, tmp1
184 subs count, count, 128
185 b.ls L(copy64_from_start)
186
187L(loop64_backwards):
188 str B_q, [dstend, -16]
189 str A_q, [dstend, -32]
190 ldp A_q, B_q, [srcend, -96]
191 str D_q, [dstend, -48]
192 str C_q, [dstend, -64]!
193 ldp C_q, D_q, [srcend, -128]
194 sub srcend, srcend, 64
195 subs count, count, 64
196 b.hi L(loop64_backwards)
197
198 /* Write the last iteration and copy 64 bytes from the start. */
199L(copy64_from_start):
200 ldp E_q, F_q, [src, 32]
201 stp A_q, B_q, [dstend, -32]
202 ldp A_q, B_q, [src]
203 stp C_q, D_q, [dstend, -64]
204 stp E_q, F_q, [dstin, 32]
205 stp A_q, B_q, [dstin]
206L(return):
207 ret
208
209END (__memmove_sve)
210#endif
211

source code of glibc/sysdeps/aarch64/multiarch/memcpy_sve.S