1 | /* Optimized memcpy for SVE. |
2 | Copyright (C) 2021-2024 Free Software Foundation, Inc. |
3 | |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library. If not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | #include <sysdep.h> |
21 | |
22 | /* Assumptions: |
23 | * |
24 | * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses. |
25 | * |
26 | */ |
27 | |
28 | #define dstin x0 |
29 | #define src x1 |
30 | #define count x2 |
31 | #define dst x3 |
32 | #define srcend x4 |
33 | #define dstend x5 |
34 | #define tmp1 x6 |
35 | #define vlen x6 |
36 | |
37 | #define A_q q0 |
38 | #define B_q q1 |
39 | #define C_q q2 |
40 | #define D_q q3 |
41 | #define E_q q4 |
42 | #define F_q q5 |
43 | #define G_q q6 |
44 | #define H_q q7 |
45 | |
46 | /* This implementation supports both memcpy and memmove and shares most code. |
47 | It uses unaligned accesses and branchless sequences to keep the code small, |
48 | simple and improve performance. |
49 | |
50 | Copies are split into 3 main cases: small copies of up to 32 bytes, medium |
51 | copies of up to 128 bytes, and large copies. The overhead of the overlap |
52 | check in memmove is negligible since it is only required for large copies. |
53 | |
54 | Large copies use a software pipelined loop processing 64 bytes per iteration. |
55 | The source pointer is 16-byte aligned to minimize unaligned accesses. |
56 | The loop tail is handled by always copying 64 bytes from the end. |
57 | */ |
58 | |
59 | #if HAVE_AARCH64_SVE_ASM |
60 | |
61 | .arch armv8.2-a+sve |
62 | |
63 | ENTRY (__memcpy_sve) |
64 | PTR_ARG (0) |
65 | PTR_ARG (1) |
66 | SIZE_ARG (2) |
67 | |
68 | cmp count, 128 |
69 | b.hi L(copy_long) |
70 | cntb vlen |
71 | cmp count, vlen, lsl 1 |
72 | b.hi L(copy32_128) |
73 | whilelo p0.b, xzr, count |
74 | whilelo p1.b, vlen, count |
75 | ld1b z0.b, p0/z, [src, 0, mul vl] |
76 | ld1b z1.b, p1/z, [src, 1, mul vl] |
77 | st1b z0.b, p0, [dstin, 0, mul vl] |
78 | st1b z1.b, p1, [dstin, 1, mul vl] |
79 | ret |
80 | |
81 | /* Medium copies: 33..128 bytes. */ |
82 | L(copy32_128): |
83 | add srcend, src, count |
84 | add dstend, dstin, count |
85 | ldp A_q, B_q, [src] |
86 | ldp C_q, D_q, [srcend, -32] |
87 | cmp count, 64 |
88 | b.hi L(copy128) |
89 | stp A_q, B_q, [dstin] |
90 | stp C_q, D_q, [dstend, -32] |
91 | ret |
92 | |
93 | /* Copy 65..128 bytes. */ |
94 | L(copy128): |
95 | ldp E_q, F_q, [src, 32] |
96 | cmp count, 96 |
97 | b.ls L(copy96) |
98 | ldp G_q, H_q, [srcend, -64] |
99 | stp G_q, H_q, [dstend, -64] |
100 | L(copy96): |
101 | stp A_q, B_q, [dstin] |
102 | stp E_q, F_q, [dstin, 32] |
103 | stp C_q, D_q, [dstend, -32] |
104 | ret |
105 | |
106 | .p2align 4 |
107 | /* Copy more than 128 bytes. */ |
108 | L(copy_long): |
109 | add srcend, src, count |
110 | add dstend, dstin, count |
111 | |
112 | /* Copy 16 bytes and then align src to 16-byte alignment. */ |
113 | ldr D_q, [src] |
114 | and tmp1, src, 15 |
115 | bic src, src, 15 |
116 | sub dst, dstin, tmp1 |
117 | add count, count, tmp1 /* Count is now 16 too large. */ |
118 | ldp A_q, B_q, [src, 16] |
119 | str D_q, [dstin] |
120 | ldp C_q, D_q, [src, 48] |
121 | subs count, count, 128 + 16 /* Test and readjust count. */ |
122 | b.ls L(copy64_from_end) |
123 | L(loop64): |
124 | stp A_q, B_q, [dst, 16] |
125 | ldp A_q, B_q, [src, 80] |
126 | stp C_q, D_q, [dst, 48] |
127 | ldp C_q, D_q, [src, 112] |
128 | add src, src, 64 |
129 | add dst, dst, 64 |
130 | subs count, count, 64 |
131 | b.hi L(loop64) |
132 | |
133 | /* Write the last iteration and copy 64 bytes from the end. */ |
134 | L(copy64_from_end): |
135 | ldp E_q, F_q, [srcend, -64] |
136 | stp A_q, B_q, [dst, 16] |
137 | ldp A_q, B_q, [srcend, -32] |
138 | stp C_q, D_q, [dst, 48] |
139 | stp E_q, F_q, [dstend, -64] |
140 | stp A_q, B_q, [dstend, -32] |
141 | ret |
142 | |
143 | END (__memcpy_sve) |
144 | |
145 | |
146 | ENTRY (__memmove_sve) |
147 | PTR_ARG (0) |
148 | PTR_ARG (1) |
149 | SIZE_ARG (2) |
150 | |
151 | cmp count, 128 |
152 | b.hi L(move_long) |
153 | cntb vlen |
154 | cmp count, vlen, lsl 1 |
155 | b.hi L(copy32_128) |
156 | whilelo p0.b, xzr, count |
157 | whilelo p1.b, vlen, count |
158 | ld1b z0.b, p0/z, [src, 0, mul vl] |
159 | ld1b z1.b, p1/z, [src, 1, mul vl] |
160 | st1b z0.b, p0, [dstin, 0, mul vl] |
161 | st1b z1.b, p1, [dstin, 1, mul vl] |
162 | ret |
163 | |
164 | .p2align 4 |
165 | L(move_long): |
166 | add srcend, src, count |
167 | add dstend, dstin, count |
168 | /* Only use backward copy if there is an overlap. */ |
169 | sub tmp1, dstin, src |
170 | cbz tmp1, L(return) |
171 | cmp tmp1, count |
172 | b.hs L(copy_long) |
173 | |
174 | /* Large backwards copy for overlapping copies. |
175 | Copy 16 bytes and then align srcend to 16-byte alignment. */ |
176 | ldr D_q, [srcend, -16] |
177 | and tmp1, srcend, 15 |
178 | bic srcend, srcend, 15 |
179 | sub count, count, tmp1 |
180 | ldp A_q, B_q, [srcend, -32] |
181 | str D_q, [dstend, -16] |
182 | ldp C_q, D_q, [srcend, -64] |
183 | sub dstend, dstend, tmp1 |
184 | subs count, count, 128 |
185 | b.ls L(copy64_from_start) |
186 | |
187 | L(loop64_backwards): |
188 | str B_q, [dstend, -16] |
189 | str A_q, [dstend, -32] |
190 | ldp A_q, B_q, [srcend, -96] |
191 | str D_q, [dstend, -48] |
192 | str C_q, [dstend, -64]! |
193 | ldp C_q, D_q, [srcend, -128] |
194 | sub srcend, srcend, 64 |
195 | subs count, count, 64 |
196 | b.hi L(loop64_backwards) |
197 | |
198 | /* Write the last iteration and copy 64 bytes from the start. */ |
199 | L(copy64_from_start): |
200 | ldp E_q, F_q, [src, 32] |
201 | stp A_q, B_q, [dstend, -32] |
202 | ldp A_q, B_q, [src] |
203 | stp C_q, D_q, [dstend, -64] |
204 | stp E_q, F_q, [dstin, 32] |
205 | stp A_q, B_q, [dstin] |
206 | L(return): |
207 | ret |
208 | |
209 | END (__memmove_sve) |
210 | #endif |
211 | |