1 | /* Generic optimized memcpy using SIMD. |
2 | Copyright (C) 2012-2024 Free Software Foundation, Inc. |
3 | |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library. If not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | #include <sysdep.h> |
21 | |
22 | /* Assumptions: |
23 | * |
24 | * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. |
25 | * |
26 | */ |
27 | |
28 | #define dstin x0 |
29 | #define src x1 |
30 | #define count x2 |
31 | #define dst x3 |
32 | #define srcend x4 |
33 | #define dstend x5 |
34 | #define A_l x6 |
35 | #define A_lw w6 |
36 | #define A_h x7 |
37 | #define B_l x8 |
38 | #define B_lw w8 |
39 | #define B_h x9 |
40 | #define C_lw w10 |
41 | #define tmp1 x14 |
42 | |
43 | #define A_q q0 |
44 | #define B_q q1 |
45 | #define C_q q2 |
46 | #define D_q q3 |
47 | #define E_q q4 |
48 | #define F_q q5 |
49 | #define G_q q6 |
50 | #define H_q q7 |
51 | |
52 | #ifndef MEMMOVE |
53 | # define MEMMOVE memmove |
54 | #endif |
55 | #ifndef MEMCPY |
56 | # define MEMCPY memcpy |
57 | #endif |
58 | |
59 | /* This implementation supports both memcpy and memmove and shares most code. |
60 | It uses unaligned accesses and branchless sequences to keep the code small, |
61 | simple and improve performance. |
62 | |
63 | Copies are split into 3 main cases: small copies of up to 32 bytes, medium |
64 | copies of up to 128 bytes, and large copies. The overhead of the overlap |
65 | check in memmove is negligible since it is only required for large copies. |
66 | |
67 | Large copies use a software pipelined loop processing 64 bytes per |
68 | iteration. The destination pointer is 16-byte aligned to minimize |
69 | unaligned accesses. The loop tail is handled by always copying 64 bytes |
70 | from the end. */ |
71 | |
72 | ENTRY (MEMCPY) |
73 | PTR_ARG (0) |
74 | PTR_ARG (1) |
75 | SIZE_ARG (2) |
76 | |
77 | add srcend, src, count |
78 | add dstend, dstin, count |
79 | cmp count, 128 |
80 | b.hi L(copy_long) |
81 | cmp count, 32 |
82 | b.hi L(copy32_128) |
83 | |
84 | /* Small copies: 0..32 bytes. */ |
85 | cmp count, 16 |
86 | b.lo L(copy16) |
87 | ldr A_q, [src] |
88 | ldr B_q, [srcend, -16] |
89 | str A_q, [dstin] |
90 | str B_q, [dstend, -16] |
91 | ret |
92 | |
93 | /* Copy 8-15 bytes. */ |
94 | L(copy16): |
95 | tbz count, 3, L(copy8) |
96 | ldr A_l, [src] |
97 | ldr A_h, [srcend, -8] |
98 | str A_l, [dstin] |
99 | str A_h, [dstend, -8] |
100 | ret |
101 | |
102 | /* Copy 4-7 bytes. */ |
103 | L(copy8): |
104 | tbz count, 2, L(copy4) |
105 | ldr A_lw, [src] |
106 | ldr B_lw, [srcend, -4] |
107 | str A_lw, [dstin] |
108 | str B_lw, [dstend, -4] |
109 | ret |
110 | |
111 | /* Copy 0..3 bytes using a branchless sequence. */ |
112 | L(copy4): |
113 | cbz count, L(copy0) |
114 | lsr tmp1, count, 1 |
115 | ldrb A_lw, [src] |
116 | ldrb C_lw, [srcend, -1] |
117 | ldrb B_lw, [src, tmp1] |
118 | strb A_lw, [dstin] |
119 | strb B_lw, [dstin, tmp1] |
120 | strb C_lw, [dstend, -1] |
121 | L(copy0): |
122 | ret |
123 | |
124 | .p2align 4 |
125 | /* Medium copies: 33..128 bytes. */ |
126 | L(copy32_128): |
127 | ldp A_q, B_q, [src] |
128 | ldp C_q, D_q, [srcend, -32] |
129 | cmp count, 64 |
130 | b.hi L(copy128) |
131 | stp A_q, B_q, [dstin] |
132 | stp C_q, D_q, [dstend, -32] |
133 | ret |
134 | |
135 | .p2align 4 |
136 | /* Copy 65..128 bytes. */ |
137 | L(copy128): |
138 | ldp E_q, F_q, [src, 32] |
139 | cmp count, 96 |
140 | b.ls L(copy96) |
141 | ldp G_q, H_q, [srcend, -64] |
142 | stp G_q, H_q, [dstend, -64] |
143 | L(copy96): |
144 | stp A_q, B_q, [dstin] |
145 | stp E_q, F_q, [dstin, 32] |
146 | stp C_q, D_q, [dstend, -32] |
147 | ret |
148 | |
149 | /* Align loop64 below to 16 bytes. */ |
150 | nop |
151 | |
152 | /* Copy more than 128 bytes. */ |
153 | L(copy_long): |
154 | /* Copy 16 bytes and then align src to 16-byte alignment. */ |
155 | ldr D_q, [src] |
156 | and tmp1, src, 15 |
157 | bic src, src, 15 |
158 | sub dst, dstin, tmp1 |
159 | add count, count, tmp1 /* Count is now 16 too large. */ |
160 | ldp A_q, B_q, [src, 16] |
161 | str D_q, [dstin] |
162 | ldp C_q, D_q, [src, 48] |
163 | subs count, count, 128 + 16 /* Test and readjust count. */ |
164 | b.ls L(copy64_from_end) |
165 | L(loop64): |
166 | stp A_q, B_q, [dst, 16] |
167 | ldp A_q, B_q, [src, 80] |
168 | stp C_q, D_q, [dst, 48] |
169 | ldp C_q, D_q, [src, 112] |
170 | add src, src, 64 |
171 | add dst, dst, 64 |
172 | subs count, count, 64 |
173 | b.hi L(loop64) |
174 | |
175 | /* Write the last iteration and copy 64 bytes from the end. */ |
176 | L(copy64_from_end): |
177 | ldp E_q, F_q, [srcend, -64] |
178 | stp A_q, B_q, [dst, 16] |
179 | ldp A_q, B_q, [srcend, -32] |
180 | stp C_q, D_q, [dst, 48] |
181 | stp E_q, F_q, [dstend, -64] |
182 | stp A_q, B_q, [dstend, -32] |
183 | ret |
184 | |
185 | END (MEMCPY) |
186 | libc_hidden_builtin_def (MEMCPY) |
187 | |
188 | |
189 | ENTRY (MEMMOVE) |
190 | PTR_ARG (0) |
191 | PTR_ARG (1) |
192 | SIZE_ARG (2) |
193 | |
194 | add srcend, src, count |
195 | add dstend, dstin, count |
196 | cmp count, 128 |
197 | b.hi L(move_long) |
198 | cmp count, 32 |
199 | b.hi L(copy32_128) |
200 | |
201 | /* Small moves: 0..32 bytes. */ |
202 | cmp count, 16 |
203 | b.lo L(copy16) |
204 | ldr A_q, [src] |
205 | ldr B_q, [srcend, -16] |
206 | str A_q, [dstin] |
207 | str B_q, [dstend, -16] |
208 | ret |
209 | |
210 | L(move_long): |
211 | /* Only use backward copy if there is an overlap. */ |
212 | sub tmp1, dstin, src |
213 | cbz tmp1, L(move0) |
214 | cmp tmp1, count |
215 | b.hs L(copy_long) |
216 | |
217 | /* Large backwards copy for overlapping copies. |
218 | Copy 16 bytes and then align srcend to 16-byte alignment. */ |
219 | L(copy_long_backwards): |
220 | ldr D_q, [srcend, -16] |
221 | and tmp1, srcend, 15 |
222 | bic srcend, srcend, 15 |
223 | sub count, count, tmp1 |
224 | ldp A_q, B_q, [srcend, -32] |
225 | str D_q, [dstend, -16] |
226 | ldp C_q, D_q, [srcend, -64] |
227 | sub dstend, dstend, tmp1 |
228 | subs count, count, 128 |
229 | b.ls L(copy64_from_start) |
230 | |
231 | L(loop64_backwards): |
232 | str B_q, [dstend, -16] |
233 | str A_q, [dstend, -32] |
234 | ldp A_q, B_q, [srcend, -96] |
235 | str D_q, [dstend, -48] |
236 | str C_q, [dstend, -64]! |
237 | ldp C_q, D_q, [srcend, -128] |
238 | sub srcend, srcend, 64 |
239 | subs count, count, 64 |
240 | b.hi L(loop64_backwards) |
241 | |
242 | /* Write the last iteration and copy 64 bytes from the start. */ |
243 | L(copy64_from_start): |
244 | ldp E_q, F_q, [src, 32] |
245 | stp A_q, B_q, [dstend, -32] |
246 | ldp A_q, B_q, [src] |
247 | stp C_q, D_q, [dstend, -64] |
248 | stp E_q, F_q, [dstin, 32] |
249 | stp A_q, B_q, [dstin] |
250 | L(move0): |
251 | ret |
252 | |
253 | END (MEMMOVE) |
254 | libc_hidden_builtin_def (MEMMOVE) |
255 | |