1 | /* |
2 | * memcpy - copy memory area |
3 | * |
4 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
5 | * See https://llvm.org/LICENSE.txt for license information. |
6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
7 | */ |
8 | |
9 | /* Assumptions: |
10 | * |
11 | * ARMv8-a, AArch64, unaligned accesses. |
12 | * |
13 | */ |
14 | |
15 | #include "../asmdefs.h" |
16 | |
17 | #define dstin x0 |
18 | #define src x1 |
19 | #define count x2 |
20 | #define dst x3 |
21 | #define srcend x4 |
22 | #define dstend x5 |
23 | #define A_l x6 |
24 | #define A_lw w6 |
25 | #define A_h x7 |
26 | #define B_l x8 |
27 | #define B_lw w8 |
28 | #define B_h x9 |
29 | #define C_l x10 |
30 | #define C_lw w10 |
31 | #define C_h x11 |
32 | #define D_l x12 |
33 | #define D_h x13 |
34 | #define E_l x14 |
35 | #define E_h x15 |
36 | #define F_l x16 |
37 | #define F_h x17 |
38 | #define G_l count |
39 | #define G_h dst |
40 | #define H_l src |
41 | #define H_h srcend |
42 | #define tmp1 x14 |
43 | |
44 | /* This implementation handles overlaps and supports both memcpy and memmove |
45 | from a single entry point. It uses unaligned accesses and branchless |
46 | sequences to keep the code small, simple and improve performance. |
47 | |
48 | Copies are split into 3 main cases: small copies of up to 32 bytes, medium |
49 | copies of up to 128 bytes, and large copies. The overhead of the overlap |
50 | check is negligible since it is only required for large copies. |
51 | |
52 | Large copies use a software pipelined loop processing 64 bytes per iteration. |
53 | The destination pointer is 16-byte aligned to minimize unaligned accesses. |
54 | The loop tail is handled by always copying 64 bytes from the end. |
55 | */ |
56 | |
57 | ENTRY (__memcpy_aarch64) |
58 | ENTRY_ALIAS (__memmove_aarch64) |
59 | add srcend, src, count |
60 | add dstend, dstin, count |
61 | cmp count, 128 |
62 | b.hi L(copy_long) |
63 | cmp count, 32 |
64 | b.hi L(copy32_128) |
65 | |
66 | /* Small copies: 0..32 bytes. */ |
67 | cmp count, 16 |
68 | b.lo L(copy16) |
69 | ldp A_l, A_h, [src] |
70 | ldp D_l, D_h, [srcend, -16] |
71 | stp A_l, A_h, [dstin] |
72 | stp D_l, D_h, [dstend, -16] |
73 | ret |
74 | |
75 | /* Copy 8-15 bytes. */ |
76 | L(copy16): |
77 | tbz count, 3, L(copy8) |
78 | ldr A_l, [src] |
79 | ldr A_h, [srcend, -8] |
80 | str A_l, [dstin] |
81 | str A_h, [dstend, -8] |
82 | ret |
83 | |
84 | .p2align 3 |
85 | /* Copy 4-7 bytes. */ |
86 | L(copy8): |
87 | tbz count, 2, L(copy4) |
88 | ldr A_lw, [src] |
89 | ldr B_lw, [srcend, -4] |
90 | str A_lw, [dstin] |
91 | str B_lw, [dstend, -4] |
92 | ret |
93 | |
94 | /* Copy 0..3 bytes using a branchless sequence. */ |
95 | L(copy4): |
96 | cbz count, L(copy0) |
97 | lsr tmp1, count, 1 |
98 | ldrb A_lw, [src] |
99 | ldrb C_lw, [srcend, -1] |
100 | ldrb B_lw, [src, tmp1] |
101 | strb A_lw, [dstin] |
102 | strb B_lw, [dstin, tmp1] |
103 | strb C_lw, [dstend, -1] |
104 | L(copy0): |
105 | ret |
106 | |
107 | .p2align 4 |
108 | /* Medium copies: 33..128 bytes. */ |
109 | L(copy32_128): |
110 | ldp A_l, A_h, [src] |
111 | ldp B_l, B_h, [src, 16] |
112 | ldp C_l, C_h, [srcend, -32] |
113 | ldp D_l, D_h, [srcend, -16] |
114 | cmp count, 64 |
115 | b.hi L(copy128) |
116 | stp A_l, A_h, [dstin] |
117 | stp B_l, B_h, [dstin, 16] |
118 | stp C_l, C_h, [dstend, -32] |
119 | stp D_l, D_h, [dstend, -16] |
120 | ret |
121 | |
122 | .p2align 4 |
123 | /* Copy 65..128 bytes. */ |
124 | L(copy128): |
125 | ldp E_l, E_h, [src, 32] |
126 | ldp F_l, F_h, [src, 48] |
127 | cmp count, 96 |
128 | b.ls L(copy96) |
129 | ldp G_l, G_h, [srcend, -64] |
130 | ldp H_l, H_h, [srcend, -48] |
131 | stp G_l, G_h, [dstend, -64] |
132 | stp H_l, H_h, [dstend, -48] |
133 | L(copy96): |
134 | stp A_l, A_h, [dstin] |
135 | stp B_l, B_h, [dstin, 16] |
136 | stp E_l, E_h, [dstin, 32] |
137 | stp F_l, F_h, [dstin, 48] |
138 | stp C_l, C_h, [dstend, -32] |
139 | stp D_l, D_h, [dstend, -16] |
140 | ret |
141 | |
142 | .p2align 4 |
143 | /* Copy more than 128 bytes. */ |
144 | L(copy_long): |
145 | /* Use backwards copy if there is an overlap. */ |
146 | sub tmp1, dstin, src |
147 | cbz tmp1, L(copy0) |
148 | cmp tmp1, count |
149 | b.lo L(copy_long_backwards) |
150 | |
151 | /* Copy 16 bytes and then align dst to 16-byte alignment. */ |
152 | |
153 | ldp D_l, D_h, [src] |
154 | and tmp1, dstin, 15 |
155 | bic dst, dstin, 15 |
156 | sub src, src, tmp1 |
157 | add count, count, tmp1 /* Count is now 16 too large. */ |
158 | ldp A_l, A_h, [src, 16] |
159 | stp D_l, D_h, [dstin] |
160 | ldp B_l, B_h, [src, 32] |
161 | ldp C_l, C_h, [src, 48] |
162 | ldp D_l, D_h, [src, 64]! |
163 | subs count, count, 128 + 16 /* Test and readjust count. */ |
164 | b.ls L(copy64_from_end) |
165 | |
166 | L(loop64): |
167 | stp A_l, A_h, [dst, 16] |
168 | ldp A_l, A_h, [src, 16] |
169 | stp B_l, B_h, [dst, 32] |
170 | ldp B_l, B_h, [src, 32] |
171 | stp C_l, C_h, [dst, 48] |
172 | ldp C_l, C_h, [src, 48] |
173 | stp D_l, D_h, [dst, 64]! |
174 | ldp D_l, D_h, [src, 64]! |
175 | subs count, count, 64 |
176 | b.hi L(loop64) |
177 | |
178 | /* Write the last iteration and copy 64 bytes from the end. */ |
179 | L(copy64_from_end): |
180 | ldp E_l, E_h, [srcend, -64] |
181 | stp A_l, A_h, [dst, 16] |
182 | ldp A_l, A_h, [srcend, -48] |
183 | stp B_l, B_h, [dst, 32] |
184 | ldp B_l, B_h, [srcend, -32] |
185 | stp C_l, C_h, [dst, 48] |
186 | ldp C_l, C_h, [srcend, -16] |
187 | stp D_l, D_h, [dst, 64] |
188 | stp E_l, E_h, [dstend, -64] |
189 | stp A_l, A_h, [dstend, -48] |
190 | stp B_l, B_h, [dstend, -32] |
191 | stp C_l, C_h, [dstend, -16] |
192 | ret |
193 | |
194 | .p2align 4 |
195 | |
196 | /* Large backwards copy for overlapping copies. |
197 | Copy 16 bytes and then align dst to 16-byte alignment. */ |
198 | L(copy_long_backwards): |
199 | ldp D_l, D_h, [srcend, -16] |
200 | and tmp1, dstend, 15 |
201 | sub srcend, srcend, tmp1 |
202 | sub count, count, tmp1 |
203 | ldp A_l, A_h, [srcend, -16] |
204 | stp D_l, D_h, [dstend, -16] |
205 | ldp B_l, B_h, [srcend, -32] |
206 | ldp C_l, C_h, [srcend, -48] |
207 | ldp D_l, D_h, [srcend, -64]! |
208 | sub dstend, dstend, tmp1 |
209 | subs count, count, 128 |
210 | b.ls L(copy64_from_start) |
211 | |
212 | L(loop64_backwards): |
213 | stp A_l, A_h, [dstend, -16] |
214 | ldp A_l, A_h, [srcend, -16] |
215 | stp B_l, B_h, [dstend, -32] |
216 | ldp B_l, B_h, [srcend, -32] |
217 | stp C_l, C_h, [dstend, -48] |
218 | ldp C_l, C_h, [srcend, -48] |
219 | stp D_l, D_h, [dstend, -64]! |
220 | ldp D_l, D_h, [srcend, -64]! |
221 | subs count, count, 64 |
222 | b.hi L(loop64_backwards) |
223 | |
224 | /* Write the last iteration and copy 64 bytes from the start. */ |
225 | L(copy64_from_start): |
226 | ldp G_l, G_h, [src, 48] |
227 | stp A_l, A_h, [dstend, -16] |
228 | ldp A_l, A_h, [src, 32] |
229 | stp B_l, B_h, [dstend, -32] |
230 | ldp B_l, B_h, [src, 16] |
231 | stp C_l, C_h, [dstend, -48] |
232 | ldp C_l, C_h, [src] |
233 | stp D_l, D_h, [dstend, -64] |
234 | stp G_l, G_h, [dstin, 48] |
235 | stp A_l, A_h, [dstin, 32] |
236 | stp B_l, B_h, [dstin, 16] |
237 | stp C_l, C_h, [dstin] |
238 | ret |
239 | |
240 | END (__memcpy_aarch64) |
241 | |