1 | //===----------------------------------------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// |
9 | /// \file |
10 | /// This file contains assembly-optimized implementations of Scalable Matrix |
11 | /// Extension (SME) compatible memcpy and memmove functions. |
12 | /// |
13 | /// These implementations depend on unaligned access support. |
14 | /// |
15 | /// Routines taken from libc/AOR_v20.02/string/aarch64. |
16 | /// |
17 | //===----------------------------------------------------------------------===// |
18 | |
19 | #include "../assembly.h" |
20 | |
21 | // |
22 | // __arm_sc_memcpy / __arm_sc_memmove |
23 | // |
24 | |
25 | #define dstin x0 |
26 | #define src x1 |
27 | #define count x2 |
28 | #define dst x3 |
29 | #define srcend1 x4 |
30 | #define dstend1 x5 |
31 | #define A_l x6 |
32 | #define A_lw w6 |
33 | #define A_h x7 |
34 | #define B_l x8 |
35 | #define B_lw w8 |
36 | #define B_h x9 |
37 | #define C_l x10 |
38 | #define C_lw w10 |
39 | #define C_h x11 |
40 | #define D_l x12 |
41 | #define D_h x13 |
42 | #define E_l x14 |
43 | #define E_h x15 |
44 | #define F_l x16 |
45 | #define F_h x17 |
46 | #define G_l count |
47 | #define G_h dst |
48 | #define H_l src |
49 | #define H_h srcend1 |
50 | #define tmp1 x14 |
51 | |
52 | /* This implementation handles overlaps and supports both memcpy and memmove |
53 | from a single entry point. It uses unaligned accesses and branchless |
54 | sequences to keep the code small, simple and improve performance. |
55 | |
56 | Copies are split into 3 main cases: small copies of up to 32 bytes, medium |
57 | copies of up to 128 bytes, and large copies. The overhead of the overlap |
58 | check is negligible since it is only required for large copies. |
59 | |
60 | Large copies use a software pipelined loop processing 64 bytes per iteration. |
61 | The destination pointer is 16-byte aligned to minimize unaligned accesses. |
62 | The loop tail is handled by always copying 64 bytes from the end. |
63 | */ |
64 | |
65 | DEFINE_COMPILERRT_FUNCTION(__arm_sc_memcpy) |
66 | add srcend1, src, count |
67 | add dstend1, dstin, count |
68 | cmp count, 128 |
69 | b.hi 7f // copy_long |
70 | cmp count, 32 |
71 | b.hi 4f // copy32_128 |
72 | |
73 | /* Small copies: 0..32 bytes. */ |
74 | cmp count, 16 |
75 | b.lo 0f // copy16 |
76 | ldp A_l, A_h, [src] |
77 | ldp D_l, D_h, [srcend1, -16] |
78 | stp A_l, A_h, [dstin] |
79 | stp D_l, D_h, [dstend1, -16] |
80 | ret |
81 | |
82 | /* Copy 8-15 bytes. */ |
83 | 0: // copy16 |
84 | tbz count, 3, 1f // copy8 |
85 | ldr A_l, [src] |
86 | ldr A_h, [srcend1, -8] |
87 | str A_l, [dstin] |
88 | str A_h, [dstend1, -8] |
89 | ret |
90 | |
91 | .p2align 3 |
92 | /* Copy 4-7 bytes. */ |
93 | 1: // copy8 |
94 | tbz count, 2, 2f // copy4 |
95 | ldr A_lw, [src] |
96 | ldr B_lw, [srcend1, -4] |
97 | str A_lw, [dstin] |
98 | str B_lw, [dstend1, -4] |
99 | ret |
100 | |
101 | /* Copy 0..3 bytes using a branchless sequence. */ |
102 | 2: // copy4 |
103 | cbz count, 3f // copy0 |
104 | lsr tmp1, count, 1 |
105 | ldrb A_lw, [src] |
106 | ldrb C_lw, [srcend1, -1] |
107 | ldrb B_lw, [src, tmp1] |
108 | strb A_lw, [dstin] |
109 | strb B_lw, [dstin, tmp1] |
110 | strb C_lw, [dstend1, -1] |
111 | 3: // copy0 |
112 | ret |
113 | |
114 | .p2align 4 |
115 | /* Medium copies: 33..128 bytes. */ |
116 | 4: // copy32_128 |
117 | ldp A_l, A_h, [src] |
118 | ldp B_l, B_h, [src, 16] |
119 | ldp C_l, C_h, [srcend1, -32] |
120 | ldp D_l, D_h, [srcend1, -16] |
121 | cmp count, 64 |
122 | b.hi 5f // copy128 |
123 | stp A_l, A_h, [dstin] |
124 | stp B_l, B_h, [dstin, 16] |
125 | stp C_l, C_h, [dstend1, -32] |
126 | stp D_l, D_h, [dstend1, -16] |
127 | ret |
128 | |
129 | .p2align 4 |
130 | /* Copy 65..128 bytes. */ |
131 | 5: // copy128 |
132 | ldp E_l, E_h, [src, 32] |
133 | ldp F_l, F_h, [src, 48] |
134 | cmp count, 96 |
135 | b.ls 6f // copy96 |
136 | ldp G_l, G_h, [srcend1, -64] |
137 | ldp H_l, H_h, [srcend1, -48] |
138 | stp G_l, G_h, [dstend1, -64] |
139 | stp H_l, H_h, [dstend1, -48] |
140 | 6: // copy96 |
141 | stp A_l, A_h, [dstin] |
142 | stp B_l, B_h, [dstin, 16] |
143 | stp E_l, E_h, [dstin, 32] |
144 | stp F_l, F_h, [dstin, 48] |
145 | stp C_l, C_h, [dstend1, -32] |
146 | stp D_l, D_h, [dstend1, -16] |
147 | ret |
148 | |
149 | .p2align 4 |
150 | /* Copy more than 128 bytes. */ |
151 | 7: // copy_long |
152 | /* Use backwards copy if there is an overlap. */ |
153 | sub tmp1, dstin, src |
154 | cbz tmp1, 3b // copy0 |
155 | cmp tmp1, count |
156 | b.lo 10f //copy_long_backwards |
157 | |
158 | /* Copy 16 bytes and then align dst to 16-byte alignment. */ |
159 | |
160 | ldp D_l, D_h, [src] |
161 | and tmp1, dstin, 15 |
162 | bic dst, dstin, 15 |
163 | sub src, src, tmp1 |
164 | add count, count, tmp1 /* Count is now 16 too large. */ |
165 | ldp A_l, A_h, [src, 16] |
166 | stp D_l, D_h, [dstin] |
167 | ldp B_l, B_h, [src, 32] |
168 | ldp C_l, C_h, [src, 48] |
169 | ldp D_l, D_h, [src, 64]! |
170 | subs count, count, 128 + 16 /* Test and readjust count. */ |
171 | b.ls 9f // copy64_from_end |
172 | 8: // loop64 |
173 | stp A_l, A_h, [dst, 16] |
174 | ldp A_l, A_h, [src, 16] |
175 | stp B_l, B_h, [dst, 32] |
176 | ldp B_l, B_h, [src, 32] |
177 | stp C_l, C_h, [dst, 48] |
178 | ldp C_l, C_h, [src, 48] |
179 | stp D_l, D_h, [dst, 64]! |
180 | ldp D_l, D_h, [src, 64]! |
181 | subs count, count, 64 |
182 | b.hi 8b // loop64 |
183 | |
184 | /* Write the last iteration and copy 64 bytes from the end. */ |
185 | 9: // copy64_from_end |
186 | ldp E_l, E_h, [srcend1, -64] |
187 | stp A_l, A_h, [dst, 16] |
188 | ldp A_l, A_h, [srcend1, -48] |
189 | stp B_l, B_h, [dst, 32] |
190 | ldp B_l, B_h, [srcend1, -32] |
191 | stp C_l, C_h, [dst, 48] |
192 | ldp C_l, C_h, [srcend1, -16] |
193 | stp D_l, D_h, [dst, 64] |
194 | stp E_l, E_h, [dstend1, -64] |
195 | stp A_l, A_h, [dstend1, -48] |
196 | stp B_l, B_h, [dstend1, -32] |
197 | stp C_l, C_h, [dstend1, -16] |
198 | ret |
199 | |
200 | .p2align 4 |
201 | |
202 | /* Large backwards copy for overlapping copies. |
203 | Copy 16 bytes and then align dst to 16-byte alignment. */ |
204 | 10: // copy_long_backwards |
205 | ldp D_l, D_h, [srcend1, -16] |
206 | and tmp1, dstend1, 15 |
207 | sub srcend1, srcend1, tmp1 |
208 | sub count, count, tmp1 |
209 | ldp A_l, A_h, [srcend1, -16] |
210 | stp D_l, D_h, [dstend1, -16] |
211 | ldp B_l, B_h, [srcend1, -32] |
212 | ldp C_l, C_h, [srcend1, -48] |
213 | ldp D_l, D_h, [srcend1, -64]! |
214 | sub dstend1, dstend1, tmp1 |
215 | subs count, count, 128 |
216 | b.ls 12f // copy64_from_start |
217 | |
218 | 11: // loop64_backwards |
219 | stp A_l, A_h, [dstend1, -16] |
220 | ldp A_l, A_h, [srcend1, -16] |
221 | stp B_l, B_h, [dstend1, -32] |
222 | ldp B_l, B_h, [srcend1, -32] |
223 | stp C_l, C_h, [dstend1, -48] |
224 | ldp C_l, C_h, [srcend1, -48] |
225 | stp D_l, D_h, [dstend1, -64]! |
226 | ldp D_l, D_h, [srcend1, -64]! |
227 | subs count, count, 64 |
228 | b.hi 11b // loop64_backwards |
229 | |
230 | /* Write the last iteration and copy 64 bytes from the start. */ |
231 | 12: // copy64_from_start |
232 | ldp G_l, G_h, [src, 48] |
233 | stp A_l, A_h, [dstend1, -16] |
234 | ldp A_l, A_h, [src, 32] |
235 | stp B_l, B_h, [dstend1, -32] |
236 | ldp B_l, B_h, [src, 16] |
237 | stp C_l, C_h, [dstend1, -48] |
238 | ldp C_l, C_h, [src] |
239 | stp D_l, D_h, [dstend1, -64] |
240 | stp G_l, G_h, [dstin, 48] |
241 | stp A_l, A_h, [dstin, 32] |
242 | stp B_l, B_h, [dstin, 16] |
243 | stp C_l, C_h, [dstin] |
244 | ret |
245 | END_COMPILERRT_FUNCTION(__arm_sc_memcpy) |
246 | |
247 | DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy) |
248 | |
249 | |