1//===----------------------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file contains assembly-optimized implementations of Scalable Matrix
11/// Extension (SME) compatible memcpy and memmove functions.
12///
13/// These implementations depend on unaligned access support.
14///
15/// Routines taken from libc/AOR_v20.02/string/aarch64.
16///
17//===----------------------------------------------------------------------===//
18
19#include "../assembly.h"
20
21//
22// __arm_sc_memcpy / __arm_sc_memmove
23//
24
25#define dstin x0
26#define src x1
27#define count x2
28#define dst x3
29#define srcend1 x4
30#define dstend1 x5
31#define A_l x6
32#define A_lw w6
33#define A_h x7
34#define B_l x8
35#define B_lw w8
36#define B_h x9
37#define C_l x10
38#define C_lw w10
39#define C_h x11
40#define D_l x12
41#define D_h x13
42#define E_l x14
43#define E_h x15
44#define F_l x16
45#define F_h x17
46#define G_l count
47#define G_h dst
48#define H_l src
49#define H_h srcend1
50#define tmp1 x14
51
52/* This implementation handles overlaps and supports both memcpy and memmove
53 from a single entry point. It uses unaligned accesses and branchless
54 sequences to keep the code small, simple and improve performance.
55
56 Copies are split into 3 main cases: small copies of up to 32 bytes, medium
57 copies of up to 128 bytes, and large copies. The overhead of the overlap
58 check is negligible since it is only required for large copies.
59
60 Large copies use a software pipelined loop processing 64 bytes per iteration.
61 The destination pointer is 16-byte aligned to minimize unaligned accesses.
62 The loop tail is handled by always copying 64 bytes from the end.
63*/
64
65DEFINE_COMPILERRT_FUNCTION(__arm_sc_memcpy)
66 add srcend1, src, count
67 add dstend1, dstin, count
68 cmp count, 128
69 b.hi 7f // copy_long
70 cmp count, 32
71 b.hi 4f // copy32_128
72
73 /* Small copies: 0..32 bytes. */
74 cmp count, 16
75 b.lo 0f // copy16
76 ldp A_l, A_h, [src]
77 ldp D_l, D_h, [srcend1, -16]
78 stp A_l, A_h, [dstin]
79 stp D_l, D_h, [dstend1, -16]
80 ret
81
82 /* Copy 8-15 bytes. */
830: // copy16
84 tbz count, 3, 1f // copy8
85 ldr A_l, [src]
86 ldr A_h, [srcend1, -8]
87 str A_l, [dstin]
88 str A_h, [dstend1, -8]
89 ret
90
91 .p2align 3
92 /* Copy 4-7 bytes. */
931: // copy8
94 tbz count, 2, 2f // copy4
95 ldr A_lw, [src]
96 ldr B_lw, [srcend1, -4]
97 str A_lw, [dstin]
98 str B_lw, [dstend1, -4]
99 ret
100
101 /* Copy 0..3 bytes using a branchless sequence. */
1022: // copy4
103 cbz count, 3f // copy0
104 lsr tmp1, count, 1
105 ldrb A_lw, [src]
106 ldrb C_lw, [srcend1, -1]
107 ldrb B_lw, [src, tmp1]
108 strb A_lw, [dstin]
109 strb B_lw, [dstin, tmp1]
110 strb C_lw, [dstend1, -1]
1113: // copy0
112 ret
113
114 .p2align 4
115 /* Medium copies: 33..128 bytes. */
1164: // copy32_128
117 ldp A_l, A_h, [src]
118 ldp B_l, B_h, [src, 16]
119 ldp C_l, C_h, [srcend1, -32]
120 ldp D_l, D_h, [srcend1, -16]
121 cmp count, 64
122 b.hi 5f // copy128
123 stp A_l, A_h, [dstin]
124 stp B_l, B_h, [dstin, 16]
125 stp C_l, C_h, [dstend1, -32]
126 stp D_l, D_h, [dstend1, -16]
127 ret
128
129 .p2align 4
130 /* Copy 65..128 bytes. */
1315: // copy128
132 ldp E_l, E_h, [src, 32]
133 ldp F_l, F_h, [src, 48]
134 cmp count, 96
135 b.ls 6f // copy96
136 ldp G_l, G_h, [srcend1, -64]
137 ldp H_l, H_h, [srcend1, -48]
138 stp G_l, G_h, [dstend1, -64]
139 stp H_l, H_h, [dstend1, -48]
1406: // copy96
141 stp A_l, A_h, [dstin]
142 stp B_l, B_h, [dstin, 16]
143 stp E_l, E_h, [dstin, 32]
144 stp F_l, F_h, [dstin, 48]
145 stp C_l, C_h, [dstend1, -32]
146 stp D_l, D_h, [dstend1, -16]
147 ret
148
149 .p2align 4
150 /* Copy more than 128 bytes. */
1517: // copy_long
152 /* Use backwards copy if there is an overlap. */
153 sub tmp1, dstin, src
154 cbz tmp1, 3b // copy0
155 cmp tmp1, count
156 b.lo 10f //copy_long_backwards
157
158 /* Copy 16 bytes and then align dst to 16-byte alignment. */
159
160 ldp D_l, D_h, [src]
161 and tmp1, dstin, 15
162 bic dst, dstin, 15
163 sub src, src, tmp1
164 add count, count, tmp1 /* Count is now 16 too large. */
165 ldp A_l, A_h, [src, 16]
166 stp D_l, D_h, [dstin]
167 ldp B_l, B_h, [src, 32]
168 ldp C_l, C_h, [src, 48]
169 ldp D_l, D_h, [src, 64]!
170 subs count, count, 128 + 16 /* Test and readjust count. */
171 b.ls 9f // copy64_from_end
1728: // loop64
173 stp A_l, A_h, [dst, 16]
174 ldp A_l, A_h, [src, 16]
175 stp B_l, B_h, [dst, 32]
176 ldp B_l, B_h, [src, 32]
177 stp C_l, C_h, [dst, 48]
178 ldp C_l, C_h, [src, 48]
179 stp D_l, D_h, [dst, 64]!
180 ldp D_l, D_h, [src, 64]!
181 subs count, count, 64
182 b.hi 8b // loop64
183
184 /* Write the last iteration and copy 64 bytes from the end. */
1859: // copy64_from_end
186 ldp E_l, E_h, [srcend1, -64]
187 stp A_l, A_h, [dst, 16]
188 ldp A_l, A_h, [srcend1, -48]
189 stp B_l, B_h, [dst, 32]
190 ldp B_l, B_h, [srcend1, -32]
191 stp C_l, C_h, [dst, 48]
192 ldp C_l, C_h, [srcend1, -16]
193 stp D_l, D_h, [dst, 64]
194 stp E_l, E_h, [dstend1, -64]
195 stp A_l, A_h, [dstend1, -48]
196 stp B_l, B_h, [dstend1, -32]
197 stp C_l, C_h, [dstend1, -16]
198 ret
199
200 .p2align 4
201
202 /* Large backwards copy for overlapping copies.
203 Copy 16 bytes and then align dst to 16-byte alignment. */
20410: // copy_long_backwards
205 ldp D_l, D_h, [srcend1, -16]
206 and tmp1, dstend1, 15
207 sub srcend1, srcend1, tmp1
208 sub count, count, tmp1
209 ldp A_l, A_h, [srcend1, -16]
210 stp D_l, D_h, [dstend1, -16]
211 ldp B_l, B_h, [srcend1, -32]
212 ldp C_l, C_h, [srcend1, -48]
213 ldp D_l, D_h, [srcend1, -64]!
214 sub dstend1, dstend1, tmp1
215 subs count, count, 128
216 b.ls 12f // copy64_from_start
217
21811: // loop64_backwards
219 stp A_l, A_h, [dstend1, -16]
220 ldp A_l, A_h, [srcend1, -16]
221 stp B_l, B_h, [dstend1, -32]
222 ldp B_l, B_h, [srcend1, -32]
223 stp C_l, C_h, [dstend1, -48]
224 ldp C_l, C_h, [srcend1, -48]
225 stp D_l, D_h, [dstend1, -64]!
226 ldp D_l, D_h, [srcend1, -64]!
227 subs count, count, 64
228 b.hi 11b // loop64_backwards
229
230 /* Write the last iteration and copy 64 bytes from the start. */
23112: // copy64_from_start
232 ldp G_l, G_h, [src, 48]
233 stp A_l, A_h, [dstend1, -16]
234 ldp A_l, A_h, [src, 32]
235 stp B_l, B_h, [dstend1, -32]
236 ldp B_l, B_h, [src, 16]
237 stp C_l, C_h, [dstend1, -48]
238 ldp C_l, C_h, [src]
239 stp D_l, D_h, [dstend1, -64]
240 stp G_l, G_h, [dstin, 48]
241 stp A_l, A_h, [dstin, 32]
242 stp B_l, B_h, [dstin, 16]
243 stp C_l, C_h, [dstin]
244 ret
245END_COMPILERRT_FUNCTION(__arm_sc_memcpy)
246
247DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy)
248
249

source code of compiler-rt/lib/builtins/aarch64/sme-libc-opt-memcpy-memmove.S