1/*
2 * memcpy - copy memory area
3 *
4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 * See https://llvm.org/LICENSE.txt for license information.
6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 */
8
9/* Assumptions:
10 *
11 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
12 *
13 */
14
15#include "../asmdefs.h"
16
17#define dstin x0
18#define src x1
19#define count x2
20#define dst x3
21#define srcend x4
22#define dstend x5
23#define A_l x6
24#define A_lw w6
25#define A_h x7
26#define B_l x8
27#define B_lw w8
28#define B_h x9
29#define C_lw w10
30#define tmp1 x14
31
32#define A_q q0
33#define B_q q1
34#define C_q q2
35#define D_q q3
36#define E_q q4
37#define F_q q5
38#define G_q q6
39#define H_q q7
40
41/* This implementation handles overlaps and supports both memcpy and memmove
42 from a single entry point. It uses unaligned accesses and branchless
43 sequences to keep the code small, simple and improve performance.
44
45 Copies are split into 3 main cases: small copies of up to 32 bytes, medium
46 copies of up to 128 bytes, and large copies. The overhead of the overlap
47 check is negligible since it is only required for large copies.
48
49 Large copies use a software pipelined loop processing 64 bytes per iteration.
50 The source pointer is 16-byte aligned to minimize unaligned accesses.
51 The loop tail is handled by always copying 64 bytes from the end.
52*/
53
54ENTRY (__memcpy_aarch64_simd)
55ENTRY_ALIAS (__memmove_aarch64_simd)
56 add srcend, src, count
57 add dstend, dstin, count
58 cmp count, 128
59 b.hi L(copy_long)
60 cmp count, 32
61 b.hi L(copy32_128)
62
63 /* Small copies: 0..32 bytes. */
64 cmp count, 16
65 b.lo L(copy16)
66 ldr A_q, [src]
67 ldr B_q, [srcend, -16]
68 str A_q, [dstin]
69 str B_q, [dstend, -16]
70 ret
71
72 /* Copy 8-15 bytes. */
73L(copy16):
74 tbz count, 3, L(copy8)
75 ldr A_l, [src]
76 ldr A_h, [srcend, -8]
77 str A_l, [dstin]
78 str A_h, [dstend, -8]
79 ret
80
81 .p2align 3
82 /* Copy 4-7 bytes. */
83L(copy8):
84 tbz count, 2, L(copy4)
85 ldr A_lw, [src]
86 ldr B_lw, [srcend, -4]
87 str A_lw, [dstin]
88 str B_lw, [dstend, -4]
89 ret
90
91 /* Copy 0..3 bytes using a branchless sequence. */
92L(copy4):
93 cbz count, L(copy0)
94 lsr tmp1, count, 1
95 ldrb A_lw, [src]
96 ldrb C_lw, [srcend, -1]
97 ldrb B_lw, [src, tmp1]
98 strb A_lw, [dstin]
99 strb B_lw, [dstin, tmp1]
100 strb C_lw, [dstend, -1]
101L(copy0):
102 ret
103
104 .p2align 4
105 /* Medium copies: 33..128 bytes. */
106L(copy32_128):
107 ldp A_q, B_q, [src]
108 ldp C_q, D_q, [srcend, -32]
109 cmp count, 64
110 b.hi L(copy128)
111 stp A_q, B_q, [dstin]
112 stp C_q, D_q, [dstend, -32]
113 ret
114
115 .p2align 4
116 /* Copy 65..128 bytes. */
117L(copy128):
118 ldp E_q, F_q, [src, 32]
119 cmp count, 96
120 b.ls L(copy96)
121 ldp G_q, H_q, [srcend, -64]
122 stp G_q, H_q, [dstend, -64]
123L(copy96):
124 stp A_q, B_q, [dstin]
125 stp E_q, F_q, [dstin, 32]
126 stp C_q, D_q, [dstend, -32]
127 ret
128
129 /* Copy more than 128 bytes. */
130L(copy_long):
131 /* Use backwards copy if there is an overlap. */
132 sub tmp1, dstin, src
133 cmp tmp1, count
134 b.lo L(copy_long_backwards)
135
136 /* Copy 16 bytes and then align src to 16-byte alignment. */
137 ldr D_q, [src]
138 and tmp1, src, 15
139 bic src, src, 15
140 sub dst, dstin, tmp1
141 add count, count, tmp1 /* Count is now 16 too large. */
142 ldp A_q, B_q, [src, 16]
143 str D_q, [dstin]
144 ldp C_q, D_q, [src, 48]
145 subs count, count, 128 + 16 /* Test and readjust count. */
146 b.ls L(copy64_from_end)
147L(loop64):
148 stp A_q, B_q, [dst, 16]
149 ldp A_q, B_q, [src, 80]
150 stp C_q, D_q, [dst, 48]
151 ldp C_q, D_q, [src, 112]
152 add src, src, 64
153 add dst, dst, 64
154 subs count, count, 64
155 b.hi L(loop64)
156
157 /* Write the last iteration and copy 64 bytes from the end. */
158L(copy64_from_end):
159 ldp E_q, F_q, [srcend, -64]
160 stp A_q, B_q, [dst, 16]
161 ldp A_q, B_q, [srcend, -32]
162 stp C_q, D_q, [dst, 48]
163 stp E_q, F_q, [dstend, -64]
164 stp A_q, B_q, [dstend, -32]
165 ret
166
167 /* Large backwards copy for overlapping copies.
168 Copy 16 bytes and then align srcend to 16-byte alignment. */
169L(copy_long_backwards):
170 cbz tmp1, L(copy0)
171 ldr D_q, [srcend, -16]
172 and tmp1, srcend, 15
173 bic srcend, srcend, 15
174 sub count, count, tmp1
175 ldp A_q, B_q, [srcend, -32]
176 str D_q, [dstend, -16]
177 ldp C_q, D_q, [srcend, -64]
178 sub dstend, dstend, tmp1
179 subs count, count, 128
180 b.ls L(copy64_from_start)
181
182L(loop64_backwards):
183 stp A_q, B_q, [dstend, -32]
184 ldp A_q, B_q, [srcend, -96]
185 stp C_q, D_q, [dstend, -64]
186 ldp C_q, D_q, [srcend, -128]
187 sub srcend, srcend, 64
188 sub dstend, dstend, 64
189 subs count, count, 64
190 b.hi L(loop64_backwards)
191
192 /* Write the last iteration and copy 64 bytes from the start. */
193L(copy64_from_start):
194 ldp E_q, F_q, [src, 32]
195 stp A_q, B_q, [dstend, -32]
196 ldp A_q, B_q, [src]
197 stp C_q, D_q, [dstend, -64]
198 stp E_q, F_q, [dstin, 32]
199 stp A_q, B_q, [dstin]
200 ret
201
202END (__memcpy_aarch64_simd)
203

source code of libc/AOR_v20.02/string/aarch64/memcpy-advsimd.S