1 | /* |
2 | * memcpy - copy memory area |
3 | * |
4 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
5 | * See https://llvm.org/LICENSE.txt for license information. |
6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
7 | */ |
8 | |
9 | /* Assumptions: |
10 | * |
11 | * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. |
12 | * |
13 | */ |
14 | |
15 | #include "../asmdefs.h" |
16 | |
17 | #define dstin x0 |
18 | #define src x1 |
19 | #define count x2 |
20 | #define dst x3 |
21 | #define srcend x4 |
22 | #define dstend x5 |
23 | #define A_l x6 |
24 | #define A_lw w6 |
25 | #define A_h x7 |
26 | #define B_l x8 |
27 | #define B_lw w8 |
28 | #define B_h x9 |
29 | #define C_lw w10 |
30 | #define tmp1 x14 |
31 | |
32 | #define A_q q0 |
33 | #define B_q q1 |
34 | #define C_q q2 |
35 | #define D_q q3 |
36 | #define E_q q4 |
37 | #define F_q q5 |
38 | #define G_q q6 |
39 | #define H_q q7 |
40 | |
41 | /* This implementation handles overlaps and supports both memcpy and memmove |
42 | from a single entry point. It uses unaligned accesses and branchless |
43 | sequences to keep the code small, simple and improve performance. |
44 | |
45 | Copies are split into 3 main cases: small copies of up to 32 bytes, medium |
46 | copies of up to 128 bytes, and large copies. The overhead of the overlap |
47 | check is negligible since it is only required for large copies. |
48 | |
49 | Large copies use a software pipelined loop processing 64 bytes per iteration. |
50 | The source pointer is 16-byte aligned to minimize unaligned accesses. |
51 | The loop tail is handled by always copying 64 bytes from the end. |
52 | */ |
53 | |
54 | ENTRY (__memcpy_aarch64_simd) |
55 | ENTRY_ALIAS (__memmove_aarch64_simd) |
56 | add srcend, src, count |
57 | add dstend, dstin, count |
58 | cmp count, 128 |
59 | b.hi L(copy_long) |
60 | cmp count, 32 |
61 | b.hi L(copy32_128) |
62 | |
63 | /* Small copies: 0..32 bytes. */ |
64 | cmp count, 16 |
65 | b.lo L(copy16) |
66 | ldr A_q, [src] |
67 | ldr B_q, [srcend, -16] |
68 | str A_q, [dstin] |
69 | str B_q, [dstend, -16] |
70 | ret |
71 | |
72 | /* Copy 8-15 bytes. */ |
73 | L(copy16): |
74 | tbz count, 3, L(copy8) |
75 | ldr A_l, [src] |
76 | ldr A_h, [srcend, -8] |
77 | str A_l, [dstin] |
78 | str A_h, [dstend, -8] |
79 | ret |
80 | |
81 | .p2align 3 |
82 | /* Copy 4-7 bytes. */ |
83 | L(copy8): |
84 | tbz count, 2, L(copy4) |
85 | ldr A_lw, [src] |
86 | ldr B_lw, [srcend, -4] |
87 | str A_lw, [dstin] |
88 | str B_lw, [dstend, -4] |
89 | ret |
90 | |
91 | /* Copy 0..3 bytes using a branchless sequence. */ |
92 | L(copy4): |
93 | cbz count, L(copy0) |
94 | lsr tmp1, count, 1 |
95 | ldrb A_lw, [src] |
96 | ldrb C_lw, [srcend, -1] |
97 | ldrb B_lw, [src, tmp1] |
98 | strb A_lw, [dstin] |
99 | strb B_lw, [dstin, tmp1] |
100 | strb C_lw, [dstend, -1] |
101 | L(copy0): |
102 | ret |
103 | |
104 | .p2align 4 |
105 | /* Medium copies: 33..128 bytes. */ |
106 | L(copy32_128): |
107 | ldp A_q, B_q, [src] |
108 | ldp C_q, D_q, [srcend, -32] |
109 | cmp count, 64 |
110 | b.hi L(copy128) |
111 | stp A_q, B_q, [dstin] |
112 | stp C_q, D_q, [dstend, -32] |
113 | ret |
114 | |
115 | .p2align 4 |
116 | /* Copy 65..128 bytes. */ |
117 | L(copy128): |
118 | ldp E_q, F_q, [src, 32] |
119 | cmp count, 96 |
120 | b.ls L(copy96) |
121 | ldp G_q, H_q, [srcend, -64] |
122 | stp G_q, H_q, [dstend, -64] |
123 | L(copy96): |
124 | stp A_q, B_q, [dstin] |
125 | stp E_q, F_q, [dstin, 32] |
126 | stp C_q, D_q, [dstend, -32] |
127 | ret |
128 | |
129 | /* Copy more than 128 bytes. */ |
130 | L(copy_long): |
131 | /* Use backwards copy if there is an overlap. */ |
132 | sub tmp1, dstin, src |
133 | cmp tmp1, count |
134 | b.lo L(copy_long_backwards) |
135 | |
136 | /* Copy 16 bytes and then align src to 16-byte alignment. */ |
137 | ldr D_q, [src] |
138 | and tmp1, src, 15 |
139 | bic src, src, 15 |
140 | sub dst, dstin, tmp1 |
141 | add count, count, tmp1 /* Count is now 16 too large. */ |
142 | ldp A_q, B_q, [src, 16] |
143 | str D_q, [dstin] |
144 | ldp C_q, D_q, [src, 48] |
145 | subs count, count, 128 + 16 /* Test and readjust count. */ |
146 | b.ls L(copy64_from_end) |
147 | L(loop64): |
148 | stp A_q, B_q, [dst, 16] |
149 | ldp A_q, B_q, [src, 80] |
150 | stp C_q, D_q, [dst, 48] |
151 | ldp C_q, D_q, [src, 112] |
152 | add src, src, 64 |
153 | add dst, dst, 64 |
154 | subs count, count, 64 |
155 | b.hi L(loop64) |
156 | |
157 | /* Write the last iteration and copy 64 bytes from the end. */ |
158 | L(copy64_from_end): |
159 | ldp E_q, F_q, [srcend, -64] |
160 | stp A_q, B_q, [dst, 16] |
161 | ldp A_q, B_q, [srcend, -32] |
162 | stp C_q, D_q, [dst, 48] |
163 | stp E_q, F_q, [dstend, -64] |
164 | stp A_q, B_q, [dstend, -32] |
165 | ret |
166 | |
167 | /* Large backwards copy for overlapping copies. |
168 | Copy 16 bytes and then align srcend to 16-byte alignment. */ |
169 | L(copy_long_backwards): |
170 | cbz tmp1, L(copy0) |
171 | ldr D_q, [srcend, -16] |
172 | and tmp1, srcend, 15 |
173 | bic srcend, srcend, 15 |
174 | sub count, count, tmp1 |
175 | ldp A_q, B_q, [srcend, -32] |
176 | str D_q, [dstend, -16] |
177 | ldp C_q, D_q, [srcend, -64] |
178 | sub dstend, dstend, tmp1 |
179 | subs count, count, 128 |
180 | b.ls L(copy64_from_start) |
181 | |
182 | L(loop64_backwards): |
183 | stp A_q, B_q, [dstend, -32] |
184 | ldp A_q, B_q, [srcend, -96] |
185 | stp C_q, D_q, [dstend, -64] |
186 | ldp C_q, D_q, [srcend, -128] |
187 | sub srcend, srcend, 64 |
188 | sub dstend, dstend, 64 |
189 | subs count, count, 64 |
190 | b.hi L(loop64_backwards) |
191 | |
192 | /* Write the last iteration and copy 64 bytes from the start. */ |
193 | L(copy64_from_start): |
194 | ldp E_q, F_q, [src, 32] |
195 | stp A_q, B_q, [dstend, -32] |
196 | ldp A_q, B_q, [src] |
197 | stp C_q, D_q, [dstend, -64] |
198 | stp E_q, F_q, [dstin, 32] |
199 | stp A_q, B_q, [dstin] |
200 | ret |
201 | |
202 | END (__memcpy_aarch64_simd) |
203 | |