1 | /* SPDX-License-Identifier: GPL-2.0-only */ |
2 | /* |
3 | * Copyright (c) 2012-2021, Arm Limited. |
4 | * |
5 | * Adapted from the original at: |
6 | * https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/memcpy.S |
7 | */ |
8 | |
9 | #include <linux/linkage.h> |
10 | #include <asm/assembler.h> |
11 | |
12 | /* Assumptions: |
13 | * |
14 | * ARMv8-a, AArch64, unaligned accesses. |
15 | * |
16 | */ |
17 | |
18 | #define L(label) .L ## label |
19 | |
20 | #define dstin x0 |
21 | #define src x1 |
22 | #define count x2 |
23 | #define dst x3 |
24 | #define srcend x4 |
25 | #define dstend x5 |
26 | #define A_l x6 |
27 | #define A_lw w6 |
28 | #define A_h x7 |
29 | #define B_l x8 |
30 | #define B_lw w8 |
31 | #define B_h x9 |
32 | #define C_l x10 |
33 | #define C_lw w10 |
34 | #define C_h x11 |
35 | #define D_l x12 |
36 | #define D_h x13 |
37 | #define E_l x14 |
38 | #define E_h x15 |
39 | #define F_l x16 |
40 | #define F_h x17 |
41 | #define G_l count |
42 | #define G_h dst |
43 | #define H_l src |
44 | #define H_h srcend |
45 | #define tmp1 x14 |
46 | |
47 | /* This implementation handles overlaps and supports both memcpy and memmove |
48 | from a single entry point. It uses unaligned accesses and branchless |
49 | sequences to keep the code small, simple and improve performance. |
50 | |
51 | Copies are split into 3 main cases: small copies of up to 32 bytes, medium |
52 | copies of up to 128 bytes, and large copies. The overhead of the overlap |
53 | check is negligible since it is only required for large copies. |
54 | |
55 | Large copies use a software pipelined loop processing 64 bytes per iteration. |
56 | The destination pointer is 16-byte aligned to minimize unaligned accesses. |
57 | The loop tail is handled by always copying 64 bytes from the end. |
58 | */ |
59 | |
60 | SYM_FUNC_START(__pi_memcpy) |
61 | add srcend, src, count |
62 | add dstend, dstin, count |
63 | cmp count, 128 |
64 | b.hi L(copy_long) |
65 | cmp count, 32 |
66 | b.hi L(copy32_128) |
67 | |
68 | /* Small copies: 0..32 bytes. */ |
69 | cmp count, 16 |
70 | b.lo L(copy16) |
71 | ldp A_l, A_h, [src] |
72 | ldp D_l, D_h, [srcend, -16] |
73 | stp A_l, A_h, [dstin] |
74 | stp D_l, D_h, [dstend, -16] |
75 | ret |
76 | |
77 | /* Copy 8-15 bytes. */ |
78 | L(copy16): |
79 | tbz count, 3, L(copy8) |
80 | ldr A_l, [src] |
81 | ldr A_h, [srcend, -8] |
82 | str A_l, [dstin] |
83 | str A_h, [dstend, -8] |
84 | ret |
85 | |
86 | .p2align 3 |
87 | /* Copy 4-7 bytes. */ |
88 | L(copy8): |
89 | tbz count, 2, L(copy4) |
90 | ldr A_lw, [src] |
91 | ldr B_lw, [srcend, -4] |
92 | str A_lw, [dstin] |
93 | str B_lw, [dstend, -4] |
94 | ret |
95 | |
96 | /* Copy 0..3 bytes using a branchless sequence. */ |
97 | L(copy4): |
98 | cbz count, L(copy0) |
99 | lsr tmp1, count, 1 |
100 | ldrb A_lw, [src] |
101 | ldrb C_lw, [srcend, -1] |
102 | ldrb B_lw, [src, tmp1] |
103 | strb A_lw, [dstin] |
104 | strb B_lw, [dstin, tmp1] |
105 | strb C_lw, [dstend, -1] |
106 | L(copy0): |
107 | ret |
108 | |
109 | .p2align 4 |
110 | /* Medium copies: 33..128 bytes. */ |
111 | L(copy32_128): |
112 | ldp A_l, A_h, [src] |
113 | ldp B_l, B_h, [src, 16] |
114 | ldp C_l, C_h, [srcend, -32] |
115 | ldp D_l, D_h, [srcend, -16] |
116 | cmp count, 64 |
117 | b.hi L(copy128) |
118 | stp A_l, A_h, [dstin] |
119 | stp B_l, B_h, [dstin, 16] |
120 | stp C_l, C_h, [dstend, -32] |
121 | stp D_l, D_h, [dstend, -16] |
122 | ret |
123 | |
124 | .p2align 4 |
125 | /* Copy 65..128 bytes. */ |
126 | L(copy128): |
127 | ldp E_l, E_h, [src, 32] |
128 | ldp F_l, F_h, [src, 48] |
129 | cmp count, 96 |
130 | b.ls L(copy96) |
131 | ldp G_l, G_h, [srcend, -64] |
132 | ldp H_l, H_h, [srcend, -48] |
133 | stp G_l, G_h, [dstend, -64] |
134 | stp H_l, H_h, [dstend, -48] |
135 | L(copy96): |
136 | stp A_l, A_h, [dstin] |
137 | stp B_l, B_h, [dstin, 16] |
138 | stp E_l, E_h, [dstin, 32] |
139 | stp F_l, F_h, [dstin, 48] |
140 | stp C_l, C_h, [dstend, -32] |
141 | stp D_l, D_h, [dstend, -16] |
142 | ret |
143 | |
144 | .p2align 4 |
145 | /* Copy more than 128 bytes. */ |
146 | L(copy_long): |
147 | /* Use backwards copy if there is an overlap. */ |
148 | sub tmp1, dstin, src |
149 | cbz tmp1, L(copy0) |
150 | cmp tmp1, count |
151 | b.lo L(copy_long_backwards) |
152 | |
153 | /* Copy 16 bytes and then align dst to 16-byte alignment. */ |
154 | |
155 | ldp D_l, D_h, [src] |
156 | and tmp1, dstin, 15 |
157 | bic dst, dstin, 15 |
158 | sub src, src, tmp1 |
159 | add count, count, tmp1 /* Count is now 16 too large. */ |
160 | ldp A_l, A_h, [src, 16] |
161 | stp D_l, D_h, [dstin] |
162 | ldp B_l, B_h, [src, 32] |
163 | ldp C_l, C_h, [src, 48] |
164 | ldp D_l, D_h, [src, 64]! |
165 | subs count, count, 128 + 16 /* Test and readjust count. */ |
166 | b.ls L(copy64_from_end) |
167 | |
168 | L(loop64): |
169 | stp A_l, A_h, [dst, 16] |
170 | ldp A_l, A_h, [src, 16] |
171 | stp B_l, B_h, [dst, 32] |
172 | ldp B_l, B_h, [src, 32] |
173 | stp C_l, C_h, [dst, 48] |
174 | ldp C_l, C_h, [src, 48] |
175 | stp D_l, D_h, [dst, 64]! |
176 | ldp D_l, D_h, [src, 64]! |
177 | subs count, count, 64 |
178 | b.hi L(loop64) |
179 | |
180 | /* Write the last iteration and copy 64 bytes from the end. */ |
181 | L(copy64_from_end): |
182 | ldp E_l, E_h, [srcend, -64] |
183 | stp A_l, A_h, [dst, 16] |
184 | ldp A_l, A_h, [srcend, -48] |
185 | stp B_l, B_h, [dst, 32] |
186 | ldp B_l, B_h, [srcend, -32] |
187 | stp C_l, C_h, [dst, 48] |
188 | ldp C_l, C_h, [srcend, -16] |
189 | stp D_l, D_h, [dst, 64] |
190 | stp E_l, E_h, [dstend, -64] |
191 | stp A_l, A_h, [dstend, -48] |
192 | stp B_l, B_h, [dstend, -32] |
193 | stp C_l, C_h, [dstend, -16] |
194 | ret |
195 | |
196 | .p2align 4 |
197 | |
198 | /* Large backwards copy for overlapping copies. |
199 | Copy 16 bytes and then align dst to 16-byte alignment. */ |
200 | L(copy_long_backwards): |
201 | ldp D_l, D_h, [srcend, -16] |
202 | and tmp1, dstend, 15 |
203 | sub srcend, srcend, tmp1 |
204 | sub count, count, tmp1 |
205 | ldp A_l, A_h, [srcend, -16] |
206 | stp D_l, D_h, [dstend, -16] |
207 | ldp B_l, B_h, [srcend, -32] |
208 | ldp C_l, C_h, [srcend, -48] |
209 | ldp D_l, D_h, [srcend, -64]! |
210 | sub dstend, dstend, tmp1 |
211 | subs count, count, 128 |
212 | b.ls L(copy64_from_start) |
213 | |
214 | L(loop64_backwards): |
215 | stp A_l, A_h, [dstend, -16] |
216 | ldp A_l, A_h, [srcend, -16] |
217 | stp B_l, B_h, [dstend, -32] |
218 | ldp B_l, B_h, [srcend, -32] |
219 | stp C_l, C_h, [dstend, -48] |
220 | ldp C_l, C_h, [srcend, -48] |
221 | stp D_l, D_h, [dstend, -64]! |
222 | ldp D_l, D_h, [srcend, -64]! |
223 | subs count, count, 64 |
224 | b.hi L(loop64_backwards) |
225 | |
226 | /* Write the last iteration and copy 64 bytes from the start. */ |
227 | L(copy64_from_start): |
228 | ldp G_l, G_h, [src, 48] |
229 | stp A_l, A_h, [dstend, -16] |
230 | ldp A_l, A_h, [src, 32] |
231 | stp B_l, B_h, [dstend, -32] |
232 | ldp B_l, B_h, [src, 16] |
233 | stp C_l, C_h, [dstend, -48] |
234 | ldp C_l, C_h, [src] |
235 | stp D_l, D_h, [dstend, -64] |
236 | stp G_l, G_h, [dstin, 48] |
237 | stp A_l, A_h, [dstin, 32] |
238 | stp B_l, B_h, [dstin, 16] |
239 | stp C_l, C_h, [dstin] |
240 | ret |
241 | SYM_FUNC_END(__pi_memcpy) |
242 | |
243 | SYM_FUNC_ALIAS(__memcpy, __pi_memcpy) |
244 | EXPORT_SYMBOL(__memcpy) |
245 | SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy) |
246 | EXPORT_SYMBOL(memcpy) |
247 | |
248 | SYM_FUNC_ALIAS(__pi_memmove, __pi_memcpy) |
249 | |
250 | SYM_FUNC_ALIAS(__memmove, __pi_memmove) |
251 | EXPORT_SYMBOL(__memmove) |
252 | SYM_FUNC_ALIAS_WEAK(memmove, __memmove) |
253 | EXPORT_SYMBOL(memmove) |
254 | |