1 | /* |
2 | * memcpy - copy memory area |
3 | * |
4 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
5 | * See https://llvm.org/LICENSE.txt for license information. |
6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
7 | */ |
8 | |
9 | /* |
10 | This memcpy routine is optimised for Cortex-A15 cores and takes advantage |
11 | of VFP or NEON when built with the appropriate flags. |
12 | |
13 | Assumptions: |
14 | |
15 | ARMv6 (ARMv7-a if using Neon) |
16 | ARM state |
17 | Unaligned accesses |
18 | |
19 | */ |
20 | |
21 | #include "../asmdefs.h" |
22 | |
23 | .syntax unified |
24 | /* This implementation requires ARM state. */ |
25 | .arm |
26 | |
27 | #ifdef __ARM_NEON__ |
28 | |
29 | .fpu neon |
30 | .arch armv7-a |
31 | # define FRAME_SIZE 4 |
32 | # define USE_VFP |
33 | # define USE_NEON |
34 | |
35 | #elif !defined (__SOFTFP__) |
36 | |
37 | .arch armv6 |
38 | .fpu vfpv2 |
39 | # define FRAME_SIZE 32 |
40 | # define USE_VFP |
41 | |
42 | #else |
43 | .arch armv6 |
44 | # define FRAME_SIZE 32 |
45 | |
46 | #endif |
47 | |
48 | /* Old versions of GAS incorrectly implement the NEON align semantics. */ |
49 | #ifdef BROKEN_ASM_NEON_ALIGN |
50 | #define ALIGN(addr, align) addr,:align |
51 | #else |
52 | #define ALIGN(addr, align) addr:align |
53 | #endif |
54 | |
55 | #define PC_OFFSET 8 /* PC pipeline compensation. */ |
56 | #define INSN_SIZE 4 |
57 | |
58 | /* Call parameters. */ |
59 | #define dstin r0 |
60 | #define src r1 |
61 | #define count r2 |
62 | |
63 | /* Locals. */ |
64 | #define tmp1 r3 |
65 | #define dst ip |
66 | #define tmp2 r10 |
67 | |
68 | #ifndef USE_NEON |
69 | /* For bulk copies using GP registers. */ |
70 | #define A_l r2 /* Call-clobbered. */ |
71 | #define A_h r3 /* Call-clobbered. */ |
72 | #define B_l r4 |
73 | #define B_h r5 |
74 | #define C_l r6 |
75 | #define C_h r7 |
76 | #define D_l r8 |
77 | #define D_h r9 |
78 | #endif |
79 | |
80 | /* Number of lines ahead to pre-fetch data. If you change this the code |
81 | below will need adjustment to compensate. */ |
82 | |
83 | #define prefetch_lines 5 |
84 | |
85 | #ifdef USE_VFP |
86 | .macro cpy_line_vfp vreg, base |
87 | vstr \vreg, [dst, #\base] |
88 | vldr \vreg, [src, #\base] |
89 | vstr d0, [dst, #\base + 8] |
90 | vldr d0, [src, #\base + 8] |
91 | vstr d1, [dst, #\base + 16] |
92 | vldr d1, [src, #\base + 16] |
93 | vstr d2, [dst, #\base + 24] |
94 | vldr d2, [src, #\base + 24] |
95 | vstr \vreg, [dst, #\base + 32] |
96 | vldr \vreg, [src, #\base + prefetch_lines * 64 - 32] |
97 | vstr d0, [dst, #\base + 40] |
98 | vldr d0, [src, #\base + 40] |
99 | vstr d1, [dst, #\base + 48] |
100 | vldr d1, [src, #\base + 48] |
101 | vstr d2, [dst, #\base + 56] |
102 | vldr d2, [src, #\base + 56] |
103 | .endm |
104 | |
105 | .macro cpy_tail_vfp vreg, base |
106 | vstr \vreg, [dst, #\base] |
107 | vldr \vreg, [src, #\base] |
108 | vstr d0, [dst, #\base + 8] |
109 | vldr d0, [src, #\base + 8] |
110 | vstr d1, [dst, #\base + 16] |
111 | vldr d1, [src, #\base + 16] |
112 | vstr d2, [dst, #\base + 24] |
113 | vldr d2, [src, #\base + 24] |
114 | vstr \vreg, [dst, #\base + 32] |
115 | vstr d0, [dst, #\base + 40] |
116 | vldr d0, [src, #\base + 40] |
117 | vstr d1, [dst, #\base + 48] |
118 | vldr d1, [src, #\base + 48] |
119 | vstr d2, [dst, #\base + 56] |
120 | vldr d2, [src, #\base + 56] |
121 | .endm |
122 | #endif |
123 | |
124 | ENTRY (__memcpy_arm) |
125 | |
126 | mov dst, dstin /* Preserve dstin, we need to return it. */ |
127 | cmp count, #64 |
128 | bge L(cpy_not_short) |
129 | /* Deal with small copies quickly by dropping straight into the |
130 | exit block. */ |
131 | |
132 | L(tail63unaligned): |
133 | #ifdef USE_NEON |
134 | and tmp1, count, #0x38 |
135 | rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) |
136 | add pc, pc, tmp1 |
137 | vld1.8 {d0}, [src]! /* 14 words to go. */ |
138 | vst1.8 {d0}, [dst]! |
139 | vld1.8 {d0}, [src]! /* 12 words to go. */ |
140 | vst1.8 {d0}, [dst]! |
141 | vld1.8 {d0}, [src]! /* 10 words to go. */ |
142 | vst1.8 {d0}, [dst]! |
143 | vld1.8 {d0}, [src]! /* 8 words to go. */ |
144 | vst1.8 {d0}, [dst]! |
145 | vld1.8 {d0}, [src]! /* 6 words to go. */ |
146 | vst1.8 {d0}, [dst]! |
147 | vld1.8 {d0}, [src]! /* 4 words to go. */ |
148 | vst1.8 {d0}, [dst]! |
149 | vld1.8 {d0}, [src]! /* 2 words to go. */ |
150 | vst1.8 {d0}, [dst]! |
151 | |
152 | tst count, #4 |
153 | ldrne tmp1, [src], #4 |
154 | strne tmp1, [dst], #4 |
155 | #else |
156 | /* Copy up to 15 full words of data. May not be aligned. */ |
157 | /* Cannot use VFP for unaligned data. */ |
158 | and tmp1, count, #0x3c |
159 | add dst, dst, tmp1 |
160 | add src, src, tmp1 |
161 | rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2) |
162 | /* Jump directly into the sequence below at the correct offset. */ |
163 | add pc, pc, tmp1, lsl #1 |
164 | |
165 | ldr tmp1, [src, #-60] /* 15 words to go. */ |
166 | str tmp1, [dst, #-60] |
167 | |
168 | ldr tmp1, [src, #-56] /* 14 words to go. */ |
169 | str tmp1, [dst, #-56] |
170 | ldr tmp1, [src, #-52] |
171 | str tmp1, [dst, #-52] |
172 | |
173 | ldr tmp1, [src, #-48] /* 12 words to go. */ |
174 | str tmp1, [dst, #-48] |
175 | ldr tmp1, [src, #-44] |
176 | str tmp1, [dst, #-44] |
177 | |
178 | ldr tmp1, [src, #-40] /* 10 words to go. */ |
179 | str tmp1, [dst, #-40] |
180 | ldr tmp1, [src, #-36] |
181 | str tmp1, [dst, #-36] |
182 | |
183 | ldr tmp1, [src, #-32] /* 8 words to go. */ |
184 | str tmp1, [dst, #-32] |
185 | ldr tmp1, [src, #-28] |
186 | str tmp1, [dst, #-28] |
187 | |
188 | ldr tmp1, [src, #-24] /* 6 words to go. */ |
189 | str tmp1, [dst, #-24] |
190 | ldr tmp1, [src, #-20] |
191 | str tmp1, [dst, #-20] |
192 | |
193 | ldr tmp1, [src, #-16] /* 4 words to go. */ |
194 | str tmp1, [dst, #-16] |
195 | ldr tmp1, [src, #-12] |
196 | str tmp1, [dst, #-12] |
197 | |
198 | ldr tmp1, [src, #-8] /* 2 words to go. */ |
199 | str tmp1, [dst, #-8] |
200 | ldr tmp1, [src, #-4] |
201 | str tmp1, [dst, #-4] |
202 | #endif |
203 | |
204 | lsls count, count, #31 |
205 | ldrhcs tmp1, [src], #2 |
206 | ldrbne src, [src] /* Src is dead, use as a scratch. */ |
207 | strhcs tmp1, [dst], #2 |
208 | strbne src, [dst] |
209 | bx lr |
210 | |
211 | L(cpy_not_short): |
212 | /* At least 64 bytes to copy, but don't know the alignment yet. */ |
213 | str tmp2, [sp, #-FRAME_SIZE]! |
214 | and tmp2, src, #7 |
215 | and tmp1, dst, #7 |
216 | cmp tmp1, tmp2 |
217 | bne L(cpy_notaligned) |
218 | |
219 | #ifdef USE_VFP |
220 | /* Magic dust alert! Force VFP on Cortex-A9. Experiments show |
221 | that the FP pipeline is much better at streaming loads and |
222 | stores. This is outside the critical loop. */ |
223 | vmov.f32 s0, s0 |
224 | #endif |
225 | |
226 | /* SRC and DST have the same mutual 64-bit alignment, but we may |
227 | still need to pre-copy some bytes to get to natural alignment. |
228 | We bring SRC and DST into full 64-bit alignment. */ |
229 | lsls tmp2, dst, #29 |
230 | beq 1f |
231 | rsbs tmp2, tmp2, #0 |
232 | sub count, count, tmp2, lsr #29 |
233 | ldrmi tmp1, [src], #4 |
234 | strmi tmp1, [dst], #4 |
235 | lsls tmp2, tmp2, #2 |
236 | ldrhcs tmp1, [src], #2 |
237 | ldrbne tmp2, [src], #1 |
238 | strhcs tmp1, [dst], #2 |
239 | strbne tmp2, [dst], #1 |
240 | |
241 | 1: |
242 | subs tmp2, count, #64 /* Use tmp2 for count. */ |
243 | blt L(tail63aligned) |
244 | |
245 | cmp tmp2, #512 |
246 | bge L(cpy_body_long) |
247 | |
248 | L(cpy_body_medium): /* Count in tmp2. */ |
249 | #ifdef USE_VFP |
250 | 1: |
251 | vldr d0, [src, #0] |
252 | subs tmp2, tmp2, #64 |
253 | vldr d1, [src, #8] |
254 | vstr d0, [dst, #0] |
255 | vldr d0, [src, #16] |
256 | vstr d1, [dst, #8] |
257 | vldr d1, [src, #24] |
258 | vstr d0, [dst, #16] |
259 | vldr d0, [src, #32] |
260 | vstr d1, [dst, #24] |
261 | vldr d1, [src, #40] |
262 | vstr d0, [dst, #32] |
263 | vldr d0, [src, #48] |
264 | vstr d1, [dst, #40] |
265 | vldr d1, [src, #56] |
266 | vstr d0, [dst, #48] |
267 | add src, src, #64 |
268 | vstr d1, [dst, #56] |
269 | add dst, dst, #64 |
270 | bge 1b |
271 | tst tmp2, #0x3f |
272 | beq L(done) |
273 | |
274 | L(tail63aligned): /* Count in tmp2. */ |
275 | and tmp1, tmp2, #0x38 |
276 | add dst, dst, tmp1 |
277 | add src, src, tmp1 |
278 | rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) |
279 | add pc, pc, tmp1 |
280 | |
281 | vldr d0, [src, #-56] /* 14 words to go. */ |
282 | vstr d0, [dst, #-56] |
283 | vldr d0, [src, #-48] /* 12 words to go. */ |
284 | vstr d0, [dst, #-48] |
285 | vldr d0, [src, #-40] /* 10 words to go. */ |
286 | vstr d0, [dst, #-40] |
287 | vldr d0, [src, #-32] /* 8 words to go. */ |
288 | vstr d0, [dst, #-32] |
289 | vldr d0, [src, #-24] /* 6 words to go. */ |
290 | vstr d0, [dst, #-24] |
291 | vldr d0, [src, #-16] /* 4 words to go. */ |
292 | vstr d0, [dst, #-16] |
293 | vldr d0, [src, #-8] /* 2 words to go. */ |
294 | vstr d0, [dst, #-8] |
295 | #else |
296 | sub src, src, #8 |
297 | sub dst, dst, #8 |
298 | 1: |
299 | ldrd A_l, A_h, [src, #8] |
300 | strd A_l, A_h, [dst, #8] |
301 | ldrd A_l, A_h, [src, #16] |
302 | strd A_l, A_h, [dst, #16] |
303 | ldrd A_l, A_h, [src, #24] |
304 | strd A_l, A_h, [dst, #24] |
305 | ldrd A_l, A_h, [src, #32] |
306 | strd A_l, A_h, [dst, #32] |
307 | ldrd A_l, A_h, [src, #40] |
308 | strd A_l, A_h, [dst, #40] |
309 | ldrd A_l, A_h, [src, #48] |
310 | strd A_l, A_h, [dst, #48] |
311 | ldrd A_l, A_h, [src, #56] |
312 | strd A_l, A_h, [dst, #56] |
313 | ldrd A_l, A_h, [src, #64]! |
314 | strd A_l, A_h, [dst, #64]! |
315 | subs tmp2, tmp2, #64 |
316 | bge 1b |
317 | tst tmp2, #0x3f |
318 | bne 1f |
319 | ldr tmp2,[sp], #FRAME_SIZE |
320 | bx lr |
321 | 1: |
322 | add src, src, #8 |
323 | add dst, dst, #8 |
324 | |
325 | L(tail63aligned): /* Count in tmp2. */ |
326 | /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but |
327 | we know that the src and dest are 64-bit aligned so we can use |
328 | LDRD/STRD to improve efficiency. */ |
329 | /* TMP2 is now negative, but we don't care about that. The bottom |
330 | six bits still tell us how many bytes are left to copy. */ |
331 | |
332 | and tmp1, tmp2, #0x38 |
333 | add dst, dst, tmp1 |
334 | add src, src, tmp1 |
335 | rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) |
336 | add pc, pc, tmp1 |
337 | ldrd A_l, A_h, [src, #-56] /* 14 words to go. */ |
338 | strd A_l, A_h, [dst, #-56] |
339 | ldrd A_l, A_h, [src, #-48] /* 12 words to go. */ |
340 | strd A_l, A_h, [dst, #-48] |
341 | ldrd A_l, A_h, [src, #-40] /* 10 words to go. */ |
342 | strd A_l, A_h, [dst, #-40] |
343 | ldrd A_l, A_h, [src, #-32] /* 8 words to go. */ |
344 | strd A_l, A_h, [dst, #-32] |
345 | ldrd A_l, A_h, [src, #-24] /* 6 words to go. */ |
346 | strd A_l, A_h, [dst, #-24] |
347 | ldrd A_l, A_h, [src, #-16] /* 4 words to go. */ |
348 | strd A_l, A_h, [dst, #-16] |
349 | ldrd A_l, A_h, [src, #-8] /* 2 words to go. */ |
350 | strd A_l, A_h, [dst, #-8] |
351 | |
352 | #endif |
353 | tst tmp2, #4 |
354 | ldrne tmp1, [src], #4 |
355 | strne tmp1, [dst], #4 |
356 | lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */ |
357 | ldrhcs tmp1, [src], #2 |
358 | ldrbne tmp2, [src] |
359 | strhcs tmp1, [dst], #2 |
360 | strbne tmp2, [dst] |
361 | |
362 | L(done): |
363 | ldr tmp2, [sp], #FRAME_SIZE |
364 | bx lr |
365 | |
366 | L(cpy_body_long): /* Count in tmp2. */ |
367 | |
368 | /* Long copy. We know that there's at least (prefetch_lines * 64) |
369 | bytes to go. */ |
370 | #ifdef USE_VFP |
371 | /* Don't use PLD. Instead, read some data in advance of the current |
372 | copy position into a register. This should act like a PLD |
373 | operation but we won't have to repeat the transfer. */ |
374 | |
375 | vldr d3, [src, #0] |
376 | vldr d4, [src, #64] |
377 | vldr d5, [src, #128] |
378 | vldr d6, [src, #192] |
379 | vldr d7, [src, #256] |
380 | |
381 | vldr d0, [src, #8] |
382 | vldr d1, [src, #16] |
383 | vldr d2, [src, #24] |
384 | add src, src, #32 |
385 | |
386 | subs tmp2, tmp2, #prefetch_lines * 64 * 2 |
387 | blt 2f |
388 | 1: |
389 | cpy_line_vfp d3, 0 |
390 | cpy_line_vfp d4, 64 |
391 | cpy_line_vfp d5, 128 |
392 | add dst, dst, #3 * 64 |
393 | add src, src, #3 * 64 |
394 | cpy_line_vfp d6, 0 |
395 | cpy_line_vfp d7, 64 |
396 | add dst, dst, #2 * 64 |
397 | add src, src, #2 * 64 |
398 | subs tmp2, tmp2, #prefetch_lines * 64 |
399 | bge 1b |
400 | |
401 | 2: |
402 | cpy_tail_vfp d3, 0 |
403 | cpy_tail_vfp d4, 64 |
404 | cpy_tail_vfp d5, 128 |
405 | add src, src, #3 * 64 |
406 | add dst, dst, #3 * 64 |
407 | cpy_tail_vfp d6, 0 |
408 | vstr d7, [dst, #64] |
409 | vldr d7, [src, #64] |
410 | vstr d0, [dst, #64 + 8] |
411 | vldr d0, [src, #64 + 8] |
412 | vstr d1, [dst, #64 + 16] |
413 | vldr d1, [src, #64 + 16] |
414 | vstr d2, [dst, #64 + 24] |
415 | vldr d2, [src, #64 + 24] |
416 | vstr d7, [dst, #64 + 32] |
417 | add src, src, #96 |
418 | vstr d0, [dst, #64 + 40] |
419 | vstr d1, [dst, #64 + 48] |
420 | vstr d2, [dst, #64 + 56] |
421 | add dst, dst, #128 |
422 | add tmp2, tmp2, #prefetch_lines * 64 |
423 | b L(cpy_body_medium) |
424 | #else |
425 | /* Long copy. Use an SMS style loop to maximize the I/O |
426 | bandwidth of the core. We don't have enough spare registers |
427 | to synthesise prefetching, so use PLD operations. */ |
428 | /* Pre-bias src and dst. */ |
429 | sub src, src, #8 |
430 | sub dst, dst, #8 |
431 | pld [src, #8] |
432 | pld [src, #72] |
433 | subs tmp2, tmp2, #64 |
434 | pld [src, #136] |
435 | ldrd A_l, A_h, [src, #8] |
436 | strd B_l, B_h, [sp, #8] |
437 | ldrd B_l, B_h, [src, #16] |
438 | strd C_l, C_h, [sp, #16] |
439 | ldrd C_l, C_h, [src, #24] |
440 | strd D_l, D_h, [sp, #24] |
441 | pld [src, #200] |
442 | ldrd D_l, D_h, [src, #32]! |
443 | b 1f |
444 | .p2align 6 |
445 | 2: |
446 | pld [src, #232] |
447 | strd A_l, A_h, [dst, #40] |
448 | ldrd A_l, A_h, [src, #40] |
449 | strd B_l, B_h, [dst, #48] |
450 | ldrd B_l, B_h, [src, #48] |
451 | strd C_l, C_h, [dst, #56] |
452 | ldrd C_l, C_h, [src, #56] |
453 | strd D_l, D_h, [dst, #64]! |
454 | ldrd D_l, D_h, [src, #64]! |
455 | subs tmp2, tmp2, #64 |
456 | 1: |
457 | strd A_l, A_h, [dst, #8] |
458 | ldrd A_l, A_h, [src, #8] |
459 | strd B_l, B_h, [dst, #16] |
460 | ldrd B_l, B_h, [src, #16] |
461 | strd C_l, C_h, [dst, #24] |
462 | ldrd C_l, C_h, [src, #24] |
463 | strd D_l, D_h, [dst, #32] |
464 | ldrd D_l, D_h, [src, #32] |
465 | bcs 2b |
466 | /* Save the remaining bytes and restore the callee-saved regs. */ |
467 | strd A_l, A_h, [dst, #40] |
468 | add src, src, #40 |
469 | strd B_l, B_h, [dst, #48] |
470 | ldrd B_l, B_h, [sp, #8] |
471 | strd C_l, C_h, [dst, #56] |
472 | ldrd C_l, C_h, [sp, #16] |
473 | strd D_l, D_h, [dst, #64] |
474 | ldrd D_l, D_h, [sp, #24] |
475 | add dst, dst, #72 |
476 | tst tmp2, #0x3f |
477 | bne L(tail63aligned) |
478 | ldr tmp2, [sp], #FRAME_SIZE |
479 | bx lr |
480 | #endif |
481 | |
482 | L(cpy_notaligned): |
483 | pld [src] |
484 | pld [src, #64] |
485 | /* There's at least 64 bytes to copy, but there is no mutual |
486 | alignment. */ |
487 | /* Bring DST to 64-bit alignment. */ |
488 | lsls tmp2, dst, #29 |
489 | pld [src, #(2 * 64)] |
490 | beq 1f |
491 | rsbs tmp2, tmp2, #0 |
492 | sub count, count, tmp2, lsr #29 |
493 | ldrmi tmp1, [src], #4 |
494 | strmi tmp1, [dst], #4 |
495 | lsls tmp2, tmp2, #2 |
496 | ldrbne tmp1, [src], #1 |
497 | ldrhcs tmp2, [src], #2 |
498 | strbne tmp1, [dst], #1 |
499 | strhcs tmp2, [dst], #2 |
500 | 1: |
501 | pld [src, #(3 * 64)] |
502 | subs count, count, #64 |
503 | ldrmi tmp2, [sp], #FRAME_SIZE |
504 | bmi L(tail63unaligned) |
505 | pld [src, #(4 * 64)] |
506 | |
507 | #ifdef USE_NEON |
508 | vld1.8 {d0-d3}, [src]! |
509 | vld1.8 {d4-d7}, [src]! |
510 | subs count, count, #64 |
511 | bmi 2f |
512 | 1: |
513 | pld [src, #(4 * 64)] |
514 | vst1.8 {d0-d3}, [ALIGN (dst, 64)]! |
515 | vld1.8 {d0-d3}, [src]! |
516 | vst1.8 {d4-d7}, [ALIGN (dst, 64)]! |
517 | vld1.8 {d4-d7}, [src]! |
518 | subs count, count, #64 |
519 | bpl 1b |
520 | 2: |
521 | vst1.8 {d0-d3}, [ALIGN (dst, 64)]! |
522 | vst1.8 {d4-d7}, [ALIGN (dst, 64)]! |
523 | ands count, count, #0x3f |
524 | #else |
525 | /* Use an SMS style loop to maximize the I/O bandwidth. */ |
526 | sub src, src, #4 |
527 | sub dst, dst, #8 |
528 | subs tmp2, count, #64 /* Use tmp2 for count. */ |
529 | ldr A_l, [src, #4] |
530 | ldr A_h, [src, #8] |
531 | strd B_l, B_h, [sp, #8] |
532 | ldr B_l, [src, #12] |
533 | ldr B_h, [src, #16] |
534 | strd C_l, C_h, [sp, #16] |
535 | ldr C_l, [src, #20] |
536 | ldr C_h, [src, #24] |
537 | strd D_l, D_h, [sp, #24] |
538 | ldr D_l, [src, #28] |
539 | ldr D_h, [src, #32]! |
540 | b 1f |
541 | .p2align 6 |
542 | 2: |
543 | pld [src, #(5 * 64) - (32 - 4)] |
544 | strd A_l, A_h, [dst, #40] |
545 | ldr A_l, [src, #36] |
546 | ldr A_h, [src, #40] |
547 | strd B_l, B_h, [dst, #48] |
548 | ldr B_l, [src, #44] |
549 | ldr B_h, [src, #48] |
550 | strd C_l, C_h, [dst, #56] |
551 | ldr C_l, [src, #52] |
552 | ldr C_h, [src, #56] |
553 | strd D_l, D_h, [dst, #64]! |
554 | ldr D_l, [src, #60] |
555 | ldr D_h, [src, #64]! |
556 | subs tmp2, tmp2, #64 |
557 | 1: |
558 | strd A_l, A_h, [dst, #8] |
559 | ldr A_l, [src, #4] |
560 | ldr A_h, [src, #8] |
561 | strd B_l, B_h, [dst, #16] |
562 | ldr B_l, [src, #12] |
563 | ldr B_h, [src, #16] |
564 | strd C_l, C_h, [dst, #24] |
565 | ldr C_l, [src, #20] |
566 | ldr C_h, [src, #24] |
567 | strd D_l, D_h, [dst, #32] |
568 | ldr D_l, [src, #28] |
569 | ldr D_h, [src, #32] |
570 | bcs 2b |
571 | |
572 | /* Save the remaining bytes and restore the callee-saved regs. */ |
573 | strd A_l, A_h, [dst, #40] |
574 | add src, src, #36 |
575 | strd B_l, B_h, [dst, #48] |
576 | ldrd B_l, B_h, [sp, #8] |
577 | strd C_l, C_h, [dst, #56] |
578 | ldrd C_l, C_h, [sp, #16] |
579 | strd D_l, D_h, [dst, #64] |
580 | ldrd D_l, D_h, [sp, #24] |
581 | add dst, dst, #72 |
582 | ands count, tmp2, #0x3f |
583 | #endif |
584 | ldr tmp2, [sp], #FRAME_SIZE |
585 | bne L(tail63unaligned) |
586 | bx lr |
587 | |
588 | END (__memcpy_arm) |
589 | |