1/*
2 * memcpy - copy memory area
3 *
4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 * See https://llvm.org/LICENSE.txt for license information.
6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 */
8
9/*
10 This memcpy routine is optimised for Cortex-A15 cores and takes advantage
11 of VFP or NEON when built with the appropriate flags.
12
13 Assumptions:
14
15 ARMv6 (ARMv7-a if using Neon)
16 ARM state
17 Unaligned accesses
18
19 */
20
21#include "../asmdefs.h"
22
23 .syntax unified
24 /* This implementation requires ARM state. */
25 .arm
26
27#ifdef __ARM_NEON__
28
29 .fpu neon
30 .arch armv7-a
31# define FRAME_SIZE 4
32# define USE_VFP
33# define USE_NEON
34
35#elif !defined (__SOFTFP__)
36
37 .arch armv6
38 .fpu vfpv2
39# define FRAME_SIZE 32
40# define USE_VFP
41
42#else
43 .arch armv6
44# define FRAME_SIZE 32
45
46#endif
47
48/* Old versions of GAS incorrectly implement the NEON align semantics. */
49#ifdef BROKEN_ASM_NEON_ALIGN
50#define ALIGN(addr, align) addr,:align
51#else
52#define ALIGN(addr, align) addr:align
53#endif
54
55#define PC_OFFSET 8 /* PC pipeline compensation. */
56#define INSN_SIZE 4
57
58/* Call parameters. */
59#define dstin r0
60#define src r1
61#define count r2
62
63/* Locals. */
64#define tmp1 r3
65#define dst ip
66#define tmp2 r10
67
68#ifndef USE_NEON
69/* For bulk copies using GP registers. */
70#define A_l r2 /* Call-clobbered. */
71#define A_h r3 /* Call-clobbered. */
72#define B_l r4
73#define B_h r5
74#define C_l r6
75#define C_h r7
76#define D_l r8
77#define D_h r9
78#endif
79
80/* Number of lines ahead to pre-fetch data. If you change this the code
81 below will need adjustment to compensate. */
82
83#define prefetch_lines 5
84
85#ifdef USE_VFP
86 .macro cpy_line_vfp vreg, base
87 vstr \vreg, [dst, #\base]
88 vldr \vreg, [src, #\base]
89 vstr d0, [dst, #\base + 8]
90 vldr d0, [src, #\base + 8]
91 vstr d1, [dst, #\base + 16]
92 vldr d1, [src, #\base + 16]
93 vstr d2, [dst, #\base + 24]
94 vldr d2, [src, #\base + 24]
95 vstr \vreg, [dst, #\base + 32]
96 vldr \vreg, [src, #\base + prefetch_lines * 64 - 32]
97 vstr d0, [dst, #\base + 40]
98 vldr d0, [src, #\base + 40]
99 vstr d1, [dst, #\base + 48]
100 vldr d1, [src, #\base + 48]
101 vstr d2, [dst, #\base + 56]
102 vldr d2, [src, #\base + 56]
103 .endm
104
105 .macro cpy_tail_vfp vreg, base
106 vstr \vreg, [dst, #\base]
107 vldr \vreg, [src, #\base]
108 vstr d0, [dst, #\base + 8]
109 vldr d0, [src, #\base + 8]
110 vstr d1, [dst, #\base + 16]
111 vldr d1, [src, #\base + 16]
112 vstr d2, [dst, #\base + 24]
113 vldr d2, [src, #\base + 24]
114 vstr \vreg, [dst, #\base + 32]
115 vstr d0, [dst, #\base + 40]
116 vldr d0, [src, #\base + 40]
117 vstr d1, [dst, #\base + 48]
118 vldr d1, [src, #\base + 48]
119 vstr d2, [dst, #\base + 56]
120 vldr d2, [src, #\base + 56]
121 .endm
122#endif
123
124ENTRY (__memcpy_arm)
125
126 mov dst, dstin /* Preserve dstin, we need to return it. */
127 cmp count, #64
128 bge L(cpy_not_short)
129 /* Deal with small copies quickly by dropping straight into the
130 exit block. */
131
132L(tail63unaligned):
133#ifdef USE_NEON
134 and tmp1, count, #0x38
135 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
136 add pc, pc, tmp1
137 vld1.8 {d0}, [src]! /* 14 words to go. */
138 vst1.8 {d0}, [dst]!
139 vld1.8 {d0}, [src]! /* 12 words to go. */
140 vst1.8 {d0}, [dst]!
141 vld1.8 {d0}, [src]! /* 10 words to go. */
142 vst1.8 {d0}, [dst]!
143 vld1.8 {d0}, [src]! /* 8 words to go. */
144 vst1.8 {d0}, [dst]!
145 vld1.8 {d0}, [src]! /* 6 words to go. */
146 vst1.8 {d0}, [dst]!
147 vld1.8 {d0}, [src]! /* 4 words to go. */
148 vst1.8 {d0}, [dst]!
149 vld1.8 {d0}, [src]! /* 2 words to go. */
150 vst1.8 {d0}, [dst]!
151
152 tst count, #4
153 ldrne tmp1, [src], #4
154 strne tmp1, [dst], #4
155#else
156 /* Copy up to 15 full words of data. May not be aligned. */
157 /* Cannot use VFP for unaligned data. */
158 and tmp1, count, #0x3c
159 add dst, dst, tmp1
160 add src, src, tmp1
161 rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
162 /* Jump directly into the sequence below at the correct offset. */
163 add pc, pc, tmp1, lsl #1
164
165 ldr tmp1, [src, #-60] /* 15 words to go. */
166 str tmp1, [dst, #-60]
167
168 ldr tmp1, [src, #-56] /* 14 words to go. */
169 str tmp1, [dst, #-56]
170 ldr tmp1, [src, #-52]
171 str tmp1, [dst, #-52]
172
173 ldr tmp1, [src, #-48] /* 12 words to go. */
174 str tmp1, [dst, #-48]
175 ldr tmp1, [src, #-44]
176 str tmp1, [dst, #-44]
177
178 ldr tmp1, [src, #-40] /* 10 words to go. */
179 str tmp1, [dst, #-40]
180 ldr tmp1, [src, #-36]
181 str tmp1, [dst, #-36]
182
183 ldr tmp1, [src, #-32] /* 8 words to go. */
184 str tmp1, [dst, #-32]
185 ldr tmp1, [src, #-28]
186 str tmp1, [dst, #-28]
187
188 ldr tmp1, [src, #-24] /* 6 words to go. */
189 str tmp1, [dst, #-24]
190 ldr tmp1, [src, #-20]
191 str tmp1, [dst, #-20]
192
193 ldr tmp1, [src, #-16] /* 4 words to go. */
194 str tmp1, [dst, #-16]
195 ldr tmp1, [src, #-12]
196 str tmp1, [dst, #-12]
197
198 ldr tmp1, [src, #-8] /* 2 words to go. */
199 str tmp1, [dst, #-8]
200 ldr tmp1, [src, #-4]
201 str tmp1, [dst, #-4]
202#endif
203
204 lsls count, count, #31
205 ldrhcs tmp1, [src], #2
206 ldrbne src, [src] /* Src is dead, use as a scratch. */
207 strhcs tmp1, [dst], #2
208 strbne src, [dst]
209 bx lr
210
211L(cpy_not_short):
212 /* At least 64 bytes to copy, but don't know the alignment yet. */
213 str tmp2, [sp, #-FRAME_SIZE]!
214 and tmp2, src, #7
215 and tmp1, dst, #7
216 cmp tmp1, tmp2
217 bne L(cpy_notaligned)
218
219#ifdef USE_VFP
220 /* Magic dust alert! Force VFP on Cortex-A9. Experiments show
221 that the FP pipeline is much better at streaming loads and
222 stores. This is outside the critical loop. */
223 vmov.f32 s0, s0
224#endif
225
226 /* SRC and DST have the same mutual 64-bit alignment, but we may
227 still need to pre-copy some bytes to get to natural alignment.
228 We bring SRC and DST into full 64-bit alignment. */
229 lsls tmp2, dst, #29
230 beq 1f
231 rsbs tmp2, tmp2, #0
232 sub count, count, tmp2, lsr #29
233 ldrmi tmp1, [src], #4
234 strmi tmp1, [dst], #4
235 lsls tmp2, tmp2, #2
236 ldrhcs tmp1, [src], #2
237 ldrbne tmp2, [src], #1
238 strhcs tmp1, [dst], #2
239 strbne tmp2, [dst], #1
240
2411:
242 subs tmp2, count, #64 /* Use tmp2 for count. */
243 blt L(tail63aligned)
244
245 cmp tmp2, #512
246 bge L(cpy_body_long)
247
248L(cpy_body_medium): /* Count in tmp2. */
249#ifdef USE_VFP
2501:
251 vldr d0, [src, #0]
252 subs tmp2, tmp2, #64
253 vldr d1, [src, #8]
254 vstr d0, [dst, #0]
255 vldr d0, [src, #16]
256 vstr d1, [dst, #8]
257 vldr d1, [src, #24]
258 vstr d0, [dst, #16]
259 vldr d0, [src, #32]
260 vstr d1, [dst, #24]
261 vldr d1, [src, #40]
262 vstr d0, [dst, #32]
263 vldr d0, [src, #48]
264 vstr d1, [dst, #40]
265 vldr d1, [src, #56]
266 vstr d0, [dst, #48]
267 add src, src, #64
268 vstr d1, [dst, #56]
269 add dst, dst, #64
270 bge 1b
271 tst tmp2, #0x3f
272 beq L(done)
273
274L(tail63aligned): /* Count in tmp2. */
275 and tmp1, tmp2, #0x38
276 add dst, dst, tmp1
277 add src, src, tmp1
278 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
279 add pc, pc, tmp1
280
281 vldr d0, [src, #-56] /* 14 words to go. */
282 vstr d0, [dst, #-56]
283 vldr d0, [src, #-48] /* 12 words to go. */
284 vstr d0, [dst, #-48]
285 vldr d0, [src, #-40] /* 10 words to go. */
286 vstr d0, [dst, #-40]
287 vldr d0, [src, #-32] /* 8 words to go. */
288 vstr d0, [dst, #-32]
289 vldr d0, [src, #-24] /* 6 words to go. */
290 vstr d0, [dst, #-24]
291 vldr d0, [src, #-16] /* 4 words to go. */
292 vstr d0, [dst, #-16]
293 vldr d0, [src, #-8] /* 2 words to go. */
294 vstr d0, [dst, #-8]
295#else
296 sub src, src, #8
297 sub dst, dst, #8
2981:
299 ldrd A_l, A_h, [src, #8]
300 strd A_l, A_h, [dst, #8]
301 ldrd A_l, A_h, [src, #16]
302 strd A_l, A_h, [dst, #16]
303 ldrd A_l, A_h, [src, #24]
304 strd A_l, A_h, [dst, #24]
305 ldrd A_l, A_h, [src, #32]
306 strd A_l, A_h, [dst, #32]
307 ldrd A_l, A_h, [src, #40]
308 strd A_l, A_h, [dst, #40]
309 ldrd A_l, A_h, [src, #48]
310 strd A_l, A_h, [dst, #48]
311 ldrd A_l, A_h, [src, #56]
312 strd A_l, A_h, [dst, #56]
313 ldrd A_l, A_h, [src, #64]!
314 strd A_l, A_h, [dst, #64]!
315 subs tmp2, tmp2, #64
316 bge 1b
317 tst tmp2, #0x3f
318 bne 1f
319 ldr tmp2,[sp], #FRAME_SIZE
320 bx lr
3211:
322 add src, src, #8
323 add dst, dst, #8
324
325L(tail63aligned): /* Count in tmp2. */
326 /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
327 we know that the src and dest are 64-bit aligned so we can use
328 LDRD/STRD to improve efficiency. */
329 /* TMP2 is now negative, but we don't care about that. The bottom
330 six bits still tell us how many bytes are left to copy. */
331
332 and tmp1, tmp2, #0x38
333 add dst, dst, tmp1
334 add src, src, tmp1
335 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
336 add pc, pc, tmp1
337 ldrd A_l, A_h, [src, #-56] /* 14 words to go. */
338 strd A_l, A_h, [dst, #-56]
339 ldrd A_l, A_h, [src, #-48] /* 12 words to go. */
340 strd A_l, A_h, [dst, #-48]
341 ldrd A_l, A_h, [src, #-40] /* 10 words to go. */
342 strd A_l, A_h, [dst, #-40]
343 ldrd A_l, A_h, [src, #-32] /* 8 words to go. */
344 strd A_l, A_h, [dst, #-32]
345 ldrd A_l, A_h, [src, #-24] /* 6 words to go. */
346 strd A_l, A_h, [dst, #-24]
347 ldrd A_l, A_h, [src, #-16] /* 4 words to go. */
348 strd A_l, A_h, [dst, #-16]
349 ldrd A_l, A_h, [src, #-8] /* 2 words to go. */
350 strd A_l, A_h, [dst, #-8]
351
352#endif
353 tst tmp2, #4
354 ldrne tmp1, [src], #4
355 strne tmp1, [dst], #4
356 lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
357 ldrhcs tmp1, [src], #2
358 ldrbne tmp2, [src]
359 strhcs tmp1, [dst], #2
360 strbne tmp2, [dst]
361
362L(done):
363 ldr tmp2, [sp], #FRAME_SIZE
364 bx lr
365
366L(cpy_body_long): /* Count in tmp2. */
367
368 /* Long copy. We know that there's at least (prefetch_lines * 64)
369 bytes to go. */
370#ifdef USE_VFP
371 /* Don't use PLD. Instead, read some data in advance of the current
372 copy position into a register. This should act like a PLD
373 operation but we won't have to repeat the transfer. */
374
375 vldr d3, [src, #0]
376 vldr d4, [src, #64]
377 vldr d5, [src, #128]
378 vldr d6, [src, #192]
379 vldr d7, [src, #256]
380
381 vldr d0, [src, #8]
382 vldr d1, [src, #16]
383 vldr d2, [src, #24]
384 add src, src, #32
385
386 subs tmp2, tmp2, #prefetch_lines * 64 * 2
387 blt 2f
3881:
389 cpy_line_vfp d3, 0
390 cpy_line_vfp d4, 64
391 cpy_line_vfp d5, 128
392 add dst, dst, #3 * 64
393 add src, src, #3 * 64
394 cpy_line_vfp d6, 0
395 cpy_line_vfp d7, 64
396 add dst, dst, #2 * 64
397 add src, src, #2 * 64
398 subs tmp2, tmp2, #prefetch_lines * 64
399 bge 1b
400
4012:
402 cpy_tail_vfp d3, 0
403 cpy_tail_vfp d4, 64
404 cpy_tail_vfp d5, 128
405 add src, src, #3 * 64
406 add dst, dst, #3 * 64
407 cpy_tail_vfp d6, 0
408 vstr d7, [dst, #64]
409 vldr d7, [src, #64]
410 vstr d0, [dst, #64 + 8]
411 vldr d0, [src, #64 + 8]
412 vstr d1, [dst, #64 + 16]
413 vldr d1, [src, #64 + 16]
414 vstr d2, [dst, #64 + 24]
415 vldr d2, [src, #64 + 24]
416 vstr d7, [dst, #64 + 32]
417 add src, src, #96
418 vstr d0, [dst, #64 + 40]
419 vstr d1, [dst, #64 + 48]
420 vstr d2, [dst, #64 + 56]
421 add dst, dst, #128
422 add tmp2, tmp2, #prefetch_lines * 64
423 b L(cpy_body_medium)
424#else
425 /* Long copy. Use an SMS style loop to maximize the I/O
426 bandwidth of the core. We don't have enough spare registers
427 to synthesise prefetching, so use PLD operations. */
428 /* Pre-bias src and dst. */
429 sub src, src, #8
430 sub dst, dst, #8
431 pld [src, #8]
432 pld [src, #72]
433 subs tmp2, tmp2, #64
434 pld [src, #136]
435 ldrd A_l, A_h, [src, #8]
436 strd B_l, B_h, [sp, #8]
437 ldrd B_l, B_h, [src, #16]
438 strd C_l, C_h, [sp, #16]
439 ldrd C_l, C_h, [src, #24]
440 strd D_l, D_h, [sp, #24]
441 pld [src, #200]
442 ldrd D_l, D_h, [src, #32]!
443 b 1f
444 .p2align 6
4452:
446 pld [src, #232]
447 strd A_l, A_h, [dst, #40]
448 ldrd A_l, A_h, [src, #40]
449 strd B_l, B_h, [dst, #48]
450 ldrd B_l, B_h, [src, #48]
451 strd C_l, C_h, [dst, #56]
452 ldrd C_l, C_h, [src, #56]
453 strd D_l, D_h, [dst, #64]!
454 ldrd D_l, D_h, [src, #64]!
455 subs tmp2, tmp2, #64
4561:
457 strd A_l, A_h, [dst, #8]
458 ldrd A_l, A_h, [src, #8]
459 strd B_l, B_h, [dst, #16]
460 ldrd B_l, B_h, [src, #16]
461 strd C_l, C_h, [dst, #24]
462 ldrd C_l, C_h, [src, #24]
463 strd D_l, D_h, [dst, #32]
464 ldrd D_l, D_h, [src, #32]
465 bcs 2b
466 /* Save the remaining bytes and restore the callee-saved regs. */
467 strd A_l, A_h, [dst, #40]
468 add src, src, #40
469 strd B_l, B_h, [dst, #48]
470 ldrd B_l, B_h, [sp, #8]
471 strd C_l, C_h, [dst, #56]
472 ldrd C_l, C_h, [sp, #16]
473 strd D_l, D_h, [dst, #64]
474 ldrd D_l, D_h, [sp, #24]
475 add dst, dst, #72
476 tst tmp2, #0x3f
477 bne L(tail63aligned)
478 ldr tmp2, [sp], #FRAME_SIZE
479 bx lr
480#endif
481
482L(cpy_notaligned):
483 pld [src]
484 pld [src, #64]
485 /* There's at least 64 bytes to copy, but there is no mutual
486 alignment. */
487 /* Bring DST to 64-bit alignment. */
488 lsls tmp2, dst, #29
489 pld [src, #(2 * 64)]
490 beq 1f
491 rsbs tmp2, tmp2, #0
492 sub count, count, tmp2, lsr #29
493 ldrmi tmp1, [src], #4
494 strmi tmp1, [dst], #4
495 lsls tmp2, tmp2, #2
496 ldrbne tmp1, [src], #1
497 ldrhcs tmp2, [src], #2
498 strbne tmp1, [dst], #1
499 strhcs tmp2, [dst], #2
5001:
501 pld [src, #(3 * 64)]
502 subs count, count, #64
503 ldrmi tmp2, [sp], #FRAME_SIZE
504 bmi L(tail63unaligned)
505 pld [src, #(4 * 64)]
506
507#ifdef USE_NEON
508 vld1.8 {d0-d3}, [src]!
509 vld1.8 {d4-d7}, [src]!
510 subs count, count, #64
511 bmi 2f
5121:
513 pld [src, #(4 * 64)]
514 vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
515 vld1.8 {d0-d3}, [src]!
516 vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
517 vld1.8 {d4-d7}, [src]!
518 subs count, count, #64
519 bpl 1b
5202:
521 vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
522 vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
523 ands count, count, #0x3f
524#else
525 /* Use an SMS style loop to maximize the I/O bandwidth. */
526 sub src, src, #4
527 sub dst, dst, #8
528 subs tmp2, count, #64 /* Use tmp2 for count. */
529 ldr A_l, [src, #4]
530 ldr A_h, [src, #8]
531 strd B_l, B_h, [sp, #8]
532 ldr B_l, [src, #12]
533 ldr B_h, [src, #16]
534 strd C_l, C_h, [sp, #16]
535 ldr C_l, [src, #20]
536 ldr C_h, [src, #24]
537 strd D_l, D_h, [sp, #24]
538 ldr D_l, [src, #28]
539 ldr D_h, [src, #32]!
540 b 1f
541 .p2align 6
5422:
543 pld [src, #(5 * 64) - (32 - 4)]
544 strd A_l, A_h, [dst, #40]
545 ldr A_l, [src, #36]
546 ldr A_h, [src, #40]
547 strd B_l, B_h, [dst, #48]
548 ldr B_l, [src, #44]
549 ldr B_h, [src, #48]
550 strd C_l, C_h, [dst, #56]
551 ldr C_l, [src, #52]
552 ldr C_h, [src, #56]
553 strd D_l, D_h, [dst, #64]!
554 ldr D_l, [src, #60]
555 ldr D_h, [src, #64]!
556 subs tmp2, tmp2, #64
5571:
558 strd A_l, A_h, [dst, #8]
559 ldr A_l, [src, #4]
560 ldr A_h, [src, #8]
561 strd B_l, B_h, [dst, #16]
562 ldr B_l, [src, #12]
563 ldr B_h, [src, #16]
564 strd C_l, C_h, [dst, #24]
565 ldr C_l, [src, #20]
566 ldr C_h, [src, #24]
567 strd D_l, D_h, [dst, #32]
568 ldr D_l, [src, #28]
569 ldr D_h, [src, #32]
570 bcs 2b
571
572 /* Save the remaining bytes and restore the callee-saved regs. */
573 strd A_l, A_h, [dst, #40]
574 add src, src, #36
575 strd B_l, B_h, [dst, #48]
576 ldrd B_l, B_h, [sp, #8]
577 strd C_l, C_h, [dst, #56]
578 ldrd C_l, C_h, [sp, #16]
579 strd D_l, D_h, [dst, #64]
580 ldrd D_l, D_h, [sp, #24]
581 add dst, dst, #72
582 ands count, tmp2, #0x3f
583#endif
584 ldr tmp2, [sp], #FRAME_SIZE
585 bne L(tail63unaligned)
586 bx lr
587
588END (__memcpy_arm)
589

source code of libc/AOR_v20.02/string/arm/memcpy.S