1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Copyright (C) 2013 ARM Ltd.
4 * Copyright (C) 2013 Linaro.
5 *
6 * This code is based on glibc cortex strings work originally authored by Linaro
7 * be found @
8 *
9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10 * files/head:/src/aarch64/
11 */
12
13
14/*
15 * Copy a buffer from src to dest (alignment handled by the hardware)
16 *
17 * Parameters:
18 * x0 - dest
19 * x1 - src
20 * x2 - n
21 * Returns:
22 * x0 - dest
23 */
24dstin .req x0
25src .req x1
26count .req x2
27tmp1 .req x3
28tmp1w .req w3
29tmp2 .req x4
30tmp2w .req w4
31dst .req x6
32
33A_l .req x7
34A_h .req x8
35B_l .req x9
36B_h .req x10
37C_l .req x11
38C_h .req x12
39D_l .req x13
40D_h .req x14
41
42 mov dst, dstin
43 cmp count, #16
44 /*When memory length is less than 16, the accessed are not aligned.*/
45 b.lo .Ltiny15
46
47 neg tmp2, src
48 ands tmp2, tmp2, #15/* Bytes to reach alignment. */
49 b.eq .LSrcAligned
50 sub count, count, tmp2
51 /*
52 * Copy the leading memory data from src to dst in an increasing
53 * address order.By this way,the risk of overwriting the source
54 * memory data is eliminated when the distance between src and
55 * dst is less than 16. The memory accesses here are alignment.
56 */
57 tbz tmp2, #0, 1f
58 ldrb1 tmp1w, src, #1
59 strb1 tmp1w, dst, #1
601:
61 tbz tmp2, #1, 2f
62 ldrh1 tmp1w, src, #2
63 strh1 tmp1w, dst, #2
642:
65 tbz tmp2, #2, 3f
66 ldr1 tmp1w, src, #4
67 str1 tmp1w, dst, #4
683:
69 tbz tmp2, #3, .LSrcAligned
70 ldr1 tmp1, src, #8
71 str1 tmp1, dst, #8
72
73.LSrcAligned:
74 cmp count, #64
75 b.ge .Lcpy_over64
76 /*
77 * Deal with small copies quickly by dropping straight into the
78 * exit block.
79 */
80.Ltail63:
81 /*
82 * Copy up to 48 bytes of data. At this point we only need the
83 * bottom 6 bits of count to be accurate.
84 */
85 ands tmp1, count, #0x30
86 b.eq .Ltiny15
87 cmp tmp1w, #0x20
88 b.eq 1f
89 b.lt 2f
90 ldp1 A_l, A_h, src, #16
91 stp1 A_l, A_h, dst, #16
921:
93 ldp1 A_l, A_h, src, #16
94 stp1 A_l, A_h, dst, #16
952:
96 ldp1 A_l, A_h, src, #16
97 stp1 A_l, A_h, dst, #16
98.Ltiny15:
99 /*
100 * Prefer to break one ldp/stp into several load/store to access
101 * memory in an increasing address order,rather than to load/store 16
102 * bytes from (src-16) to (dst-16) and to backward the src to aligned
103 * address,which way is used in original cortex memcpy. If keeping
104 * the original memcpy process here, memmove need to satisfy the
105 * precondition that src address is at least 16 bytes bigger than dst
106 * address,otherwise some source data will be overwritten when memove
107 * call memcpy directly. To make memmove simpler and decouple the
108 * memcpy's dependency on memmove, withdrew the original process.
109 */
110 tbz count, #3, 1f
111 ldr1 tmp1, src, #8
112 str1 tmp1, dst, #8
1131:
114 tbz count, #2, 2f
115 ldr1 tmp1w, src, #4
116 str1 tmp1w, dst, #4
1172:
118 tbz count, #1, 3f
119 ldrh1 tmp1w, src, #2
120 strh1 tmp1w, dst, #2
1213:
122 tbz count, #0, .Lexitfunc
123 ldrb1 tmp1w, src, #1
124 strb1 tmp1w, dst, #1
125
126 b .Lexitfunc
127
128.Lcpy_over64:
129 subs count, count, #128
130 b.ge .Lcpy_body_large
131 /*
132 * Less than 128 bytes to copy, so handle 64 here and then jump
133 * to the tail.
134 */
135 ldp1 A_l, A_h, src, #16
136 stp1 A_l, A_h, dst, #16
137 ldp1 B_l, B_h, src, #16
138 ldp1 C_l, C_h, src, #16
139 stp1 B_l, B_h, dst, #16
140 stp1 C_l, C_h, dst, #16
141 ldp1 D_l, D_h, src, #16
142 stp1 D_l, D_h, dst, #16
143
144 tst count, #0x3f
145 b.ne .Ltail63
146 b .Lexitfunc
147
148 /*
149 * Critical loop. Start at a new cache line boundary. Assuming
150 * 64 bytes per line this ensures the entire loop is in one line.
151 */
152 .p2align L1_CACHE_SHIFT
153.Lcpy_body_large:
154 /* pre-get 64 bytes data. */
155 ldp1 A_l, A_h, src, #16
156 ldp1 B_l, B_h, src, #16
157 ldp1 C_l, C_h, src, #16
158 ldp1 D_l, D_h, src, #16
1591:
160 /*
161 * interlace the load of next 64 bytes data block with store of the last
162 * loaded 64 bytes data.
163 */
164 stp1 A_l, A_h, dst, #16
165 ldp1 A_l, A_h, src, #16
166 stp1 B_l, B_h, dst, #16
167 ldp1 B_l, B_h, src, #16
168 stp1 C_l, C_h, dst, #16
169 ldp1 C_l, C_h, src, #16
170 stp1 D_l, D_h, dst, #16
171 ldp1 D_l, D_h, src, #16
172 subs count, count, #64
173 b.ge 1b
174 stp1 A_l, A_h, dst, #16
175 stp1 B_l, B_h, dst, #16
176 stp1 C_l, C_h, dst, #16
177 stp1 D_l, D_h, dst, #16
178
179 tst count, #0x3f
180 b.ne .Ltail63
181.Lexitfunc:
182

source code of linux/arch/arm64/lib/copy_template.S