1 | /* SPDX-License-Identifier: GPL-2.0-only */ |
2 | /* |
3 | * Copyright (C) 2013 ARM Ltd. |
4 | * Copyright (C) 2013 Linaro. |
5 | * |
6 | * This code is based on glibc cortex strings work originally authored by Linaro |
7 | * be found @ |
8 | * |
9 | * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ |
10 | * files/head:/src/aarch64/ |
11 | */ |
12 | |
13 | |
14 | /* |
15 | * Copy a buffer from src to dest (alignment handled by the hardware) |
16 | * |
17 | * Parameters: |
18 | * x0 - dest |
19 | * x1 - src |
20 | * x2 - n |
21 | * Returns: |
22 | * x0 - dest |
23 | */ |
24 | dstin .req x0 |
25 | src .req x1 |
26 | count .req x2 |
27 | tmp1 .req x3 |
28 | tmp1w .req w3 |
29 | tmp2 .req x4 |
30 | tmp2w .req w4 |
31 | dst .req x6 |
32 | |
33 | A_l .req x7 |
34 | A_h .req x8 |
35 | B_l .req x9 |
36 | B_h .req x10 |
37 | C_l .req x11 |
38 | C_h .req x12 |
39 | D_l .req x13 |
40 | D_h .req x14 |
41 | |
42 | mov dst, dstin |
43 | cmp count, #16 |
44 | /*When memory length is less than 16, the accessed are not aligned.*/ |
45 | b.lo .Ltiny15 |
46 | |
47 | neg tmp2, src |
48 | ands tmp2, tmp2, #15/* Bytes to reach alignment. */ |
49 | b.eq .LSrcAligned |
50 | sub count, count, tmp2 |
51 | /* |
52 | * Copy the leading memory data from src to dst in an increasing |
53 | * address order.By this way,the risk of overwriting the source |
54 | * memory data is eliminated when the distance between src and |
55 | * dst is less than 16. The memory accesses here are alignment. |
56 | */ |
57 | tbz tmp2, #0, 1f |
58 | ldrb1 tmp1w, src, #1 |
59 | strb1 tmp1w, dst, #1 |
60 | 1: |
61 | tbz tmp2, #1, 2f |
62 | ldrh1 tmp1w, src, #2 |
63 | strh1 tmp1w, dst, #2 |
64 | 2: |
65 | tbz tmp2, #2, 3f |
66 | ldr1 tmp1w, src, #4 |
67 | str1 tmp1w, dst, #4 |
68 | 3: |
69 | tbz tmp2, #3, .LSrcAligned |
70 | ldr1 tmp1, src, #8 |
71 | str1 tmp1, dst, #8 |
72 | |
73 | .LSrcAligned: |
74 | cmp count, #64 |
75 | b.ge .Lcpy_over64 |
76 | /* |
77 | * Deal with small copies quickly by dropping straight into the |
78 | * exit block. |
79 | */ |
80 | .Ltail63: |
81 | /* |
82 | * Copy up to 48 bytes of data. At this point we only need the |
83 | * bottom 6 bits of count to be accurate. |
84 | */ |
85 | ands tmp1, count, #0x30 |
86 | b.eq .Ltiny15 |
87 | cmp tmp1w, #0x20 |
88 | b.eq 1f |
89 | b.lt 2f |
90 | ldp1 A_l, A_h, src, #16 |
91 | stp1 A_l, A_h, dst, #16 |
92 | 1: |
93 | ldp1 A_l, A_h, src, #16 |
94 | stp1 A_l, A_h, dst, #16 |
95 | 2: |
96 | ldp1 A_l, A_h, src, #16 |
97 | stp1 A_l, A_h, dst, #16 |
98 | .Ltiny15: |
99 | /* |
100 | * Prefer to break one ldp/stp into several load/store to access |
101 | * memory in an increasing address order,rather than to load/store 16 |
102 | * bytes from (src-16) to (dst-16) and to backward the src to aligned |
103 | * address,which way is used in original cortex memcpy. If keeping |
104 | * the original memcpy process here, memmove need to satisfy the |
105 | * precondition that src address is at least 16 bytes bigger than dst |
106 | * address,otherwise some source data will be overwritten when memove |
107 | * call memcpy directly. To make memmove simpler and decouple the |
108 | * memcpy's dependency on memmove, withdrew the original process. |
109 | */ |
110 | tbz count, #3, 1f |
111 | ldr1 tmp1, src, #8 |
112 | str1 tmp1, dst, #8 |
113 | 1: |
114 | tbz count, #2, 2f |
115 | ldr1 tmp1w, src, #4 |
116 | str1 tmp1w, dst, #4 |
117 | 2: |
118 | tbz count, #1, 3f |
119 | ldrh1 tmp1w, src, #2 |
120 | strh1 tmp1w, dst, #2 |
121 | 3: |
122 | tbz count, #0, .Lexitfunc |
123 | ldrb1 tmp1w, src, #1 |
124 | strb1 tmp1w, dst, #1 |
125 | |
126 | b .Lexitfunc |
127 | |
128 | .Lcpy_over64: |
129 | subs count, count, #128 |
130 | b.ge .Lcpy_body_large |
131 | /* |
132 | * Less than 128 bytes to copy, so handle 64 here and then jump |
133 | * to the tail. |
134 | */ |
135 | ldp1 A_l, A_h, src, #16 |
136 | stp1 A_l, A_h, dst, #16 |
137 | ldp1 B_l, B_h, src, #16 |
138 | ldp1 C_l, C_h, src, #16 |
139 | stp1 B_l, B_h, dst, #16 |
140 | stp1 C_l, C_h, dst, #16 |
141 | ldp1 D_l, D_h, src, #16 |
142 | stp1 D_l, D_h, dst, #16 |
143 | |
144 | tst count, #0x3f |
145 | b.ne .Ltail63 |
146 | b .Lexitfunc |
147 | |
148 | /* |
149 | * Critical loop. Start at a new cache line boundary. Assuming |
150 | * 64 bytes per line this ensures the entire loop is in one line. |
151 | */ |
152 | .p2align L1_CACHE_SHIFT |
153 | .Lcpy_body_large: |
154 | /* pre-get 64 bytes data. */ |
155 | ldp1 A_l, A_h, src, #16 |
156 | ldp1 B_l, B_h, src, #16 |
157 | ldp1 C_l, C_h, src, #16 |
158 | ldp1 D_l, D_h, src, #16 |
159 | 1: |
160 | /* |
161 | * interlace the load of next 64 bytes data block with store of the last |
162 | * loaded 64 bytes data. |
163 | */ |
164 | stp1 A_l, A_h, dst, #16 |
165 | ldp1 A_l, A_h, src, #16 |
166 | stp1 B_l, B_h, dst, #16 |
167 | ldp1 B_l, B_h, src, #16 |
168 | stp1 C_l, C_h, dst, #16 |
169 | ldp1 C_l, C_h, src, #16 |
170 | stp1 D_l, D_h, dst, #16 |
171 | ldp1 D_l, D_h, src, #16 |
172 | subs count, count, #64 |
173 | b.ge 1b |
174 | stp1 A_l, A_h, dst, #16 |
175 | stp1 B_l, B_h, dst, #16 |
176 | stp1 C_l, C_h, dst, #16 |
177 | stp1 D_l, D_h, dst, #16 |
178 | |
179 | tst count, #0x3f |
180 | b.ne .Ltail63 |
181 | .Lexitfunc: |
182 | |