1 | /* SPDX-License-Identifier: GPL-2.0-only */ |
2 | /* |
3 | * Copyright (C) 2022 Michael T. Kloos <michael@michaelkloos.com> |
4 | */ |
5 | |
6 | #include <linux/linkage.h> |
7 | #include <asm/asm.h> |
8 | |
9 | SYM_FUNC_START(__memmove) |
10 | /* |
11 | * Returns |
12 | * a0 - dest |
13 | * |
14 | * Parameters |
15 | * a0 - Inclusive first byte of dest |
16 | * a1 - Inclusive first byte of src |
17 | * a2 - Length of copy n |
18 | * |
19 | * Because the return matches the parameter register a0, |
20 | * we will not clobber or modify that register. |
21 | * |
22 | * Note: This currently only works on little-endian. |
23 | * To port to big-endian, reverse the direction of shifts |
24 | * in the 2 misaligned fixup copy loops. |
25 | */ |
26 | |
27 | /* Return if nothing to do */ |
28 | beq a0, a1, .Lreturn_from_memmove |
29 | beqz a2, .Lreturn_from_memmove |
30 | |
31 | /* |
32 | * Register Uses |
33 | * Forward Copy: a1 - Index counter of src |
34 | * Reverse Copy: a4 - Index counter of src |
35 | * Forward Copy: t3 - Index counter of dest |
36 | * Reverse Copy: t4 - Index counter of dest |
37 | * Both Copy Modes: t5 - Inclusive first multibyte/aligned of dest |
38 | * Both Copy Modes: t6 - Non-Inclusive last multibyte/aligned of dest |
39 | * Both Copy Modes: t0 - Link / Temporary for load-store |
40 | * Both Copy Modes: t1 - Temporary for load-store |
41 | * Both Copy Modes: t2 - Temporary for load-store |
42 | * Both Copy Modes: a5 - dest to src alignment offset |
43 | * Both Copy Modes: a6 - Shift ammount |
44 | * Both Copy Modes: a7 - Inverse Shift ammount |
45 | * Both Copy Modes: a2 - Alternate breakpoint for unrolled loops |
46 | */ |
47 | |
48 | /* |
49 | * Solve for some register values now. |
50 | * Byte copy does not need t5 or t6. |
51 | */ |
52 | mv t3, a0 |
53 | add t4, a0, a2 |
54 | add a4, a1, a2 |
55 | |
56 | /* |
57 | * Byte copy if copying less than (2 * SZREG) bytes. This can |
58 | * cause problems with the bulk copy implementation and is |
59 | * small enough not to bother. |
60 | */ |
61 | andi t0, a2, -(2 * SZREG) |
62 | beqz t0, .Lbyte_copy |
63 | |
64 | /* |
65 | * Now solve for t5 and t6. |
66 | */ |
67 | andi t5, t3, -SZREG |
68 | andi t6, t4, -SZREG |
69 | /* |
70 | * If dest(Register t3) rounded down to the nearest naturally |
71 | * aligned SZREG address, does not equal dest, then add SZREG |
72 | * to find the low-bound of SZREG alignment in the dest memory |
73 | * region. Note that this could overshoot the dest memory |
74 | * region if n is less than SZREG. This is one reason why |
75 | * we always byte copy if n is less than SZREG. |
76 | * Otherwise, dest is already naturally aligned to SZREG. |
77 | */ |
78 | beq t5, t3, 1f |
79 | addi t5, t5, SZREG |
80 | 1: |
81 | |
82 | /* |
83 | * If the dest and src are co-aligned to SZREG, then there is |
84 | * no need for the full rigmarole of a full misaligned fixup copy. |
85 | * Instead, do a simpler co-aligned copy. |
86 | */ |
87 | xor t0, a0, a1 |
88 | andi t1, t0, (SZREG - 1) |
89 | beqz t1, .Lcoaligned_copy |
90 | /* Fall through to misaligned fixup copy */ |
91 | |
92 | .Lmisaligned_fixup_copy: |
93 | bltu a1, a0, .Lmisaligned_fixup_copy_reverse |
94 | |
95 | .Lmisaligned_fixup_copy_forward: |
96 | jal t0, .Lbyte_copy_until_aligned_forward |
97 | |
98 | andi a5, a1, (SZREG - 1) /* Find the alignment offset of src (a1) */ |
99 | slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */ |
100 | sub a5, a1, t3 /* Find the difference between src and dest */ |
101 | andi a1, a1, -SZREG /* Align the src pointer */ |
102 | addi a2, t6, SZREG /* The other breakpoint for the unrolled loop*/ |
103 | |
104 | /* |
105 | * Compute The Inverse Shift |
106 | * a7 = XLEN - a6 = XLEN + -a6 |
107 | * 2s complement negation to find the negative: -a6 = ~a6 + 1 |
108 | * Add that to XLEN. XLEN = SZREG * 8. |
109 | */ |
110 | not a7, a6 |
111 | addi a7, a7, (SZREG * 8 + 1) |
112 | |
113 | /* |
114 | * Fix Misalignment Copy Loop - Forward |
115 | * load_val0 = load_ptr[0]; |
116 | * do { |
117 | * load_val1 = load_ptr[1]; |
118 | * store_ptr += 2; |
119 | * store_ptr[0 - 2] = (load_val0 >> {a6}) | (load_val1 << {a7}); |
120 | * |
121 | * if (store_ptr == {a2}) |
122 | * break; |
123 | * |
124 | * load_val0 = load_ptr[2]; |
125 | * load_ptr += 2; |
126 | * store_ptr[1 - 2] = (load_val1 >> {a6}) | (load_val0 << {a7}); |
127 | * |
128 | * } while (store_ptr != store_ptr_end); |
129 | * store_ptr = store_ptr_end; |
130 | */ |
131 | |
132 | REG_L t0, (0 * SZREG)(a1) |
133 | 1: |
134 | REG_L t1, (1 * SZREG)(a1) |
135 | addi t3, t3, (2 * SZREG) |
136 | srl t0, t0, a6 |
137 | sll t2, t1, a7 |
138 | or t2, t0, t2 |
139 | REG_S t2, ((0 * SZREG) - (2 * SZREG))(t3) |
140 | |
141 | beq t3, a2, 2f |
142 | |
143 | REG_L t0, (2 * SZREG)(a1) |
144 | addi a1, a1, (2 * SZREG) |
145 | srl t1, t1, a6 |
146 | sll t2, t0, a7 |
147 | or t2, t1, t2 |
148 | REG_S t2, ((1 * SZREG) - (2 * SZREG))(t3) |
149 | |
150 | bne t3, t6, 1b |
151 | 2: |
152 | mv t3, t6 /* Fix the dest pointer in case the loop was broken */ |
153 | |
154 | add a1, t3, a5 /* Restore the src pointer */ |
155 | j .Lbyte_copy_forward /* Copy any remaining bytes */ |
156 | |
157 | .Lmisaligned_fixup_copy_reverse: |
158 | jal t0, .Lbyte_copy_until_aligned_reverse |
159 | |
160 | andi a5, a4, (SZREG - 1) /* Find the alignment offset of src (a4) */ |
161 | slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */ |
162 | sub a5, a4, t4 /* Find the difference between src and dest */ |
163 | andi a4, a4, -SZREG /* Align the src pointer */ |
164 | addi a2, t5, -SZREG /* The other breakpoint for the unrolled loop*/ |
165 | |
166 | /* |
167 | * Compute The Inverse Shift |
168 | * a7 = XLEN - a6 = XLEN + -a6 |
169 | * 2s complement negation to find the negative: -a6 = ~a6 + 1 |
170 | * Add that to XLEN. XLEN = SZREG * 8. |
171 | */ |
172 | not a7, a6 |
173 | addi a7, a7, (SZREG * 8 + 1) |
174 | |
175 | /* |
176 | * Fix Misalignment Copy Loop - Reverse |
177 | * load_val1 = load_ptr[0]; |
178 | * do { |
179 | * load_val0 = load_ptr[-1]; |
180 | * store_ptr -= 2; |
181 | * store_ptr[1] = (load_val0 >> {a6}) | (load_val1 << {a7}); |
182 | * |
183 | * if (store_ptr == {a2}) |
184 | * break; |
185 | * |
186 | * load_val1 = load_ptr[-2]; |
187 | * load_ptr -= 2; |
188 | * store_ptr[0] = (load_val1 >> {a6}) | (load_val0 << {a7}); |
189 | * |
190 | * } while (store_ptr != store_ptr_end); |
191 | * store_ptr = store_ptr_end; |
192 | */ |
193 | |
194 | REG_L t1, ( 0 * SZREG)(a4) |
195 | 1: |
196 | REG_L t0, (-1 * SZREG)(a4) |
197 | addi t4, t4, (-2 * SZREG) |
198 | sll t1, t1, a7 |
199 | srl t2, t0, a6 |
200 | or t2, t1, t2 |
201 | REG_S t2, ( 1 * SZREG)(t4) |
202 | |
203 | beq t4, a2, 2f |
204 | |
205 | REG_L t1, (-2 * SZREG)(a4) |
206 | addi a4, a4, (-2 * SZREG) |
207 | sll t0, t0, a7 |
208 | srl t2, t1, a6 |
209 | or t2, t0, t2 |
210 | REG_S t2, ( 0 * SZREG)(t4) |
211 | |
212 | bne t4, t5, 1b |
213 | 2: |
214 | mv t4, t5 /* Fix the dest pointer in case the loop was broken */ |
215 | |
216 | add a4, t4, a5 /* Restore the src pointer */ |
217 | j .Lbyte_copy_reverse /* Copy any remaining bytes */ |
218 | |
219 | /* |
220 | * Simple copy loops for SZREG co-aligned memory locations. |
221 | * These also make calls to do byte copies for any unaligned |
222 | * data at their terminations. |
223 | */ |
224 | .Lcoaligned_copy: |
225 | bltu a1, a0, .Lcoaligned_copy_reverse |
226 | |
227 | .Lcoaligned_copy_forward: |
228 | jal t0, .Lbyte_copy_until_aligned_forward |
229 | |
230 | 1: |
231 | REG_L t1, ( 0 * SZREG)(a1) |
232 | addi a1, a1, SZREG |
233 | addi t3, t3, SZREG |
234 | REG_S t1, (-1 * SZREG)(t3) |
235 | bne t3, t6, 1b |
236 | |
237 | j .Lbyte_copy_forward /* Copy any remaining bytes */ |
238 | |
239 | .Lcoaligned_copy_reverse: |
240 | jal t0, .Lbyte_copy_until_aligned_reverse |
241 | |
242 | 1: |
243 | REG_L t1, (-1 * SZREG)(a4) |
244 | addi a4, a4, -SZREG |
245 | addi t4, t4, -SZREG |
246 | REG_S t1, ( 0 * SZREG)(t4) |
247 | bne t4, t5, 1b |
248 | |
249 | j .Lbyte_copy_reverse /* Copy any remaining bytes */ |
250 | |
251 | /* |
252 | * These are basically sub-functions within the function. They |
253 | * are used to byte copy until the dest pointer is in alignment. |
254 | * At which point, a bulk copy method can be used by the |
255 | * calling code. These work on the same registers as the bulk |
256 | * copy loops. Therefore, the register values can be picked |
257 | * up from where they were left and we avoid code duplication |
258 | * without any overhead except the call in and return jumps. |
259 | */ |
260 | .Lbyte_copy_until_aligned_forward: |
261 | beq t3, t5, 2f |
262 | 1: |
263 | lb t1, 0(a1) |
264 | addi a1, a1, 1 |
265 | addi t3, t3, 1 |
266 | sb t1, -1(t3) |
267 | bne t3, t5, 1b |
268 | 2: |
269 | jalr zero, 0x0(t0) /* Return to multibyte copy loop */ |
270 | |
271 | .Lbyte_copy_until_aligned_reverse: |
272 | beq t4, t6, 2f |
273 | 1: |
274 | lb t1, -1(a4) |
275 | addi a4, a4, -1 |
276 | addi t4, t4, -1 |
277 | sb t1, 0(t4) |
278 | bne t4, t6, 1b |
279 | 2: |
280 | jalr zero, 0x0(t0) /* Return to multibyte copy loop */ |
281 | |
282 | /* |
283 | * Simple byte copy loops. |
284 | * These will byte copy until they reach the end of data to copy. |
285 | * At that point, they will call to return from memmove. |
286 | */ |
287 | .Lbyte_copy: |
288 | bltu a1, a0, .Lbyte_copy_reverse |
289 | |
290 | .Lbyte_copy_forward: |
291 | beq t3, t4, 2f |
292 | 1: |
293 | lb t1, 0(a1) |
294 | addi a1, a1, 1 |
295 | addi t3, t3, 1 |
296 | sb t1, -1(t3) |
297 | bne t3, t4, 1b |
298 | 2: |
299 | ret |
300 | |
301 | .Lbyte_copy_reverse: |
302 | beq t4, t3, 2f |
303 | 1: |
304 | lb t1, -1(a4) |
305 | addi a4, a4, -1 |
306 | addi t4, t4, -1 |
307 | sb t1, 0(t4) |
308 | bne t4, t3, 1b |
309 | 2: |
310 | |
311 | .Lreturn_from_memmove: |
312 | ret |
313 | |
314 | SYM_FUNC_END(__memmove) |
315 | SYM_FUNC_ALIAS_WEAK(memmove, __memmove) |
316 | SYM_FUNC_ALIAS(__pi_memmove, __memmove) |
317 | SYM_FUNC_ALIAS(__pi___memmove, __memmove) |
318 | |