1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Copyright (C) 2022 Michael T. Kloos <michael@michaelkloos.com>
4 */
5
6#include <linux/linkage.h>
7#include <asm/asm.h>
8
9SYM_FUNC_START(__memmove)
10 /*
11 * Returns
12 * a0 - dest
13 *
14 * Parameters
15 * a0 - Inclusive first byte of dest
16 * a1 - Inclusive first byte of src
17 * a2 - Length of copy n
18 *
19 * Because the return matches the parameter register a0,
20 * we will not clobber or modify that register.
21 *
22 * Note: This currently only works on little-endian.
23 * To port to big-endian, reverse the direction of shifts
24 * in the 2 misaligned fixup copy loops.
25 */
26
27 /* Return if nothing to do */
28 beq a0, a1, .Lreturn_from_memmove
29 beqz a2, .Lreturn_from_memmove
30
31 /*
32 * Register Uses
33 * Forward Copy: a1 - Index counter of src
34 * Reverse Copy: a4 - Index counter of src
35 * Forward Copy: t3 - Index counter of dest
36 * Reverse Copy: t4 - Index counter of dest
37 * Both Copy Modes: t5 - Inclusive first multibyte/aligned of dest
38 * Both Copy Modes: t6 - Non-Inclusive last multibyte/aligned of dest
39 * Both Copy Modes: t0 - Link / Temporary for load-store
40 * Both Copy Modes: t1 - Temporary for load-store
41 * Both Copy Modes: t2 - Temporary for load-store
42 * Both Copy Modes: a5 - dest to src alignment offset
43 * Both Copy Modes: a6 - Shift ammount
44 * Both Copy Modes: a7 - Inverse Shift ammount
45 * Both Copy Modes: a2 - Alternate breakpoint for unrolled loops
46 */
47
48 /*
49 * Solve for some register values now.
50 * Byte copy does not need t5 or t6.
51 */
52 mv t3, a0
53 add t4, a0, a2
54 add a4, a1, a2
55
56 /*
57 * Byte copy if copying less than (2 * SZREG) bytes. This can
58 * cause problems with the bulk copy implementation and is
59 * small enough not to bother.
60 */
61 andi t0, a2, -(2 * SZREG)
62 beqz t0, .Lbyte_copy
63
64 /*
65 * Now solve for t5 and t6.
66 */
67 andi t5, t3, -SZREG
68 andi t6, t4, -SZREG
69 /*
70 * If dest(Register t3) rounded down to the nearest naturally
71 * aligned SZREG address, does not equal dest, then add SZREG
72 * to find the low-bound of SZREG alignment in the dest memory
73 * region. Note that this could overshoot the dest memory
74 * region if n is less than SZREG. This is one reason why
75 * we always byte copy if n is less than SZREG.
76 * Otherwise, dest is already naturally aligned to SZREG.
77 */
78 beq t5, t3, 1f
79 addi t5, t5, SZREG
80 1:
81
82 /*
83 * If the dest and src are co-aligned to SZREG, then there is
84 * no need for the full rigmarole of a full misaligned fixup copy.
85 * Instead, do a simpler co-aligned copy.
86 */
87 xor t0, a0, a1
88 andi t1, t0, (SZREG - 1)
89 beqz t1, .Lcoaligned_copy
90 /* Fall through to misaligned fixup copy */
91
92.Lmisaligned_fixup_copy:
93 bltu a1, a0, .Lmisaligned_fixup_copy_reverse
94
95.Lmisaligned_fixup_copy_forward:
96 jal t0, .Lbyte_copy_until_aligned_forward
97
98 andi a5, a1, (SZREG - 1) /* Find the alignment offset of src (a1) */
99 slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
100 sub a5, a1, t3 /* Find the difference between src and dest */
101 andi a1, a1, -SZREG /* Align the src pointer */
102 addi a2, t6, SZREG /* The other breakpoint for the unrolled loop*/
103
104 /*
105 * Compute The Inverse Shift
106 * a7 = XLEN - a6 = XLEN + -a6
107 * 2s complement negation to find the negative: -a6 = ~a6 + 1
108 * Add that to XLEN. XLEN = SZREG * 8.
109 */
110 not a7, a6
111 addi a7, a7, (SZREG * 8 + 1)
112
113 /*
114 * Fix Misalignment Copy Loop - Forward
115 * load_val0 = load_ptr[0];
116 * do {
117 * load_val1 = load_ptr[1];
118 * store_ptr += 2;
119 * store_ptr[0 - 2] = (load_val0 >> {a6}) | (load_val1 << {a7});
120 *
121 * if (store_ptr == {a2})
122 * break;
123 *
124 * load_val0 = load_ptr[2];
125 * load_ptr += 2;
126 * store_ptr[1 - 2] = (load_val1 >> {a6}) | (load_val0 << {a7});
127 *
128 * } while (store_ptr != store_ptr_end);
129 * store_ptr = store_ptr_end;
130 */
131
132 REG_L t0, (0 * SZREG)(a1)
133 1:
134 REG_L t1, (1 * SZREG)(a1)
135 addi t3, t3, (2 * SZREG)
136 srl t0, t0, a6
137 sll t2, t1, a7
138 or t2, t0, t2
139 REG_S t2, ((0 * SZREG) - (2 * SZREG))(t3)
140
141 beq t3, a2, 2f
142
143 REG_L t0, (2 * SZREG)(a1)
144 addi a1, a1, (2 * SZREG)
145 srl t1, t1, a6
146 sll t2, t0, a7
147 or t2, t1, t2
148 REG_S t2, ((1 * SZREG) - (2 * SZREG))(t3)
149
150 bne t3, t6, 1b
151 2:
152 mv t3, t6 /* Fix the dest pointer in case the loop was broken */
153
154 add a1, t3, a5 /* Restore the src pointer */
155 j .Lbyte_copy_forward /* Copy any remaining bytes */
156
157.Lmisaligned_fixup_copy_reverse:
158 jal t0, .Lbyte_copy_until_aligned_reverse
159
160 andi a5, a4, (SZREG - 1) /* Find the alignment offset of src (a4) */
161 slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
162 sub a5, a4, t4 /* Find the difference between src and dest */
163 andi a4, a4, -SZREG /* Align the src pointer */
164 addi a2, t5, -SZREG /* The other breakpoint for the unrolled loop*/
165
166 /*
167 * Compute The Inverse Shift
168 * a7 = XLEN - a6 = XLEN + -a6
169 * 2s complement negation to find the negative: -a6 = ~a6 + 1
170 * Add that to XLEN. XLEN = SZREG * 8.
171 */
172 not a7, a6
173 addi a7, a7, (SZREG * 8 + 1)
174
175 /*
176 * Fix Misalignment Copy Loop - Reverse
177 * load_val1 = load_ptr[0];
178 * do {
179 * load_val0 = load_ptr[-1];
180 * store_ptr -= 2;
181 * store_ptr[1] = (load_val0 >> {a6}) | (load_val1 << {a7});
182 *
183 * if (store_ptr == {a2})
184 * break;
185 *
186 * load_val1 = load_ptr[-2];
187 * load_ptr -= 2;
188 * store_ptr[0] = (load_val1 >> {a6}) | (load_val0 << {a7});
189 *
190 * } while (store_ptr != store_ptr_end);
191 * store_ptr = store_ptr_end;
192 */
193
194 REG_L t1, ( 0 * SZREG)(a4)
195 1:
196 REG_L t0, (-1 * SZREG)(a4)
197 addi t4, t4, (-2 * SZREG)
198 sll t1, t1, a7
199 srl t2, t0, a6
200 or t2, t1, t2
201 REG_S t2, ( 1 * SZREG)(t4)
202
203 beq t4, a2, 2f
204
205 REG_L t1, (-2 * SZREG)(a4)
206 addi a4, a4, (-2 * SZREG)
207 sll t0, t0, a7
208 srl t2, t1, a6
209 or t2, t0, t2
210 REG_S t2, ( 0 * SZREG)(t4)
211
212 bne t4, t5, 1b
213 2:
214 mv t4, t5 /* Fix the dest pointer in case the loop was broken */
215
216 add a4, t4, a5 /* Restore the src pointer */
217 j .Lbyte_copy_reverse /* Copy any remaining bytes */
218
219/*
220 * Simple copy loops for SZREG co-aligned memory locations.
221 * These also make calls to do byte copies for any unaligned
222 * data at their terminations.
223 */
224.Lcoaligned_copy:
225 bltu a1, a0, .Lcoaligned_copy_reverse
226
227.Lcoaligned_copy_forward:
228 jal t0, .Lbyte_copy_until_aligned_forward
229
230 1:
231 REG_L t1, ( 0 * SZREG)(a1)
232 addi a1, a1, SZREG
233 addi t3, t3, SZREG
234 REG_S t1, (-1 * SZREG)(t3)
235 bne t3, t6, 1b
236
237 j .Lbyte_copy_forward /* Copy any remaining bytes */
238
239.Lcoaligned_copy_reverse:
240 jal t0, .Lbyte_copy_until_aligned_reverse
241
242 1:
243 REG_L t1, (-1 * SZREG)(a4)
244 addi a4, a4, -SZREG
245 addi t4, t4, -SZREG
246 REG_S t1, ( 0 * SZREG)(t4)
247 bne t4, t5, 1b
248
249 j .Lbyte_copy_reverse /* Copy any remaining bytes */
250
251/*
252 * These are basically sub-functions within the function. They
253 * are used to byte copy until the dest pointer is in alignment.
254 * At which point, a bulk copy method can be used by the
255 * calling code. These work on the same registers as the bulk
256 * copy loops. Therefore, the register values can be picked
257 * up from where they were left and we avoid code duplication
258 * without any overhead except the call in and return jumps.
259 */
260.Lbyte_copy_until_aligned_forward:
261 beq t3, t5, 2f
262 1:
263 lb t1, 0(a1)
264 addi a1, a1, 1
265 addi t3, t3, 1
266 sb t1, -1(t3)
267 bne t3, t5, 1b
268 2:
269 jalr zero, 0x0(t0) /* Return to multibyte copy loop */
270
271.Lbyte_copy_until_aligned_reverse:
272 beq t4, t6, 2f
273 1:
274 lb t1, -1(a4)
275 addi a4, a4, -1
276 addi t4, t4, -1
277 sb t1, 0(t4)
278 bne t4, t6, 1b
279 2:
280 jalr zero, 0x0(t0) /* Return to multibyte copy loop */
281
282/*
283 * Simple byte copy loops.
284 * These will byte copy until they reach the end of data to copy.
285 * At that point, they will call to return from memmove.
286 */
287.Lbyte_copy:
288 bltu a1, a0, .Lbyte_copy_reverse
289
290.Lbyte_copy_forward:
291 beq t3, t4, 2f
292 1:
293 lb t1, 0(a1)
294 addi a1, a1, 1
295 addi t3, t3, 1
296 sb t1, -1(t3)
297 bne t3, t4, 1b
298 2:
299 ret
300
301.Lbyte_copy_reverse:
302 beq t4, t3, 2f
303 1:
304 lb t1, -1(a4)
305 addi a4, a4, -1
306 addi t4, t4, -1
307 sb t1, 0(t4)
308 bne t4, t3, 1b
309 2:
310
311.Lreturn_from_memmove:
312 ret
313
314SYM_FUNC_END(__memmove)
315SYM_FUNC_ALIAS_WEAK(memmove, __memmove)
316SYM_FUNC_ALIAS(__pi_memmove, __memmove)
317SYM_FUNC_ALIAS(__pi___memmove, __memmove)
318

source code of linux/arch/riscv/lib/memmove.S