memmove.S source code [linux/arch/riscv/lib/memmove.S]

1	/ SPDX-License-Identifier: GPL-2.0-only /
2	/*
3	* Copyright (C) 2022 Michael T. Kloos <michael@michaelkloos.com>
4	*/
5
6	#include <linux/linkage.h>
7	#include <asm/asm.h>
8
9	SYM_FUNC_START(__memmove)
10	/*
11	* Returns
12	* a0 - dest
13	*
14	* Parameters
15	* a0 - Inclusive first byte of dest
16	* a1 - Inclusive first byte of src
17	* a2 - Length of copy n
18	*
19	* Because the return matches the parameter register a0,
20	* we will not clobber or modify that register.
21	*
22	* Note: This currently only works on little-endian.
23	* To port to big-endian, reverse the direction of shifts
24	* in the 2 misaligned fixup copy loops.
25	*/
26
27	/ Return if nothing to do /
28	beq a0, a1, .Lreturn_from_memmove
29	beqz a2, .Lreturn_from_memmove
30
31	/*
32	* Register Uses
33	* Forward Copy: a1 - Index counter of src
34	* Reverse Copy: a4 - Index counter of src
35	* Forward Copy: t3 - Index counter of dest
36	* Reverse Copy: t4 - Index counter of dest
37	* Both Copy Modes: t5 - Inclusive first multibyte/aligned of dest
38	* Both Copy Modes: t6 - Non-Inclusive last multibyte/aligned of dest
39	* Both Copy Modes: t0 - Link / Temporary for load-store
40	* Both Copy Modes: t1 - Temporary for load-store
41	* Both Copy Modes: t2 - Temporary for load-store
42	* Both Copy Modes: a5 - dest to src alignment offset
43	* Both Copy Modes: a6 - Shift ammount
44	* Both Copy Modes: a7 - Inverse Shift ammount
45	* Both Copy Modes: a2 - Alternate breakpoint for unrolled loops
46	*/
47
48	/*
49	* Solve for some register values now.
50	* Byte copy does not need t5 or t6.
51	*/
52	mv t3, a0
53	add t4, a0, a2
54	add a4, a1, a2
55
56	/*
57	* Byte copy if copying less than (2 * SZREG) bytes. This can
58	* cause problems with the bulk copy implementation and is
59	* small enough not to bother.
60	*/
61	andi t0, a2, -(`2` * SZREG)
62	beqz t0, .Lbyte_copy
63
64	/*
65	* Now solve for t5 and t6.
66	*/
67	andi t5, t3, -SZREG
68	andi t6, t4, -SZREG
69	/*
70	* If dest(Register t3) rounded down to the nearest naturally
71	* aligned SZREG address, does not equal dest, then add SZREG
72	* to find the low-bound of SZREG alignment in the dest memory
73	* region. Note that this could overshoot the dest memory
74	* region if n is less than SZREG. This is one reason why
75	* we always byte copy if n is less than SZREG.
76	* Otherwise, dest is already naturally aligned to SZREG.
77	*/
78	beq t5, t3, `1f`
79	addi t5, t5, SZREG
80	`1`:
81
82	/*
83	* If the dest and src are co-aligned to SZREG, then there is
84	* no need for the full rigmarole of a full misaligned fixup copy.
85	* Instead, do a simpler co-aligned copy.
86	*/
87	xor t0, a0, a1
88	andi t1, t0, (SZREG - `1`)
89	beqz t1, .Lcoaligned_copy
90	/ Fall through to misaligned fixup copy /
91
92	.Lmisaligned_fixup_copy:
93	bltu a1, a0, .Lmisaligned_fixup_copy_reverse
94
95	.Lmisaligned_fixup_copy_forward:
96	jal t0, .Lbyte_copy_until_aligned_forward
97
98	andi a5, a1, (SZREG - `1`) / Find the alignment offset of src (a1) /
99	slli a6, a5, `3` / Multiply by 8 to convert that to bits to shift /
100	sub a5, a1, t3 / Find the difference between src and dest /
101	andi a1, a1, -SZREG / Align the src pointer /
102	addi a2, t6, SZREG / The other breakpoint for the unrolled loop/
103
104	/*
105	* Compute The Inverse Shift
106	* a7 = XLEN - a6 = XLEN + -a6
107	* 2s complement negation to find the negative: -a6 = ~a6 + 1
108	* Add that to XLEN. XLEN = SZREG * 8.
109	*/
110	not a7, a6
111	addi a7, a7, (SZREG * `8` + `1`)
112
113	/*
114	* Fix Misalignment Copy Loop - Forward
115	* load_val0 = load_ptr[0];
116	* do {
117	* load_val1 = load_ptr[1];
118	* store_ptr += 2;
119	* store_ptr[0 - 2] = (load_val0 >> {a6}) \| (load_val1 << {a7});
120	*
121	* if (store_ptr == {a2})
122	* break;
123	*
124	* load_val0 = load_ptr[2];
125	* load_ptr += 2;
126	* store_ptr[1 - 2] = (load_val1 >> {a6}) \| (load_val0 << {a7});
127	*
128	* } while (store_ptr != store_ptr_end);
129	* store_ptr = store_ptr_end;
130	*/
131
132	REG_L t0, (`0` * SZREG)(a1)
133	`1`:
134	REG_L t1, (`1` * SZREG)(a1)
135	addi t3, t3, (`2` * SZREG)
136	srl t0, t0, a6
137	sll t2, t1, a7
138	or t2, t0, t2
139	REG_S t2, ((`0` * SZREG) - (`2` * SZREG))(t3)
140
141	beq t3, a2, `2f`
142
143	REG_L t0, (`2` * SZREG)(a1)
144	addi a1, a1, (`2` * SZREG)
145	srl t1, t1, a6
146	sll t2, t0, a7
147	or t2, t1, t2
148	REG_S t2, ((`1` * SZREG) - (`2` * SZREG))(t3)
149
150	bne t3, t6, `1b`
151	`2`:
152	mv t3, t6 / Fix the dest pointer in case the loop was broken /
153
154	add a1, t3, a5 / Restore the src pointer /
155	j .Lbyte_copy_forward / Copy any remaining bytes /
156
157	.Lmisaligned_fixup_copy_reverse:
158	jal t0, .Lbyte_copy_until_aligned_reverse
159
160	andi a5, a4, (SZREG - `1`) / Find the alignment offset of src (a4) /
161	slli a6, a5, `3` / Multiply by 8 to convert that to bits to shift /
162	sub a5, a4, t4 / Find the difference between src and dest /
163	andi a4, a4, -SZREG / Align the src pointer /
164	addi a2, t5, -SZREG / The other breakpoint for the unrolled loop/
165
166	/*
167	* Compute The Inverse Shift
168	* a7 = XLEN - a6 = XLEN + -a6
169	* 2s complement negation to find the negative: -a6 = ~a6 + 1
170	* Add that to XLEN. XLEN = SZREG * 8.
171	*/
172	not a7, a6
173	addi a7, a7, (SZREG * `8` + `1`)
174
175	/*
176	* Fix Misalignment Copy Loop - Reverse
177	* load_val1 = load_ptr[0];
178	* do {
179	* load_val0 = load_ptr[-1];
180	* store_ptr -= 2;
181	* store_ptr[1] = (load_val0 >> {a6}) \| (load_val1 << {a7});
182	*
183	* if (store_ptr == {a2})
184	* break;
185	*
186	* load_val1 = load_ptr[-2];
187	* load_ptr -= 2;
188	* store_ptr[0] = (load_val1 >> {a6}) \| (load_val0 << {a7});
189	*
190	* } while (store_ptr != store_ptr_end);
191	* store_ptr = store_ptr_end;
192	*/
193
194	REG_L t1, ( `0` * SZREG)(a4)
195	`1`:
196	REG_L t0, (-`1` * SZREG)(a4)
197	addi t4, t4, (-`2` * SZREG)
198	sll t1, t1, a7
199	srl t2, t0, a6
200	or t2, t1, t2
201	REG_S t2, ( `1` * SZREG)(t4)
202
203	beq t4, a2, `2f`
204
205	REG_L t1, (-`2` * SZREG)(a4)
206	addi a4, a4, (-`2` * SZREG)
207	sll t0, t0, a7
208	srl t2, t1, a6
209	or t2, t0, t2
210	REG_S t2, ( `0` * SZREG)(t4)
211
212	bne t4, t5, `1b`
213	`2`:
214	mv t4, t5 / Fix the dest pointer in case the loop was broken /
215
216	add a4, t4, a5 / Restore the src pointer /
217	j .Lbyte_copy_reverse / Copy any remaining bytes /
218
219	/*
220	* Simple copy loops for SZREG co-aligned memory locations.
221	* These also make calls to do byte copies for any unaligned
222	* data at their terminations.
223	*/
224	.Lcoaligned_copy:
225	bltu a1, a0, .Lcoaligned_copy_reverse
226
227	.Lcoaligned_copy_forward:
228	jal t0, .Lbyte_copy_until_aligned_forward
229
230	`1`:
231	REG_L t1, ( `0` * SZREG)(a1)
232	addi a1, a1, SZREG
233	addi t3, t3, SZREG
234	REG_S t1, (-`1` * SZREG)(t3)
235	bne t3, t6, `1b`
236
237	j .Lbyte_copy_forward / Copy any remaining bytes /
238
239	.Lcoaligned_copy_reverse:
240	jal t0, .Lbyte_copy_until_aligned_reverse
241
242	`1`:
243	REG_L t1, (-`1` * SZREG)(a4)
244	addi a4, a4, -SZREG
245	addi t4, t4, -SZREG
246	REG_S t1, ( `0` * SZREG)(t4)
247	bne t4, t5, `1b`
248
249	j .Lbyte_copy_reverse / Copy any remaining bytes /
250
251	/*
252	* These are basically sub-functions within the function. They
253	* are used to byte copy until the dest pointer is in alignment.
254	* At which point, a bulk copy method can be used by the
255	* calling code. These work on the same registers as the bulk
256	* copy loops. Therefore, the register values can be picked
257	* up from where they were left and we avoid code duplication
258	* without any overhead except the call in and return jumps.
259	*/
260	.Lbyte_copy_until_aligned_forward:
261	beq t3, t5, `2f`
262	`1`:
263	lb t1, `0`(a1)
264	addi a1, a1, `1`
265	addi t3, t3, `1`
266	sb t1, -`1`(t3)
267	bne t3, t5, `1b`
268	`2`:
269	jalr zero, `0x0`(t0) / Return to multibyte copy loop /
270
271	.Lbyte_copy_until_aligned_reverse:
272	beq t4, t6, `2f`
273	`1`:
274	lb t1, -`1`(a4)
275	addi a4, a4, -`1`
276	addi t4, t4, -`1`
277	sb t1, `0`(t4)
278	bne t4, t6, `1b`
279	`2`:
280	jalr zero, `0x0`(t0) / Return to multibyte copy loop /
281
282	/*
283	* Simple byte copy loops.
284	* These will byte copy until they reach the end of data to copy.
285	* At that point, they will call to return from memmove.
286	*/
287	.Lbyte_copy:
288	bltu a1, a0, .Lbyte_copy_reverse
289
290	.Lbyte_copy_forward:
291	beq t3, t4, `2f`
292	`1`:
293	lb t1, `0`(a1)
294	addi a1, a1, `1`
295	addi t3, t3, `1`
296	sb t1, -`1`(t3)
297	bne t3, t4, `1b`
298	`2`:
299	ret
300
301	.Lbyte_copy_reverse:
302	beq t4, t3, `2f`
303	`1`:
304	lb t1, -`1`(a4)
305	addi a4, a4, -`1`
306	addi t4, t4, -`1`
307	sb t1, `0`(t4)
308	bne t4, t3, `1b`
309	`2`:
310
311	.Lreturn_from_memmove:
312	ret
313
314	SYM_FUNC_END(__memmove)
315	SYM_FUNC_ALIAS_WEAK(memmove, __memmove)
316	SYM_FUNC_ALIAS(__pi_memmove, __memmove)
317	SYM_FUNC_ALIAS(__pi___memmove, __memmove)
318

source code of linux/arch/riscv/lib/memmove.S