lusercopy.S source code [linux/arch/parisc/lib/lusercopy.S]

1	/ SPDX-License-Identifier: GPL-2.0-or-later /
2	/*
3	* User Space Access Routines
4	*
5	* Copyright (C) 2000-2002 Hewlett-Packard (John Marvin)
6	* Copyright (C) 2000 Richard Hirst <rhirst with parisc-linux.org>
7	* Copyright (C) 2001 Matthieu Delahaye <delahaym at esiee.fr>
8	* Copyright (C) 2003 Randolph Chung <tausq with parisc-linux.org>
9	* Copyright (C) 2017 Helge Deller <deller@gmx.de>
10	* Copyright (C) 2017 John David Anglin <dave.anglin@bell.net>
11	*/
12
13	/*
14	* These routines still have plenty of room for optimization
15	* (word & doubleword load/store, dual issue, store hints, etc.).
16	*/
17
18	/*
19	* The following routines assume that space register 3 (sr3) contains
20	* the space id associated with the current users address space.
21	*/
22
23
24	.text
25
26	#include <asm/assembly.h>
27	#include <asm/errno.h>
28	#include <linux/linkage.h>
29
30	/*
31	* unsigned long lclear_user(void *to, unsigned long n)
32	*
33	* Returns 0 for success.
34	* otherwise, returns number of bytes not transferred.
35	*/
36
37	ENTRY_CFI(lclear_user)
38	comib,=,n `0`,%r25,$lclu_done
39	$lclu_loop:
40	addib,<> -`1`,%r25,$lclu_loop
41	`1`: stbs,ma %r0,`1`(%sr3,%r26)
42
43	$lclu_done:
44	bv %r0(%r2)
45	copy %r25,%r28
46
47	`2`: b $lclu_done
48	ldo `1`(%r25),%r25
49
50	ASM_EXCEPTIONTABLE_ENTRY(`1b`,`2b`)
51	ENDPROC_CFI(lclear_user)
52
53
54	/*
55	* unsigned long pa_memcpy(void dstp, const void srcp, unsigned long len)
56	*
57	* Inputs:
58	* - sr1 already contains space of source region
59	* - sr2 already contains space of destination region
60	*
61	* Returns:
62	* - number of bytes that could not be copied.
63	* On success, this will be zero.
64	*
65	* This code is based on a C-implementation of a copy routine written by
66	* Randolph Chung, which in turn was derived from the glibc.
67	*
68	* Several strategies are tried to try to get the best performance for various
69	* conditions. In the optimal case, we copy by loops that copy 32- or 16-bytes
70	* at a time using general registers. Unaligned copies are handled either by
71	* aligning the destination and then using shift-and-write method, or in a few
72	* cases by falling back to a byte-at-a-time copy.
73	*
74	* Testing with various alignments and buffer sizes shows that this code is
75	* often >10x faster than a simple byte-at-a-time copy, even for strangely
76	* aligned operands. It is interesting to note that the glibc version of memcpy
77	* (written in C) is actually quite fast already. This routine is able to beat
78	* it by 30-40% for aligned copies because of the loop unrolling, but in some
79	* cases the glibc version is still slightly faster. This lends more
80	* credibility that gcc can generate very good code as long as we are careful.
81	*
82	* Possible optimizations:
83	* - add cache prefetching
84	* - try not to use the post-increment address modifiers; they may create
85	* additional interlocks. Assumption is that those were only efficient on old
86	* machines (pre PA8000 processors)
87	*/
88
89	dst = arg0
90	src = arg1
91	len = arg2
92	end = arg3
93	t1 = r19
94	t2 = r20
95	t3 = r21
96	t4 = r22
97	srcspc = sr1
98	dstspc = sr2
99
100	t0 = r1
101	a1 = t1
102	a2 = t2
103	a3 = t3
104	a0 = t4
105
106	save_src = ret0
107	save_dst = ret1
108	save_len = r31
109
110	ENTRY_CFI(pa_memcpy)
111	/ Last destination address /
112	add dst,len,end
113
114	/ short copy with less than 16 bytes? /
115	cmpib,COND(>>=),n `15`,len,.Lbyte_loop
116
117	/ same alignment? /
118	xor src,dst,t0
119	extru t0,`31`,`2`,t1
120	cmpib,<>,n `0`,t1,.Lunaligned_copy
121
122	#ifdef CONFIG_64BIT
123	/ only do 64-bit copies if we can get aligned. /
124	extru t0,`31`,`3`,t1
125	cmpib,<>,n `0`,t1,.Lalign_loop32
126
127	/ loop until we are 64-bit aligned /
128	.Lalign_loop64:
129	extru dst,`31`,`3`,t1
130	cmpib,=,n `0`,t1,.Lcopy_loop_16_start
131	`20`: ldb,ma `1`(srcspc,src),t1
132	`21`: stb,ma t1,`1`(dstspc,dst)
133	b .Lalign_loop64
134	ldo -`1`(len),len
135
136	ASM_EXCEPTIONTABLE_ENTRY(`20b`,.Lcopy_done)
137	ASM_EXCEPTIONTABLE_ENTRY(`21b`,.Lcopy_done)
138
139	.Lcopy_loop_16_start:
140	ldi `31`,t0
141	.Lcopy_loop_16:
142	cmpb,COND(>>=),n t0,len,.Lword_loop
143
144	`10`: ldd `0`(srcspc,src),t1
145	`11`: ldd `8`(srcspc,src),t2
146	ldo `16`(src),src
147	`12`: std,ma t1,`8`(dstspc,dst)
148	`13`: std,ma t2,`8`(dstspc,dst)
149	`14`: ldd `0`(srcspc,src),t1
150	`15`: ldd `8`(srcspc,src),t2
151	ldo `16`(src),src
152	`16`: std,ma t1,`8`(dstspc,dst)
153	`17`: std,ma t2,`8`(dstspc,dst)
154
155	ASM_EXCEPTIONTABLE_ENTRY(`10b`,.Lcopy_done)
156	ASM_EXCEPTIONTABLE_ENTRY(`11b`,.Lcopy16_fault)
157	ASM_EXCEPTIONTABLE_ENTRY(`12b`,.Lcopy_done)
158	ASM_EXCEPTIONTABLE_ENTRY(`13b`,.Lcopy_done)
159	ASM_EXCEPTIONTABLE_ENTRY(`14b`,.Lcopy_done)
160	ASM_EXCEPTIONTABLE_ENTRY(`15b`,.Lcopy16_fault)
161	ASM_EXCEPTIONTABLE_ENTRY(`16b`,.Lcopy_done)
162	ASM_EXCEPTIONTABLE_ENTRY(`17b`,.Lcopy_done)
163
164	b .Lcopy_loop_16
165	ldo -`32`(len),len
166
167	.Lword_loop:
168	cmpib,COND(>>=),n `3`,len,.Lbyte_loop
169	`20`: ldw,ma `4`(srcspc,src),t1
170	`21`: stw,ma t1,`4`(dstspc,dst)
171	b .Lword_loop
172	ldo -`4`(len),len
173
174	ASM_EXCEPTIONTABLE_ENTRY(`20b`,.Lcopy_done)
175	ASM_EXCEPTIONTABLE_ENTRY(`21b`,.Lcopy_done)
176
177	#endif /* CONFIG_64BIT */
178
179	/ loop until we are 32-bit aligned /
180	.Lalign_loop32:
181	extru dst,`31`,`2`,t1
182	cmpib,=,n `0`,t1,.Lcopy_loop_8
183	`20`: ldb,ma `1`(srcspc,src),t1
184	`21`: stb,ma t1,`1`(dstspc,dst)
185	b .Lalign_loop32
186	ldo -`1`(len),len
187
188	ASM_EXCEPTIONTABLE_ENTRY(`20b`,.Lcopy_done)
189	ASM_EXCEPTIONTABLE_ENTRY(`21b`,.Lcopy_done)
190
191
192	.Lcopy_loop_8:
193	cmpib,COND(>>=),n `15`,len,.Lbyte_loop
194
195	`10`: ldw `0`(srcspc,src),t1
196	`11`: ldw `4`(srcspc,src),t2
197	`12`: stw,ma t1,`4`(dstspc,dst)
198	`13`: stw,ma t2,`4`(dstspc,dst)
199	`14`: ldw `8`(srcspc,src),t1
200	`15`: ldw `12`(srcspc,src),t2
201	ldo `16`(src),src
202	`16`: stw,ma t1,`4`(dstspc,dst)
203	`17`: stw,ma t2,`4`(dstspc,dst)
204
205	ASM_EXCEPTIONTABLE_ENTRY(`10b`,.Lcopy_done)
206	ASM_EXCEPTIONTABLE_ENTRY(`11b`,.Lcopy8_fault)
207	ASM_EXCEPTIONTABLE_ENTRY(`12b`,.Lcopy_done)
208	ASM_EXCEPTIONTABLE_ENTRY(`13b`,.Lcopy_done)
209	ASM_EXCEPTIONTABLE_ENTRY(`14b`,.Lcopy_done)
210	ASM_EXCEPTIONTABLE_ENTRY(`15b`,.Lcopy8_fault)
211	ASM_EXCEPTIONTABLE_ENTRY(`16b`,.Lcopy_done)
212	ASM_EXCEPTIONTABLE_ENTRY(`17b`,.Lcopy_done)
213
214	b .Lcopy_loop_8
215	ldo -`16`(len),len
216
217	.Lbyte_loop:
218	cmpclr,COND(<>) len,%r0,%r0
219	b,n .Lcopy_done
220	`20`: ldb `0`(srcspc,src),t1
221	ldo `1`(src),src
222	`21`: stb,ma t1,`1`(dstspc,dst)
223	b .Lbyte_loop
224	ldo -`1`(len),len
225
226	ASM_EXCEPTIONTABLE_ENTRY(`20b`,.Lcopy_done)
227	ASM_EXCEPTIONTABLE_ENTRY(`21b`,.Lcopy_done)
228
229	.Lcopy_done:
230	bv %r0(%r2)
231	sub end,dst,ret0
232
233
234	/ src and dst are not aligned the same way. /
235	/ need to go the hard way /
236	.Lunaligned_copy:
237	/ align until dst is 32bit-word-aligned /
238	extru dst,`31`,`2`,t1
239	cmpib,=,n `0`,t1,.Lcopy_dstaligned
240	`20`: ldb `0`(srcspc,src),t1
241	ldo `1`(src),src
242	`21`: stb,ma t1,`1`(dstspc,dst)
243	b .Lunaligned_copy
244	ldo -`1`(len),len
245
246	ASM_EXCEPTIONTABLE_ENTRY(`20b`,.Lcopy_done)
247	ASM_EXCEPTIONTABLE_ENTRY(`21b`,.Lcopy_done)
248
249	.Lcopy_dstaligned:
250
251	/ store src, dst and len in safe place /
252	copy src,save_src
253	copy dst,save_dst
254	copy len,save_len
255
256	/ len now needs give number of words to copy /
257	SHRREG len,`2`,len
258
259	/*
260	* Copy from a not-aligned src to an aligned dst using shifts.
261	* Handles 4 words per loop.
262	*/
263
264	depw,z src,`28`,`2`,t0
265	subi `32`,t0,t0
266	mtsar t0
267	extru len,`31`,`2`,t0
268	cmpib,= `2`,t0,.Lcase2
269	/ Make src aligned by rounding it down. /
270	depi `0`,`31`,`2`,src
271
272	cmpiclr,<> `3`,t0,%r0
273	b,n .Lcase3
274	cmpiclr,<> `1`,t0,%r0
275	b,n .Lcase1
276	.Lcase0:
277	cmpb,COND(=) %r0,len,.Lcda_finish
278	nop
279
280	`1`: ldw,ma `4`(srcspc,src), a3
281	ASM_EXCEPTIONTABLE_ENTRY(`1b`,.Lcda_rdfault)
282	`1`: ldw,ma `4`(srcspc,src), a0
283	ASM_EXCEPTIONTABLE_ENTRY(`1b`,.Lcda_rdfault)
284	b,n .Ldo3
285	.Lcase1:
286	`1`: ldw,ma `4`(srcspc,src), a2
287	ASM_EXCEPTIONTABLE_ENTRY(`1b`,.Lcda_rdfault)
288	`1`: ldw,ma `4`(srcspc,src), a3
289	ASM_EXCEPTIONTABLE_ENTRY(`1b`,.Lcda_rdfault)
290	ldo -`1`(len),len
291	cmpb,COND(=),n %r0,len,.Ldo0
292	.Ldo4:
293	`1`: ldw,ma `4`(srcspc,src), a0
294	ASM_EXCEPTIONTABLE_ENTRY(`1b`,.Lcda_rdfault)
295	shrpw a2, a3, %sar, t0
296	`1`: stw,ma t0, `4`(dstspc,dst)
297	ASM_EXCEPTIONTABLE_ENTRY(`1b`,.Lcopy_done)
298	.Ldo3:
299	`1`: ldw,ma `4`(srcspc,src), a1
300	ASM_EXCEPTIONTABLE_ENTRY(`1b`,.Lcda_rdfault)
301	shrpw a3, a0, %sar, t0
302	`1`: stw,ma t0, `4`(dstspc,dst)
303	ASM_EXCEPTIONTABLE_ENTRY(`1b`,.Lcopy_done)
304	.Ldo2:
305	`1`: ldw,ma `4`(srcspc,src), a2
306	ASM_EXCEPTIONTABLE_ENTRY(`1b`,.Lcda_rdfault)
307	shrpw a0, a1, %sar, t0
308	`1`: stw,ma t0, `4`(dstspc,dst)
309	ASM_EXCEPTIONTABLE_ENTRY(`1b`,.Lcopy_done)
310	.Ldo1:
311	`1`: ldw,ma `4`(srcspc,src), a3
312	ASM_EXCEPTIONTABLE_ENTRY(`1b`,.Lcda_rdfault)
313	shrpw a1, a2, %sar, t0
314	`1`: stw,ma t0, `4`(dstspc,dst)
315	ASM_EXCEPTIONTABLE_ENTRY(`1b`,.Lcopy_done)
316	ldo -`4`(len),len
317	cmpb,COND(<>) %r0,len,.Ldo4
318	nop
319	.Ldo0:
320	shrpw a2, a3, %sar, t0
321	`1`: stw,ma t0, `4`(dstspc,dst)
322	ASM_EXCEPTIONTABLE_ENTRY(`1b`,.Lcopy_done)
323
324	.Lcda_rdfault:
325	.Lcda_finish:
326	/ calculate new src, dst and len and jump to byte-copy loop /
327	sub dst,save_dst,t0
328	add save_src,t0,src
329	b .Lbyte_loop
330	sub save_len,t0,len
331
332	.Lcase3:
333	`1`: ldw,ma `4`(srcspc,src), a0
334	ASM_EXCEPTIONTABLE_ENTRY(`1b`,.Lcda_rdfault)
335	`1`: ldw,ma `4`(srcspc,src), a1
336	ASM_EXCEPTIONTABLE_ENTRY(`1b`,.Lcda_rdfault)
337	b .Ldo2
338	ldo `1`(len),len
339	.Lcase2:
340	`1`: ldw,ma `4`(srcspc,src), a1
341	ASM_EXCEPTIONTABLE_ENTRY(`1b`,.Lcda_rdfault)
342	`1`: ldw,ma `4`(srcspc,src), a2
343	ASM_EXCEPTIONTABLE_ENTRY(`1b`,.Lcda_rdfault)
344	b .Ldo1
345	ldo `2`(len),len
346
347
348	/ fault exception fixup handlers: /
349	#ifdef CONFIG_64BIT
350	.Lcopy16_fault:
351	b .Lcopy_done
352	`10`: std,ma t1,`8`(dstspc,dst)
353	ASM_EXCEPTIONTABLE_ENTRY(`10b`,.Lcopy_done)
354	#endif
355
356	.Lcopy8_fault:
357	b .Lcopy_done
358	`10`: stw,ma t1,`4`(dstspc,dst)
359	ASM_EXCEPTIONTABLE_ENTRY(`10b`,.Lcopy_done)
360	ENDPROC_CFI(pa_memcpy)
361
362	.end
363

source code of linux/arch/parisc/lib/lusercopy.S