ev6-memcpy.S source code [linux/arch/alpha/lib/ev6-memcpy.S]

1	/ SPDX-License-Identifier: GPL-2.0 /
2	/*
3	* arch/alpha/lib/ev6-memcpy.S
4	* 21264 version by Rick Gorton <rick.gorton@alpha-processor.com>
5	*
6	* Reasonably optimized memcpy() routine for the Alpha 21264
7	*
8	* - memory accessed as aligned quadwords only
9	* - uses bcmpge to compare 8 bytes in parallel
10	*
11	* Much of the information about 21264 scheduling/coding comes from:
12	* Compiler Writer's Guide for the Alpha 21264
13	* abbreviated as 'CWG' in other comments here
14	* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
15	* Scheduling notation:
16	* E - either cluster
17	* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
18	* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
19	*
20	* Temp usage notes:
21	* $1,$2, - scratch
22	*/
23	#include <linux/export.h>
24	.set noreorder
25	.set noat
26
27	.align `4`
28	.globl memcpy
29	.ent memcpy
30	memcpy:
31	.frame $`30`,`0`,$`26`,`0`
32	.prologue `0`
33
34	mov $`16`, $`0` # E : copy dest to return
35	ble $`18`, $nomoredata # U : done with the copy?
36	xor $`16`, $`17`, $`1` # E : are source and dest alignments the same?
37	and $`1`, `7`, $`1` # E : are they the same mod `8`?
38
39	bne $`1`, $misaligned # U : Nope - gotta do this the slow way
40	/ source and dest are same mod 8 address /
41	and $`16`, `7`, $`1` # E : Are both `0mod8`?
42	beq $`1`, $both_0mod8 # U : Yes
43	nop # E :
44
45	/*
46	* source and dest are same misalignment. move a byte at a time
47	* until a 0mod8 alignment for both is reached.
48	* At least one byte more to move
49	*/
50
51	$head_align:
52	ldbu $`1`, `0`($`17`) # L : grab a byte
53	subq $`18`, `1`, $`18` # E : count--
54	addq $`17`, `1`, $`17` # E : src++
55	stb $`1`, `0`($`16`) # L :
56	addq $`16`, `1`, $`16` # E : dest++
57	and $`16`, `7`, $`1` # E : Are we at `0mod8` yet?
58	ble $`18`, $nomoredata # U : done with the copy?
59	bne $`1`, $head_align # U :
60
61	$both_0mod8:
62	cmple $`18`, `127`, $`1` # E : Can we unroll the loop?
63	bne $`1`, $no_unroll # U :
64	and $`16`, `63`, $`1` # E : get mod64 alignment
65	beq $`1`, $do_unroll # U : no single quads to fiddle
66
67	$single_head_quad:
68	ldq $`1`, `0`($`17`) # L : get `8` bytes
69	subq $`18`, `8`, $`18` # E : count -= `8`
70	addq $`17`, `8`, $`17` # E : src += `8`
71	nop # E :
72
73	stq $`1`, `0`($`16`) # L : store
74	addq $`16`, `8`, $`16` # E : dest += `8`
75	and $`16`, `63`, $`1` # E : get mod64 alignment
76	bne $`1`, $single_head_quad # U : still not fully aligned
77
78	$do_unroll:
79	addq $`16`, `64`, $`7` # E : Initial (+`1` trip) wh64 address
80	cmple $`18`, `127`, $`1` # E : Can we go through the unrolled loop?
81	bne $`1`, $tail_quads # U : Nope
82	nop # E :
83
84	$unroll_body:
85	wh64 ($`7`) # L1 : memory subsystem hint: `64` bytes at
86	# ($7) are about to be over-written
87	ldq $`6`, `0`($`17`) # L0 : bytes `0..7`
88	nop # E :
89	nop # E :
90
91	ldq $`4`, `8`($`17`) # L : bytes `8..15`
92	ldq $`5`, `16`($`17`) # L : bytes `16..23`
93	addq $`7`, `64`, $`7` # E : Update next wh64 address
94	nop # E :
95
96	ldq $`3`, `24`($`17`) # L : bytes `24..31`
97	addq $`16`, `64`, $`1` # E : fallback value for wh64
98	nop # E :
99	nop # E :
100
101	addq $`17`, `32`, $`17` # E : src += `32` bytes
102	stq $`6`, `0`($`16`) # L : bytes `0..7`
103	nop # E :
104	nop # E :
105
106	stq $`4`, `8`($`16`) # L : bytes `8..15`
107	stq $`5`, `16`($`16`) # L : bytes `16..23`
108	subq $`18`, `192`, $`2` # E : At least two more trips to go?
109	nop # E :
110
111	stq $`3`, `24`($`16`) # L : bytes `24..31`
112	addq $`16`, `32`, $`16` # E : dest += `32` bytes
113	nop # E :
114	nop # E :
115
116	ldq $`6`, `0`($`17`) # L : bytes `0..7`
117	ldq $`4`, `8`($`17`) # L : bytes `8..15`
118	cmovlt $`2`, $`1`, $`7` # E : Latency `2`, extra map slot - Use
119	# fallback wh64 address if < 2 more trips
120	nop # E :
121
122	ldq $`5`, `16`($`17`) # L : bytes `16..23`
123	ldq $`3`, `24`($`17`) # L : bytes `24..31`
124	addq $`16`, `32`, $`16` # E : dest += `32`
125	subq $`18`, `64`, $`18` # E : count -= `64`
126
127	addq $`17`, `32`, $`17` # E : src += `32`
128	stq $`6`, -`32`($`16`) # L : bytes `0..7`
129	stq $`4`, -`24`($`16`) # L : bytes `8..15`
130	cmple $`18`, `63`, $`1` # E : At least one more trip?
131
132	stq $`5`, -`16`($`16`) # L : bytes `16..23`
133	stq $`3`, -`8`($`16`) # L : bytes `24..31`
134	nop # E :
135	beq $`1`, $unroll_body
136
137	$tail_quads:
138	$no_unroll:
139	.align `4`
140	subq $`18`, `8`, $`18` # E : At least a quad left?
141	blt $`18`, $less_than_8 # U : Nope
142	nop # E :
143	nop # E :
144
145	$move_a_quad:
146	ldq $`1`, `0`($`17`) # L : fetch `8`
147	subq $`18`, `8`, $`18` # E : count -= `8`
148	addq $`17`, `8`, $`17` # E : src += `8`
149	nop # E :
150
151	stq $`1`, `0`($`16`) # L : store `8`
152	addq $`16`, `8`, $`16` # E : dest += `8`
153	bge $`18`, $move_a_quad # U :
154	nop # E :
155
156	$less_than_8:
157	.align `4`
158	addq $`18`, `8`, $`18` # E : add back for trailing bytes
159	ble $`18`, $nomoredata # U : All-done
160	nop # E :
161	nop # E :
162
163	/ Trailing bytes /
164	$tail_bytes:
165	subq $`18`, `1`, $`18` # E : count--
166	ldbu $`1`, `0`($`17`) # L : fetch a byte
167	addq $`17`, `1`, $`17` # E : src++
168	nop # E :
169
170	stb $`1`, `0`($`16`) # L : store a byte
171	addq $`16`, `1`, $`16` # E : dest++
172	bgt $`18`, $tail_bytes # U : more to be done?
173	nop # E :
174
175	/ branching to exit takes 3 extra cycles, so replicate exit here /
176	ret $`31`, ($`26`), `1` # L0 :
177	nop # E :
178	nop # E :
179	nop # E :
180
181	$misaligned:
182	mov $`0`, $`4` # E : dest temp
183	and $`0`, `7`, $`1` # E : dest alignment mod8
184	beq $`1`, $dest_0mod8 # U : life doesnt totally suck
185	nop
186
187	$aligndest:
188	ble $`18`, $nomoredata # U :
189	ldbu $`1`, `0`($`17`) # L : fetch a byte
190	subq $`18`, `1`, $`18` # E : count--
191	addq $`17`, `1`, $`17` # E : src++
192
193	stb $`1`, `0`($`4`) # L : store it
194	addq $`4`, `1`, $`4` # E : dest++
195	and $`4`, `7`, $`1` # E : dest `0mod8` yet?
196	bne $`1`, $aligndest # U : go until we are aligned.
197
198	/ Source has unknown alignment, but dest is known to be 0mod8 /
199	$dest_0mod8:
200	subq $`18`, `8`, $`18` # E : At least a quad left?
201	blt $`18`, $misalign_tail # U : Nope
202	ldq_u $`3`, `0`($`17`) # L : seed (rotating load) of `8` bytes
203	nop # E :
204
205	$mis_quad:
206	ldq_u $`16`, `8`($`17`) # L : Fetch next `8`
207	extql $`3`, $`17`, $`3` # U : masking
208	extqh $`16`, $`17`, $`1` # U : masking
209	bis $`3`, $`1`, $`1` # E : merged bytes to store
210
211	subq $`18`, `8`, $`18` # E : count -= `8`
212	addq $`17`, `8`, $`17` # E : src += `8`
213	stq $`1`, `0`($`4`) # L : store `8` (aligned)
214	mov $`16`, $`3` # E : "rotate" source data
215
216	addq $`4`, `8`, $`4` # E : dest += `8`
217	bge $`18`, $mis_quad # U : More quads to move
218	nop
219	nop
220
221	$misalign_tail:
222	addq $`18`, `8`, $`18` # E : account for tail stuff
223	ble $`18`, $nomoredata # U :
224	nop
225	nop
226
227	$misalign_byte:
228	ldbu $`1`, `0`($`17`) # L : fetch `1`
229	subq $`18`, `1`, $`18` # E : count--
230	addq $`17`, `1`, $`17` # E : src++
231	nop # E :
232
233	stb $`1`, `0`($`4`) # L : store
234	addq $`4`, `1`, $`4` # E : dest++
235	bgt $`18`, $misalign_byte # U : more to go?
236	nop
237
238
239	$nomoredata:
240	ret $`31`, ($`26`), `1` # L0 :
241	nop # E :
242	nop # E :
243	nop # E :
244
245	.end memcpy
246	EXPORT_SYMBOL(memcpy)
247
248	/ For backwards module compatibility. /
249	__memcpy = memcpy
250	.globl __memcpy
251

source code of linux/arch/alpha/lib/ev6-memcpy.S