ev6-divide.S source code [linux/arch/alpha/lib/ev6-divide.S]

1	/ SPDX-License-Identifier: GPL-2.0 /
2	/*
3	* arch/alpha/lib/ev6-divide.S
4	*
5	* 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
6	*
7	* Alpha division..
8	*/
9
10	/*
11	* The alpha chip doesn't provide hardware division, so we have to do it
12	* by hand. The compiler expects the functions
13	*
14	* __divqu: 64-bit unsigned long divide
15	* __remqu: 64-bit unsigned long remainder
16	* __divqs/__remqs: signed 64-bit
17	* __divlu/__remlu: unsigned 32-bit
18	* __divls/__remls: signed 32-bit
19	*
20	* These are not normal C functions: instead of the normal
21	* calling sequence, these expect their arguments in registers
22	* $24 and $25, and return the result in $27. Register $28 may
23	* be clobbered (assembly temporary), anything else must be saved.
24	*
25	* In short: painful.
26	*
27	* This is a rather simple bit-at-a-time algorithm: it's very good
28	* at dividing random 64-bit numbers, but the more usual case where
29	* the divisor is small is handled better by the DEC algorithm
30	* using lookup tables. This uses much less memory, though, and is
31	* nicer on the cache.. Besides, I don't know the copyright status
32	* of the DEC code.
33	*/
34
35	/*
36	* My temporaries:
37	* $0 - current bit
38	* $1 - shifted divisor
39	* $2 - modulus/quotient
40	*
41	* $23 - return address
42	* $24 - dividend
43	* $25 - divisor
44	*
45	* $27 - quotient/modulus
46	* $28 - compare status
47	*
48	* Much of the information about 21264 scheduling/coding comes from:
49	* Compiler Writer's Guide for the Alpha 21264
50	* abbreviated as 'CWG' in other comments here
51	* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
52	* Scheduling notation:
53	* E - either cluster
54	* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
55	* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
56	* Try not to change the actual algorithm if possible for consistency.
57	*/
58
59	#include <linux/export.h>
60	#define halt .long 0
61
62	/*
63	* Select function type and registers
64	*/
65	#define mask $0
66	#define divisor $1
67	#define compare $28
68	#define tmp1 $3
69	#define tmp2 $4
70
71	#ifdef DIV
72	#define DIV_ONLY(x,y...) x,##y
73	#define MOD_ONLY(x,y...)
74	#define func(x) __div##x
75	#define modulus $2
76	#define quotient $27
77	#define GETSIGN(x) xor $24,$25,x
78	#define STACK 48
79	#else
80	#define DIV_ONLY(x,y...)
81	#define MOD_ONLY(x,y...) x,##y
82	#define func(x) __rem##x
83	#define modulus $27
84	#define quotient $2
85	#define GETSIGN(x) bis $24,$24,x
86	#define STACK 32
87	#endif
88
89	/*
90	* For 32-bit operations, we need to extend to 64-bit
91	*/
92	#ifdef INTSIZE
93	#define ufunction func(lu)
94	#define sfunction func(l)
95	#define LONGIFY(x) zapnot x,15,x
96	#define SLONGIFY(x) addl x,0,x
97	#else
98	#define ufunction func(qu)
99	#define sfunction func(q)
100	#define LONGIFY(x)
101	#define SLONGIFY(x)
102	#endif
103
104	.set noat
105	.align `4`
106	.globl ufunction
107	.ent ufunction
108	ufunction:
109	subq $`30`,STACK,$`30` # E :
110	.frame $`30`,STACK,$`23`
111	.prologue `0`
112
113	`7`: stq $`1`, `0`($`30`) # L :
114	bis $`25`,$`25`,divisor # E :
115	stq $`2`, `8`($`30`) # L : L U L U
116
117	bis $`24`,$`24`,modulus # E :
118	stq $`0`,`16`($`30`) # L :
119	bis $`31`,$`31`,quotient # E :
120	LONGIFY(divisor) # E : U L L U
121
122	stq tmp1,`24`($`30`) # L :
123	LONGIFY(modulus) # E :
124	bis $`31`,`1`,mask # E :
125	DIV_ONLY(stq tmp2,`32`($`30`)) # L : L U U L
126
127	beq divisor, `9f` / div by zero /
128	/*
129	* In spite of the DIV_ONLY being either a non-instruction
130	* or an actual stq, the addition of the .align directive
131	* below ensures that label 1 is going to be nicely aligned
132	*/
133
134	.align `4`
135	#ifdef INTSIZE
136	/*
137	* shift divisor left, using 3-bit shifts for
138	* 32-bit divides as we can't overflow. Three-bit
139	* shifts will result in looping three times less
140	* here, but can result in two loops more later.
141	* Thus using a large shift isn't worth it (and
142	* s8add pairs better than a sll..)
143	*/
144	`1`: cmpult divisor,modulus,compare # E :
145	s8addq divisor,$`31`,divisor # E :
146	s8addq mask,$`31`,mask # E :
147	bne compare,`1b` # U : U L U L
148	#else
149	`1`: cmpult divisor,modulus,compare # E :
150	nop # E :
151	nop # E :
152	blt divisor, `2f` # U : U L U L
153
154	addq divisor,divisor,divisor # E :
155	addq mask,mask,mask # E :
156	unop # E :
157	bne compare,`1b` # U : U L U L
158	#endif
159
160	/ ok, start to go right again.. /
161	`2`:
162	/*
163	* Keep things nicely bundled... use a nop instead of not
164	* having an instruction for DIV_ONLY
165	*/
166	#ifdef DIV
167	DIV_ONLY(addq quotient,mask,tmp2) # E :
168	#else
169	nop # E :
170	#endif
171	srl mask,`1`,mask # U :
172	cmpule divisor,modulus,compare # E :
173	subq modulus,divisor,tmp1 # E :
174
175	#ifdef DIV
176	DIV_ONLY(cmovne compare,tmp2,quotient) # E : Latency `2`, extra map slot
177	nop # E : as part of the cmovne
178	srl divisor,`1`,divisor # U :
179	nop # E : L U L U
180
181	nop # E :
182	cmovne compare,tmp1,modulus # E : Latency `2`, extra map slot
183	nop # E : as part of the cmovne
184	bne mask,`2b` # U : U L U L
185	#else
186	srl divisor,`1`,divisor # U :
187	cmovne compare,tmp1,modulus # E : Latency `2`, extra map slot
188	nop # E : as part of the cmovne
189	bne mask,`2b` # U : U L L U
190	#endif
191
192	`9`: ldq $`1`, `0`($`30`) # L :
193	ldq $`2`, `8`($`30`) # L :
194	nop # E :
195	nop # E : U U L L
196
197	ldq $`0`,`16`($`30`) # L :
198	ldq tmp1,`24`($`30`) # L :
199	nop # E :
200	nop # E :
201
202	#ifdef DIV
203	DIV_ONLY(ldq tmp2,`32`($`30`)) # L :
204	#else
205	nop # E :
206	#endif
207	addq $`30`,STACK,$`30` # E :
208	ret $`31`,($`23`),`1` # L0 : L U U L
209	.end ufunction
210	EXPORT_SYMBOL(ufunction)
211
212	/*
213	* Uhh.. Ugly signed division. I'd rather not have it at all, but
214	* it's needed in some circumstances. There are different ways to
215	* handle this, really. This does:
216	* -a / b = a / -b = -(a / b)
217	* -a % b = -(a % b)
218	* a % -b = a % b
219	* which is probably not the best solution, but at least should
220	* have the property that (x/y)*y + (x%y) = x.
221	*/
222	.align `4`
223	.globl sfunction
224	.ent sfunction
225	sfunction:
226	subq $`30`,STACK,$`30` # E :
227	.frame $`30`,STACK,$`23`
228	.prologue `0`
229	bis $`24`,$`25`,$`28` # E :
230	SLONGIFY($`28`) # E :
231	bge $`28`,`7b` # U :
232
233	stq $`24`,`0`($`30`) # L :
234	subq $`31`,$`24`,$`28` # E :
235	stq $`25`,`8`($`30`) # L :
236	nop # E : U L U L
237
238	cmovlt $`24`,$`28`,$`24` / abs($24) / # E : Latency `2`, extra map slot
239	nop # E : as part of the cmov
240	stq $`23`,`16`($`30`) # L :
241	subq $`31`,$`25`,$`28` # E : U L U L
242
243	stq tmp1,`24`($`30`) # L :
244	cmovlt $`25`,$`28`,$`25` / abs($25) / # E : Latency `2`, extra map slot
245	nop # E :
246	bsr $`23`,ufunction # L0: L U L U
247
248	ldq $`24`,`0`($`30`) # L :
249	ldq $`25`,`8`($`30`) # L :
250	GETSIGN($`28`) # E :
251	subq $`31`,$`27`,tmp1 # E : U U L L
252
253	SLONGIFY($`28`) # E :
254	ldq $`23`,`16`($`30`) # L :
255	cmovlt $`28`,tmp1,$`27` # E : Latency `2`, extra map slot
256	nop # E : U L L U : as part of the cmov
257
258	ldq tmp1,`24`($`30`) # L :
259	nop # E : as part of the cmov
260	addq $`30`,STACK,$`30` # E :
261	ret $`31`,($`23`),`1` # L0 : L U U L
262	.end sfunction
263	EXPORT_SYMBOL(sfunction)
264

source code of linux/arch/alpha/lib/ev6-divide.S