divide.S source code [linux/arch/alpha/lib/divide.S]

1	/ SPDX-License-Identifier: GPL-2.0 /
2	/*
3	* arch/alpha/lib/divide.S
4	*
5	* (C) 1995 Linus Torvalds
6	*
7	* Alpha division..
8	*/
9
10	/*
11	* The alpha chip doesn't provide hardware division, so we have to do it
12	* by hand. The compiler expects the functions
13	*
14	* __divqu: 64-bit unsigned long divide
15	* __remqu: 64-bit unsigned long remainder
16	* __divqs/__remqs: signed 64-bit
17	* __divlu/__remlu: unsigned 32-bit
18	* __divls/__remls: signed 32-bit
19	*
20	* These are not normal C functions: instead of the normal
21	* calling sequence, these expect their arguments in registers
22	* $24 and $25, and return the result in $27. Register $28 may
23	* be clobbered (assembly temporary), anything else must be saved.
24	*
25	* In short: painful.
26	*
27	* This is a rather simple bit-at-a-time algorithm: it's very good
28	* at dividing random 64-bit numbers, but the more usual case where
29	* the divisor is small is handled better by the DEC algorithm
30	* using lookup tables. This uses much less memory, though, and is
31	* nicer on the cache.. Besides, I don't know the copyright status
32	* of the DEC code.
33	*/
34
35	/*
36	* My temporaries:
37	* $0 - current bit
38	* $1 - shifted divisor
39	* $2 - modulus/quotient
40	*
41	* $23 - return address
42	* $24 - dividend
43	* $25 - divisor
44	*
45	* $27 - quotient/modulus
46	* $28 - compare status
47	*/
48
49	#include <linux/export.h>
50	#define halt .long 0
51
52	/*
53	* Select function type and registers
54	*/
55	#define mask $0
56	#define divisor $1
57	#define compare $28
58	#define tmp1 $3
59	#define tmp2 $4
60
61	#ifdef DIV
62	#define DIV_ONLY(x,y...) x,##y
63	#define MOD_ONLY(x,y...)
64	#define func(x) __div##x
65	#define modulus $2
66	#define quotient $27
67	#define GETSIGN(x) xor $24,$25,x
68	#define STACK 48
69	#else
70	#define DIV_ONLY(x,y...)
71	#define MOD_ONLY(x,y...) x,##y
72	#define func(x) __rem##x
73	#define modulus $27
74	#define quotient $2
75	#define GETSIGN(x) bis $24,$24,x
76	#define STACK 32
77	#endif
78
79	/*
80	* For 32-bit operations, we need to extend to 64-bit
81	*/
82	#ifdef INTSIZE
83	#define ufunction func(lu)
84	#define sfunction func(l)
85	#define LONGIFY(x) zapnot x,15,x
86	#define SLONGIFY(x) addl x,0,x
87	#else
88	#define ufunction func(qu)
89	#define sfunction func(q)
90	#define LONGIFY(x)
91	#define SLONGIFY(x)
92	#endif
93
94	.set noat
95	.align `3`
96	.globl ufunction
97	.ent ufunction
98	ufunction:
99	subq $`30`,STACK,$`30`
100	.frame $`30`,STACK,$`23`
101	.prologue `0`
102
103	`7`: stq $`1`, `0`($`30`)
104	bis $`25`,$`25`,divisor
105	stq $`2`, `8`($`30`)
106	bis $`24`,$`24`,modulus
107	stq $`0`,`16`($`30`)
108	bis $`31`,$`31`,quotient
109	LONGIFY(divisor)
110	stq tmp1,`24`($`30`)
111	LONGIFY(modulus)
112	bis $`31`,`1`,mask
113	DIV_ONLY(stq tmp2,`32`($`30`))
114	beq divisor, `9f` / div by zero /
115
116	#ifdef INTSIZE
117	/*
118	* shift divisor left, using 3-bit shifts for
119	* 32-bit divides as we can't overflow. Three-bit
120	* shifts will result in looping three times less
121	* here, but can result in two loops more later.
122	* Thus using a large shift isn't worth it (and
123	* s8add pairs better than a sll..)
124	*/
125	`1`: cmpult divisor,modulus,compare
126	s8addq divisor,$`31`,divisor
127	s8addq mask,$`31`,mask
128	bne compare,`1b`
129	#else
130	`1`: cmpult divisor,modulus,compare
131	blt divisor, `2f`
132	addq divisor,divisor,divisor
133	addq mask,mask,mask
134	bne compare,`1b`
135	unop
136	#endif
137
138	/ ok, start to go right again.. /
139	`2`: DIV_ONLY(addq quotient,mask,tmp2)
140	srl mask,`1`,mask
141	cmpule divisor,modulus,compare
142	subq modulus,divisor,tmp1
143	DIV_ONLY(cmovne compare,tmp2,quotient)
144	srl divisor,`1`,divisor
145	cmovne compare,tmp1,modulus
146	bne mask,`2b`
147
148	`9`: ldq $`1`, `0`($`30`)
149	ldq $`2`, `8`($`30`)
150	ldq $`0`,`16`($`30`)
151	ldq tmp1,`24`($`30`)
152	DIV_ONLY(ldq tmp2,`32`($`30`))
153	addq $`30`,STACK,$`30`
154	ret $`31`,($`23`),`1`
155	.end ufunction
156	EXPORT_SYMBOL(ufunction)
157
158	/*
159	* Uhh.. Ugly signed division. I'd rather not have it at all, but
160	* it's needed in some circumstances. There are different ways to
161	* handle this, really. This does:
162	* -a / b = a / -b = -(a / b)
163	* -a % b = -(a % b)
164	* a % -b = a % b
165	* which is probably not the best solution, but at least should
166	* have the property that (x/y)*y + (x%y) = x.
167	*/
168	.align `3`
169	.globl sfunction
170	.ent sfunction
171	sfunction:
172	subq $`30`,STACK,$`30`
173	.frame $`30`,STACK,$`23`
174	.prologue `0`
175	bis $`24`,$`25`,$`28`
176	SLONGIFY($`28`)
177	bge $`28`,`7b`
178	stq $`24`,`0`($`30`)
179	subq $`31`,$`24`,$`28`
180	stq $`25`,`8`($`30`)
181	cmovlt $`24`,$`28`,$`24` / abs($24) /
182	stq $`23`,`16`($`30`)
183	subq $`31`,$`25`,$`28`
184	stq tmp1,`24`($`30`)
185	cmovlt $`25`,$`28`,$`25` / abs($25) /
186	unop
187	bsr $`23`,ufunction
188	ldq $`24`,`0`($`30`)
189	ldq $`25`,`8`($`30`)
190	GETSIGN($`28`)
191	subq $`31`,$`27`,tmp1
192	SLONGIFY($`28`)
193	ldq $`23`,`16`($`30`)
194	cmovlt $`28`,tmp1,$`27`
195	ldq tmp1,`24`($`30`)
196	addq $`30`,STACK,$`30`
197	ret $`31`,($`23`),`1`
198	.end sfunction
199	EXPORT_SYMBOL(sfunction)
200

source code of linux/arch/alpha/lib/divide.S