dfsqrt.S source code [compiler-rt/lib/builtins/hexagon/dfsqrt.S]

1	//===----------------------Hexagon builtin routine ------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	// Double Precision square root
10
11	#define EXP r28
12
13	#define A r1:0
14	#define AH r1
15	#define AL r0
16
17	#define SFSH r3:2
18	#define SF_S r3
19	#define SF_H r2
20
21	#define SFHALF_SONE r5:4
22	#define S_ONE r4
23	#define SFHALF r5
24	#define SF_D r6
25	#define SF_E r7
26	#define RECIPEST r8
27	#define SFRAD r9
28
29	#define FRACRAD r11:10
30	#define FRACRADH r11
31	#define FRACRADL r10
32
33	#define ROOT r13:12
34	#define ROOTHI r13
35	#define ROOTLO r12
36
37	#define PROD r15:14
38	#define PRODHI r15
39	#define PRODLO r14
40
41	#define P_TMP p0
42	#define P_EXP1 p1
43	#define NORMAL p2
44
45	#define SF_EXPBITS 8
46	#define SF_MANTBITS 23
47
48	#define DF_EXPBITS 11
49	#define DF_MANTBITS 52
50
51	#define DF_BIAS 0x3ff
52
53	#define DFCLASS_ZERO 0x01
54	#define DFCLASS_NORMAL 0x02
55	#define DFCLASS_DENORMAL 0x02
56	#define DFCLASS_INFINITE 0x08
57	#define DFCLASS_NAN 0x10
58
59	#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG; .type __qdsp_##TAG,@function
60	#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG; .type __hexagon_fast_##TAG,@function
61	#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG; .type __hexagon_fast2_##TAG,@function
62	#define END(TAG) .size TAG,.-TAG
63
64	.text
65	.global __hexagon_sqrtdf2
66	.type __hexagon_sqrtdf2,@function
67	.global __hexagon_sqrt
68	.type __hexagon_sqrt,@function
69	Q6_ALIAS(sqrtdf2)
70	Q6_ALIAS(sqrt)
71	FAST_ALIAS(sqrtdf2)
72	FAST_ALIAS(sqrt)
73	FAST2_ALIAS(sqrtdf2)
74	FAST2_ALIAS(sqrt)
75	.type sqrt,@function
76	.p2align `5`
77	__hexagon_sqrtdf2:
78	__hexagon_sqrt:
79	{
80	PROD = extractu(A,#SF_MANTBITS+`1`,#DF_MANTBITS-SF_MANTBITS)
81	EXP = extractu(AH,#DF_EXPBITS,#DF_MANTBITS-`32`)
82	SFHALF_SONE = combine(##`0x3f000004`,#`1`)
83	}
84	{
85	NORMAL = dfclass(A,#DFCLASS_NORMAL) // Is it normal
86	NORMAL = cmp.gt(AH,#-`1`) // and positive?
87	if (!NORMAL.new) jump:nt .Lsqrt_abnormal
88	SFRAD = or(SFHALF,PRODLO)
89	}
90	#undef NORMAL
91	.Ldenormal_restart:
92	{
93	FRACRAD = A
94	SF_E,P_TMP = sfinvsqrta(SFRAD)
95	SFHALF = and(SFHALF,#-`16`)
96	SFSH = #`0`
97	}
98	#undef A
99	#undef AH
100	#undef AL
101	#define ERROR r1:0
102	#define ERRORHI r1
103	#define ERRORLO r0
104	// SF_E : reciprocal square root
105	// SF_H : half rsqrt
106	// sf_S : square root
107	// SF_D : error term
108	// SFHALF: 0.5
109	{
110	SF_S += sfmpy(SF_E,SFRAD):lib // s0: root
111	SF_H += sfmpy(SF_E,SFHALF):lib // h0: 0.5y0. Could also decrement exponent...*
112	SF_D = SFHALF
113	#undef SFRAD
114	#define SHIFTAMT r9
115	SHIFTAMT = and(EXP,#`1`)
116	}
117	{
118	SF_D -= sfmpy(SF_S,SF_H):lib // d0: 0.5-HS = 0.5-0.5~1
119	FRACRADH = insert(S_ONE,#DF_EXPBITS+`1`,#DF_MANTBITS-`32`) // replace upper bits with hidden
120	P_EXP1 = cmp.gtu(SHIFTAMT,#`0`)
121	}
122	{
123	SF_S += sfmpy(SF_S,SF_D):lib // s1: refine sqrt
124	SF_H += sfmpy(SF_H,SF_D):lib // h1: refine half-recip
125	SF_D = SFHALF
126	SHIFTAMT = mux(P_EXP1,#`8`,#`9`)
127	}
128	{
129	SF_D -= sfmpy(SF_S,SF_H):lib // d1: error term
130	FRACRAD = asl(FRACRAD,SHIFTAMT) // Move fracrad bits to right place
131	SHIFTAMT = mux(P_EXP1,#`3`,#`2`)
132	}
133	{
134	SF_H += sfmpy(SF_H,SF_D):lib // d2: rsqrt
135	// cool trick: half of 1/sqrt(x) has same mantissa as 1/sqrt(x).
136	PROD = asl(FRACRAD,SHIFTAMT) // fracrad<<(2+exp1)
137	}
138	{
139	SF_H = and(SF_H,##`0x007fffff`)
140	}
141	{
142	SF_H = add(SF_H,##`0x00800000` - `3`)
143	SHIFTAMT = mux(P_EXP1,#`7`,#`8`)
144	}
145	{
146	RECIPEST = asl(SF_H,SHIFTAMT)
147	SHIFTAMT = mux(P_EXP1,#`15`-(`1`+`1`),#`15`-(`1`+`0`))
148	}
149	{
150	ROOT = mpyu(RECIPEST,PRODHI) // root = mpyu_full(recipest,hi(fracrad<<(2+exp1)))
151	}
152
153	#undef SFSH // r3:2
154	#undef SF_H // r2
155	#undef SF_S // r3
156	#undef S_ONE // r4
157	#undef SFHALF // r5
158	#undef SFHALF_SONE // r5:4
159	#undef SF_D // r6
160	#undef SF_E // r7
161
162	#define HL r3:2
163	#define LL r5:4
164	#define HH r7:6
165
166	#undef P_EXP1
167	#define P_CARRY0 p1
168	#define P_CARRY1 p2
169	#define P_CARRY2 p3
170
171	// Iteration 0
172	// Maybe we can save a cycle by starting with ERROR=asl(fracrad), then as we multiply
173	// We can shift and subtract instead of shift and add?
174	{
175	ERROR = asl(FRACRAD,#`15`)
176	PROD = mpyu(ROOTHI,ROOTHI)
177	P_CARRY0 = cmp.eq(r0,r0)
178	}
179	{
180	ERROR -= asl(PROD,#`15`)
181	PROD = mpyu(ROOTHI,ROOTLO)
182	P_CARRY1 = cmp.eq(r0,r0)
183	}
184	{
185	ERROR -= lsr(PROD,#`16`)
186	P_CARRY2 = cmp.eq(r0,r0)
187	}
188	{
189	ERROR = mpyu(ERRORHI,RECIPEST)
190	}
191	{
192	ROOT += lsr(ERROR,SHIFTAMT)
193	SHIFTAMT = add(SHIFTAMT,#`16`)
194	ERROR = asl(FRACRAD,#`31`) // for next iter
195	}
196	// Iteration 1
197	{
198	PROD = mpyu(ROOTHI,ROOTHI)
199	ERROR -= mpyu(ROOTHI,ROOTLO) // amount is 31, no shift needed
200	}
201	{
202	ERROR -= asl(PROD,#`31`)
203	PROD = mpyu(ROOTLO,ROOTLO)
204	}
205	{
206	ERROR -= lsr(PROD,#`33`)
207	}
208	{
209	ERROR = mpyu(ERRORHI,RECIPEST)
210	}
211	{
212	ROOT += lsr(ERROR,SHIFTAMT)
213	SHIFTAMT = add(SHIFTAMT,#`16`)
214	ERROR = asl(FRACRAD,#`47`) // for next iter
215	}
216	// Iteration 2
217	{
218	PROD = mpyu(ROOTHI,ROOTHI)
219	}
220	{
221	ERROR -= asl(PROD,#`47`)
222	PROD = mpyu(ROOTHI,ROOTLO)
223	}
224	{
225	ERROR -= asl(PROD,#`16`) // bidir shr 31-47
226	PROD = mpyu(ROOTLO,ROOTLO)
227	}
228	{
229	ERROR -= lsr(PROD,#`17`) // 64-47
230	}
231	{
232	ERROR = mpyu(ERRORHI,RECIPEST)
233	}
234	{
235	ROOT += lsr(ERROR,SHIFTAMT)
236	}
237	#undef ERROR
238	#undef PROD
239	#undef PRODHI
240	#undef PRODLO
241	#define REM_HI r15:14
242	#define REM_HI_HI r15
243	#define REM_LO r1:0
244	#undef RECIPEST
245	#undef SHIFTAMT
246	#define TWOROOT_LO r9:8
247	// Adjust Root
248	{
249	HL = mpyu(ROOTHI,ROOTLO)
250	LL = mpyu(ROOTLO,ROOTLO)
251	REM_HI = #`0`
252	REM_LO = #`0`
253	}
254	{
255	HL += lsr(LL,#`33`)
256	LL += asl(HL,#`33`)
257	P_CARRY0 = cmp.eq(r0,r0)
258	}
259	{
260	HH = mpyu(ROOTHI,ROOTHI)
261	REM_LO = sub(REM_LO,LL,P_CARRY0):carry
262	TWOROOT_LO = #`1`
263	}
264	{
265	HH += lsr(HL,#`31`)
266	TWOROOT_LO += asl(ROOT,#`1`)
267	}
268	#undef HL
269	#undef LL
270	#define REM_HI_TMP r3:2
271	#define REM_HI_TMP_HI r3
272	#define REM_LO_TMP r5:4
273	{
274	REM_HI = sub(FRACRAD,HH,P_CARRY0):carry
275	REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY1):carry
276	#undef FRACRAD
277	#undef HH
278	#define ZERO r11:10
279	#define ONE r7:6
280	ONE = #`1`
281	ZERO = #`0`
282	}
283	{
284	REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY1):carry
285	ONE = add(ROOT,ONE)
286	EXP = add(EXP,#-DF_BIAS) // subtract bias --> signed exp
287	}
288	{
289	// If carry set, no borrow: result was still positive
290	if (P_CARRY1) ROOT = ONE
291	if (P_CARRY1) REM_LO = REM_LO_TMP
292	if (P_CARRY1) REM_HI = REM_HI_TMP
293	}
294	{
295	REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY2):carry
296	ONE = #`1`
297	EXP = asr(EXP,#`1`) // divide signed exp by 2
298	}
299	{
300	REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY2):carry
301	ONE = add(ROOT,ONE)
302	}
303	{
304	if (P_CARRY2) ROOT = ONE
305	if (P_CARRY2) REM_LO = REM_LO_TMP
306	// since tworoot <= 2^32, remhi must be zero
307	#undef REM_HI_TMP
308	#undef REM_HI_TMP_HI
309	#define S_ONE r2
310	#define ADJ r3
311	S_ONE = #`1`
312	}
313	{
314	P_TMP = cmp.eq(REM_LO,ZERO) // is the low part zero
315	if (!P_TMP.new) ROOTLO = or(ROOTLO,S_ONE) // if so, it's exact... hopefully
316	ADJ = cl0(ROOT)
317	EXP = add(EXP,#-`63`)
318	}
319	#undef REM_LO
320	#define RET r1:0
321	#define RETHI r1
322	{
323	RET = convert_ud2df(ROOT) // set up mantissa, maybe set inexact flag
324	EXP = add(EXP,ADJ) // add back bias
325	}
326	{
327	RETHI += asl(EXP,#DF_MANTBITS-`32`) // add exponent adjust
328	jumpr r31
329	}
330	#undef REM_LO_TMP
331	#undef REM_HI_TMP
332	#undef REM_HI_TMP_HI
333	#undef REM_LO
334	#undef REM_HI
335	#undef TWOROOT_LO
336
337	#undef RET
338	#define A r1:0
339	#define AH r1
340	#define AL r1
341	#undef S_ONE
342	#define TMP r3:2
343	#define TMPHI r3
344	#define TMPLO r2
345	#undef P_CARRY0
346	#define P_NEG p1
347
348
349	#define SFHALF r5
350	#define SFRAD r9
351	.Lsqrt_abnormal:
352	{
353	P_TMP = dfclass(A,#DFCLASS_ZERO) // zero?
354	if (P_TMP.new) jumpr:t r31
355	}
356	{
357	P_TMP = dfclass(A,#DFCLASS_NAN)
358	if (P_TMP.new) jump:nt .Lsqrt_nan
359	}
360	{
361	P_TMP = cmp.gt(AH,#-`1`)
362	if (!P_TMP.new) jump:nt .Lsqrt_invalid_neg
363	if (!P_TMP.new) EXP = ##`0x7F800001` // sNaN
364	}
365	{
366	P_TMP = dfclass(A,#DFCLASS_INFINITE)
367	if (P_TMP.new) jumpr:nt r31
368	}
369	// If we got here, we're denormal
370	// prepare to restart
371	{
372	A = extractu(A,#DF_MANTBITS,#`0`) // Extract mantissa
373	}
374	{
375	EXP = add(clb(A),#-DF_EXPBITS) // how much to normalize?
376	}
377	{
378	A = asl(A,EXP) // Shift mantissa
379	EXP = sub(#`1`,EXP) // Form exponent
380	}
381	{
382	AH = insert(EXP,#`1`,#DF_MANTBITS-`32`) // insert lsb of exponent
383	}
384	{
385	TMP = extractu(A,#SF_MANTBITS+`1`,#DF_MANTBITS-SF_MANTBITS) // get sf value (mant+exp1)
386	SFHALF = ##`0x3f000004` // form half constant
387	}
388	{
389	SFRAD = or(SFHALF,TMPLO) // form sf value
390	SFHALF = and(SFHALF,#-`16`)
391	jump .Ldenormal_restart // restart
392	}
393	.Lsqrt_nan:
394	{
395	EXP = convert_df2sf(A) // if sNaN, get invalid
396	A = #-`1` // qNaN
397	jumpr r31
398	}
399	.Lsqrt_invalid_neg:
400	{
401	A = convert_sf2df(EXP) // Invalid,NaNval
402	jumpr r31
403	}
404	END(__hexagon_sqrt)
405	END(__hexagon_sqrtdf2)
406

Provided by KDAB

Improve your Profiling and Debugging skills

Find out more

source code of compiler-rt/lib/builtins/hexagon/dfsqrt.S