1 | //===----------------------Hexagon builtin routine ------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | // Double Precision square root |
10 | |
11 | #define EXP r28 |
12 | |
13 | #define A r1:0 |
14 | #define AH r1 |
15 | #define AL r0 |
16 | |
17 | #define SFSH r3:2 |
18 | #define SF_S r3 |
19 | #define SF_H r2 |
20 | |
21 | #define SFHALF_SONE r5:4 |
22 | #define S_ONE r4 |
23 | #define SFHALF r5 |
24 | #define SF_D r6 |
25 | #define SF_E r7 |
26 | #define RECIPEST r8 |
27 | #define SFRAD r9 |
28 | |
29 | #define FRACRAD r11:10 |
30 | #define FRACRADH r11 |
31 | #define FRACRADL r10 |
32 | |
33 | #define ROOT r13:12 |
34 | #define ROOTHI r13 |
35 | #define ROOTLO r12 |
36 | |
37 | #define PROD r15:14 |
38 | #define PRODHI r15 |
39 | #define PRODLO r14 |
40 | |
41 | #define P_TMP p0 |
42 | #define P_EXP1 p1 |
43 | #define NORMAL p2 |
44 | |
45 | #define SF_EXPBITS 8 |
46 | #define SF_MANTBITS 23 |
47 | |
48 | #define DF_EXPBITS 11 |
49 | #define DF_MANTBITS 52 |
50 | |
51 | #define DF_BIAS 0x3ff |
52 | |
53 | #define DFCLASS_ZERO 0x01 |
54 | #define DFCLASS_NORMAL 0x02 |
55 | #define DFCLASS_DENORMAL 0x02 |
56 | #define DFCLASS_INFINITE 0x08 |
57 | #define DFCLASS_NAN 0x10 |
58 | |
59 | #define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG; .type __qdsp_##TAG,@function |
60 | #define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG; .type __hexagon_fast_##TAG,@function |
61 | #define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG; .type __hexagon_fast2_##TAG,@function |
62 | #define END(TAG) .size TAG,.-TAG |
63 | |
64 | .text |
65 | .global __hexagon_sqrtdf2 |
66 | .type __hexagon_sqrtdf2,@function |
67 | .global __hexagon_sqrt |
68 | .type __hexagon_sqrt,@function |
69 | Q6_ALIAS(sqrtdf2) |
70 | Q6_ALIAS(sqrt) |
71 | FAST_ALIAS(sqrtdf2) |
72 | FAST_ALIAS(sqrt) |
73 | FAST2_ALIAS(sqrtdf2) |
74 | FAST2_ALIAS(sqrt) |
75 | .type sqrt,@function |
76 | .p2align 5 |
77 | __hexagon_sqrtdf2: |
78 | __hexagon_sqrt: |
79 | { |
80 | PROD = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS) |
81 | EXP = extractu(AH,#DF_EXPBITS,#DF_MANTBITS-32) |
82 | SFHALF_SONE = combine(##0x3f000004,#1) |
83 | } |
84 | { |
85 | NORMAL = dfclass(A,#DFCLASS_NORMAL) // Is it normal |
86 | NORMAL = cmp.gt(AH,#-1) // and positive? |
87 | if (!NORMAL.new) jump:nt .Lsqrt_abnormal |
88 | SFRAD = or(SFHALF,PRODLO) |
89 | } |
90 | #undef NORMAL |
91 | .Ldenormal_restart: |
92 | { |
93 | FRACRAD = A |
94 | SF_E,P_TMP = sfinvsqrta(SFRAD) |
95 | SFHALF = and(SFHALF,#-16) |
96 | SFSH = #0 |
97 | } |
98 | #undef A |
99 | #undef AH |
100 | #undef AL |
101 | #define ERROR r1:0 |
102 | #define ERRORHI r1 |
103 | #define ERRORLO r0 |
104 | // SF_E : reciprocal square root |
105 | // SF_H : half rsqrt |
106 | // sf_S : square root |
107 | // SF_D : error term |
108 | // SFHALF: 0.5 |
109 | { |
110 | SF_S += sfmpy(SF_E,SFRAD):lib // s0: root |
111 | SF_H += sfmpy(SF_E,SFHALF):lib // h0: 0.5*y0. Could also decrement exponent... |
112 | SF_D = SFHALF |
113 | #undef SFRAD |
114 | #define SHIFTAMT r9 |
115 | SHIFTAMT = and(EXP,#1) |
116 | } |
117 | { |
118 | SF_D -= sfmpy(SF_S,SF_H):lib // d0: 0.5-H*S = 0.5-0.5*~1 |
119 | FRACRADH = insert(S_ONE,#DF_EXPBITS+1,#DF_MANTBITS-32) // replace upper bits with hidden |
120 | P_EXP1 = cmp.gtu(SHIFTAMT,#0) |
121 | } |
122 | { |
123 | SF_S += sfmpy(SF_S,SF_D):lib // s1: refine sqrt |
124 | SF_H += sfmpy(SF_H,SF_D):lib // h1: refine half-recip |
125 | SF_D = SFHALF |
126 | SHIFTAMT = mux(P_EXP1,#8,#9) |
127 | } |
128 | { |
129 | SF_D -= sfmpy(SF_S,SF_H):lib // d1: error term |
130 | FRACRAD = asl(FRACRAD,SHIFTAMT) // Move fracrad bits to right place |
131 | SHIFTAMT = mux(P_EXP1,#3,#2) |
132 | } |
133 | { |
134 | SF_H += sfmpy(SF_H,SF_D):lib // d2: rsqrt |
135 | // cool trick: half of 1/sqrt(x) has same mantissa as 1/sqrt(x). |
136 | PROD = asl(FRACRAD,SHIFTAMT) // fracrad<<(2+exp1) |
137 | } |
138 | { |
139 | SF_H = and(SF_H,##0x007fffff) |
140 | } |
141 | { |
142 | SF_H = add(SF_H,##0x00800000 - 3) |
143 | SHIFTAMT = mux(P_EXP1,#7,#8) |
144 | } |
145 | { |
146 | RECIPEST = asl(SF_H,SHIFTAMT) |
147 | SHIFTAMT = mux(P_EXP1,#15-(1+1),#15-(1+0)) |
148 | } |
149 | { |
150 | ROOT = mpyu(RECIPEST,PRODHI) // root = mpyu_full(recipest,hi(fracrad<<(2+exp1))) |
151 | } |
152 | |
153 | #undef SFSH // r3:2 |
154 | #undef SF_H // r2 |
155 | #undef SF_S // r3 |
156 | #undef S_ONE // r4 |
157 | #undef SFHALF // r5 |
158 | #undef SFHALF_SONE // r5:4 |
159 | #undef SF_D // r6 |
160 | #undef SF_E // r7 |
161 | |
162 | #define HL r3:2 |
163 | #define LL r5:4 |
164 | #define HH r7:6 |
165 | |
166 | #undef P_EXP1 |
167 | #define P_CARRY0 p1 |
168 | #define P_CARRY1 p2 |
169 | #define P_CARRY2 p3 |
170 | |
171 | // Iteration 0 |
172 | // Maybe we can save a cycle by starting with ERROR=asl(fracrad), then as we multiply |
173 | // We can shift and subtract instead of shift and add? |
174 | { |
175 | ERROR = asl(FRACRAD,#15) |
176 | PROD = mpyu(ROOTHI,ROOTHI) |
177 | P_CARRY0 = cmp.eq(r0,r0) |
178 | } |
179 | { |
180 | ERROR -= asl(PROD,#15) |
181 | PROD = mpyu(ROOTHI,ROOTLO) |
182 | P_CARRY1 = cmp.eq(r0,r0) |
183 | } |
184 | { |
185 | ERROR -= lsr(PROD,#16) |
186 | P_CARRY2 = cmp.eq(r0,r0) |
187 | } |
188 | { |
189 | ERROR = mpyu(ERRORHI,RECIPEST) |
190 | } |
191 | { |
192 | ROOT += lsr(ERROR,SHIFTAMT) |
193 | SHIFTAMT = add(SHIFTAMT,#16) |
194 | ERROR = asl(FRACRAD,#31) // for next iter |
195 | } |
196 | // Iteration 1 |
197 | { |
198 | PROD = mpyu(ROOTHI,ROOTHI) |
199 | ERROR -= mpyu(ROOTHI,ROOTLO) // amount is 31, no shift needed |
200 | } |
201 | { |
202 | ERROR -= asl(PROD,#31) |
203 | PROD = mpyu(ROOTLO,ROOTLO) |
204 | } |
205 | { |
206 | ERROR -= lsr(PROD,#33) |
207 | } |
208 | { |
209 | ERROR = mpyu(ERRORHI,RECIPEST) |
210 | } |
211 | { |
212 | ROOT += lsr(ERROR,SHIFTAMT) |
213 | SHIFTAMT = add(SHIFTAMT,#16) |
214 | ERROR = asl(FRACRAD,#47) // for next iter |
215 | } |
216 | // Iteration 2 |
217 | { |
218 | PROD = mpyu(ROOTHI,ROOTHI) |
219 | } |
220 | { |
221 | ERROR -= asl(PROD,#47) |
222 | PROD = mpyu(ROOTHI,ROOTLO) |
223 | } |
224 | { |
225 | ERROR -= asl(PROD,#16) // bidir shr 31-47 |
226 | PROD = mpyu(ROOTLO,ROOTLO) |
227 | } |
228 | { |
229 | ERROR -= lsr(PROD,#17) // 64-47 |
230 | } |
231 | { |
232 | ERROR = mpyu(ERRORHI,RECIPEST) |
233 | } |
234 | { |
235 | ROOT += lsr(ERROR,SHIFTAMT) |
236 | } |
237 | #undef ERROR |
238 | #undef PROD |
239 | #undef PRODHI |
240 | #undef PRODLO |
241 | #define REM_HI r15:14 |
242 | #define REM_HI_HI r15 |
243 | #define REM_LO r1:0 |
244 | #undef RECIPEST |
245 | #undef SHIFTAMT |
246 | #define TWOROOT_LO r9:8 |
247 | // Adjust Root |
248 | { |
249 | HL = mpyu(ROOTHI,ROOTLO) |
250 | LL = mpyu(ROOTLO,ROOTLO) |
251 | REM_HI = #0 |
252 | REM_LO = #0 |
253 | } |
254 | { |
255 | HL += lsr(LL,#33) |
256 | LL += asl(HL,#33) |
257 | P_CARRY0 = cmp.eq(r0,r0) |
258 | } |
259 | { |
260 | HH = mpyu(ROOTHI,ROOTHI) |
261 | REM_LO = sub(REM_LO,LL,P_CARRY0):carry |
262 | TWOROOT_LO = #1 |
263 | } |
264 | { |
265 | HH += lsr(HL,#31) |
266 | TWOROOT_LO += asl(ROOT,#1) |
267 | } |
268 | #undef HL |
269 | #undef LL |
270 | #define REM_HI_TMP r3:2 |
271 | #define REM_HI_TMP_HI r3 |
272 | #define REM_LO_TMP r5:4 |
273 | { |
274 | REM_HI = sub(FRACRAD,HH,P_CARRY0):carry |
275 | REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY1):carry |
276 | #undef FRACRAD |
277 | #undef HH |
278 | #define ZERO r11:10 |
279 | #define ONE r7:6 |
280 | ONE = #1 |
281 | ZERO = #0 |
282 | } |
283 | { |
284 | REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY1):carry |
285 | ONE = add(ROOT,ONE) |
286 | EXP = add(EXP,#-DF_BIAS) // subtract bias --> signed exp |
287 | } |
288 | { |
289 | // If carry set, no borrow: result was still positive |
290 | if (P_CARRY1) ROOT = ONE |
291 | if (P_CARRY1) REM_LO = REM_LO_TMP |
292 | if (P_CARRY1) REM_HI = REM_HI_TMP |
293 | } |
294 | { |
295 | REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY2):carry |
296 | ONE = #1 |
297 | EXP = asr(EXP,#1) // divide signed exp by 2 |
298 | } |
299 | { |
300 | REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY2):carry |
301 | ONE = add(ROOT,ONE) |
302 | } |
303 | { |
304 | if (P_CARRY2) ROOT = ONE |
305 | if (P_CARRY2) REM_LO = REM_LO_TMP |
306 | // since tworoot <= 2^32, remhi must be zero |
307 | #undef REM_HI_TMP |
308 | #undef REM_HI_TMP_HI |
309 | #define S_ONE r2 |
310 | #define ADJ r3 |
311 | S_ONE = #1 |
312 | } |
313 | { |
314 | P_TMP = cmp.eq(REM_LO,ZERO) // is the low part zero |
315 | if (!P_TMP.new) ROOTLO = or(ROOTLO,S_ONE) // if so, it's exact... hopefully |
316 | ADJ = cl0(ROOT) |
317 | EXP = add(EXP,#-63) |
318 | } |
319 | #undef REM_LO |
320 | #define RET r1:0 |
321 | #define RETHI r1 |
322 | { |
323 | RET = convert_ud2df(ROOT) // set up mantissa, maybe set inexact flag |
324 | EXP = add(EXP,ADJ) // add back bias |
325 | } |
326 | { |
327 | RETHI += asl(EXP,#DF_MANTBITS-32) // add exponent adjust |
328 | jumpr r31 |
329 | } |
330 | #undef REM_LO_TMP |
331 | #undef REM_HI_TMP |
332 | #undef REM_HI_TMP_HI |
333 | #undef REM_LO |
334 | #undef REM_HI |
335 | #undef TWOROOT_LO |
336 | |
337 | #undef RET |
338 | #define A r1:0 |
339 | #define AH r1 |
340 | #define AL r1 |
341 | #undef S_ONE |
342 | #define TMP r3:2 |
343 | #define TMPHI r3 |
344 | #define TMPLO r2 |
345 | #undef P_CARRY0 |
346 | #define P_NEG p1 |
347 | |
348 | |
349 | #define SFHALF r5 |
350 | #define SFRAD r9 |
351 | .Lsqrt_abnormal: |
352 | { |
353 | P_TMP = dfclass(A,#DFCLASS_ZERO) // zero? |
354 | if (P_TMP.new) jumpr:t r31 |
355 | } |
356 | { |
357 | P_TMP = dfclass(A,#DFCLASS_NAN) |
358 | if (P_TMP.new) jump:nt .Lsqrt_nan |
359 | } |
360 | { |
361 | P_TMP = cmp.gt(AH,#-1) |
362 | if (!P_TMP.new) jump:nt .Lsqrt_invalid_neg |
363 | if (!P_TMP.new) EXP = ##0x7F800001 // sNaN |
364 | } |
365 | { |
366 | P_TMP = dfclass(A,#DFCLASS_INFINITE) |
367 | if (P_TMP.new) jumpr:nt r31 |
368 | } |
369 | // If we got here, we're denormal |
370 | // prepare to restart |
371 | { |
372 | A = extractu(A,#DF_MANTBITS,#0) // Extract mantissa |
373 | } |
374 | { |
375 | EXP = add(clb(A),#-DF_EXPBITS) // how much to normalize? |
376 | } |
377 | { |
378 | A = asl(A,EXP) // Shift mantissa |
379 | EXP = sub(#1,EXP) // Form exponent |
380 | } |
381 | { |
382 | AH = insert(EXP,#1,#DF_MANTBITS-32) // insert lsb of exponent |
383 | } |
384 | { |
385 | TMP = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS) // get sf value (mant+exp1) |
386 | SFHALF = ##0x3f000004 // form half constant |
387 | } |
388 | { |
389 | SFRAD = or(SFHALF,TMPLO) // form sf value |
390 | SFHALF = and(SFHALF,#-16) |
391 | jump .Ldenormal_restart // restart |
392 | } |
393 | .Lsqrt_nan: |
394 | { |
395 | EXP = convert_df2sf(A) // if sNaN, get invalid |
396 | A = #-1 // qNaN |
397 | jumpr r31 |
398 | } |
399 | .Lsqrt_invalid_neg: |
400 | { |
401 | A = convert_sf2df(EXP) // Invalid,NaNval |
402 | jumpr r31 |
403 | } |
404 | END(__hexagon_sqrt) |
405 | END(__hexagon_sqrtdf2) |
406 | |