1 | //===----------------------Hexagon builtin routine ------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | // Double Precision Multiply |
10 | #define A r1:0 |
11 | #define AH r1 |
12 | #define AL r0 |
13 | #define B r3:2 |
14 | #define BH r3 |
15 | #define BL r2 |
16 | |
17 | #define BTMP r5:4 |
18 | #define BTMPH r5 |
19 | #define BTMPL r4 |
20 | |
21 | #define PP_ODD r7:6 |
22 | #define PP_ODD_H r7 |
23 | #define PP_ODD_L r6 |
24 | |
25 | #define ONE r9:8 |
26 | #define S_ONE r8 |
27 | #define S_ZERO r9 |
28 | |
29 | #define PP_HH r11:10 |
30 | #define PP_HH_H r11 |
31 | #define PP_HH_L r10 |
32 | |
33 | #define ATMP r13:12 |
34 | #define ATMPH r13 |
35 | #define ATMPL r12 |
36 | |
37 | #define PP_LL r15:14 |
38 | #define PP_LL_H r15 |
39 | #define PP_LL_L r14 |
40 | |
41 | #define TMP r28 |
42 | |
43 | #define MANTBITS 52 |
44 | #define HI_MANTBITS 20 |
45 | #define EXPBITS 11 |
46 | #define BIAS 1024 |
47 | #define MANTISSA_TO_INT_BIAS 52 |
48 | |
49 | // Some constant to adjust normalization amount in error code |
50 | // Amount to right shift the partial product to get to a denorm |
51 | #define FUDGE 5 |
52 | |
53 | #define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG |
54 | #define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG |
55 | #define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG |
56 | #define END(TAG) .size TAG,.-TAG |
57 | |
58 | #define SR_ROUND_OFF 22 |
59 | .text |
60 | .global __hexagon_muldf3 |
61 | .type __hexagon_muldf3,@function |
62 | Q6_ALIAS(muldf3) |
63 | FAST_ALIAS(muldf3) |
64 | FAST2_ALIAS(muldf3) |
65 | .p2align 5 |
66 | __hexagon_muldf3: |
67 | { |
68 | p0 = dfclass(A,#2) |
69 | p0 = dfclass(B,#2) |
70 | ATMP = combine(##0x40000000,#0) |
71 | } |
72 | { |
73 | ATMP = insert(A,#MANTBITS,#EXPBITS-1) |
74 | BTMP = asl(B,#EXPBITS-1) |
75 | TMP = #-BIAS |
76 | ONE = #1 |
77 | } |
78 | { |
79 | PP_ODD = mpyu(BTMPL,ATMPH) |
80 | BTMP = insert(ONE,#2,#62) |
81 | } |
82 | // since we know that the MSB of the H registers is zero, we should never carry |
83 | // H <= 2^31-1. L <= 2^32-1. Therefore, HL <= 2^63-2^32-2^31+1 |
84 | // Adding 2 HLs, we get 2^64-3*2^32+2 maximum. |
85 | // Therefore, we can add 3 2^32-1 values safely without carry. We only need one. |
86 | { |
87 | PP_LL = mpyu(ATMPL,BTMPL) |
88 | PP_ODD += mpyu(ATMPL,BTMPH) |
89 | } |
90 | { |
91 | PP_ODD += lsr(PP_LL,#32) |
92 | PP_HH = mpyu(ATMPH,BTMPH) |
93 | BTMP = combine(##BIAS+BIAS-4,#0) |
94 | } |
95 | { |
96 | PP_HH += lsr(PP_ODD,#32) |
97 | if (!p0) jump .Lmul_abnormal |
98 | p1 = cmp.eq(PP_LL_L,#0) // 64 lsb's 0? |
99 | p1 = cmp.eq(PP_ODD_L,#0) // 64 lsb's 0? |
100 | } |
101 | |
102 | // PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts |
103 | // PP_HH can have a minimum of 0x1000_0000_0000_0000 or so |
104 | |
105 | #undef PP_ODD |
106 | #undef PP_ODD_H |
107 | #undef PP_ODD_L |
108 | #define EXP10 r7:6 |
109 | #define EXP1 r7 |
110 | #define EXP0 r6 |
111 | { |
112 | if (!p1) PP_HH_L = or(PP_HH_L,S_ONE) |
113 | EXP0 = extractu(AH,#EXPBITS,#HI_MANTBITS) |
114 | EXP1 = extractu(BH,#EXPBITS,#HI_MANTBITS) |
115 | } |
116 | { |
117 | PP_LL = neg(PP_HH) |
118 | EXP0 += add(TMP,EXP1) |
119 | TMP = xor(AH,BH) |
120 | } |
121 | { |
122 | if (!p2.new) PP_HH = PP_LL |
123 | p2 = cmp.gt(TMP,#-1) |
124 | p0 = !cmp.gt(EXP0,BTMPH) |
125 | p0 = cmp.gt(EXP0,BTMPL) |
126 | if (!p0.new) jump:nt .Lmul_ovf_unf |
127 | } |
128 | { |
129 | A = convert_d2df(PP_HH) |
130 | EXP0 = add(EXP0,#-BIAS-58) |
131 | } |
132 | { |
133 | AH += asl(EXP0,#HI_MANTBITS) |
134 | jumpr r31 |
135 | } |
136 | |
137 | .falign |
138 | .Lpossible_unf: |
139 | // We end up with a positive exponent |
140 | // But we may have rounded up to an exponent of 1. |
141 | // If the exponent is 1, if we rounded up to it |
142 | // we need to also raise underflow |
143 | // Fortunately, this is pretty easy to detect, we must have +/- 0x0010_0000_0000_0000 |
144 | // And the PP should also have more than one bit set |
145 | // |
146 | // Note: ATMP should have abs(PP_HH) |
147 | // Note: BTMPL should have 0x7FEFFFFF |
148 | { |
149 | p0 = cmp.eq(AL,#0) |
150 | p0 = bitsclr(AH,BTMPL) |
151 | if (!p0.new) jumpr:t r31 |
152 | BTMPH = #0x7fff |
153 | } |
154 | { |
155 | p0 = bitsset(ATMPH,BTMPH) |
156 | BTMPL = USR |
157 | BTMPH = #0x030 |
158 | } |
159 | { |
160 | if (p0) BTMPL = or(BTMPL,BTMPH) |
161 | } |
162 | { |
163 | USR = BTMPL |
164 | } |
165 | { |
166 | p0 = dfcmp.eq(A,A) |
167 | jumpr r31 |
168 | } |
169 | .falign |
170 | .Lmul_ovf_unf: |
171 | { |
172 | A = convert_d2df(PP_HH) |
173 | ATMP = abs(PP_HH) // take absolute value |
174 | EXP1 = add(EXP0,#-BIAS-58) |
175 | } |
176 | { |
177 | AH += asl(EXP1,#HI_MANTBITS) |
178 | EXP1 = extractu(AH,#EXPBITS,#HI_MANTBITS) |
179 | BTMPL = ##0x7FEFFFFF |
180 | } |
181 | { |
182 | EXP1 += add(EXP0,##-BIAS-58) |
183 | //BTMPH = add(clb(ATMP),#-2) |
184 | BTMPH = #0 |
185 | } |
186 | { |
187 | p0 = cmp.gt(EXP1,##BIAS+BIAS-2) // overflow |
188 | if (p0.new) jump:nt .Lmul_ovf |
189 | } |
190 | { |
191 | p0 = cmp.gt(EXP1,#0) |
192 | if (p0.new) jump:nt .Lpossible_unf |
193 | BTMPH = sub(EXP0,BTMPH) |
194 | TMP = #63 // max amount to shift |
195 | } |
196 | // Underflow |
197 | // |
198 | // PP_HH has the partial product with sticky LSB. |
199 | // PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts |
200 | // PP_HH can have a minimum of 0x1000_0000_0000_0000 or so |
201 | // The exponent of PP_HH is in EXP1, which is non-positive (0 or negative) |
202 | // That's the exponent that happens after the normalization |
203 | // |
204 | // EXP0 has the exponent that, when added to the normalized value, is out of range. |
205 | // |
206 | // Strategy: |
207 | // |
208 | // * Shift down bits, with sticky bit, such that the bits are aligned according |
209 | // to the LZ count and appropriate exponent, but not all the way to mantissa |
210 | // field, keep around the last few bits. |
211 | // * Put a 1 near the MSB |
212 | // * Check the LSBs for inexact; if inexact also set underflow |
213 | // * Convert [u]d2df -- will correctly round according to rounding mode |
214 | // * Replace exponent field with zero |
215 | |
216 | { |
217 | BTMPL = #0 // offset for extract |
218 | BTMPH = sub(#FUDGE,BTMPH) // amount to right shift |
219 | } |
220 | { |
221 | p3 = cmp.gt(PP_HH_H,#-1) // is it positive? |
222 | BTMPH = min(BTMPH,TMP) // Don't shift more than 63 |
223 | PP_HH = ATMP |
224 | } |
225 | { |
226 | TMP = USR |
227 | PP_LL = extractu(PP_HH,BTMP) |
228 | } |
229 | { |
230 | PP_HH = asr(PP_HH,BTMPH) |
231 | BTMPL = #0x0030 // underflow flag |
232 | AH = insert(S_ZERO,#EXPBITS,#HI_MANTBITS) |
233 | } |
234 | { |
235 | p0 = cmp.gtu(ONE,PP_LL) // Did we extract all zeros? |
236 | if (!p0.new) PP_HH_L = or(PP_HH_L,S_ONE) // add sticky bit |
237 | PP_HH_H = setbit(PP_HH_H,#HI_MANTBITS+3) // Add back in a bit so we can use convert instruction |
238 | } |
239 | { |
240 | PP_LL = neg(PP_HH) |
241 | p1 = bitsclr(PP_HH_L,#0x7) // Are the LSB's clear? |
242 | if (!p1.new) TMP = or(BTMPL,TMP) // If not, Inexact+Underflow |
243 | } |
244 | { |
245 | if (!p3) PP_HH = PP_LL |
246 | USR = TMP |
247 | } |
248 | { |
249 | A = convert_d2df(PP_HH) // Do rounding |
250 | p0 = dfcmp.eq(A,A) // realize exception |
251 | } |
252 | { |
253 | AH = insert(S_ZERO,#EXPBITS-1,#HI_MANTBITS+1) // Insert correct exponent |
254 | jumpr r31 |
255 | } |
256 | .falign |
257 | .Lmul_ovf: |
258 | // We get either max finite value or infinity. Either way, overflow+inexact |
259 | { |
260 | TMP = USR |
261 | ATMP = combine(##0x7fefffff,#-1) // positive max finite |
262 | A = PP_HH |
263 | } |
264 | { |
265 | PP_LL_L = extractu(TMP,#2,#SR_ROUND_OFF) // rounding bits |
266 | TMP = or(TMP,#0x28) // inexact + overflow |
267 | BTMP = combine(##0x7ff00000,#0) // positive infinity |
268 | } |
269 | { |
270 | USR = TMP |
271 | PP_LL_L ^= lsr(AH,#31) // Does sign match rounding? |
272 | TMP = PP_LL_L // unmodified rounding mode |
273 | } |
274 | { |
275 | p0 = !cmp.eq(TMP,#1) // If not round-to-zero and |
276 | p0 = !cmp.eq(PP_LL_L,#2) // Not rounding the other way, |
277 | if (p0.new) ATMP = BTMP // we should get infinity |
278 | p0 = dfcmp.eq(A,A) // Realize FP exception if enabled |
279 | } |
280 | { |
281 | A = insert(ATMP,#63,#0) // insert inf/maxfinite, leave sign |
282 | jumpr r31 |
283 | } |
284 | |
285 | .Lmul_abnormal: |
286 | { |
287 | ATMP = extractu(A,#63,#0) // strip off sign |
288 | BTMP = extractu(B,#63,#0) // strip off sign |
289 | } |
290 | { |
291 | p3 = cmp.gtu(ATMP,BTMP) |
292 | if (!p3.new) A = B // sort values |
293 | if (!p3.new) B = A // sort values |
294 | } |
295 | { |
296 | // Any NaN --> NaN, possibly raise invalid if sNaN |
297 | p0 = dfclass(A,#0x0f) // A not NaN? |
298 | if (!p0.new) jump:nt .Linvalid_nan |
299 | if (!p3) ATMP = BTMP |
300 | if (!p3) BTMP = ATMP |
301 | } |
302 | { |
303 | // Infinity * nonzero number is infinity |
304 | p1 = dfclass(A,#0x08) // A is infinity |
305 | p1 = dfclass(B,#0x0e) // B is nonzero |
306 | } |
307 | { |
308 | // Infinity * zero --> NaN, raise invalid |
309 | // Other zeros return zero |
310 | p0 = dfclass(A,#0x08) // A is infinity |
311 | p0 = dfclass(B,#0x01) // B is zero |
312 | } |
313 | { |
314 | if (p1) jump .Ltrue_inf |
315 | p2 = dfclass(B,#0x01) |
316 | } |
317 | { |
318 | if (p0) jump .Linvalid_zeroinf |
319 | if (p2) jump .Ltrue_zero // so return zero |
320 | TMP = ##0x7c000000 |
321 | } |
322 | // We are left with a normal or subnormal times a subnormal. A > B |
323 | // If A and B are both very small (exp(a) < BIAS-MANTBITS), |
324 | // we go to a single sticky bit, which we can round easily. |
325 | // If A and B might multiply to something bigger, decrease A exponent and increase |
326 | // B exponent and try again |
327 | { |
328 | p0 = bitsclr(AH,TMP) |
329 | if (p0.new) jump:nt .Lmul_tiny |
330 | } |
331 | { |
332 | TMP = cl0(BTMP) |
333 | } |
334 | { |
335 | TMP = add(TMP,#-EXPBITS) |
336 | } |
337 | { |
338 | BTMP = asl(BTMP,TMP) |
339 | } |
340 | { |
341 | B = insert(BTMP,#63,#0) |
342 | AH -= asl(TMP,#HI_MANTBITS) |
343 | } |
344 | jump __hexagon_muldf3 |
345 | .Lmul_tiny: |
346 | { |
347 | TMP = USR |
348 | A = xor(A,B) // get sign bit |
349 | } |
350 | { |
351 | TMP = or(TMP,#0x30) // Inexact + Underflow |
352 | A = insert(ONE,#63,#0) // put in rounded up value |
353 | BTMPH = extractu(TMP,#2,#SR_ROUND_OFF) // get rounding mode |
354 | } |
355 | { |
356 | USR = TMP |
357 | p0 = cmp.gt(BTMPH,#1) // Round towards pos/neg inf? |
358 | if (!p0.new) AL = #0 // If not, zero |
359 | BTMPH ^= lsr(AH,#31) // rounding my way --> set LSB |
360 | } |
361 | { |
362 | p0 = cmp.eq(BTMPH,#3) // if rounding towards right inf |
363 | if (!p0.new) AL = #0 // don't go to zero |
364 | jumpr r31 |
365 | } |
366 | .Linvalid_zeroinf: |
367 | { |
368 | TMP = USR |
369 | } |
370 | { |
371 | A = #-1 |
372 | TMP = or(TMP,#2) |
373 | } |
374 | { |
375 | USR = TMP |
376 | } |
377 | { |
378 | p0 = dfcmp.uo(A,A) // force exception if enabled |
379 | jumpr r31 |
380 | } |
381 | .Linvalid_nan: |
382 | { |
383 | p0 = dfclass(B,#0x0f) // if B is not NaN |
384 | TMP = convert_df2sf(A) // will generate invalid if sNaN |
385 | if (p0.new) B = A // make it whatever A is |
386 | } |
387 | { |
388 | BL = convert_df2sf(B) // will generate invalid if sNaN |
389 | A = #-1 |
390 | jumpr r31 |
391 | } |
392 | .falign |
393 | .Ltrue_zero: |
394 | { |
395 | A = B |
396 | B = A |
397 | } |
398 | .Ltrue_inf: |
399 | { |
400 | BH = extract(BH,#1,#31) |
401 | } |
402 | { |
403 | AH ^= asl(BH,#31) |
404 | jumpr r31 |
405 | } |
406 | END(__hexagon_muldf3) |
407 | |
408 | #undef ATMP |
409 | #undef ATMPL |
410 | #undef ATMPH |
411 | #undef BTMP |
412 | #undef BTMPL |
413 | #undef BTMPH |
414 | |