1//===----------------------Hexagon builtin routine ------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
10#define END(TAG) .size TAG,.-TAG
11
12// Double Precision Multiply
13
14
15#define A r1:0
16#define AH r1
17#define AL r0
18#define B r3:2
19#define BH r3
20#define BL r2
21#define C r5:4
22#define CH r5
23#define CL r4
24
25
26
27#define BTMP r15:14
28#define BTMPH r15
29#define BTMPL r14
30
31#define ATMP r13:12
32#define ATMPH r13
33#define ATMPL r12
34
35#define CTMP r11:10
36#define CTMPH r11
37#define CTMPL r10
38
39#define PP_LL r9:8
40#define PP_LL_H r9
41#define PP_LL_L r8
42
43#define PP_ODD r7:6
44#define PP_ODD_H r7
45#define PP_ODD_L r6
46
47
48#define PP_HH r17:16
49#define PP_HH_H r17
50#define PP_HH_L r16
51
52#define EXPA r18
53#define EXPB r19
54#define EXPBA r19:18
55
56#define TMP r28
57
58#define P_TMP p0
59#define PROD_NEG p3
60#define EXACT p2
61#define SWAP p1
62
63#define MANTBITS 52
64#define HI_MANTBITS 20
65#define EXPBITS 11
66#define BIAS 1023
67#define STACKSPACE 32
68
69#define ADJUST 4
70
71#define FUDGE 7
72#define FUDGE2 3
73
74#ifndef SR_ROUND_OFF
75#define SR_ROUND_OFF 22
76#endif
77
78 // First, classify for normal values, and abort if abnormal
79 //
80 // Next, unpack mantissa into 0x1000_0000_0000_0000 + mant<<8
81 //
82 // Since we know that the 2 MSBs of the H registers is zero, we should never carry
83 // the partial products that involve the H registers
84 //
85 // Try to buy X slots, at the expense of latency if needed
86 //
87 // We will have PP_HH with the upper bits of the product, PP_LL with the lower
88 // PP_HH can have a maximum of 0x03FF_FFFF_FFFF_FFFF or thereabouts
89 // PP_HH can have a minimum of 0x0100_0000_0000_0000
90 //
91 // 0x0100_0000_0000_0000 has EXP of EXPA+EXPB-BIAS
92 //
93 // We need to align CTMP.
94 // If CTMP >> PP, convert PP to 64 bit with sticky, align CTMP, and follow normal add
95 // If CTMP << PP align CTMP and add 128 bits. Then compute sticky
96 // If CTMP ~= PP, align CTMP and add 128 bits. May have massive cancellation.
97 //
98 // Convert partial product and CTMP to 2's complement prior to addition
99 //
100 // After we add, we need to normalize into upper 64 bits, then compute sticky.
101
102 .text
103 .global __hexagon_fmadf4
104 .type __hexagon_fmadf4,@function
105 .global __hexagon_fmadf5
106 .type __hexagon_fmadf5,@function
107 Q6_ALIAS(fmadf5)
108 .p2align 5
109__hexagon_fmadf4:
110__hexagon_fmadf5:
111.Lfma_begin:
112 {
113 P_TMP = dfclass(A,#2)
114 P_TMP = dfclass(B,#2)
115 ATMP = #0
116 BTMP = #0
117 }
118 {
119 ATMP = insert(A,#MANTBITS,#EXPBITS-3)
120 BTMP = insert(B,#MANTBITS,#EXPBITS-3)
121 PP_ODD_H = ##0x10000000
122 allocframe(#STACKSPACE)
123 }
124 {
125 PP_LL = mpyu(ATMPL,BTMPL)
126 if (!P_TMP) jump .Lfma_abnormal_ab
127 ATMPH = or(ATMPH,PP_ODD_H)
128 BTMPH = or(BTMPH,PP_ODD_H)
129 }
130 {
131 P_TMP = dfclass(C,#2)
132 if (!P_TMP.new) jump:nt .Lfma_abnormal_c
133 CTMP = combine(PP_ODD_H,#0)
134 PP_ODD = combine(#0,PP_LL_H)
135 }
136.Lfma_abnormal_c_restart:
137 {
138 PP_ODD += mpyu(BTMPL,ATMPH)
139 CTMP = insert(C,#MANTBITS,#EXPBITS-3)
140 memd(r29+#0) = PP_HH
141 memd(r29+#8) = EXPBA
142 }
143 {
144 PP_ODD += mpyu(ATMPL,BTMPH)
145 EXPBA = neg(CTMP)
146 P_TMP = cmp.gt(CH,#-1)
147 TMP = xor(AH,BH)
148 }
149 {
150 EXPA = extractu(AH,#EXPBITS,#HI_MANTBITS)
151 EXPB = extractu(BH,#EXPBITS,#HI_MANTBITS)
152 PP_HH = combine(#0,PP_ODD_H)
153 if (!P_TMP) CTMP = EXPBA
154 }
155 {
156 PP_HH += mpyu(ATMPH,BTMPH)
157 PP_LL = combine(PP_ODD_L,PP_LL_L)
158#undef PP_ODD
159#undef PP_ODD_H
160#undef PP_ODD_L
161#undef ATMP
162#undef ATMPL
163#undef ATMPH
164#undef BTMP
165#undef BTMPL
166#undef BTMPH
167#define RIGHTLEFTSHIFT r13:12
168#define RIGHTSHIFT r13
169#define LEFTSHIFT r12
170
171 EXPA = add(EXPA,EXPB)
172#undef EXPB
173#undef EXPBA
174#define EXPC r19
175#define EXPCA r19:18
176 EXPC = extractu(CH,#EXPBITS,#HI_MANTBITS)
177 }
178 // PP_HH:PP_LL now has product
179 // CTMP is negated
180 // EXPA,B,C are extracted
181 // We need to negate PP
182 // Since we will be adding with carry later, if we need to negate,
183 // just invert all bits now, which we can do conditionally and in parallel
184#define PP_HH_TMP r15:14
185#define PP_LL_TMP r7:6
186 {
187 EXPA = add(EXPA,#-BIAS+(ADJUST))
188 PROD_NEG = !cmp.gt(TMP,#-1)
189 PP_LL_TMP = #0
190 PP_HH_TMP = #0
191 }
192 {
193 PP_LL_TMP = sub(PP_LL_TMP,PP_LL,PROD_NEG):carry
194 P_TMP = !cmp.gt(TMP,#-1)
195 SWAP = cmp.gt(EXPC,EXPA) // If C >> PP
196 if (SWAP.new) EXPCA = combine(EXPA,EXPC)
197 }
198 {
199 PP_HH_TMP = sub(PP_HH_TMP,PP_HH,PROD_NEG):carry
200 if (P_TMP) PP_LL = PP_LL_TMP
201#undef PP_LL_TMP
202#define CTMP2 r7:6
203#define CTMP2H r7
204#define CTMP2L r6
205 CTMP2 = #0
206 EXPC = sub(EXPA,EXPC)
207 }
208 {
209 if (P_TMP) PP_HH = PP_HH_TMP
210 P_TMP = cmp.gt(EXPC,#63)
211 if (SWAP) PP_LL = CTMP2
212 if (SWAP) CTMP2 = PP_LL
213 }
214#undef PP_HH_TMP
215//#define ONE r15:14
216//#define S_ONE r14
217#define ZERO r15:14
218#define S_ZERO r15
219#undef PROD_NEG
220#define P_CARRY p3
221 {
222 if (SWAP) PP_HH = CTMP // Swap C and PP
223 if (SWAP) CTMP = PP_HH
224 if (P_TMP) EXPC = add(EXPC,#-64)
225 TMP = #63
226 }
227 {
228 // If diff > 63, pre-shift-right by 64...
229 if (P_TMP) CTMP2 = CTMP
230 TMP = asr(CTMPH,#31)
231 RIGHTSHIFT = min(EXPC,TMP)
232 LEFTSHIFT = #0
233 }
234#undef C
235#undef CH
236#undef CL
237#define STICKIES r5:4
238#define STICKIESH r5
239#define STICKIESL r4
240 {
241 if (P_TMP) CTMP = combine(TMP,TMP) // sign extension of pre-shift-right-64
242 STICKIES = extract(CTMP2,RIGHTLEFTSHIFT)
243 CTMP2 = lsr(CTMP2,RIGHTSHIFT)
244 LEFTSHIFT = sub(#64,RIGHTSHIFT)
245 }
246 {
247 ZERO = #0
248 TMP = #-2
249 CTMP2 |= lsl(CTMP,LEFTSHIFT)
250 CTMP = asr(CTMP,RIGHTSHIFT)
251 }
252 {
253 P_CARRY = cmp.gtu(STICKIES,ZERO) // If we have sticky bits from C shift
254 if (P_CARRY.new) CTMP2L = and(CTMP2L,TMP) // make sure adding 1 == OR
255#undef ZERO
256#define ONE r15:14
257#define S_ONE r14
258 ONE = #1
259 STICKIES = #0
260 }
261 {
262 PP_LL = add(CTMP2,PP_LL,P_CARRY):carry // use the carry to add the sticky
263 }
264 {
265 PP_HH = add(CTMP,PP_HH,P_CARRY):carry
266 TMP = #62
267 }
268 // PP_HH:PP_LL now holds the sum
269 // We may need to normalize left, up to ??? bits.
270 //
271 // I think that if we have massive cancellation, the range we normalize by
272 // is still limited
273 {
274 LEFTSHIFT = add(clb(PP_HH),#-2)
275 if (!cmp.eq(LEFTSHIFT.new,TMP)) jump:t 1f // all sign bits?
276 }
277 // We had all sign bits, shift left by 62.
278 {
279 CTMP = extractu(PP_LL,#62,#2)
280 PP_LL = asl(PP_LL,#62)
281 EXPA = add(EXPA,#-62) // And adjust exponent of result
282 }
283 {
284 PP_HH = insert(CTMP,#62,#0) // Then shift 63
285 }
286 {
287 LEFTSHIFT = add(clb(PP_HH),#-2)
288 }
289 .falign
2901:
291 {
292 CTMP = asl(PP_HH,LEFTSHIFT)
293 STICKIES |= asl(PP_LL,LEFTSHIFT)
294 RIGHTSHIFT = sub(#64,LEFTSHIFT)
295 EXPA = sub(EXPA,LEFTSHIFT)
296 }
297 {
298 CTMP |= lsr(PP_LL,RIGHTSHIFT)
299 EXACT = cmp.gtu(ONE,STICKIES)
300 TMP = #BIAS+BIAS-2
301 }
302 {
303 if (!EXACT) CTMPL = or(CTMPL,S_ONE)
304 // If EXPA is overflow/underflow, jump to ovf_unf
305 P_TMP = !cmp.gt(EXPA,TMP)
306 P_TMP = cmp.gt(EXPA,#1)
307 if (!P_TMP.new) jump:nt .Lfma_ovf_unf
308 }
309 {
310 // XXX: FIXME: should PP_HH for check of zero be CTMP?
311 P_TMP = cmp.gtu(ONE,CTMP) // is result true zero?
312 A = convert_d2df(CTMP)
313 EXPA = add(EXPA,#-BIAS-60)
314 PP_HH = memd(r29+#0)
315 }
316 {
317 AH += asl(EXPA,#HI_MANTBITS)
318 EXPCA = memd(r29+#8)
319 if (!P_TMP) dealloc_return // not zero, return
320 }
321.Ladd_yields_zero:
322 // We had full cancellation. Return +/- zero (-0 when round-down)
323 {
324 TMP = USR
325 A = #0
326 }
327 {
328 TMP = extractu(TMP,#2,#SR_ROUND_OFF)
329 PP_HH = memd(r29+#0)
330 EXPCA = memd(r29+#8)
331 }
332 {
333 p0 = cmp.eq(TMP,#2)
334 if (p0.new) AH = ##0x80000000
335 dealloc_return
336 }
337
338#undef RIGHTLEFTSHIFT
339#undef RIGHTSHIFT
340#undef LEFTSHIFT
341#undef CTMP2
342#undef CTMP2H
343#undef CTMP2L
344
345.Lfma_ovf_unf:
346 {
347 p0 = cmp.gtu(ONE,CTMP)
348 if (p0.new) jump:nt .Ladd_yields_zero
349 }
350 {
351 A = convert_d2df(CTMP)
352 EXPA = add(EXPA,#-BIAS-60)
353 TMP = EXPA
354 }
355#define NEW_EXPB r7
356#define NEW_EXPA r6
357 {
358 AH += asl(EXPA,#HI_MANTBITS)
359 NEW_EXPB = extractu(AH,#EXPBITS,#HI_MANTBITS)
360 }
361 {
362 NEW_EXPA = add(EXPA,NEW_EXPB)
363 PP_HH = memd(r29+#0)
364 EXPCA = memd(r29+#8)
365#undef PP_HH
366#undef PP_HH_H
367#undef PP_HH_L
368#undef EXPCA
369#undef EXPC
370#undef EXPA
371#undef PP_LL
372#undef PP_LL_H
373#undef PP_LL_L
374#define EXPA r6
375#define EXPB r7
376#define EXPBA r7:6
377#define ATMP r9:8
378#define ATMPH r9
379#define ATMPL r8
380#undef NEW_EXPB
381#undef NEW_EXPA
382 ATMP = abs(CTMP)
383 }
384 {
385 p0 = cmp.gt(EXPA,##BIAS+BIAS)
386 if (p0.new) jump:nt .Lfma_ovf
387 }
388 {
389 p0 = cmp.gt(EXPA,#0)
390 if (p0.new) jump:nt .Lpossible_unf
391 }
392 {
393 // TMP has original EXPA.
394 // ATMP is corresponding value
395 // Normalize ATMP and shift right to correct location
396 EXPB = add(clb(ATMP),#-2) // Amount to left shift to normalize
397 EXPA = sub(#1+5,TMP) // Amount to right shift to denormalize
398 p3 = cmp.gt(CTMPH,#-1)
399 }
400 // Underflow
401 // We know that the infinte range exponent should be EXPA
402 // CTMP is 2's complement, ATMP is abs(CTMP)
403 {
404 EXPA = add(EXPA,EXPB) // how much to shift back right
405 ATMP = asl(ATMP,EXPB) // shift left
406 AH = USR
407 TMP = #63
408 }
409 {
410 EXPB = min(EXPA,TMP)
411 EXPA = #0
412 AL = #0x0030
413 }
414 {
415 B = extractu(ATMP,EXPBA)
416 ATMP = asr(ATMP,EXPB)
417 }
418 {
419 p0 = cmp.gtu(ONE,B)
420 if (!p0.new) ATMPL = or(ATMPL,S_ONE)
421 ATMPH = setbit(ATMPH,#HI_MANTBITS+FUDGE2)
422 }
423 {
424 CTMP = neg(ATMP)
425 p1 = bitsclr(ATMPL,#(1<<FUDGE2)-1)
426 if (!p1.new) AH = or(AH,AL)
427 B = #0
428 }
429 {
430 if (p3) CTMP = ATMP
431 USR = AH
432 TMP = #-BIAS-(MANTBITS+FUDGE2)
433 }
434 {
435 A = convert_d2df(CTMP)
436 }
437 {
438 AH += asl(TMP,#HI_MANTBITS)
439 dealloc_return
440 }
441.Lpossible_unf:
442 {
443 TMP = ##0x7fefffff
444 ATMP = abs(CTMP)
445 }
446 {
447 p0 = cmp.eq(AL,#0)
448 p0 = bitsclr(AH,TMP)
449 if (!p0.new) dealloc_return:t
450 TMP = #0x7fff
451 }
452 {
453 p0 = bitsset(ATMPH,TMP)
454 BH = USR
455 BL = #0x0030
456 }
457 {
458 if (p0) BH = or(BH,BL)
459 }
460 {
461 USR = BH
462 }
463 {
464 p0 = dfcmp.eq(A,A)
465 dealloc_return
466 }
467.Lfma_ovf:
468 {
469 TMP = USR
470 CTMP = combine(##0x7fefffff,#-1)
471 A = CTMP
472 }
473 {
474 ATMP = combine(##0x7ff00000,#0)
475 BH = extractu(TMP,#2,#SR_ROUND_OFF)
476 TMP = or(TMP,#0x28)
477 }
478 {
479 USR = TMP
480 BH ^= lsr(AH,#31)
481 BL = BH
482 }
483 {
484 p0 = !cmp.eq(BL,#1)
485 p0 = !cmp.eq(BH,#2)
486 }
487 {
488 p0 = dfcmp.eq(ATMP,ATMP)
489 if (p0.new) CTMP = ATMP
490 }
491 {
492 A = insert(CTMP,#63,#0)
493 dealloc_return
494 }
495#undef CTMP
496#undef CTMPH
497#undef CTMPL
498#define BTMP r11:10
499#define BTMPH r11
500#define BTMPL r10
501
502#undef STICKIES
503#undef STICKIESH
504#undef STICKIESL
505#define C r5:4
506#define CH r5
507#define CL r4
508
509.Lfma_abnormal_ab:
510 {
511 ATMP = extractu(A,#63,#0)
512 BTMP = extractu(B,#63,#0)
513 deallocframe
514 }
515 {
516 p3 = cmp.gtu(ATMP,BTMP)
517 if (!p3.new) A = B // sort values
518 if (!p3.new) B = A
519 }
520 {
521 p0 = dfclass(A,#0x0f) // A NaN?
522 if (!p0.new) jump:nt .Lnan
523 if (!p3) ATMP = BTMP
524 if (!p3) BTMP = ATMP
525 }
526 {
527 p1 = dfclass(A,#0x08) // A is infinity
528 p1 = dfclass(B,#0x0e) // B is nonzero
529 }
530 {
531 p0 = dfclass(A,#0x08) // a is inf
532 p0 = dfclass(B,#0x01) // b is zero
533 }
534 {
535 if (p1) jump .Lab_inf
536 p2 = dfclass(B,#0x01)
537 }
538 {
539 if (p0) jump .Linvalid
540 if (p2) jump .Lab_true_zero
541 TMP = ##0x7c000000
542 }
543 // We are left with a normal or subnormal times a subnormal, A > B
544 // If A and B are both very small, we will go to a single sticky bit; replace
545 // A and B lower 63 bits with 0x0010_0000_0000_0000, which yields equivalent results
546 // if A and B might multiply to something bigger, decrease A exp and increase B exp
547 // and start over
548 {
549 p0 = bitsclr(AH,TMP)
550 if (p0.new) jump:nt .Lfma_ab_tiny
551 }
552 {
553 TMP = add(clb(BTMP),#-EXPBITS)
554 }
555 {
556 BTMP = asl(BTMP,TMP)
557 }
558 {
559 B = insert(BTMP,#63,#0)
560 AH -= asl(TMP,#HI_MANTBITS)
561 }
562 jump .Lfma_begin
563
564.Lfma_ab_tiny:
565 ATMP = combine(##0x00100000,#0)
566 {
567 A = insert(ATMP,#63,#0)
568 B = insert(ATMP,#63,#0)
569 }
570 jump .Lfma_begin
571
572.Lab_inf:
573 {
574 B = lsr(B,#63)
575 p0 = dfclass(C,#0x10)
576 }
577 {
578 A ^= asl(B,#63)
579 if (p0) jump .Lnan
580 }
581 {
582 p1 = dfclass(C,#0x08)
583 if (p1.new) jump:nt .Lfma_inf_plus_inf
584 }
585 // A*B is +/- inf, C is finite. Return A
586 {
587 jumpr r31
588 }
589 .falign
590.Lfma_inf_plus_inf:
591 { // adding infinities of different signs is invalid
592 p0 = dfcmp.eq(A,C)
593 if (!p0.new) jump:nt .Linvalid
594 }
595 {
596 jumpr r31
597 }
598
599.Lnan:
600 {
601 p0 = dfclass(B,#0x10)
602 p1 = dfclass(C,#0x10)
603 if (!p0.new) B = A
604 if (!p1.new) C = A
605 }
606 { // find sNaNs
607 BH = convert_df2sf(B)
608 BL = convert_df2sf(C)
609 }
610 {
611 BH = convert_df2sf(A)
612 A = #-1
613 jumpr r31
614 }
615
616.Linvalid:
617 {
618 TMP = ##0x7f800001 // sp snan
619 }
620 {
621 A = convert_sf2df(TMP)
622 jumpr r31
623 }
624
625.Lab_true_zero:
626 // B is zero, A is finite number
627 {
628 p0 = dfclass(C,#0x10)
629 if (p0.new) jump:nt .Lnan
630 if (p0.new) A = C
631 }
632 {
633 p0 = dfcmp.eq(B,C) // is C also zero?
634 AH = lsr(AH,#31) // get sign
635 }
636 {
637 BH ^= asl(AH,#31) // form correctly signed zero in B
638 if (!p0) A = C // If C is not zero, return C
639 if (!p0) jumpr r31
640 }
641 // B has correctly signed zero, C is also zero
642.Lzero_plus_zero:
643 {
644 p0 = cmp.eq(B,C) // yes, scalar equals. +0++0 or -0+-0
645 if (p0.new) jumpr:t r31
646 A = B
647 }
648 {
649 TMP = USR
650 }
651 {
652 TMP = extractu(TMP,#2,#SR_ROUND_OFF)
653 A = #0
654 }
655 {
656 p0 = cmp.eq(TMP,#2)
657 if (p0.new) AH = ##0x80000000
658 jumpr r31
659 }
660#undef BTMP
661#undef BTMPH
662#undef BTMPL
663#define CTMP r11:10
664 .falign
665.Lfma_abnormal_c:
666 // We know that AB is normal * normal
667 // C is not normal: zero, subnormal, inf, or NaN.
668 {
669 p0 = dfclass(C,#0x10) // is C NaN?
670 if (p0.new) jump:nt .Lnan
671 if (p0.new) A = C // move NaN to A
672 deallocframe
673 }
674 {
675 p0 = dfclass(C,#0x08) // is C inf?
676 if (p0.new) A = C // return C
677 if (p0.new) jumpr:nt r31
678 }
679 // zero or subnormal
680 // If we have a zero, and we know AB is normal*normal, we can just call normal multiply
681 {
682 p0 = dfclass(C,#0x01) // is C zero?
683 if (p0.new) jump:nt __hexagon_muldf3
684 TMP = #1
685 }
686 // Left with: subnormal
687 // Adjust C and jump back to restart
688 {
689 allocframe(#STACKSPACE) // oops, deallocated above, re-allocate frame
690 CTMP = #0
691 CH = insert(TMP,#EXPBITS,#HI_MANTBITS)
692 jump .Lfma_abnormal_c_restart
693 }
694END(fma)
695

source code of compiler-rt/lib/builtins/hexagon/dffma.S