dffma.S source code [compiler-rt/lib/builtins/hexagon/dffma.S]

1	//===----------------------Hexagon builtin routine ------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
10	#define END(TAG) .size TAG,.-TAG
11
12	// Double Precision Multiply
13
14
15	#define A r1:0
16	#define AH r1
17	#define AL r0
18	#define B r3:2
19	#define BH r3
20	#define BL r2
21	#define C r5:4
22	#define CH r5
23	#define CL r4
24
25
26
27	#define BTMP r15:14
28	#define BTMPH r15
29	#define BTMPL r14
30
31	#define ATMP r13:12
32	#define ATMPH r13
33	#define ATMPL r12
34
35	#define CTMP r11:10
36	#define CTMPH r11
37	#define CTMPL r10
38
39	#define PP_LL r9:8
40	#define PP_LL_H r9
41	#define PP_LL_L r8
42
43	#define PP_ODD r7:6
44	#define PP_ODD_H r7
45	#define PP_ODD_L r6
46
47
48	#define PP_HH r17:16
49	#define PP_HH_H r17
50	#define PP_HH_L r16
51
52	#define EXPA r18
53	#define EXPB r19
54	#define EXPBA r19:18
55
56	#define TMP r28
57
58	#define P_TMP p0
59	#define PROD_NEG p3
60	#define EXACT p2
61	#define SWAP p1
62
63	#define MANTBITS 52
64	#define HI_MANTBITS 20
65	#define EXPBITS 11
66	#define BIAS 1023
67	#define STACKSPACE 32
68
69	#define ADJUST 4
70
71	#define FUDGE 7
72	#define FUDGE2 3
73
74	#ifndef SR_ROUND_OFF
75	#define SR_ROUND_OFF 22
76	#endif
77
78	// First, classify for normal values, and abort if abnormal
79	//
80	// Next, unpack mantissa into 0x1000_0000_0000_0000 + mant<<8
81	//
82	// Since we know that the 2 MSBs of the H registers is zero, we should never carry
83	// the partial products that involve the H registers
84	//
85	// Try to buy X slots, at the expense of latency if needed
86	//
87	// We will have PP_HH with the upper bits of the product, PP_LL with the lower
88	// PP_HH can have a maximum of 0x03FF_FFFF_FFFF_FFFF or thereabouts
89	// PP_HH can have a minimum of 0x0100_0000_0000_0000
90	//
91	// 0x0100_0000_0000_0000 has EXP of EXPA+EXPB-BIAS
92	//
93	// We need to align CTMP.
94	// If CTMP >> PP, convert PP to 64 bit with sticky, align CTMP, and follow normal add
95	// If CTMP << PP align CTMP and add 128 bits. Then compute sticky
96	// If CTMP ~= PP, align CTMP and add 128 bits. May have massive cancellation.
97	//
98	// Convert partial product and CTMP to 2's complement prior to addition
99	//
100	// After we add, we need to normalize into upper 64 bits, then compute sticky.
101
102	.text
103	.global __hexagon_fmadf4
104	.type __hexagon_fmadf4,@function
105	.global __hexagon_fmadf5
106	.type __hexagon_fmadf5,@function
107	Q6_ALIAS(fmadf5)
108	.p2align `5`
109	__hexagon_fmadf4:
110	__hexagon_fmadf5:
111	.Lfma_begin:
112	{
113	P_TMP = dfclass(A,#`2`)
114	P_TMP = dfclass(B,#`2`)
115	ATMP = #`0`
116	BTMP = #`0`
117	}
118	{
119	ATMP = insert(A,#MANTBITS,#EXPBITS-`3`)
120	BTMP = insert(B,#MANTBITS,#EXPBITS-`3`)
121	PP_ODD_H = ##`0x10000000`
122	allocframe(#STACKSPACE)
123	}
124	{
125	PP_LL = mpyu(ATMPL,BTMPL)
126	if (!P_TMP) jump .Lfma_abnormal_ab
127	ATMPH = or(ATMPH,PP_ODD_H)
128	BTMPH = or(BTMPH,PP_ODD_H)
129	}
130	{
131	P_TMP = dfclass(C,#`2`)
132	if (!P_TMP.new) jump:nt .Lfma_abnormal_c
133	CTMP = combine(PP_ODD_H,#`0`)
134	PP_ODD = combine(#`0`,PP_LL_H)
135	}
136	.Lfma_abnormal_c_restart:
137	{
138	PP_ODD += mpyu(BTMPL,ATMPH)
139	CTMP = insert(C,#MANTBITS,#EXPBITS-`3`)
140	memd(r29+#`0`) = PP_HH
141	memd(r29+#`8`) = EXPBA
142	}
143	{
144	PP_ODD += mpyu(ATMPL,BTMPH)
145	EXPBA = neg(CTMP)
146	P_TMP = cmp.gt(CH,#-`1`)
147	TMP = xor(AH,BH)
148	}
149	{
150	EXPA = extractu(AH,#EXPBITS,#HI_MANTBITS)
151	EXPB = extractu(BH,#EXPBITS,#HI_MANTBITS)
152	PP_HH = combine(#`0`,PP_ODD_H)
153	if (!P_TMP) CTMP = EXPBA
154	}
155	{
156	PP_HH += mpyu(ATMPH,BTMPH)
157	PP_LL = combine(PP_ODD_L,PP_LL_L)
158	#undef PP_ODD
159	#undef PP_ODD_H
160	#undef PP_ODD_L
161	#undef ATMP
162	#undef ATMPL
163	#undef ATMPH
164	#undef BTMP
165	#undef BTMPL
166	#undef BTMPH
167	#define RIGHTLEFTSHIFT r13:12
168	#define RIGHTSHIFT r13
169	#define LEFTSHIFT r12
170
171	EXPA = add(EXPA,EXPB)
172	#undef EXPB
173	#undef EXPBA
174	#define EXPC r19
175	#define EXPCA r19:18
176	EXPC = extractu(CH,#EXPBITS,#HI_MANTBITS)
177	}
178	// PP_HH:PP_LL now has product
179	// CTMP is negated
180	// EXPA,B,C are extracted
181	// We need to negate PP
182	// Since we will be adding with carry later, if we need to negate,
183	// just invert all bits now, which we can do conditionally and in parallel
184	#define PP_HH_TMP r15:14
185	#define PP_LL_TMP r7:6
186	{
187	EXPA = add(EXPA,#-BIAS+(ADJUST))
188	PROD_NEG = !cmp.gt(TMP,#-`1`)
189	PP_LL_TMP = #`0`
190	PP_HH_TMP = #`0`
191	}
192	{
193	PP_LL_TMP = sub(PP_LL_TMP,PP_LL,PROD_NEG):carry
194	P_TMP = !cmp.gt(TMP,#-`1`)
195	SWAP = cmp.gt(EXPC,EXPA) // If C >> PP
196	if (SWAP.new) EXPCA = combine(EXPA,EXPC)
197	}
198	{
199	PP_HH_TMP = sub(PP_HH_TMP,PP_HH,PROD_NEG):carry
200	if (P_TMP) PP_LL = PP_LL_TMP
201	#undef PP_LL_TMP
202	#define CTMP2 r7:6
203	#define CTMP2H r7
204	#define CTMP2L r6
205	CTMP2 = #`0`
206	EXPC = sub(EXPA,EXPC)
207	}
208	{
209	if (P_TMP) PP_HH = PP_HH_TMP
210	P_TMP = cmp.gt(EXPC,#`63`)
211	if (SWAP) PP_LL = CTMP2
212	if (SWAP) CTMP2 = PP_LL
213	}
214	#undef PP_HH_TMP
215	//#define ONE r15:14
216	//#define S_ONE r14
217	#define ZERO r15:14
218	#define S_ZERO r15
219	#undef PROD_NEG
220	#define P_CARRY p3
221	{
222	if (SWAP) PP_HH = CTMP // Swap C and PP
223	if (SWAP) CTMP = PP_HH
224	if (P_TMP) EXPC = add(EXPC,#-`64`)
225	TMP = #`63`
226	}
227	{
228	// If diff > 63, pre-shift-right by 64...
229	if (P_TMP) CTMP2 = CTMP
230	TMP = asr(CTMPH,#`31`)
231	RIGHTSHIFT = min(EXPC,TMP)
232	LEFTSHIFT = #`0`
233	}
234	#undef C
235	#undef CH
236	#undef CL
237	#define STICKIES r5:4
238	#define STICKIESH r5
239	#define STICKIESL r4
240	{
241	if (P_TMP) CTMP = combine(TMP,TMP) // sign extension of pre-shift-right-64
242	STICKIES = extract(CTMP2,RIGHTLEFTSHIFT)
243	CTMP2 = lsr(CTMP2,RIGHTSHIFT)
244	LEFTSHIFT = sub(#`64`,RIGHTSHIFT)
245	}
246	{
247	ZERO = #`0`
248	TMP = #-`2`
249	CTMP2 \|= lsl(CTMP,LEFTSHIFT)
250	CTMP = asr(CTMP,RIGHTSHIFT)
251	}
252	{
253	P_CARRY = cmp.gtu(STICKIES,ZERO) // If we have sticky bits from C shift
254	if (P_CARRY.new) CTMP2L = and(CTMP2L,TMP) // make sure adding 1 == OR
255	#undef ZERO
256	#define ONE r15:14
257	#define S_ONE r14
258	ONE = #`1`
259	STICKIES = #`0`
260	}
261	{
262	PP_LL = add(CTMP2,PP_LL,P_CARRY):carry // use the carry to add the sticky
263	}
264	{
265	PP_HH = add(CTMP,PP_HH,P_CARRY):carry
266	TMP = #`62`
267	}
268	// PP_HH:PP_LL now holds the sum
269	// We may need to normalize left, up to ??? bits.
270	//
271	// I think that if we have massive cancellation, the range we normalize by
272	// is still limited
273	{
274	LEFTSHIFT = add(clb(PP_HH),#-`2`)
275	if (!cmp.eq(LEFTSHIFT.new,TMP)) jump:t `1f` // all sign bits?
276	}
277	// We had all sign bits, shift left by 62.
278	{
279	CTMP = extractu(PP_LL,#`62`,#`2`)
280	PP_LL = asl(PP_LL,#`62`)
281	EXPA = add(EXPA,#-`62`) // And adjust exponent of result
282	}
283	{
284	PP_HH = insert(CTMP,#`62`,#`0`) // Then shift 63
285	}
286	{
287	LEFTSHIFT = add(clb(PP_HH),#-`2`)
288	}
289	.falign
290	`1`:
291	{
292	CTMP = asl(PP_HH,LEFTSHIFT)
293	STICKIES \|= asl(PP_LL,LEFTSHIFT)
294	RIGHTSHIFT = sub(#`64`,LEFTSHIFT)
295	EXPA = sub(EXPA,LEFTSHIFT)
296	}
297	{
298	CTMP \|= lsr(PP_LL,RIGHTSHIFT)
299	EXACT = cmp.gtu(ONE,STICKIES)
300	TMP = #BIAS+BIAS-`2`
301	}
302	{
303	if (!EXACT) CTMPL = or(CTMPL,S_ONE)
304	// If EXPA is overflow/underflow, jump to ovf_unf
305	P_TMP = !cmp.gt(EXPA,TMP)
306	P_TMP = cmp.gt(EXPA,#`1`)
307	if (!P_TMP.new) jump:nt .Lfma_ovf_unf
308	}
309	{
310	// XXX: FIXME: should PP_HH for check of zero be CTMP?
311	P_TMP = cmp.gtu(ONE,CTMP) // is result true zero?
312	A = convert_d2df(CTMP)
313	EXPA = add(EXPA,#-BIAS-`60`)
314	PP_HH = memd(r29+#`0`)
315	}
316	{
317	AH += asl(EXPA,#HI_MANTBITS)
318	EXPCA = memd(r29+#`8`)
319	if (!P_TMP) dealloc_return // not zero, return
320	}
321	.Ladd_yields_zero:
322	// We had full cancellation. Return +/- zero (-0 when round-down)
323	{
324	TMP = USR
325	A = #`0`
326	}
327	{
328	TMP = extractu(TMP,#`2`,#SR_ROUND_OFF)
329	PP_HH = memd(r29+#`0`)
330	EXPCA = memd(r29+#`8`)
331	}
332	{
333	p0 = cmp.eq(TMP,#`2`)
334	if (p0.new) AH = ##`0x80000000`
335	dealloc_return
336	}
337
338	#undef RIGHTLEFTSHIFT
339	#undef RIGHTSHIFT
340	#undef LEFTSHIFT
341	#undef CTMP2
342	#undef CTMP2H
343	#undef CTMP2L
344
345	.Lfma_ovf_unf:
346	{
347	p0 = cmp.gtu(ONE,CTMP)
348	if (p0.new) jump:nt .Ladd_yields_zero
349	}
350	{
351	A = convert_d2df(CTMP)
352	EXPA = add(EXPA,#-BIAS-`60`)
353	TMP = EXPA
354	}
355	#define NEW_EXPB r7
356	#define NEW_EXPA r6
357	{
358	AH += asl(EXPA,#HI_MANTBITS)
359	NEW_EXPB = extractu(AH,#EXPBITS,#HI_MANTBITS)
360	}
361	{
362	NEW_EXPA = add(EXPA,NEW_EXPB)
363	PP_HH = memd(r29+#`0`)
364	EXPCA = memd(r29+#`8`)
365	#undef PP_HH
366	#undef PP_HH_H
367	#undef PP_HH_L
368	#undef EXPCA
369	#undef EXPC
370	#undef EXPA
371	#undef PP_LL
372	#undef PP_LL_H
373	#undef PP_LL_L
374	#define EXPA r6
375	#define EXPB r7
376	#define EXPBA r7:6
377	#define ATMP r9:8
378	#define ATMPH r9
379	#define ATMPL r8
380	#undef NEW_EXPB
381	#undef NEW_EXPA
382	ATMP = abs(CTMP)
383	}
384	{
385	p0 = cmp.gt(EXPA,##BIAS+BIAS)
386	if (p0.new) jump:nt .Lfma_ovf
387	}
388	{
389	p0 = cmp.gt(EXPA,#`0`)
390	if (p0.new) jump:nt .Lpossible_unf
391	}
392	{
393	// TMP has original EXPA.
394	// ATMP is corresponding value
395	// Normalize ATMP and shift right to correct location
396	EXPB = add(clb(ATMP),#-`2`) // Amount to left shift to normalize
397	EXPA = sub(#`1`+`5`,TMP) // Amount to right shift to denormalize
398	p3 = cmp.gt(CTMPH,#-`1`)
399	}
400	// Underflow
401	// We know that the infinte range exponent should be EXPA
402	// CTMP is 2's complement, ATMP is abs(CTMP)
403	{
404	EXPA = add(EXPA,EXPB) // how much to shift back right
405	ATMP = asl(ATMP,EXPB) // shift left
406	AH = USR
407	TMP = #`63`
408	}
409	{
410	EXPB = min(EXPA,TMP)
411	EXPA = #`0`
412	AL = #`0x0030`
413	}
414	{
415	B = extractu(ATMP,EXPBA)
416	ATMP = asr(ATMP,EXPB)
417	}
418	{
419	p0 = cmp.gtu(ONE,B)
420	if (!p0.new) ATMPL = or(ATMPL,S_ONE)
421	ATMPH = setbit(ATMPH,#HI_MANTBITS+FUDGE2)
422	}
423	{
424	CTMP = neg(ATMP)
425	p1 = bitsclr(ATMPL,#(`1`<<FUDGE2)-`1`)
426	if (!p1.new) AH = or(AH,AL)
427	B = #`0`
428	}
429	{
430	if (p3) CTMP = ATMP
431	USR = AH
432	TMP = #-BIAS-(MANTBITS+FUDGE2)
433	}
434	{
435	A = convert_d2df(CTMP)
436	}
437	{
438	AH += asl(TMP,#HI_MANTBITS)
439	dealloc_return
440	}
441	.Lpossible_unf:
442	{
443	TMP = ##`0x7fefffff`
444	ATMP = abs(CTMP)
445	}
446	{
447	p0 = cmp.eq(AL,#`0`)
448	p0 = bitsclr(AH,TMP)
449	if (!p0.new) dealloc_return:t
450	TMP = #`0x7fff`
451	}
452	{
453	p0 = bitsset(ATMPH,TMP)
454	BH = USR
455	BL = #`0x0030`
456	}
457	{
458	if (p0) BH = or(BH,BL)
459	}
460	{
461	USR = BH
462	}
463	{
464	p0 = dfcmp.eq(A,A)
465	dealloc_return
466	}
467	.Lfma_ovf:
468	{
469	TMP = USR
470	CTMP = combine(##`0x7fefffff`,#-`1`)
471	A = CTMP
472	}
473	{
474	ATMP = combine(##`0x7ff00000`,#`0`)
475	BH = extractu(TMP,#`2`,#SR_ROUND_OFF)
476	TMP = or(TMP,#`0x28`)
477	}
478	{
479	USR = TMP
480	BH ^= lsr(AH,#`31`)
481	BL = BH
482	}
483	{
484	p0 = !cmp.eq(BL,#`1`)
485	p0 = !cmp.eq(BH,#`2`)
486	}
487	{
488	p0 = dfcmp.eq(ATMP,ATMP)
489	if (p0.new) CTMP = ATMP
490	}
491	{
492	A = insert(CTMP,#`63`,#`0`)
493	dealloc_return
494	}
495	#undef CTMP
496	#undef CTMPH
497	#undef CTMPL
498	#define BTMP r11:10
499	#define BTMPH r11
500	#define BTMPL r10
501
502	#undef STICKIES
503	#undef STICKIESH
504	#undef STICKIESL
505	#define C r5:4
506	#define CH r5
507	#define CL r4
508
509	.Lfma_abnormal_ab:
510	{
511	ATMP = extractu(A,#`63`,#`0`)
512	BTMP = extractu(B,#`63`,#`0`)
513	deallocframe
514	}
515	{
516	p3 = cmp.gtu(ATMP,BTMP)
517	if (!p3.new) A = B // sort values
518	if (!p3.new) B = A
519	}
520	{
521	p0 = dfclass(A,#`0x0f`) // A NaN?
522	if (!p0.new) jump:nt .Lnan
523	if (!p3) ATMP = BTMP
524	if (!p3) BTMP = ATMP
525	}
526	{
527	p1 = dfclass(A,#`0x08`) // A is infinity
528	p1 = dfclass(B,#`0x0e`) // B is nonzero
529	}
530	{
531	p0 = dfclass(A,#`0x08`) // a is inf
532	p0 = dfclass(B,#`0x01`) // b is zero
533	}
534	{
535	if (p1) jump .Lab_inf
536	p2 = dfclass(B,#`0x01`)
537	}
538	{
539	if (p0) jump .Linvalid
540	if (p2) jump .Lab_true_zero
541	TMP = ##`0x7c000000`
542	}
543	// We are left with a normal or subnormal times a subnormal, A > B
544	// If A and B are both very small, we will go to a single sticky bit; replace
545	// A and B lower 63 bits with 0x0010_0000_0000_0000, which yields equivalent results
546	// if A and B might multiply to something bigger, decrease A exp and increase B exp
547	// and start over
548	{
549	p0 = bitsclr(AH,TMP)
550	if (p0.new) jump:nt .Lfma_ab_tiny
551	}
552	{
553	TMP = add(clb(BTMP),#-EXPBITS)
554	}
555	{
556	BTMP = asl(BTMP,TMP)
557	}
558	{
559	B = insert(BTMP,#`63`,#`0`)
560	AH -= asl(TMP,#HI_MANTBITS)
561	}
562	jump .Lfma_begin
563
564	.Lfma_ab_tiny:
565	ATMP = combine(##`0x00100000`,#`0`)
566	{
567	A = insert(ATMP,#`63`,#`0`)
568	B = insert(ATMP,#`63`,#`0`)
569	}
570	jump .Lfma_begin
571
572	.Lab_inf:
573	{
574	B = lsr(B,#`63`)
575	p0 = dfclass(C,#`0x10`)
576	}
577	{
578	A ^= asl(B,#`63`)
579	if (p0) jump .Lnan
580	}
581	{
582	p1 = dfclass(C,#`0x08`)
583	if (p1.new) jump:nt .Lfma_inf_plus_inf
584	}
585	// AB is +/- inf, C is finite. Return A*
586	{
587	jumpr r31
588	}
589	.falign
590	.Lfma_inf_plus_inf:
591	{ // adding infinities of different signs is invalid
592	p0 = dfcmp.eq(A,C)
593	if (!p0.new) jump:nt .Linvalid
594	}
595	{
596	jumpr r31
597	}
598
599	.Lnan:
600	{
601	p0 = dfclass(B,#`0x10`)
602	p1 = dfclass(C,#`0x10`)
603	if (!p0.new) B = A
604	if (!p1.new) C = A
605	}
606	{ // find sNaNs
607	BH = convert_df2sf(B)
608	BL = convert_df2sf(C)
609	}
610	{
611	BH = convert_df2sf(A)
612	A = #-`1`
613	jumpr r31
614	}
615
616	.Linvalid:
617	{
618	TMP = ##`0x7f800001` // sp snan
619	}
620	{
621	A = convert_sf2df(TMP)
622	jumpr r31
623	}
624
625	.Lab_true_zero:
626	// B is zero, A is finite number
627	{
628	p0 = dfclass(C,#`0x10`)
629	if (p0.new) jump:nt .Lnan
630	if (p0.new) A = C
631	}
632	{
633	p0 = dfcmp.eq(B,C) // is C also zero?
634	AH = lsr(AH,#`31`) // get sign
635	}
636	{
637	BH ^= asl(AH,#`31`) // form correctly signed zero in B
638	if (!p0) A = C // If C is not zero, return C
639	if (!p0) jumpr r31
640	}
641	// B has correctly signed zero, C is also zero
642	.Lzero_plus_zero:
643	{
644	p0 = cmp.eq(B,C) // yes, scalar equals. +0++0 or -0+-0
645	if (p0.new) jumpr:t r31
646	A = B
647	}
648	{
649	TMP = USR
650	}
651	{
652	TMP = extractu(TMP,#`2`,#SR_ROUND_OFF)
653	A = #`0`
654	}
655	{
656	p0 = cmp.eq(TMP,#`2`)
657	if (p0.new) AH = ##`0x80000000`
658	jumpr r31
659	}
660	#undef BTMP
661	#undef BTMPH
662	#undef BTMPL
663	#define CTMP r11:10
664	.falign
665	.Lfma_abnormal_c:
666	// We know that AB is normal normal*
667	// C is not normal: zero, subnormal, inf, or NaN.
668	{
669	p0 = dfclass(C,#`0x10`) // is C NaN?
670	if (p0.new) jump:nt .Lnan
671	if (p0.new) A = C // move NaN to A
672	deallocframe
673	}
674	{
675	p0 = dfclass(C,#`0x08`) // is C inf?
676	if (p0.new) A = C // return C
677	if (p0.new) jumpr:nt r31
678	}
679	// zero or subnormal
680	// If we have a zero, and we know AB is normalnormal, we can just call normal multiply*
681	{
682	p0 = dfclass(C,#`0x01`) // is C zero?
683	if (p0.new) jump:nt __hexagon_muldf3
684	TMP = #`1`
685	}
686	// Left with: subnormal
687	// Adjust C and jump back to restart
688	{
689	allocframe(#STACKSPACE) // oops, deallocated above, re-allocate frame
690	CTMP = #`0`
691	CH = insert(TMP,#EXPBITS,#HI_MANTBITS)
692	jump .Lfma_abnormal_c_restart
693	}
694	END(fma)
695

Provided by KDAB

Learn to use CMake with our Intro Training

Find out more

source code of compiler-rt/lib/builtins/hexagon/dffma.S