floatundisf.S source code [compiler-rt/lib/builtins/i386/floatundisf.S]

1	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
2	// See https://llvm.org/LICENSE.txt for license information.
3	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4
5	#include "../assembly.h"
6
7	// float __floatundisf(du_int a);
8
9	// Note that there is a hardware instruction, fildll, that does most of what
10	// this function needs to do. However, because of our ia32 ABI, it will take
11	// a write-small read-large stall, so the software implementation here is
12	// actually several cycles faster.
13
14	// This is a branch-free implementation. A branchy implementation might be
15	// faster for the common case if you know something a priori about the input
16	// distribution.
17
18	/ branch-free x87 implementation - one cycle slower than without x87.*
19
20	#ifdef __i386__
21
22	CONST_SECTION
23	.balign 3
24
25	.quad 0x43f0000000000000
26	twop64: .quad 0x0000000000000000
27
28	#define TWOp64 twop64-0b(%ecx,%eax,8)
29
30	.text
31	.balign 4
32	DEFINE_COMPILERRT_FUNCTION(__floatundisf)
33	movl 8(%esp), %eax
34	movd 8(%esp), %xmm1
35	movd 4(%esp), %xmm0
36	punpckldq %xmm1, %xmm0
37	calll 0f
38	0: popl %ecx
39	sarl $31, %eax
40	movq %xmm0, 4(%esp)
41	fildll 4(%esp)
42	faddl TWOp64
43	fstps 4(%esp)
44	flds 4(%esp)
45	ret
46	END_COMPILERRT_FUNCTION(__floatundisf)
47
48	#endif // __i386__
49
50	*/
51
52	// branch-free, x87-free implementation - faster at the expense of code size
53
54	#ifdef __i386__
55
56	CONST_SECTION
57
58	.balign `16`
59	twop52:
60	.quad `0x4330000000000000`
61	.quad `0x0000000000000fff`
62
63	.balign `16`
64	sticky:
65	.quad `0x0000000000000000`
66	.long `0x00000012`
67
68	.balign `16`
69	twelve:
70	.long `0x00000000`
71
72	#define TWOp52 twop52-0b(%ecx)
73	#define STICKY sticky-0b(%ecx,%eax,8)
74
75	.text
76	.balign `4`
77	DEFINE_COMPILERRT_FUNCTION(__floatundisf)
78	movl `8`(%esp), %eax
79	movd `8`(%esp), %xmm1
80	movd `4`(%esp), %xmm0
81	punpckldq %xmm1, %xmm0
82
83	calll `0f`
84	`0`: popl %ecx
85	shrl %eax // high 31 bits of input as sint32
86	addl $`0x7ff80000`, %eax
87	sarl $`31`, %eax // (big input) ? -1 : 0
88	movsd STICKY, %xmm1 // (big input) ? 0xfff : 0
89	movl $`12`, %edx
90	andl %eax, %edx // (big input) ? 12 : 0
91	movd %edx, %xmm3
92	andpd %xmm0, %xmm1 // (big input) ? input & 0xfff : 0
93	movsd TWOp52, %xmm2 // 0x1.0p52
94	psrlq %xmm3, %xmm0 // (big input) ? input >> 12 : input
95	orpd %xmm2, %xmm1 // 0x1.0p52 + ((big input) ? input & 0xfff : input)
96	orpd %xmm1, %xmm0 // 0x1.0p52 + ((big input) ? (input >> 12 \| input & 0xfff) : input)
97	subsd %xmm2, %xmm0 // (double)((big input) ? (input >> 12 \| input & 0xfff) : input)
98	cvtsd2ss %xmm0, %xmm0 // (float)((big input) ? (input >> 12 \| input & 0xfff) : input)
99	pslld $`23`, %xmm3
100	paddd %xmm3, %xmm0 // (float)input
101	movd %xmm0, `4`(%esp)
102	flds `4`(%esp)
103	ret
104	END_COMPILERRT_FUNCTION(__floatundisf)
105
106	#endif // __i386__
107
108	NO_EXEC_STACK_DIRECTIVE
109
110

source code of compiler-rt/lib/builtins/i386/floatundisf.S