s_cbrtl.S source code [glibc/sysdeps/i386/fpu/s_cbrtl.S]

1	/ Compute cubic root of long double value.*
2	Copyright (C) 1997-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <libm-alias-ldouble.h>
20	#include <machine/asm.h>
21
22	.section .rodata
23
24	.align ALIGNARG(`4`)
25	.type f8,@object
26	f8: .quad `0xa57ef3d83a542839` / 0.161617097923756032 /
27	.short `0x3ffc`
28	ASM_SIZE_DIRECTIVE(f8)
29	.align ALIGNARG(`4`)
30	.type f7,@object
31	f7: .quad `0xfd11da7820029014` / -0.988553671195413709 /
32	.short `0xbffe`
33	ASM_SIZE_DIRECTIVE(f7)
34	.align ALIGNARG(`4`)
35	.type f6,@object
36	f6: .quad `0xa9ca93fcade3b4ad` / 2.65298938441952296 /
37	.short `0x4000`
38	ASM_SIZE_DIRECTIVE(f6)
39	.align ALIGNARG(`4`)
40	.type f5,@object
41	f5: .quad `0x839186562c931c34` / -4.11151425200350531 /
42	.short `0xc001`
43	ASM_SIZE_DIRECTIVE(f5)
44	.align ALIGNARG(`4`)
45	.type f4,@object
46	f4: .quad `0x830f25c9ee304594` / 4.09559907378707839 /
47	.short `0x4001`
48	ASM_SIZE_DIRECTIVE(f4)
49	.align ALIGNARG(`4`)
50	.type f3,@object
51	f3: .quad `0xb4bedd1d5fa2f0c6` / -2.82414939754975962 /
52	.short `0xc000`
53	ASM_SIZE_DIRECTIVE(f3)
54	.align ALIGNARG(`4`)
55	.type f2,@object
56	f2: .quad `0xd685a163b08586e3` / 1.67595307700780102 /
57	.short `0x3fff`
58	ASM_SIZE_DIRECTIVE(f2)
59	.align ALIGNARG(`4`)
60	.type f1,@object
61	f1: .quad `0xad16073ed4ec3b45` / 0.338058687610520237 /
62	.short `0x3ffd`
63	ASM_SIZE_DIRECTIVE(f1)
64
65	/ We make the entries in the following table all 16 bytes*
66	wide to avoid having to implement a multiplication by 10. /*
67	.type factor,@object
68	.align ALIGNARG(`4`)
69	factor: / 1.0 / cbrt (2.0) ^ 2 = 0.629960524947436582364439673883 /
70	.quad `0xa14517cc6b945711`
71	.short `0x3ffe`
72	.byte `0`, `0`, `0`, `0`, `0`, `0`
73	/ 1.0 / cbrt (2.0) = 0.793700525984099737355196796584 /
74	.quad `0xcb2ff529eb71e415`
75	.short `0x3ffe`
76	.byte `0`, `0`, `0`, `0`, `0`, `0`
77	/ 1.0L /
78	.quad `0x8000000000000000`
79	.short `0x3fff`
80	.byte `0`, `0`, `0`, `0`, `0`, `0`
81	/ cbrt (2.0) = 1.2599210498948731648 /
82	.quad `0xa14517cc6b945711`
83	.short `0x3fff`
84	.byte `0`, `0`, `0`, `0`, `0`, `0`
85	/ cbrt (2.0) ^ 2 = 1.5874010519681994748 /
86	.quad `0xcb2ff529eb71e416`
87	.short `0x3fff`
88	ASM_SIZE_DIRECTIVE(factor)
89
90	.type two64,@object
91	.align ALIGNARG(`4`)
92	two64: .byte `0`, `0`, `0`, `0`, `0`, `0`, `0xf0`, `0x43`
93	ASM_SIZE_DIRECTIVE(two64)
94
95	#ifdef PIC
96	#define MO(op) op##@GOTOFF(%ebx)
97	#define MOX(op,x) op##@GOTOFF(%ebx,x,1)
98	#else
99	#define MO(op) op
100	#define MOX(op,x) op(x)
101	#endif
102
103	.text
104	ENTRY(__cbrtl)
105	movl `4`(%esp), %ecx
106	movl `12`(%esp), %eax
107	orl `8`(%esp), %ecx
108	movl %eax, %edx
109	andl $`0x7fff`, %eax
110	orl %eax, %ecx
111	jz `1f`
112	xorl %ecx, %ecx
113	cmpl $`0x7fff`, %eax
114	je `1f`
115
116	#ifdef PIC
117	pushl %ebx
118	cfi_adjust_cfa_offset (`4`)
119	cfi_rel_offset (ebx, `0`)
120	LOAD_PIC_REG (bx)
121	#endif
122
123	cmpl $`0`, %eax
124	jne `2f`
125
126	#ifdef PIC
127	fldt `8`(%esp)
128	#else
129	fldt `4`(%esp)
130	#endif
131	fmull MO(two64)
132	movl $-`64`, %ecx
133	#ifdef PIC
134	fstpt `8`(%esp)
135	movl `16`(%esp), %eax
136	#else
137	fstpt `4`(%esp)
138	movl `12`(%esp), %eax
139	#endif
140	movl %eax, %edx
141	andl $`0x7fff`, %eax
142
143	`2`: andl $`0x8000`, %edx
144	subl $`16382`, %eax
145	orl $`0x3ffe`, %edx
146	addl %eax, %ecx
147	#ifdef PIC
148	movl %edx, `16`(%esp)
149
150	fldt `8`(%esp) / xm /
151	#else
152	movl %edx, `12`(%esp)
153
154	fldt `4`(%esp) / xm /
155	#endif
156	fabs
157
158	/ The following code has two tracks:*
159	a) compute the normalized cbrt value
160	b) compute xe/3 and xe%3
161	The right track computes the value for b) and this is done
162	in an optimized way by avoiding division.
163
164	But why two tracks at all? Very easy: efficiency. Some FP
165	instruction can overlap with a certain amount of integer (and
166	FP) instructions. So we get (except for the imull) all
167	instructions for free. /*
168
169	fldt MO(f8) / f8 : xm /
170	fmul %st(`1`) / f8xm : xm /*
171
172	fldt MO(f7)
173	faddp / f7+f8xm : xm /*
174	fmul %st(`1`) / (f7+f8xm)xm : xm /
175	movl $`1431655766`, %eax
176	fldt MO(f6)
177	faddp / f6+(f7+f8xm)xm : xm /
178	imull %ecx
179	fmul %st(`1`) / (f6+(f7+f8xm)xm)xm : xm /*
180	movl %ecx, %eax
181	fldt MO(f5)
182	faddp / f5+(f6+(f7+f8xm)xm)xm : xm /*
183	sarl $`31`, %eax
184	fmul %st(`1`) / (f5+(f6+(f7+f8xm)xm)xm)xm : xm /
185	subl %eax, %edx
186	fldt MO(f4)
187	faddp / f4+(f5+(f6+(f7+f8xm)xm)xm)xm : xm /
188	fmul %st(`1`) / (f4+(f5+(f6+(f7+f8xm)xm)xm)xm)xm : xm /*
189	fldt MO(f3)
190	faddp / f3+(f4+(f5+(f6+(f7+f8xm)xm)xm)xm)xm : xm /*
191	fmul %st(`1`) / (f3+(f4+(f5+(f6+(f7+f8xm)xm)xm)xm)xm)xm : xm /
192	fldt MO(f2)
193	faddp / f2+(f3+(f4+(f5+(f6+(f7+f8xm)xm)xm)xm)xm)xm : xm /
194	fmul %st(`1`) / (f2+(f3+(f4+(f5+(f6+(f7+f8xm)xm)xm)xm)xm)xm)xm : xm /*
195	fldt MO(f1)
196	faddp / u:=f1+(f2+(f3+(f4+(f5+(f6+(f7+f8xm)xm)xm)xm)xm)xm)xm : xm /*
197
198	fld %st / u : u : xm /
199	fmul %st(`1`) / uu : u : xm /*
200	fld %st(`2`) / xm : uu : u : xm /*
201	fadd %st / 2xm : uu : u : xm /
202	fxch %st(`1`) / uu : 2xm : u : xm /
203	fmul %st(`2`) / t2:=uuu : 2xm : u : xm /*
204	movl %edx, %eax
205	fadd %st, %st(`1`) / t2 : t2+2xm : u : xm /*
206	leal (%edx,%edx,`2`),%edx
207	fadd %st(`0`) / 2t2 : t2+2xm : u : xm /
208	subl %edx, %ecx
209	faddp %st, %st(`3`) / t2+2xm : u : 2t2+xm /
210	shll $`4`, %ecx
211	fmulp / u(t2+2xm) : 2t2+xm /*
212	fdivp %st, %st(`1`) / u(t2+2xm)/(2t2+xm) /*
213	fldt MOX(`32`+factor,%ecx)
214	fmulp / u(t2+2xm)/(2t2+xm)FACT /
215	pushl %eax
216	cfi_adjust_cfa_offset (`4`)
217	fildl (%esp) / xe/3 : u(t2+2xm)/(2t2+xm)FACT /
218	fxch / u(t2+2xm)/(2t2+xm)FACT : xe/3 /
219	fscale / u(t2+2xm)/(2t2+xm)FACT2^xe/3 /*
220	popl %edx
221	cfi_adjust_cfa_offset (-`4`)
222	#ifdef PIC
223	movl `16`(%esp), %eax
224	popl %ebx
225	cfi_adjust_cfa_offset (-`4`)
226	cfi_restore (ebx)
227	#else
228	movl `12`(%esp), %eax
229	#endif
230	testl $`0x8000`, %eax
231	fstp %st(`1`)
232	jz `4f`
233	fchs
234	`4`: ret
235
236	/ Return the argument. /
237	`1`: fldt `4`(%esp)
238	fadd %st
239	ret
240	END(__cbrtl)
241	libm_alias_ldouble (__cbrt, cbrt)
242

source code of glibc/sysdeps/i386/fpu/s_cbrtl.S