1/* Compute cubic root of long double value.
2 Copyright (C) 1997-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <libm-alias-ldouble.h>
20#include <machine/asm.h>
21
22 .section .rodata
23
24 .align ALIGNARG(4)
25 .type f8,@object
26f8: .quad 0xa57ef3d83a542839 /* 0.161617097923756032 */
27 .short 0x3ffc
28 ASM_SIZE_DIRECTIVE(f8)
29 .align ALIGNARG(4)
30 .type f7,@object
31f7: .quad 0xfd11da7820029014 /* -0.988553671195413709 */
32 .short 0xbffe
33 ASM_SIZE_DIRECTIVE(f7)
34 .align ALIGNARG(4)
35 .type f6,@object
36f6: .quad 0xa9ca93fcade3b4ad /* 2.65298938441952296 */
37 .short 0x4000
38 ASM_SIZE_DIRECTIVE(f6)
39 .align ALIGNARG(4)
40 .type f5,@object
41f5: .quad 0x839186562c931c34 /* -4.11151425200350531 */
42 .short 0xc001
43 ASM_SIZE_DIRECTIVE(f5)
44 .align ALIGNARG(4)
45 .type f4,@object
46f4: .quad 0x830f25c9ee304594 /* 4.09559907378707839 */
47 .short 0x4001
48 ASM_SIZE_DIRECTIVE(f4)
49 .align ALIGNARG(4)
50 .type f3,@object
51f3: .quad 0xb4bedd1d5fa2f0c6 /* -2.82414939754975962 */
52 .short 0xc000
53 ASM_SIZE_DIRECTIVE(f3)
54 .align ALIGNARG(4)
55 .type f2,@object
56f2: .quad 0xd685a163b08586e3 /* 1.67595307700780102 */
57 .short 0x3fff
58 ASM_SIZE_DIRECTIVE(f2)
59 .align ALIGNARG(4)
60 .type f1,@object
61f1: .quad 0xad16073ed4ec3b45 /* 0.338058687610520237 */
62 .short 0x3ffd
63 ASM_SIZE_DIRECTIVE(f1)
64
65 /* We make the entries in the following table all 16 bytes
66 wide to avoid having to implement a multiplication by 10. */
67 .type factor,@object
68 .align ALIGNARG(4)
69factor: /* 1.0 / cbrt (2.0) ^ 2 = 0.629960524947436582364439673883 */
70 .quad 0xa14517cc6b945711
71 .short 0x3ffe
72 .byte 0, 0, 0, 0, 0, 0
73 /* 1.0 / cbrt (2.0) = 0.793700525984099737355196796584 */
74 .quad 0xcb2ff529eb71e415
75 .short 0x3ffe
76 .byte 0, 0, 0, 0, 0, 0
77 /* 1.0L */
78 .quad 0x8000000000000000
79 .short 0x3fff
80 .byte 0, 0, 0, 0, 0, 0
81 /* cbrt (2.0) = 1.2599210498948731648 */
82 .quad 0xa14517cc6b945711
83 .short 0x3fff
84 .byte 0, 0, 0, 0, 0, 0
85 /* cbrt (2.0) ^ 2 = 1.5874010519681994748 */
86 .quad 0xcb2ff529eb71e416
87 .short 0x3fff
88 ASM_SIZE_DIRECTIVE(factor)
89
90 .type two64,@object
91 .align ALIGNARG(4)
92two64: .byte 0, 0, 0, 0, 0, 0, 0xf0, 0x43
93 ASM_SIZE_DIRECTIVE(two64)
94
95#ifdef PIC
96#define MO(op) op##@GOTOFF(%ebx)
97#define MOX(op,x) op##@GOTOFF(%ebx,x,1)
98#else
99#define MO(op) op
100#define MOX(op,x) op(x)
101#endif
102
103 .text
104ENTRY(__cbrtl)
105 movl 4(%esp), %ecx
106 movl 12(%esp), %eax
107 orl 8(%esp), %ecx
108 movl %eax, %edx
109 andl $0x7fff, %eax
110 orl %eax, %ecx
111 jz 1f
112 xorl %ecx, %ecx
113 cmpl $0x7fff, %eax
114 je 1f
115
116#ifdef PIC
117 pushl %ebx
118 cfi_adjust_cfa_offset (4)
119 cfi_rel_offset (ebx, 0)
120 LOAD_PIC_REG (bx)
121#endif
122
123 cmpl $0, %eax
124 jne 2f
125
126#ifdef PIC
127 fldt 8(%esp)
128#else
129 fldt 4(%esp)
130#endif
131 fmull MO(two64)
132 movl $-64, %ecx
133#ifdef PIC
134 fstpt 8(%esp)
135 movl 16(%esp), %eax
136#else
137 fstpt 4(%esp)
138 movl 12(%esp), %eax
139#endif
140 movl %eax, %edx
141 andl $0x7fff, %eax
142
1432: andl $0x8000, %edx
144 subl $16382, %eax
145 orl $0x3ffe, %edx
146 addl %eax, %ecx
147#ifdef PIC
148 movl %edx, 16(%esp)
149
150 fldt 8(%esp) /* xm */
151#else
152 movl %edx, 12(%esp)
153
154 fldt 4(%esp) /* xm */
155#endif
156 fabs
157
158 /* The following code has two tracks:
159 a) compute the normalized cbrt value
160 b) compute xe/3 and xe%3
161 The right track computes the value for b) and this is done
162 in an optimized way by avoiding division.
163
164 But why two tracks at all? Very easy: efficiency. Some FP
165 instruction can overlap with a certain amount of integer (and
166 FP) instructions. So we get (except for the imull) all
167 instructions for free. */
168
169 fldt MO(f8) /* f8 : xm */
170 fmul %st(1) /* f8*xm : xm */
171
172 fldt MO(f7)
173 faddp /* f7+f8*xm : xm */
174 fmul %st(1) /* (f7+f8*xm)*xm : xm */
175 movl $1431655766, %eax
176 fldt MO(f6)
177 faddp /* f6+(f7+f8*xm)*xm : xm */
178 imull %ecx
179 fmul %st(1) /* (f6+(f7+f8*xm)*xm)*xm : xm */
180 movl %ecx, %eax
181 fldt MO(f5)
182 faddp /* f5+(f6+(f7+f8*xm)*xm)*xm : xm */
183 sarl $31, %eax
184 fmul %st(1) /* (f5+(f6+(f7+f8*xm)*xm)*xm)*xm : xm */
185 subl %eax, %edx
186 fldt MO(f4)
187 faddp /* f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm : xm */
188 fmul %st(1) /* (f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm : xm */
189 fldt MO(f3)
190 faddp /* f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm : xm */
191 fmul %st(1) /* (f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm : xm */
192 fldt MO(f2)
193 faddp /* f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm : xm */
194 fmul %st(1) /* (f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm)*xm : xm */
195 fldt MO(f1)
196 faddp /* u:=f1+(f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm)*xm : xm */
197
198 fld %st /* u : u : xm */
199 fmul %st(1) /* u*u : u : xm */
200 fld %st(2) /* xm : u*u : u : xm */
201 fadd %st /* 2*xm : u*u : u : xm */
202 fxch %st(1) /* u*u : 2*xm : u : xm */
203 fmul %st(2) /* t2:=u*u*u : 2*xm : u : xm */
204 movl %edx, %eax
205 fadd %st, %st(1) /* t2 : t2+2*xm : u : xm */
206 leal (%edx,%edx,2),%edx
207 fadd %st(0) /* 2*t2 : t2+2*xm : u : xm */
208 subl %edx, %ecx
209 faddp %st, %st(3) /* t2+2*xm : u : 2*t2+xm */
210 shll $4, %ecx
211 fmulp /* u*(t2+2*xm) : 2*t2+xm */
212 fdivp %st, %st(1) /* u*(t2+2*xm)/(2*t2+xm) */
213 fldt MOX(32+factor,%ecx)
214 fmulp /* u*(t2+2*xm)/(2*t2+xm)*FACT */
215 pushl %eax
216 cfi_adjust_cfa_offset (4)
217 fildl (%esp) /* xe/3 : u*(t2+2*xm)/(2*t2+xm)*FACT */
218 fxch /* u*(t2+2*xm)/(2*t2+xm)*FACT : xe/3 */
219 fscale /* u*(t2+2*xm)/(2*t2+xm)*FACT*2^xe/3 */
220 popl %edx
221 cfi_adjust_cfa_offset (-4)
222#ifdef PIC
223 movl 16(%esp), %eax
224 popl %ebx
225 cfi_adjust_cfa_offset (-4)
226 cfi_restore (ebx)
227#else
228 movl 12(%esp), %eax
229#endif
230 testl $0x8000, %eax
231 fstp %st(1)
232 jz 4f
233 fchs
2344: ret
235
236 /* Return the argument. */
2371: fldt 4(%esp)
238 fadd %st
239 ret
240END(__cbrtl)
241libm_alias_ldouble (__cbrt, cbrt)
242

source code of glibc/sysdeps/i386/fpu/s_cbrtl.S