1 | /* Compute cubic root of long double value. |
2 | Copyright (C) 1997-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <libm-alias-ldouble.h> |
20 | #include <machine/asm.h> |
21 | |
22 | .section .rodata |
23 | |
24 | .align ALIGNARG(4) |
25 | .type f8,@object |
26 | f8: .quad 0xa57ef3d83a542839 /* 0.161617097923756032 */ |
27 | .short 0x3ffc |
28 | ASM_SIZE_DIRECTIVE(f8) |
29 | .align ALIGNARG(4) |
30 | .type f7,@object |
31 | f7: .quad 0xfd11da7820029014 /* -0.988553671195413709 */ |
32 | .short 0xbffe |
33 | ASM_SIZE_DIRECTIVE(f7) |
34 | .align ALIGNARG(4) |
35 | .type f6,@object |
36 | f6: .quad 0xa9ca93fcade3b4ad /* 2.65298938441952296 */ |
37 | .short 0x4000 |
38 | ASM_SIZE_DIRECTIVE(f6) |
39 | .align ALIGNARG(4) |
40 | .type f5,@object |
41 | f5: .quad 0x839186562c931c34 /* -4.11151425200350531 */ |
42 | .short 0xc001 |
43 | ASM_SIZE_DIRECTIVE(f5) |
44 | .align ALIGNARG(4) |
45 | .type f4,@object |
46 | f4: .quad 0x830f25c9ee304594 /* 4.09559907378707839 */ |
47 | .short 0x4001 |
48 | ASM_SIZE_DIRECTIVE(f4) |
49 | .align ALIGNARG(4) |
50 | .type f3,@object |
51 | f3: .quad 0xb4bedd1d5fa2f0c6 /* -2.82414939754975962 */ |
52 | .short 0xc000 |
53 | ASM_SIZE_DIRECTIVE(f3) |
54 | .align ALIGNARG(4) |
55 | .type f2,@object |
56 | f2: .quad 0xd685a163b08586e3 /* 1.67595307700780102 */ |
57 | .short 0x3fff |
58 | ASM_SIZE_DIRECTIVE(f2) |
59 | .align ALIGNARG(4) |
60 | .type f1,@object |
61 | f1: .quad 0xad16073ed4ec3b45 /* 0.338058687610520237 */ |
62 | .short 0x3ffd |
63 | ASM_SIZE_DIRECTIVE(f1) |
64 | |
65 | /* We make the entries in the following table all 16 bytes |
66 | wide to avoid having to implement a multiplication by 10. */ |
67 | .type factor,@object |
68 | .align ALIGNARG(4) |
69 | factor: /* 1.0 / cbrt (2.0) ^ 2 = 0.629960524947436582364439673883 */ |
70 | .quad 0xa14517cc6b945711 |
71 | .short 0x3ffe |
72 | .byte 0, 0, 0, 0, 0, 0 |
73 | /* 1.0 / cbrt (2.0) = 0.793700525984099737355196796584 */ |
74 | .quad 0xcb2ff529eb71e415 |
75 | .short 0x3ffe |
76 | .byte 0, 0, 0, 0, 0, 0 |
77 | /* 1.0L */ |
78 | .quad 0x8000000000000000 |
79 | .short 0x3fff |
80 | .byte 0, 0, 0, 0, 0, 0 |
81 | /* cbrt (2.0) = 1.2599210498948731648 */ |
82 | .quad 0xa14517cc6b945711 |
83 | .short 0x3fff |
84 | .byte 0, 0, 0, 0, 0, 0 |
85 | /* cbrt (2.0) ^ 2 = 1.5874010519681994748 */ |
86 | .quad 0xcb2ff529eb71e416 |
87 | .short 0x3fff |
88 | ASM_SIZE_DIRECTIVE(factor) |
89 | |
90 | .type two64,@object |
91 | .align ALIGNARG(4) |
92 | two64: .byte 0, 0, 0, 0, 0, 0, 0xf0, 0x43 |
93 | ASM_SIZE_DIRECTIVE(two64) |
94 | |
95 | #ifdef PIC |
96 | #define MO(op) op##@GOTOFF(%ebx) |
97 | #define MOX(op,x) op##@GOTOFF(%ebx,x,1) |
98 | #else |
99 | #define MO(op) op |
100 | #define MOX(op,x) op(x) |
101 | #endif |
102 | |
103 | .text |
104 | ENTRY(__cbrtl) |
105 | movl 4(%esp), %ecx |
106 | movl 12(%esp), %eax |
107 | orl 8(%esp), %ecx |
108 | movl %eax, %edx |
109 | andl $0x7fff, %eax |
110 | orl %eax, %ecx |
111 | jz 1f |
112 | xorl %ecx, %ecx |
113 | cmpl $0x7fff, %eax |
114 | je 1f |
115 | |
116 | #ifdef PIC |
117 | pushl %ebx |
118 | cfi_adjust_cfa_offset (4) |
119 | cfi_rel_offset (ebx, 0) |
120 | LOAD_PIC_REG (bx) |
121 | #endif |
122 | |
123 | cmpl $0, %eax |
124 | jne 2f |
125 | |
126 | #ifdef PIC |
127 | fldt 8(%esp) |
128 | #else |
129 | fldt 4(%esp) |
130 | #endif |
131 | fmull MO(two64) |
132 | movl $-64, %ecx |
133 | #ifdef PIC |
134 | fstpt 8(%esp) |
135 | movl 16(%esp), %eax |
136 | #else |
137 | fstpt 4(%esp) |
138 | movl 12(%esp), %eax |
139 | #endif |
140 | movl %eax, %edx |
141 | andl $0x7fff, %eax |
142 | |
143 | 2: andl $0x8000, %edx |
144 | subl $16382, %eax |
145 | orl $0x3ffe, %edx |
146 | addl %eax, %ecx |
147 | #ifdef PIC |
148 | movl %edx, 16(%esp) |
149 | |
150 | fldt 8(%esp) /* xm */ |
151 | #else |
152 | movl %edx, 12(%esp) |
153 | |
154 | fldt 4(%esp) /* xm */ |
155 | #endif |
156 | fabs |
157 | |
158 | /* The following code has two tracks: |
159 | a) compute the normalized cbrt value |
160 | b) compute xe/3 and xe%3 |
161 | The right track computes the value for b) and this is done |
162 | in an optimized way by avoiding division. |
163 | |
164 | But why two tracks at all? Very easy: efficiency. Some FP |
165 | instruction can overlap with a certain amount of integer (and |
166 | FP) instructions. So we get (except for the imull) all |
167 | instructions for free. */ |
168 | |
169 | fldt MO(f8) /* f8 : xm */ |
170 | fmul %st(1) /* f8*xm : xm */ |
171 | |
172 | fldt MO(f7) |
173 | faddp /* f7+f8*xm : xm */ |
174 | fmul %st(1) /* (f7+f8*xm)*xm : xm */ |
175 | movl $1431655766, %eax |
176 | fldt MO(f6) |
177 | faddp /* f6+(f7+f8*xm)*xm : xm */ |
178 | imull %ecx |
179 | fmul %st(1) /* (f6+(f7+f8*xm)*xm)*xm : xm */ |
180 | movl %ecx, %eax |
181 | fldt MO(f5) |
182 | faddp /* f5+(f6+(f7+f8*xm)*xm)*xm : xm */ |
183 | sarl $31, %eax |
184 | fmul %st(1) /* (f5+(f6+(f7+f8*xm)*xm)*xm)*xm : xm */ |
185 | subl %eax, %edx |
186 | fldt MO(f4) |
187 | faddp /* f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm : xm */ |
188 | fmul %st(1) /* (f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm : xm */ |
189 | fldt MO(f3) |
190 | faddp /* f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm : xm */ |
191 | fmul %st(1) /* (f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm : xm */ |
192 | fldt MO(f2) |
193 | faddp /* f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm : xm */ |
194 | fmul %st(1) /* (f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm)*xm : xm */ |
195 | fldt MO(f1) |
196 | faddp /* u:=f1+(f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm)*xm : xm */ |
197 | |
198 | fld %st /* u : u : xm */ |
199 | fmul %st(1) /* u*u : u : xm */ |
200 | fld %st(2) /* xm : u*u : u : xm */ |
201 | fadd %st /* 2*xm : u*u : u : xm */ |
202 | fxch %st(1) /* u*u : 2*xm : u : xm */ |
203 | fmul %st(2) /* t2:=u*u*u : 2*xm : u : xm */ |
204 | movl %edx, %eax |
205 | fadd %st, %st(1) /* t2 : t2+2*xm : u : xm */ |
206 | leal (%edx,%edx,2),%edx |
207 | fadd %st(0) /* 2*t2 : t2+2*xm : u : xm */ |
208 | subl %edx, %ecx |
209 | faddp %st, %st(3) /* t2+2*xm : u : 2*t2+xm */ |
210 | shll $4, %ecx |
211 | fmulp /* u*(t2+2*xm) : 2*t2+xm */ |
212 | fdivp %st, %st(1) /* u*(t2+2*xm)/(2*t2+xm) */ |
213 | fldt MOX(32+factor,%ecx) |
214 | fmulp /* u*(t2+2*xm)/(2*t2+xm)*FACT */ |
215 | pushl %eax |
216 | cfi_adjust_cfa_offset (4) |
217 | fildl (%esp) /* xe/3 : u*(t2+2*xm)/(2*t2+xm)*FACT */ |
218 | fxch /* u*(t2+2*xm)/(2*t2+xm)*FACT : xe/3 */ |
219 | fscale /* u*(t2+2*xm)/(2*t2+xm)*FACT*2^xe/3 */ |
220 | popl %edx |
221 | cfi_adjust_cfa_offset (-4) |
222 | #ifdef PIC |
223 | movl 16(%esp), %eax |
224 | popl %ebx |
225 | cfi_adjust_cfa_offset (-4) |
226 | cfi_restore (ebx) |
227 | #else |
228 | movl 12(%esp), %eax |
229 | #endif |
230 | testl $0x8000, %eax |
231 | fstp %st(1) |
232 | jz 4f |
233 | fchs |
234 | 4: ret |
235 | |
236 | /* Return the argument. */ |
237 | 1: fldt 4(%esp) |
238 | fadd %st |
239 | ret |
240 | END(__cbrtl) |
241 | libm_alias_ldouble (__cbrt, cbrt) |
242 | |