1 | // (C) Copyright John Maddock 2006. |
2 | // Use, modification and distribution are subject to the |
3 | // Boost Software License, Version 1.0. (See accompanying file |
4 | // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) |
5 | |
6 | #ifndef BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS_SSE2 |
7 | #define BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS_SSE2 |
8 | |
9 | #ifdef _MSC_VER |
10 | #pragma once |
11 | #endif |
12 | |
13 | #include <emmintrin.h> |
14 | |
15 | #if defined(__GNUC__) || defined(__PGI) || defined(__SUNPRO_CC) |
16 | #define ALIGN16 __attribute__((__aligned__(16))) |
17 | #else |
18 | #define ALIGN16 __declspec(align(16)) |
19 | #endif |
20 | |
21 | namespace boost{ namespace math{ namespace lanczos{ |
22 | |
23 | template <> |
24 | inline double lanczos13m53::lanczos_sum<double>(const double& x) |
25 | { |
26 | static const ALIGN16 double coeff[26] = { |
27 | static_cast<double>(2.506628274631000270164908177133837338626L), |
28 | static_cast<double>(1u), |
29 | static_cast<double>(210.8242777515793458725097339207133627117L), |
30 | static_cast<double>(66u), |
31 | static_cast<double>(8071.672002365816210638002902272250613822L), |
32 | static_cast<double>(1925u), |
33 | static_cast<double>(186056.2653952234950402949897160456992822L), |
34 | static_cast<double>(32670u), |
35 | static_cast<double>(2876370.628935372441225409051620849613599L), |
36 | static_cast<double>(357423u), |
37 | static_cast<double>(31426415.58540019438061423162831820536287L), |
38 | static_cast<double>(2637558u), |
39 | static_cast<double>(248874557.8620541565114603864132294232163L), |
40 | static_cast<double>(13339535u), |
41 | static_cast<double>(1439720407.311721673663223072794912393972L), |
42 | static_cast<double>(45995730u), |
43 | static_cast<double>(6039542586.35202800506429164430729792107L), |
44 | static_cast<double>(105258076u), |
45 | static_cast<double>(17921034426.03720969991975575445893111267L), |
46 | static_cast<double>(150917976u), |
47 | static_cast<double>(35711959237.35566804944018545154716670596L), |
48 | static_cast<double>(120543840u), |
49 | static_cast<double>(42919803642.64909876895789904700198885093L), |
50 | static_cast<double>(39916800u), |
51 | static_cast<double>(23531376880.41075968857200767445163675473L), |
52 | static_cast<double>(0u) |
53 | }; |
54 | |
55 | static const double lim = 4.31965e+25; // By experiment, the largest x for which the SSE2 code does not go bad. |
56 | |
57 | if (x > lim) |
58 | { |
59 | double z = 1 / x; |
60 | return ((((((((((((coeff[24] * z + coeff[22]) * z + coeff[20]) * z + coeff[18]) * z + coeff[16]) * z + coeff[14]) * z + coeff[12]) * z + coeff[10]) * z + coeff[8]) * z + coeff[6]) * z + coeff[4]) * z + coeff[2]) * z + coeff[0]) / ((((((((((((coeff[25] * z + coeff[23]) * z + coeff[21]) * z + coeff[19]) * z + coeff[17]) * z + coeff[15]) * z + coeff[13]) * z + coeff[11]) * z + coeff[9]) * z + coeff[7]) * z + coeff[5]) * z + coeff[3]) * z + coeff[1]); |
61 | } |
62 | |
63 | __m128d vx = _mm_load1_pd(dp: &x); |
64 | __m128d sum_even = _mm_load_pd(dp: coeff); |
65 | __m128d sum_odd = _mm_load_pd(dp: coeff+2); |
66 | __m128d nc_odd, nc_even; |
67 | __m128d vx2 = _mm_mul_pd(a: vx, b: vx); |
68 | |
69 | sum_even = _mm_mul_pd(a: sum_even, b: vx2); |
70 | nc_even = _mm_load_pd(dp: coeff + 4); |
71 | sum_odd = _mm_mul_pd(a: sum_odd, b: vx2); |
72 | nc_odd = _mm_load_pd(dp: coeff + 6); |
73 | sum_even = _mm_add_pd(a: sum_even, b: nc_even); |
74 | sum_odd = _mm_add_pd(a: sum_odd, b: nc_odd); |
75 | |
76 | sum_even = _mm_mul_pd(a: sum_even, b: vx2); |
77 | nc_even = _mm_load_pd(dp: coeff + 8); |
78 | sum_odd = _mm_mul_pd(a: sum_odd, b: vx2); |
79 | nc_odd = _mm_load_pd(dp: coeff + 10); |
80 | sum_even = _mm_add_pd(a: sum_even, b: nc_even); |
81 | sum_odd = _mm_add_pd(a: sum_odd, b: nc_odd); |
82 | |
83 | sum_even = _mm_mul_pd(a: sum_even, b: vx2); |
84 | nc_even = _mm_load_pd(dp: coeff + 12); |
85 | sum_odd = _mm_mul_pd(a: sum_odd, b: vx2); |
86 | nc_odd = _mm_load_pd(dp: coeff + 14); |
87 | sum_even = _mm_add_pd(a: sum_even, b: nc_even); |
88 | sum_odd = _mm_add_pd(a: sum_odd, b: nc_odd); |
89 | |
90 | sum_even = _mm_mul_pd(a: sum_even, b: vx2); |
91 | nc_even = _mm_load_pd(dp: coeff + 16); |
92 | sum_odd = _mm_mul_pd(a: sum_odd, b: vx2); |
93 | nc_odd = _mm_load_pd(dp: coeff + 18); |
94 | sum_even = _mm_add_pd(a: sum_even, b: nc_even); |
95 | sum_odd = _mm_add_pd(a: sum_odd, b: nc_odd); |
96 | |
97 | sum_even = _mm_mul_pd(a: sum_even, b: vx2); |
98 | nc_even = _mm_load_pd(dp: coeff + 20); |
99 | sum_odd = _mm_mul_pd(a: sum_odd, b: vx2); |
100 | nc_odd = _mm_load_pd(dp: coeff + 22); |
101 | sum_even = _mm_add_pd(a: sum_even, b: nc_even); |
102 | sum_odd = _mm_add_pd(a: sum_odd, b: nc_odd); |
103 | |
104 | sum_even = _mm_mul_pd(a: sum_even, b: vx2); |
105 | nc_even = _mm_load_pd(dp: coeff + 24); |
106 | sum_odd = _mm_mul_pd(a: sum_odd, b: vx); |
107 | sum_even = _mm_add_pd(a: sum_even, b: nc_even); |
108 | sum_even = _mm_add_pd(a: sum_even, b: sum_odd); |
109 | |
110 | |
111 | double ALIGN16 t[2]; |
112 | _mm_store_pd(dp: t, a: sum_even); |
113 | |
114 | return t[0] / t[1]; |
115 | } |
116 | |
117 | template <> |
118 | inline double lanczos13m53::lanczos_sum_expG_scaled<double>(const double& x) |
119 | { |
120 | static const ALIGN16 double coeff[26] = { |
121 | static_cast<double>(0.006061842346248906525783753964555936883222L), |
122 | static_cast<double>(1u), |
123 | static_cast<double>(0.5098416655656676188125178644804694509993L), |
124 | static_cast<double>(66u), |
125 | static_cast<double>(19.51992788247617482847860966235652136208L), |
126 | static_cast<double>(1925u), |
127 | static_cast<double>(449.9445569063168119446858607650988409623L), |
128 | static_cast<double>(32670u), |
129 | static_cast<double>(6955.999602515376140356310115515198987526L), |
130 | static_cast<double>(357423u), |
131 | static_cast<double>(75999.29304014542649875303443598909137092L), |
132 | static_cast<double>(2637558u), |
133 | static_cast<double>(601859.6171681098786670226533699352302507L), |
134 | static_cast<double>(13339535u), |
135 | static_cast<double>(3481712.15498064590882071018964774556468L), |
136 | static_cast<double>(45995730u), |
137 | static_cast<double>(14605578.08768506808414169982791359218571L), |
138 | static_cast<double>(105258076u), |
139 | static_cast<double>(43338889.32467613834773723740590533316085L), |
140 | static_cast<double>(150917976u), |
141 | static_cast<double>(86363131.28813859145546927288977868422342L), |
142 | static_cast<double>(120543840u), |
143 | static_cast<double>(103794043.1163445451906271053616070238554L), |
144 | static_cast<double>(39916800u), |
145 | static_cast<double>(56906521.91347156388090791033559122686859L), |
146 | static_cast<double>(0u) |
147 | }; |
148 | |
149 | static const double lim = 4.76886e+25; // By experiment, the largest x for which the SSE2 code does not go bad. |
150 | |
151 | if (x > lim) |
152 | { |
153 | double z = 1 / x; |
154 | return ((((((((((((coeff[24] * z + coeff[22]) * z + coeff[20]) * z + coeff[18]) * z + coeff[16]) * z + coeff[14]) * z + coeff[12]) * z + coeff[10]) * z + coeff[8]) * z + coeff[6]) * z + coeff[4]) * z + coeff[2]) * z + coeff[0]) / ((((((((((((coeff[25] * z + coeff[23]) * z + coeff[21]) * z + coeff[19]) * z + coeff[17]) * z + coeff[15]) * z + coeff[13]) * z + coeff[11]) * z + coeff[9]) * z + coeff[7]) * z + coeff[5]) * z + coeff[3]) * z + coeff[1]); |
155 | } |
156 | |
157 | __m128d vx = _mm_load1_pd(dp: &x); |
158 | __m128d sum_even = _mm_load_pd(dp: coeff); |
159 | __m128d sum_odd = _mm_load_pd(dp: coeff+2); |
160 | __m128d nc_odd, nc_even; |
161 | __m128d vx2 = _mm_mul_pd(a: vx, b: vx); |
162 | |
163 | sum_even = _mm_mul_pd(a: sum_even, b: vx2); |
164 | nc_even = _mm_load_pd(dp: coeff + 4); |
165 | sum_odd = _mm_mul_pd(a: sum_odd, b: vx2); |
166 | nc_odd = _mm_load_pd(dp: coeff + 6); |
167 | sum_even = _mm_add_pd(a: sum_even, b: nc_even); |
168 | sum_odd = _mm_add_pd(a: sum_odd, b: nc_odd); |
169 | |
170 | sum_even = _mm_mul_pd(a: sum_even, b: vx2); |
171 | nc_even = _mm_load_pd(dp: coeff + 8); |
172 | sum_odd = _mm_mul_pd(a: sum_odd, b: vx2); |
173 | nc_odd = _mm_load_pd(dp: coeff + 10); |
174 | sum_even = _mm_add_pd(a: sum_even, b: nc_even); |
175 | sum_odd = _mm_add_pd(a: sum_odd, b: nc_odd); |
176 | |
177 | sum_even = _mm_mul_pd(a: sum_even, b: vx2); |
178 | nc_even = _mm_load_pd(dp: coeff + 12); |
179 | sum_odd = _mm_mul_pd(a: sum_odd, b: vx2); |
180 | nc_odd = _mm_load_pd(dp: coeff + 14); |
181 | sum_even = _mm_add_pd(a: sum_even, b: nc_even); |
182 | sum_odd = _mm_add_pd(a: sum_odd, b: nc_odd); |
183 | |
184 | sum_even = _mm_mul_pd(a: sum_even, b: vx2); |
185 | nc_even = _mm_load_pd(dp: coeff + 16); |
186 | sum_odd = _mm_mul_pd(a: sum_odd, b: vx2); |
187 | nc_odd = _mm_load_pd(dp: coeff + 18); |
188 | sum_even = _mm_add_pd(a: sum_even, b: nc_even); |
189 | sum_odd = _mm_add_pd(a: sum_odd, b: nc_odd); |
190 | |
191 | sum_even = _mm_mul_pd(a: sum_even, b: vx2); |
192 | nc_even = _mm_load_pd(dp: coeff + 20); |
193 | sum_odd = _mm_mul_pd(a: sum_odd, b: vx2); |
194 | nc_odd = _mm_load_pd(dp: coeff + 22); |
195 | sum_even = _mm_add_pd(a: sum_even, b: nc_even); |
196 | sum_odd = _mm_add_pd(a: sum_odd, b: nc_odd); |
197 | |
198 | sum_even = _mm_mul_pd(a: sum_even, b: vx2); |
199 | nc_even = _mm_load_pd(dp: coeff + 24); |
200 | sum_odd = _mm_mul_pd(a: sum_odd, b: vx); |
201 | sum_even = _mm_add_pd(a: sum_even, b: nc_even); |
202 | sum_even = _mm_add_pd(a: sum_even, b: sum_odd); |
203 | |
204 | |
205 | double ALIGN16 t[2]; |
206 | _mm_store_pd(dp: t, a: sum_even); |
207 | |
208 | return t[0] / t[1]; |
209 | } |
210 | |
211 | #ifdef _MSC_VER |
212 | |
213 | BOOST_STATIC_ASSERT(sizeof(double) == sizeof(long double)); |
214 | |
215 | template <> |
216 | inline long double lanczos13m53::lanczos_sum<long double>(const long double& x) |
217 | { |
218 | return lanczos_sum<double>(static_cast<double>(x)); |
219 | } |
220 | template <> |
221 | inline long double lanczos13m53::lanczos_sum_expG_scaled<long double>(const long double& x) |
222 | { |
223 | return lanczos_sum_expG_scaled<double>(static_cast<double>(x)); |
224 | } |
225 | #endif |
226 | |
227 | } // namespace lanczos |
228 | } // namespace math |
229 | } // namespace boost |
230 | |
231 | #undef ALIGN16 |
232 | |
233 | #endif // BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS |
234 | |
235 | |
236 | |
237 | |
238 | |
239 | |