| 1 | // (C) Copyright John Maddock 2006. |
| 2 | // Use, modification and distribution are subject to the |
| 3 | // Boost Software License, Version 1.0. (See accompanying file |
| 4 | // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) |
| 5 | |
| 6 | #ifndef BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS_SSE2 |
| 7 | #define BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS_SSE2 |
| 8 | |
| 9 | #ifdef _MSC_VER |
| 10 | #pragma once |
| 11 | #endif |
| 12 | |
| 13 | #include <emmintrin.h> |
| 14 | |
| 15 | #if defined(__GNUC__) || defined(__PGI) || defined(__SUNPRO_CC) |
| 16 | #define ALIGN16 __attribute__((__aligned__(16))) |
| 17 | #else |
| 18 | #define ALIGN16 __declspec(align(16)) |
| 19 | #endif |
| 20 | |
| 21 | namespace boost{ namespace math{ namespace lanczos{ |
| 22 | |
| 23 | template <> |
| 24 | inline double lanczos13m53::lanczos_sum<double>(const double& x) |
| 25 | { |
| 26 | static const ALIGN16 double coeff[26] = { |
| 27 | static_cast<double>(2.506628274631000270164908177133837338626L), |
| 28 | static_cast<double>(1u), |
| 29 | static_cast<double>(210.8242777515793458725097339207133627117L), |
| 30 | static_cast<double>(66u), |
| 31 | static_cast<double>(8071.672002365816210638002902272250613822L), |
| 32 | static_cast<double>(1925u), |
| 33 | static_cast<double>(186056.2653952234950402949897160456992822L), |
| 34 | static_cast<double>(32670u), |
| 35 | static_cast<double>(2876370.628935372441225409051620849613599L), |
| 36 | static_cast<double>(357423u), |
| 37 | static_cast<double>(31426415.58540019438061423162831820536287L), |
| 38 | static_cast<double>(2637558u), |
| 39 | static_cast<double>(248874557.8620541565114603864132294232163L), |
| 40 | static_cast<double>(13339535u), |
| 41 | static_cast<double>(1439720407.311721673663223072794912393972L), |
| 42 | static_cast<double>(45995730u), |
| 43 | static_cast<double>(6039542586.35202800506429164430729792107L), |
| 44 | static_cast<double>(105258076u), |
| 45 | static_cast<double>(17921034426.03720969991975575445893111267L), |
| 46 | static_cast<double>(150917976u), |
| 47 | static_cast<double>(35711959237.35566804944018545154716670596L), |
| 48 | static_cast<double>(120543840u), |
| 49 | static_cast<double>(42919803642.64909876895789904700198885093L), |
| 50 | static_cast<double>(39916800u), |
| 51 | static_cast<double>(23531376880.41075968857200767445163675473L), |
| 52 | static_cast<double>(0u) |
| 53 | }; |
| 54 | |
| 55 | static const double lim = 4.31965e+25; // By experiment, the largest x for which the SSE2 code does not go bad. |
| 56 | |
| 57 | if (x > lim) |
| 58 | { |
| 59 | double z = 1 / x; |
| 60 | return ((((((((((((coeff[24] * z + coeff[22]) * z + coeff[20]) * z + coeff[18]) * z + coeff[16]) * z + coeff[14]) * z + coeff[12]) * z + coeff[10]) * z + coeff[8]) * z + coeff[6]) * z + coeff[4]) * z + coeff[2]) * z + coeff[0]) / ((((((((((((coeff[25] * z + coeff[23]) * z + coeff[21]) * z + coeff[19]) * z + coeff[17]) * z + coeff[15]) * z + coeff[13]) * z + coeff[11]) * z + coeff[9]) * z + coeff[7]) * z + coeff[5]) * z + coeff[3]) * z + coeff[1]); |
| 61 | } |
| 62 | |
| 63 | __m128d vx = _mm_load1_pd(dp: &x); |
| 64 | __m128d sum_even = _mm_load_pd(dp: coeff); |
| 65 | __m128d sum_odd = _mm_load_pd(dp: coeff+2); |
| 66 | __m128d nc_odd, nc_even; |
| 67 | __m128d vx2 = _mm_mul_pd(a: vx, b: vx); |
| 68 | |
| 69 | sum_even = _mm_mul_pd(a: sum_even, b: vx2); |
| 70 | nc_even = _mm_load_pd(dp: coeff + 4); |
| 71 | sum_odd = _mm_mul_pd(a: sum_odd, b: vx2); |
| 72 | nc_odd = _mm_load_pd(dp: coeff + 6); |
| 73 | sum_even = _mm_add_pd(a: sum_even, b: nc_even); |
| 74 | sum_odd = _mm_add_pd(a: sum_odd, b: nc_odd); |
| 75 | |
| 76 | sum_even = _mm_mul_pd(a: sum_even, b: vx2); |
| 77 | nc_even = _mm_load_pd(dp: coeff + 8); |
| 78 | sum_odd = _mm_mul_pd(a: sum_odd, b: vx2); |
| 79 | nc_odd = _mm_load_pd(dp: coeff + 10); |
| 80 | sum_even = _mm_add_pd(a: sum_even, b: nc_even); |
| 81 | sum_odd = _mm_add_pd(a: sum_odd, b: nc_odd); |
| 82 | |
| 83 | sum_even = _mm_mul_pd(a: sum_even, b: vx2); |
| 84 | nc_even = _mm_load_pd(dp: coeff + 12); |
| 85 | sum_odd = _mm_mul_pd(a: sum_odd, b: vx2); |
| 86 | nc_odd = _mm_load_pd(dp: coeff + 14); |
| 87 | sum_even = _mm_add_pd(a: sum_even, b: nc_even); |
| 88 | sum_odd = _mm_add_pd(a: sum_odd, b: nc_odd); |
| 89 | |
| 90 | sum_even = _mm_mul_pd(a: sum_even, b: vx2); |
| 91 | nc_even = _mm_load_pd(dp: coeff + 16); |
| 92 | sum_odd = _mm_mul_pd(a: sum_odd, b: vx2); |
| 93 | nc_odd = _mm_load_pd(dp: coeff + 18); |
| 94 | sum_even = _mm_add_pd(a: sum_even, b: nc_even); |
| 95 | sum_odd = _mm_add_pd(a: sum_odd, b: nc_odd); |
| 96 | |
| 97 | sum_even = _mm_mul_pd(a: sum_even, b: vx2); |
| 98 | nc_even = _mm_load_pd(dp: coeff + 20); |
| 99 | sum_odd = _mm_mul_pd(a: sum_odd, b: vx2); |
| 100 | nc_odd = _mm_load_pd(dp: coeff + 22); |
| 101 | sum_even = _mm_add_pd(a: sum_even, b: nc_even); |
| 102 | sum_odd = _mm_add_pd(a: sum_odd, b: nc_odd); |
| 103 | |
| 104 | sum_even = _mm_mul_pd(a: sum_even, b: vx2); |
| 105 | nc_even = _mm_load_pd(dp: coeff + 24); |
| 106 | sum_odd = _mm_mul_pd(a: sum_odd, b: vx); |
| 107 | sum_even = _mm_add_pd(a: sum_even, b: nc_even); |
| 108 | sum_even = _mm_add_pd(a: sum_even, b: sum_odd); |
| 109 | |
| 110 | |
| 111 | double ALIGN16 t[2]; |
| 112 | _mm_store_pd(dp: t, a: sum_even); |
| 113 | |
| 114 | return t[0] / t[1]; |
| 115 | } |
| 116 | |
| 117 | template <> |
| 118 | inline double lanczos13m53::lanczos_sum_expG_scaled<double>(const double& x) |
| 119 | { |
| 120 | static const ALIGN16 double coeff[26] = { |
| 121 | static_cast<double>(0.006061842346248906525783753964555936883222L), |
| 122 | static_cast<double>(1u), |
| 123 | static_cast<double>(0.5098416655656676188125178644804694509993L), |
| 124 | static_cast<double>(66u), |
| 125 | static_cast<double>(19.51992788247617482847860966235652136208L), |
| 126 | static_cast<double>(1925u), |
| 127 | static_cast<double>(449.9445569063168119446858607650988409623L), |
| 128 | static_cast<double>(32670u), |
| 129 | static_cast<double>(6955.999602515376140356310115515198987526L), |
| 130 | static_cast<double>(357423u), |
| 131 | static_cast<double>(75999.29304014542649875303443598909137092L), |
| 132 | static_cast<double>(2637558u), |
| 133 | static_cast<double>(601859.6171681098786670226533699352302507L), |
| 134 | static_cast<double>(13339535u), |
| 135 | static_cast<double>(3481712.15498064590882071018964774556468L), |
| 136 | static_cast<double>(45995730u), |
| 137 | static_cast<double>(14605578.08768506808414169982791359218571L), |
| 138 | static_cast<double>(105258076u), |
| 139 | static_cast<double>(43338889.32467613834773723740590533316085L), |
| 140 | static_cast<double>(150917976u), |
| 141 | static_cast<double>(86363131.28813859145546927288977868422342L), |
| 142 | static_cast<double>(120543840u), |
| 143 | static_cast<double>(103794043.1163445451906271053616070238554L), |
| 144 | static_cast<double>(39916800u), |
| 145 | static_cast<double>(56906521.91347156388090791033559122686859L), |
| 146 | static_cast<double>(0u) |
| 147 | }; |
| 148 | |
| 149 | static const double lim = 4.76886e+25; // By experiment, the largest x for which the SSE2 code does not go bad. |
| 150 | |
| 151 | if (x > lim) |
| 152 | { |
| 153 | double z = 1 / x; |
| 154 | return ((((((((((((coeff[24] * z + coeff[22]) * z + coeff[20]) * z + coeff[18]) * z + coeff[16]) * z + coeff[14]) * z + coeff[12]) * z + coeff[10]) * z + coeff[8]) * z + coeff[6]) * z + coeff[4]) * z + coeff[2]) * z + coeff[0]) / ((((((((((((coeff[25] * z + coeff[23]) * z + coeff[21]) * z + coeff[19]) * z + coeff[17]) * z + coeff[15]) * z + coeff[13]) * z + coeff[11]) * z + coeff[9]) * z + coeff[7]) * z + coeff[5]) * z + coeff[3]) * z + coeff[1]); |
| 155 | } |
| 156 | |
| 157 | __m128d vx = _mm_load1_pd(dp: &x); |
| 158 | __m128d sum_even = _mm_load_pd(dp: coeff); |
| 159 | __m128d sum_odd = _mm_load_pd(dp: coeff+2); |
| 160 | __m128d nc_odd, nc_even; |
| 161 | __m128d vx2 = _mm_mul_pd(a: vx, b: vx); |
| 162 | |
| 163 | sum_even = _mm_mul_pd(a: sum_even, b: vx2); |
| 164 | nc_even = _mm_load_pd(dp: coeff + 4); |
| 165 | sum_odd = _mm_mul_pd(a: sum_odd, b: vx2); |
| 166 | nc_odd = _mm_load_pd(dp: coeff + 6); |
| 167 | sum_even = _mm_add_pd(a: sum_even, b: nc_even); |
| 168 | sum_odd = _mm_add_pd(a: sum_odd, b: nc_odd); |
| 169 | |
| 170 | sum_even = _mm_mul_pd(a: sum_even, b: vx2); |
| 171 | nc_even = _mm_load_pd(dp: coeff + 8); |
| 172 | sum_odd = _mm_mul_pd(a: sum_odd, b: vx2); |
| 173 | nc_odd = _mm_load_pd(dp: coeff + 10); |
| 174 | sum_even = _mm_add_pd(a: sum_even, b: nc_even); |
| 175 | sum_odd = _mm_add_pd(a: sum_odd, b: nc_odd); |
| 176 | |
| 177 | sum_even = _mm_mul_pd(a: sum_even, b: vx2); |
| 178 | nc_even = _mm_load_pd(dp: coeff + 12); |
| 179 | sum_odd = _mm_mul_pd(a: sum_odd, b: vx2); |
| 180 | nc_odd = _mm_load_pd(dp: coeff + 14); |
| 181 | sum_even = _mm_add_pd(a: sum_even, b: nc_even); |
| 182 | sum_odd = _mm_add_pd(a: sum_odd, b: nc_odd); |
| 183 | |
| 184 | sum_even = _mm_mul_pd(a: sum_even, b: vx2); |
| 185 | nc_even = _mm_load_pd(dp: coeff + 16); |
| 186 | sum_odd = _mm_mul_pd(a: sum_odd, b: vx2); |
| 187 | nc_odd = _mm_load_pd(dp: coeff + 18); |
| 188 | sum_even = _mm_add_pd(a: sum_even, b: nc_even); |
| 189 | sum_odd = _mm_add_pd(a: sum_odd, b: nc_odd); |
| 190 | |
| 191 | sum_even = _mm_mul_pd(a: sum_even, b: vx2); |
| 192 | nc_even = _mm_load_pd(dp: coeff + 20); |
| 193 | sum_odd = _mm_mul_pd(a: sum_odd, b: vx2); |
| 194 | nc_odd = _mm_load_pd(dp: coeff + 22); |
| 195 | sum_even = _mm_add_pd(a: sum_even, b: nc_even); |
| 196 | sum_odd = _mm_add_pd(a: sum_odd, b: nc_odd); |
| 197 | |
| 198 | sum_even = _mm_mul_pd(a: sum_even, b: vx2); |
| 199 | nc_even = _mm_load_pd(dp: coeff + 24); |
| 200 | sum_odd = _mm_mul_pd(a: sum_odd, b: vx); |
| 201 | sum_even = _mm_add_pd(a: sum_even, b: nc_even); |
| 202 | sum_even = _mm_add_pd(a: sum_even, b: sum_odd); |
| 203 | |
| 204 | |
| 205 | double ALIGN16 t[2]; |
| 206 | _mm_store_pd(dp: t, a: sum_even); |
| 207 | |
| 208 | return t[0] / t[1]; |
| 209 | } |
| 210 | |
| 211 | #ifdef _MSC_VER |
| 212 | |
| 213 | BOOST_STATIC_ASSERT(sizeof(double) == sizeof(long double)); |
| 214 | |
| 215 | template <> |
| 216 | inline long double lanczos13m53::lanczos_sum<long double>(const long double& x) |
| 217 | { |
| 218 | return lanczos_sum<double>(static_cast<double>(x)); |
| 219 | } |
| 220 | template <> |
| 221 | inline long double lanczos13m53::lanczos_sum_expG_scaled<long double>(const long double& x) |
| 222 | { |
| 223 | return lanczos_sum_expG_scaled<double>(static_cast<double>(x)); |
| 224 | } |
| 225 | #endif |
| 226 | |
| 227 | } // namespace lanczos |
| 228 | } // namespace math |
| 229 | } // namespace boost |
| 230 | |
| 231 | #undef ALIGN16 |
| 232 | |
| 233 | #endif // BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS |
| 234 | |
| 235 | |
| 236 | |
| 237 | |
| 238 | |
| 239 | |