1// (C) Copyright John Maddock 2006.
2// Use, modification and distribution are subject to the
3// Boost Software License, Version 1.0. (See accompanying file
4// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
5
6#ifndef BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS_SSE2
7#define BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS_SSE2
8
9#ifdef _MSC_VER
10#pragma once
11#endif
12
13#include <emmintrin.h>
14
15#if defined(__GNUC__) || defined(__PGI) || defined(__SUNPRO_CC)
16#define ALIGN16 __attribute__((__aligned__(16)))
17#else
18#define ALIGN16 __declspec(align(16))
19#endif
20
21namespace boost{ namespace math{ namespace lanczos{
22
23template <>
24inline double lanczos13m53::lanczos_sum<double>(const double& x)
25{
26 static const ALIGN16 double coeff[26] = {
27 static_cast<double>(2.506628274631000270164908177133837338626L),
28 static_cast<double>(1u),
29 static_cast<double>(210.8242777515793458725097339207133627117L),
30 static_cast<double>(66u),
31 static_cast<double>(8071.672002365816210638002902272250613822L),
32 static_cast<double>(1925u),
33 static_cast<double>(186056.2653952234950402949897160456992822L),
34 static_cast<double>(32670u),
35 static_cast<double>(2876370.628935372441225409051620849613599L),
36 static_cast<double>(357423u),
37 static_cast<double>(31426415.58540019438061423162831820536287L),
38 static_cast<double>(2637558u),
39 static_cast<double>(248874557.8620541565114603864132294232163L),
40 static_cast<double>(13339535u),
41 static_cast<double>(1439720407.311721673663223072794912393972L),
42 static_cast<double>(45995730u),
43 static_cast<double>(6039542586.35202800506429164430729792107L),
44 static_cast<double>(105258076u),
45 static_cast<double>(17921034426.03720969991975575445893111267L),
46 static_cast<double>(150917976u),
47 static_cast<double>(35711959237.35566804944018545154716670596L),
48 static_cast<double>(120543840u),
49 static_cast<double>(42919803642.64909876895789904700198885093L),
50 static_cast<double>(39916800u),
51 static_cast<double>(23531376880.41075968857200767445163675473L),
52 static_cast<double>(0u)
53 };
54
55 static const double lim = 4.31965e+25; // By experiment, the largest x for which the SSE2 code does not go bad.
56
57 if (x > lim)
58 {
59 double z = 1 / x;
60 return ((((((((((((coeff[24] * z + coeff[22]) * z + coeff[20]) * z + coeff[18]) * z + coeff[16]) * z + coeff[14]) * z + coeff[12]) * z + coeff[10]) * z + coeff[8]) * z + coeff[6]) * z + coeff[4]) * z + coeff[2]) * z + coeff[0]) / ((((((((((((coeff[25] * z + coeff[23]) * z + coeff[21]) * z + coeff[19]) * z + coeff[17]) * z + coeff[15]) * z + coeff[13]) * z + coeff[11]) * z + coeff[9]) * z + coeff[7]) * z + coeff[5]) * z + coeff[3]) * z + coeff[1]);
61 }
62
63 __m128d vx = _mm_load1_pd(dp: &x);
64 __m128d sum_even = _mm_load_pd(dp: coeff);
65 __m128d sum_odd = _mm_load_pd(dp: coeff+2);
66 __m128d nc_odd, nc_even;
67 __m128d vx2 = _mm_mul_pd(a: vx, b: vx);
68
69 sum_even = _mm_mul_pd(a: sum_even, b: vx2);
70 nc_even = _mm_load_pd(dp: coeff + 4);
71 sum_odd = _mm_mul_pd(a: sum_odd, b: vx2);
72 nc_odd = _mm_load_pd(dp: coeff + 6);
73 sum_even = _mm_add_pd(a: sum_even, b: nc_even);
74 sum_odd = _mm_add_pd(a: sum_odd, b: nc_odd);
75
76 sum_even = _mm_mul_pd(a: sum_even, b: vx2);
77 nc_even = _mm_load_pd(dp: coeff + 8);
78 sum_odd = _mm_mul_pd(a: sum_odd, b: vx2);
79 nc_odd = _mm_load_pd(dp: coeff + 10);
80 sum_even = _mm_add_pd(a: sum_even, b: nc_even);
81 sum_odd = _mm_add_pd(a: sum_odd, b: nc_odd);
82
83 sum_even = _mm_mul_pd(a: sum_even, b: vx2);
84 nc_even = _mm_load_pd(dp: coeff + 12);
85 sum_odd = _mm_mul_pd(a: sum_odd, b: vx2);
86 nc_odd = _mm_load_pd(dp: coeff + 14);
87 sum_even = _mm_add_pd(a: sum_even, b: nc_even);
88 sum_odd = _mm_add_pd(a: sum_odd, b: nc_odd);
89
90 sum_even = _mm_mul_pd(a: sum_even, b: vx2);
91 nc_even = _mm_load_pd(dp: coeff + 16);
92 sum_odd = _mm_mul_pd(a: sum_odd, b: vx2);
93 nc_odd = _mm_load_pd(dp: coeff + 18);
94 sum_even = _mm_add_pd(a: sum_even, b: nc_even);
95 sum_odd = _mm_add_pd(a: sum_odd, b: nc_odd);
96
97 sum_even = _mm_mul_pd(a: sum_even, b: vx2);
98 nc_even = _mm_load_pd(dp: coeff + 20);
99 sum_odd = _mm_mul_pd(a: sum_odd, b: vx2);
100 nc_odd = _mm_load_pd(dp: coeff + 22);
101 sum_even = _mm_add_pd(a: sum_even, b: nc_even);
102 sum_odd = _mm_add_pd(a: sum_odd, b: nc_odd);
103
104 sum_even = _mm_mul_pd(a: sum_even, b: vx2);
105 nc_even = _mm_load_pd(dp: coeff + 24);
106 sum_odd = _mm_mul_pd(a: sum_odd, b: vx);
107 sum_even = _mm_add_pd(a: sum_even, b: nc_even);
108 sum_even = _mm_add_pd(a: sum_even, b: sum_odd);
109
110
111 double ALIGN16 t[2];
112 _mm_store_pd(dp: t, a: sum_even);
113
114 return t[0] / t[1];
115}
116
117template <>
118inline double lanczos13m53::lanczos_sum_expG_scaled<double>(const double& x)
119{
120 static const ALIGN16 double coeff[26] = {
121 static_cast<double>(0.006061842346248906525783753964555936883222L),
122 static_cast<double>(1u),
123 static_cast<double>(0.5098416655656676188125178644804694509993L),
124 static_cast<double>(66u),
125 static_cast<double>(19.51992788247617482847860966235652136208L),
126 static_cast<double>(1925u),
127 static_cast<double>(449.9445569063168119446858607650988409623L),
128 static_cast<double>(32670u),
129 static_cast<double>(6955.999602515376140356310115515198987526L),
130 static_cast<double>(357423u),
131 static_cast<double>(75999.29304014542649875303443598909137092L),
132 static_cast<double>(2637558u),
133 static_cast<double>(601859.6171681098786670226533699352302507L),
134 static_cast<double>(13339535u),
135 static_cast<double>(3481712.15498064590882071018964774556468L),
136 static_cast<double>(45995730u),
137 static_cast<double>(14605578.08768506808414169982791359218571L),
138 static_cast<double>(105258076u),
139 static_cast<double>(43338889.32467613834773723740590533316085L),
140 static_cast<double>(150917976u),
141 static_cast<double>(86363131.28813859145546927288977868422342L),
142 static_cast<double>(120543840u),
143 static_cast<double>(103794043.1163445451906271053616070238554L),
144 static_cast<double>(39916800u),
145 static_cast<double>(56906521.91347156388090791033559122686859L),
146 static_cast<double>(0u)
147 };
148
149 static const double lim = 4.76886e+25; // By experiment, the largest x for which the SSE2 code does not go bad.
150
151 if (x > lim)
152 {
153 double z = 1 / x;
154 return ((((((((((((coeff[24] * z + coeff[22]) * z + coeff[20]) * z + coeff[18]) * z + coeff[16]) * z + coeff[14]) * z + coeff[12]) * z + coeff[10]) * z + coeff[8]) * z + coeff[6]) * z + coeff[4]) * z + coeff[2]) * z + coeff[0]) / ((((((((((((coeff[25] * z + coeff[23]) * z + coeff[21]) * z + coeff[19]) * z + coeff[17]) * z + coeff[15]) * z + coeff[13]) * z + coeff[11]) * z + coeff[9]) * z + coeff[7]) * z + coeff[5]) * z + coeff[3]) * z + coeff[1]);
155 }
156
157 __m128d vx = _mm_load1_pd(dp: &x);
158 __m128d sum_even = _mm_load_pd(dp: coeff);
159 __m128d sum_odd = _mm_load_pd(dp: coeff+2);
160 __m128d nc_odd, nc_even;
161 __m128d vx2 = _mm_mul_pd(a: vx, b: vx);
162
163 sum_even = _mm_mul_pd(a: sum_even, b: vx2);
164 nc_even = _mm_load_pd(dp: coeff + 4);
165 sum_odd = _mm_mul_pd(a: sum_odd, b: vx2);
166 nc_odd = _mm_load_pd(dp: coeff + 6);
167 sum_even = _mm_add_pd(a: sum_even, b: nc_even);
168 sum_odd = _mm_add_pd(a: sum_odd, b: nc_odd);
169
170 sum_even = _mm_mul_pd(a: sum_even, b: vx2);
171 nc_even = _mm_load_pd(dp: coeff + 8);
172 sum_odd = _mm_mul_pd(a: sum_odd, b: vx2);
173 nc_odd = _mm_load_pd(dp: coeff + 10);
174 sum_even = _mm_add_pd(a: sum_even, b: nc_even);
175 sum_odd = _mm_add_pd(a: sum_odd, b: nc_odd);
176
177 sum_even = _mm_mul_pd(a: sum_even, b: vx2);
178 nc_even = _mm_load_pd(dp: coeff + 12);
179 sum_odd = _mm_mul_pd(a: sum_odd, b: vx2);
180 nc_odd = _mm_load_pd(dp: coeff + 14);
181 sum_even = _mm_add_pd(a: sum_even, b: nc_even);
182 sum_odd = _mm_add_pd(a: sum_odd, b: nc_odd);
183
184 sum_even = _mm_mul_pd(a: sum_even, b: vx2);
185 nc_even = _mm_load_pd(dp: coeff + 16);
186 sum_odd = _mm_mul_pd(a: sum_odd, b: vx2);
187 nc_odd = _mm_load_pd(dp: coeff + 18);
188 sum_even = _mm_add_pd(a: sum_even, b: nc_even);
189 sum_odd = _mm_add_pd(a: sum_odd, b: nc_odd);
190
191 sum_even = _mm_mul_pd(a: sum_even, b: vx2);
192 nc_even = _mm_load_pd(dp: coeff + 20);
193 sum_odd = _mm_mul_pd(a: sum_odd, b: vx2);
194 nc_odd = _mm_load_pd(dp: coeff + 22);
195 sum_even = _mm_add_pd(a: sum_even, b: nc_even);
196 sum_odd = _mm_add_pd(a: sum_odd, b: nc_odd);
197
198 sum_even = _mm_mul_pd(a: sum_even, b: vx2);
199 nc_even = _mm_load_pd(dp: coeff + 24);
200 sum_odd = _mm_mul_pd(a: sum_odd, b: vx);
201 sum_even = _mm_add_pd(a: sum_even, b: nc_even);
202 sum_even = _mm_add_pd(a: sum_even, b: sum_odd);
203
204
205 double ALIGN16 t[2];
206 _mm_store_pd(dp: t, a: sum_even);
207
208 return t[0] / t[1];
209}
210
211#ifdef _MSC_VER
212
213BOOST_STATIC_ASSERT(sizeof(double) == sizeof(long double));
214
215template <>
216inline long double lanczos13m53::lanczos_sum<long double>(const long double& x)
217{
218 return lanczos_sum<double>(static_cast<double>(x));
219}
220template <>
221inline long double lanczos13m53::lanczos_sum_expG_scaled<long double>(const long double& x)
222{
223 return lanczos_sum_expG_scaled<double>(static_cast<double>(x));
224}
225#endif
226
227} // namespace lanczos
228} // namespace math
229} // namespace boost
230
231#undef ALIGN16
232
233#endif // BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS
234
235
236
237
238
239

source code of include/boost/math/special_functions/detail/lanczos_sse2.hpp