1 | // Math overloads for simd -*- C++ -*- |
2 | |
3 | // Copyright (C) 2020-2021 Free Software Foundation, Inc. |
4 | // |
5 | // This file is part of the GNU ISO C++ Library. This library is free |
6 | // software; you can redistribute it and/or modify it under the |
7 | // terms of the GNU General Public License as published by the |
8 | // Free Software Foundation; either version 3, or (at your option) |
9 | // any later version. |
10 | |
11 | // This library is distributed in the hope that it will be useful, |
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | // GNU General Public License for more details. |
15 | |
16 | // Under Section 7 of GPL version 3, you are granted additional |
17 | // permissions described in the GCC Runtime Library Exception, version |
18 | // 3.1, as published by the Free Software Foundation. |
19 | |
20 | // You should have received a copy of the GNU General Public License and |
21 | // a copy of the GCC Runtime Library Exception along with this program; |
22 | // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see |
23 | // <http://www.gnu.org/licenses/>. |
24 | |
25 | #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_MATH_H_ |
26 | #define _GLIBCXX_EXPERIMENTAL_SIMD_MATH_H_ |
27 | |
28 | #if __cplusplus >= 201703L |
29 | |
30 | #include <utility> |
31 | #include <iomanip> |
32 | |
33 | _GLIBCXX_SIMD_BEGIN_NAMESPACE |
34 | template <typename _Tp, typename _V> |
35 | using _Samesize = fixed_size_simd<_Tp, _V::size()>; |
36 | |
37 | // _Math_return_type {{{ |
38 | template <typename _DoubleR, typename _Tp, typename _Abi> |
39 | struct _Math_return_type; |
40 | |
41 | template <typename _DoubleR, typename _Tp, typename _Abi> |
42 | using _Math_return_type_t = |
43 | typename _Math_return_type<_DoubleR, _Tp, _Abi>::type; |
44 | |
45 | template <typename _Tp, typename _Abi> |
46 | struct _Math_return_type<double, _Tp, _Abi> |
47 | { using type = simd<_Tp, _Abi>; }; |
48 | |
49 | template <typename _Tp, typename _Abi> |
50 | struct _Math_return_type<bool, _Tp, _Abi> |
51 | { using type = simd_mask<_Tp, _Abi>; }; |
52 | |
53 | template <typename _DoubleR, typename _Tp, typename _Abi> |
54 | struct _Math_return_type |
55 | { using type = fixed_size_simd<_DoubleR, simd_size_v<_Tp, _Abi>>; }; |
56 | |
57 | //}}} |
58 | // _GLIBCXX_SIMD_MATH_CALL_ {{{ |
59 | #define _GLIBCXX_SIMD_MATH_CALL_(__name) \ |
60 | template <typename _Tp, typename _Abi, typename..., \ |
61 | typename _R = _Math_return_type_t< \ |
62 | decltype(std::__name(declval<double>())), _Tp, _Abi>> \ |
63 | enable_if_t<is_floating_point_v<_Tp>, _R> \ |
64 | __name(simd<_Tp, _Abi> __x) \ |
65 | { return {__private_init, _Abi::_SimdImpl::_S_##__name(__data(__x))}; } |
66 | |
67 | // }}} |
68 | //_Extra_argument_type{{{ |
69 | template <typename _Up, typename _Tp, typename _Abi> |
70 | struct _Extra_argument_type; |
71 | |
72 | template <typename _Tp, typename _Abi> |
73 | struct <_Tp*, _Tp, _Abi> |
74 | { |
75 | using = simd<_Tp, _Abi>*; |
76 | static constexpr double* (); |
77 | static constexpr bool = true; |
78 | |
79 | _GLIBCXX_SIMD_INTRINSIC static constexpr auto (type __x) |
80 | { return &__data(*__x); } |
81 | }; |
82 | |
83 | template <typename _Up, typename _Tp, typename _Abi> |
84 | struct <_Up*, _Tp, _Abi> |
85 | { |
86 | static_assert(is_integral_v<_Up>); |
87 | using = fixed_size_simd<_Up, simd_size_v<_Tp, _Abi>>*; |
88 | static constexpr _Up* (); |
89 | static constexpr bool = true; |
90 | |
91 | _GLIBCXX_SIMD_INTRINSIC static constexpr auto (type __x) |
92 | { return &__data(*__x); } |
93 | }; |
94 | |
95 | template <typename _Tp, typename _Abi> |
96 | struct <_Tp, _Tp, _Abi> |
97 | { |
98 | using = simd<_Tp, _Abi>; |
99 | static constexpr double (); |
100 | static constexpr bool = false; |
101 | |
102 | _GLIBCXX_SIMD_INTRINSIC static constexpr decltype(auto) |
103 | (const type& __x) |
104 | { return __data(__x); } |
105 | }; |
106 | |
107 | template <typename _Up, typename _Tp, typename _Abi> |
108 | struct |
109 | { |
110 | static_assert(is_integral_v<_Up>); |
111 | using = fixed_size_simd<_Up, simd_size_v<_Tp, _Abi>>; |
112 | static constexpr _Up (); |
113 | static constexpr bool = false; |
114 | |
115 | _GLIBCXX_SIMD_INTRINSIC static constexpr decltype(auto) |
116 | (const type& __x) |
117 | { return __data(__x); } |
118 | }; |
119 | |
120 | //}}} |
121 | // _GLIBCXX_SIMD_MATH_CALL2_ {{{ |
122 | #define _GLIBCXX_SIMD_MATH_CALL2_(__name, arg2_) \ |
123 | template < \ |
124 | typename _Tp, typename _Abi, typename..., \ |
125 | typename _Arg2 = _Extra_argument_type<arg2_, _Tp, _Abi>, \ |
126 | typename _R = _Math_return_type_t< \ |
127 | decltype(std::__name(declval<double>(), _Arg2::declval())), _Tp, _Abi>> \ |
128 | enable_if_t<is_floating_point_v<_Tp>, _R> \ |
129 | __name(const simd<_Tp, _Abi>& __x, const typename _Arg2::type& __y) \ |
130 | { \ |
131 | return {__private_init, \ |
132 | _Abi::_SimdImpl::_S_##__name(__data(__x), _Arg2::_S_data(__y))}; \ |
133 | } \ |
134 | template <typename _Up, typename _Tp, typename _Abi> \ |
135 | _GLIBCXX_SIMD_INTRINSIC _Math_return_type_t< \ |
136 | decltype(std::__name( \ |
137 | declval<double>(), \ |
138 | declval<enable_if_t< \ |
139 | conjunction_v< \ |
140 | is_same<arg2_, _Tp>, \ |
141 | negation<is_same<__remove_cvref_t<_Up>, simd<_Tp, _Abi>>>, \ |
142 | is_convertible<_Up, simd<_Tp, _Abi>>, is_floating_point<_Tp>>, \ |
143 | double>>())), \ |
144 | _Tp, _Abi> \ |
145 | __name(_Up&& __xx, const simd<_Tp, _Abi>& __yy) \ |
146 | { return __name(simd<_Tp, _Abi>(static_cast<_Up&&>(__xx)), __yy); } |
147 | |
148 | // }}} |
149 | // _GLIBCXX_SIMD_MATH_CALL3_ {{{ |
150 | #define _GLIBCXX_SIMD_MATH_CALL3_(__name, arg2_, arg3_) \ |
151 | template <typename _Tp, typename _Abi, typename..., \ |
152 | typename _Arg2 = _Extra_argument_type<arg2_, _Tp, _Abi>, \ |
153 | typename _Arg3 = _Extra_argument_type<arg3_, _Tp, _Abi>, \ |
154 | typename _R = _Math_return_type_t< \ |
155 | decltype(std::__name(declval<double>(), _Arg2::declval(), \ |
156 | _Arg3::declval())), \ |
157 | _Tp, _Abi>> \ |
158 | enable_if_t<is_floating_point_v<_Tp>, _R> \ |
159 | __name(const simd<_Tp, _Abi>& __x, const typename _Arg2::type& __y, \ |
160 | const typename _Arg3::type& __z) \ |
161 | { \ |
162 | return {__private_init, \ |
163 | _Abi::_SimdImpl::_S_##__name(__data(__x), _Arg2::_S_data(__y), \ |
164 | _Arg3::_S_data(__z))}; \ |
165 | } \ |
166 | template < \ |
167 | typename _T0, typename _T1, typename _T2, typename..., \ |
168 | typename _U0 = __remove_cvref_t<_T0>, \ |
169 | typename _U1 = __remove_cvref_t<_T1>, \ |
170 | typename _U2 = __remove_cvref_t<_T2>, \ |
171 | typename _Simd = conditional_t<is_simd_v<_U1>, _U1, _U2>, \ |
172 | typename = enable_if_t<conjunction_v< \ |
173 | is_simd<_Simd>, is_convertible<_T0&&, _Simd>, \ |
174 | is_convertible<_T1&&, _Simd>, is_convertible<_T2&&, _Simd>, \ |
175 | negation<conjunction< \ |
176 | is_simd<_U0>, is_floating_point<__value_type_or_identity_t<_U0>>>>>>> \ |
177 | _GLIBCXX_SIMD_INTRINSIC decltype(__name(declval<const _Simd&>(), \ |
178 | declval<const _Simd&>(), \ |
179 | declval<const _Simd&>())) \ |
180 | __name(_T0&& __xx, _T1&& __yy, _T2&& __zz) \ |
181 | { \ |
182 | return __name(_Simd(static_cast<_T0&&>(__xx)), \ |
183 | _Simd(static_cast<_T1&&>(__yy)), \ |
184 | _Simd(static_cast<_T2&&>(__zz))); \ |
185 | } |
186 | |
187 | // }}} |
188 | // __cosSeries {{{ |
189 | template <typename _Abi> |
190 | _GLIBCXX_SIMD_ALWAYS_INLINE static simd<float, _Abi> |
191 | __cosSeries(const simd<float, _Abi>& __x) |
192 | { |
193 | const simd<float, _Abi> __x2 = __x * __x; |
194 | simd<float, _Abi> __y; |
195 | __y = 0x1.ap-16f; // 1/8! |
196 | __y = __y * __x2 - 0x1.6c1p-10f; // -1/6! |
197 | __y = __y * __x2 + 0x1.555556p-5f; // 1/4! |
198 | return __y * (__x2 * __x2) - .5f * __x2 + 1.f; |
199 | } |
200 | |
201 | template <typename _Abi> |
202 | _GLIBCXX_SIMD_ALWAYS_INLINE static simd<double, _Abi> |
203 | __cosSeries(const simd<double, _Abi>& __x) |
204 | { |
205 | const simd<double, _Abi> __x2 = __x * __x; |
206 | simd<double, _Abi> __y; |
207 | __y = 0x1.AC00000000000p-45; // 1/16! |
208 | __y = __y * __x2 - 0x1.9394000000000p-37; // -1/14! |
209 | __y = __y * __x2 + 0x1.1EED8C0000000p-29; // 1/12! |
210 | __y = __y * __x2 - 0x1.27E4FB7400000p-22; // -1/10! |
211 | __y = __y * __x2 + 0x1.A01A01A018000p-16; // 1/8! |
212 | __y = __y * __x2 - 0x1.6C16C16C16C00p-10; // -1/6! |
213 | __y = __y * __x2 + 0x1.5555555555554p-5; // 1/4! |
214 | return (__y * __x2 - .5f) * __x2 + 1.f; |
215 | } |
216 | |
217 | // }}} |
218 | // __sinSeries {{{ |
219 | template <typename _Abi> |
220 | _GLIBCXX_SIMD_ALWAYS_INLINE static simd<float, _Abi> |
221 | __sinSeries(const simd<float, _Abi>& __x) |
222 | { |
223 | const simd<float, _Abi> __x2 = __x * __x; |
224 | simd<float, _Abi> __y; |
225 | __y = -0x1.9CC000p-13f; // -1/7! |
226 | __y = __y * __x2 + 0x1.111100p-7f; // 1/5! |
227 | __y = __y * __x2 - 0x1.555556p-3f; // -1/3! |
228 | return __y * (__x2 * __x) + __x; |
229 | } |
230 | |
231 | template <typename _Abi> |
232 | _GLIBCXX_SIMD_ALWAYS_INLINE static simd<double, _Abi> |
233 | __sinSeries(const simd<double, _Abi>& __x) |
234 | { |
235 | // __x = [0, 0.7854 = pi/4] |
236 | // __x² = [0, 0.6169 = pi²/8] |
237 | const simd<double, _Abi> __x2 = __x * __x; |
238 | simd<double, _Abi> __y; |
239 | __y = -0x1.ACF0000000000p-41; // -1/15! |
240 | __y = __y * __x2 + 0x1.6124400000000p-33; // 1/13! |
241 | __y = __y * __x2 - 0x1.AE64567000000p-26; // -1/11! |
242 | __y = __y * __x2 + 0x1.71DE3A5540000p-19; // 1/9! |
243 | __y = __y * __x2 - 0x1.A01A01A01A000p-13; // -1/7! |
244 | __y = __y * __x2 + 0x1.1111111111110p-7; // 1/5! |
245 | __y = __y * __x2 - 0x1.5555555555555p-3; // -1/3! |
246 | return __y * (__x2 * __x) + __x; |
247 | } |
248 | |
249 | // }}} |
250 | // __zero_low_bits {{{ |
251 | template <int _Bits, typename _Tp, typename _Abi> |
252 | _GLIBCXX_SIMD_INTRINSIC simd<_Tp, _Abi> |
253 | __zero_low_bits(simd<_Tp, _Abi> __x) |
254 | { |
255 | const simd<_Tp, _Abi> __bitmask |
256 | = __bit_cast<_Tp>(~make_unsigned_t<__int_for_sizeof_t<_Tp>>() << _Bits); |
257 | return {__private_init, |
258 | _Abi::_SimdImpl::_S_bit_and(__data(__x), __data(__bitmask))}; |
259 | } |
260 | |
261 | // }}} |
262 | // __fold_input {{{ |
263 | |
264 | /**@internal |
265 | * Fold @p x into [-¼π, ¼π] and remember the quadrant it came from: |
266 | * quadrant 0: [-¼π, ¼π] |
267 | * quadrant 1: [ ¼π, ¾π] |
268 | * quadrant 2: [ ¾π, 1¼π] |
269 | * quadrant 3: [1¼π, 1¾π] |
270 | * |
271 | * The algorithm determines `y` as the multiple `x - y * ¼π = [-¼π, ¼π]`. Using |
272 | * a bitmask, `y` is reduced to `quadrant`. `y` can be calculated as |
273 | * ``` |
274 | * y = trunc(x / ¼π); |
275 | * y += fmod(y, 2); |
276 | * ``` |
277 | * This can be simplified by moving the (implicit) division by 2 into the |
278 | * truncation expression. The `+= fmod` effect can the be achieved by using |
279 | * rounding instead of truncation: `y = round(x / ½π) * 2`. If precision allows, |
280 | * `2/Ï€ * x` is better (faster). |
281 | */ |
282 | template <typename _Tp, typename _Abi> |
283 | struct _Folded |
284 | { |
285 | simd<_Tp, _Abi> _M_x; |
286 | rebind_simd_t<int, simd<_Tp, _Abi>> _M_quadrant; |
287 | }; |
288 | |
289 | namespace __math_float { |
290 | inline constexpr float __pi_over_4 = 0x1.921FB6p-1f; // π/4 |
291 | inline constexpr float __2_over_pi = 0x1.45F306p-1f; // 2/Ï€ |
292 | inline constexpr float __pi_2_5bits0 |
293 | = 0x1.921fc0p0f; // π/2, 5 0-bits (least significant) |
294 | inline constexpr float __pi_2_5bits0_rem |
295 | = -0x1.5777a6p-21f; // π/2 - __pi_2_5bits0 |
296 | } // namespace __math_float |
297 | namespace __math_double { |
298 | inline constexpr double __pi_over_4 = 0x1.921fb54442d18p-1; // π/4 |
299 | inline constexpr double __2_over_pi = 0x1.45F306DC9C883p-1; // 2/Ï€ |
300 | inline constexpr double __pi_2 = 0x1.921fb54442d18p0; // π/2 |
301 | } // namespace __math_double |
302 | |
303 | template <typename _Abi> |
304 | _GLIBCXX_SIMD_ALWAYS_INLINE _Folded<float, _Abi> |
305 | __fold_input(const simd<float, _Abi>& __x) |
306 | { |
307 | using _V = simd<float, _Abi>; |
308 | using _IV = rebind_simd_t<int, _V>; |
309 | using namespace __math_float; |
310 | _Folded<float, _Abi> __r; |
311 | __r._M_x = abs(__x); |
312 | #if 0 |
313 | // zero most mantissa bits: |
314 | constexpr float __1_over_pi = 0x1.45F306p-2f; // 1/Ï€ |
315 | const auto __y = (__r._M_x * __1_over_pi + 0x1.8p23f) - 0x1.8p23f; |
316 | // split π into 4 parts, the first three with 13 trailing zeros (to make the |
317 | // following multiplications precise): |
318 | constexpr float __pi0 = 0x1.920000p1f; |
319 | constexpr float __pi1 = 0x1.fb4000p-11f; |
320 | constexpr float __pi2 = 0x1.444000p-23f; |
321 | constexpr float __pi3 = 0x1.68c234p-38f; |
322 | __r._M_x - __y*__pi0 - __y*__pi1 - __y*__pi2 - __y*__pi3 |
323 | #else |
324 | if (_GLIBCXX_SIMD_IS_UNLIKELY(all_of(__r._M_x < __pi_over_4))) |
325 | __r._M_quadrant = 0; |
326 | else if (_GLIBCXX_SIMD_IS_LIKELY(all_of(__r._M_x < 6 * __pi_over_4))) |
327 | { |
328 | const _V __y = nearbyint(__r._M_x * __2_over_pi); |
329 | __r._M_quadrant = static_simd_cast<_IV>(__y) & 3; // __y mod 4 |
330 | __r._M_x -= __y * __pi_2_5bits0; |
331 | __r._M_x -= __y * __pi_2_5bits0_rem; |
332 | } |
333 | else |
334 | { |
335 | using __math_double::__2_over_pi; |
336 | using __math_double::__pi_2; |
337 | using _VD = rebind_simd_t<double, _V>; |
338 | _VD __xd = static_simd_cast<_VD>(__r._M_x); |
339 | _VD __y = nearbyint(__xd * __2_over_pi); |
340 | __r._M_quadrant = static_simd_cast<_IV>(__y) & 3; // = __y mod 4 |
341 | __r._M_x = static_simd_cast<_V>(__xd - __y * __pi_2); |
342 | } |
343 | #endif |
344 | return __r; |
345 | } |
346 | |
347 | template <typename _Abi> |
348 | _GLIBCXX_SIMD_ALWAYS_INLINE _Folded<double, _Abi> |
349 | __fold_input(const simd<double, _Abi>& __x) |
350 | { |
351 | using _V = simd<double, _Abi>; |
352 | using _IV = rebind_simd_t<int, _V>; |
353 | using namespace __math_double; |
354 | |
355 | _Folded<double, _Abi> __r; |
356 | __r._M_x = abs(__x); |
357 | if (_GLIBCXX_SIMD_IS_UNLIKELY(all_of(__r._M_x < __pi_over_4))) |
358 | { |
359 | __r._M_quadrant = 0; |
360 | return __r; |
361 | } |
362 | const _V __y = nearbyint(__r._M_x / (2 * __pi_over_4)); |
363 | __r._M_quadrant = static_simd_cast<_IV>(__y) & 3; |
364 | |
365 | if (_GLIBCXX_SIMD_IS_LIKELY(all_of(__r._M_x < 1025 * __pi_over_4))) |
366 | { |
367 | // x - y * pi/2, y uses no more than 11 mantissa bits |
368 | __r._M_x -= __y * 0x1.921FB54443000p0; |
369 | __r._M_x -= __y * -0x1.73DCB3B39A000p-43; |
370 | __r._M_x -= __y * 0x1.45C06E0E68948p-86; |
371 | } |
372 | else if (_GLIBCXX_SIMD_IS_LIKELY(all_of(__y <= 0x1.0p30))) |
373 | { |
374 | // x - y * pi/2, y uses no more than 29 mantissa bits |
375 | __r._M_x -= __y * 0x1.921FB40000000p0; |
376 | __r._M_x -= __y * 0x1.4442D00000000p-24; |
377 | __r._M_x -= __y * 0x1.8469898CC5170p-48; |
378 | } |
379 | else |
380 | { |
381 | // x - y * pi/2, y may require all mantissa bits |
382 | const _V __y_hi = __zero_low_bits<26>(__y); |
383 | const _V __y_lo = __y - __y_hi; |
384 | const auto __pi_2_1 = 0x1.921FB50000000p0; |
385 | const auto __pi_2_2 = 0x1.110B460000000p-26; |
386 | const auto __pi_2_3 = 0x1.1A62630000000p-54; |
387 | const auto __pi_2_4 = 0x1.8A2E03707344Ap-81; |
388 | __r._M_x = __r._M_x - __y_hi * __pi_2_1 |
389 | - max(__y_hi * __pi_2_2, __y_lo * __pi_2_1) |
390 | - min(__y_hi * __pi_2_2, __y_lo * __pi_2_1) |
391 | - max(__y_hi * __pi_2_3, __y_lo * __pi_2_2) |
392 | - min(__y_hi * __pi_2_3, __y_lo * __pi_2_2) |
393 | - max(__y * __pi_2_4, __y_lo * __pi_2_3) |
394 | - min(__y * __pi_2_4, __y_lo * __pi_2_3); |
395 | } |
396 | return __r; |
397 | } |
398 | |
399 | // }}} |
400 | // __extract_exponent_as_int {{{ |
401 | template <typename _Tp, typename _Abi> |
402 | rebind_simd_t<int, simd<_Tp, _Abi>> |
403 | (const simd<_Tp, _Abi>& __v) |
404 | { |
405 | using _Vp = simd<_Tp, _Abi>; |
406 | using _Up = make_unsigned_t<__int_for_sizeof_t<_Tp>>; |
407 | using namespace std::experimental::__float_bitwise_operators; |
408 | const _Vp __exponent_mask |
409 | = __infinity_v<_Tp>; // 0x7f800000 or 0x7ff0000000000000 |
410 | return static_simd_cast<rebind_simd_t<int, _Vp>>( |
411 | __bit_cast<rebind_simd_t<_Up, _Vp>>(__v & __exponent_mask) |
412 | >> (__digits_v<_Tp> - 1)); |
413 | } |
414 | |
415 | // }}} |
416 | // __impl_or_fallback {{{ |
417 | template <typename ImplFun, typename FallbackFun, typename... _Args> |
418 | _GLIBCXX_SIMD_INTRINSIC auto |
419 | __impl_or_fallback_dispatch(int, ImplFun&& __impl_fun, FallbackFun&&, |
420 | _Args&&... __args) |
421 | -> decltype(__impl_fun(static_cast<_Args&&>(__args)...)) |
422 | { return __impl_fun(static_cast<_Args&&>(__args)...); } |
423 | |
424 | template <typename ImplFun, typename FallbackFun, typename... _Args> |
425 | inline auto |
426 | __impl_or_fallback_dispatch(float, ImplFun&&, FallbackFun&& __fallback_fun, |
427 | _Args&&... __args) |
428 | -> decltype(__fallback_fun(static_cast<_Args&&>(__args)...)) |
429 | { return __fallback_fun(static_cast<_Args&&>(__args)...); } |
430 | |
431 | template <typename... _Args> |
432 | _GLIBCXX_SIMD_INTRINSIC auto |
433 | __impl_or_fallback(_Args&&... __args) |
434 | { |
435 | return __impl_or_fallback_dispatch(int(), static_cast<_Args&&>(__args)...); |
436 | } |
437 | //}}} |
438 | |
439 | // trigonometric functions {{{ |
440 | _GLIBCXX_SIMD_MATH_CALL_(acos) |
441 | _GLIBCXX_SIMD_MATH_CALL_(asin) |
442 | _GLIBCXX_SIMD_MATH_CALL_(atan) |
443 | _GLIBCXX_SIMD_MATH_CALL2_(atan2, _Tp) |
444 | |
445 | /* |
446 | * algorithm for sine and cosine: |
447 | * |
448 | * The result can be calculated with sine or cosine depending on the π/4 section |
449 | * the input is in. sine ≈ __x + __x³ cosine ≈ 1 - __x² |
450 | * |
451 | * sine: |
452 | * Map -__x to __x and invert the output |
453 | * Extend precision of __x - n * π/4 by calculating |
454 | * ((__x - n * p1) - n * p2) - n * p3 (p1 + p2 + p3 = π/4) |
455 | * |
456 | * Calculate Taylor series with tuned coefficients. |
457 | * Fix sign. |
458 | */ |
459 | // cos{{{ |
460 | template <typename _Tp, typename _Abi> |
461 | enable_if_t<is_floating_point_v<_Tp>, simd<_Tp, _Abi>> |
462 | cos(const simd<_Tp, _Abi>& __x) |
463 | { |
464 | using _V = simd<_Tp, _Abi>; |
465 | if constexpr (__is_scalar_abi<_Abi>() || __is_fixed_size_abi_v<_Abi>) |
466 | return {__private_init, _Abi::_SimdImpl::_S_cos(__data(__x))}; |
467 | else |
468 | { |
469 | if constexpr (is_same_v<_Tp, float>) |
470 | if (_GLIBCXX_SIMD_IS_UNLIKELY(any_of(abs(__x) >= 393382))) |
471 | return static_simd_cast<_V>( |
472 | cos(static_simd_cast<rebind_simd_t<double, _V>>(__x))); |
473 | |
474 | const auto __f = __fold_input(__x); |
475 | // quadrant | effect |
476 | // 0 | cosSeries, + |
477 | // 1 | sinSeries, - |
478 | // 2 | cosSeries, - |
479 | // 3 | sinSeries, + |
480 | using namespace std::experimental::__float_bitwise_operators; |
481 | const _V __sign_flip |
482 | = _V(-0.f) & static_simd_cast<_V>((1 + __f._M_quadrant) << 30); |
483 | |
484 | const auto __need_cos = (__f._M_quadrant & 1) == 0; |
485 | if (_GLIBCXX_SIMD_IS_UNLIKELY(all_of(__need_cos))) |
486 | return __sign_flip ^ __cosSeries(__f._M_x); |
487 | else if (_GLIBCXX_SIMD_IS_UNLIKELY(none_of(__need_cos))) |
488 | return __sign_flip ^ __sinSeries(__f._M_x); |
489 | else // some_of(__need_cos) |
490 | { |
491 | _V __r = __sinSeries(__f._M_x); |
492 | where(__need_cos.__cvt(), __r) = __cosSeries(__f._M_x); |
493 | return __r ^ __sign_flip; |
494 | } |
495 | } |
496 | } |
497 | |
498 | template <typename _Tp> |
499 | _GLIBCXX_SIMD_ALWAYS_INLINE |
500 | enable_if_t<is_floating_point<_Tp>::value, simd<_Tp, simd_abi::scalar>> |
501 | cos(simd<_Tp, simd_abi::scalar> __x) |
502 | { return std::cos(__data(__x)); } |
503 | |
504 | //}}} |
505 | // sin{{{ |
506 | template <typename _Tp, typename _Abi> |
507 | enable_if_t<is_floating_point_v<_Tp>, simd<_Tp, _Abi>> |
508 | sin(const simd<_Tp, _Abi>& __x) |
509 | { |
510 | using _V = simd<_Tp, _Abi>; |
511 | if constexpr (__is_scalar_abi<_Abi>() || __is_fixed_size_abi_v<_Abi>) |
512 | return {__private_init, _Abi::_SimdImpl::_S_sin(__data(__x))}; |
513 | else |
514 | { |
515 | if constexpr (is_same_v<_Tp, float>) |
516 | if (_GLIBCXX_SIMD_IS_UNLIKELY(any_of(abs(__x) >= 527449))) |
517 | return static_simd_cast<_V>( |
518 | sin(static_simd_cast<rebind_simd_t<double, _V>>(__x))); |
519 | |
520 | const auto __f = __fold_input(__x); |
521 | // quadrant | effect |
522 | // 0 | sinSeries |
523 | // 1 | cosSeries |
524 | // 2 | sinSeries, sign flip |
525 | // 3 | cosSeries, sign flip |
526 | using namespace std::experimental::__float_bitwise_operators; |
527 | const auto __sign_flip |
528 | = (__x ^ static_simd_cast<_V>(1 - __f._M_quadrant)) & _V(_Tp(-0.)); |
529 | |
530 | const auto __need_sin = (__f._M_quadrant & 1) == 0; |
531 | if (_GLIBCXX_SIMD_IS_UNLIKELY(all_of(__need_sin))) |
532 | return __sign_flip ^ __sinSeries(__f._M_x); |
533 | else if (_GLIBCXX_SIMD_IS_UNLIKELY(none_of(__need_sin))) |
534 | return __sign_flip ^ __cosSeries(__f._M_x); |
535 | else // some_of(__need_sin) |
536 | { |
537 | _V __r = __cosSeries(__f._M_x); |
538 | where(__need_sin.__cvt(), __r) = __sinSeries(__f._M_x); |
539 | return __sign_flip ^ __r; |
540 | } |
541 | } |
542 | } |
543 | |
544 | template <typename _Tp> |
545 | _GLIBCXX_SIMD_ALWAYS_INLINE |
546 | enable_if_t<is_floating_point<_Tp>::value, simd<_Tp, simd_abi::scalar>> |
547 | sin(simd<_Tp, simd_abi::scalar> __x) |
548 | { return std::sin(__data(__x)); } |
549 | |
550 | //}}} |
551 | _GLIBCXX_SIMD_MATH_CALL_(tan) |
552 | _GLIBCXX_SIMD_MATH_CALL_(acosh) |
553 | _GLIBCXX_SIMD_MATH_CALL_(asinh) |
554 | _GLIBCXX_SIMD_MATH_CALL_(atanh) |
555 | _GLIBCXX_SIMD_MATH_CALL_(cosh) |
556 | _GLIBCXX_SIMD_MATH_CALL_(sinh) |
557 | _GLIBCXX_SIMD_MATH_CALL_(tanh) |
558 | // }}} |
559 | // exponential functions {{{ |
560 | _GLIBCXX_SIMD_MATH_CALL_(exp) |
561 | _GLIBCXX_SIMD_MATH_CALL_(exp2) |
562 | _GLIBCXX_SIMD_MATH_CALL_(expm1) |
563 | |
564 | // }}} |
565 | // frexp {{{ |
566 | #if _GLIBCXX_SIMD_X86INTRIN |
567 | template <typename _Tp, size_t _Np> |
568 | _SimdWrapper<_Tp, _Np> |
569 | __getexp(_SimdWrapper<_Tp, _Np> __x) |
570 | { |
571 | if constexpr (__have_avx512vl && __is_sse_ps<_Tp, _Np>()) |
572 | return __auto_bitcast(_mm_getexp_ps(__to_intrin(__x))); |
573 | else if constexpr (__have_avx512f && __is_sse_ps<_Tp, _Np>()) |
574 | return __auto_bitcast(_mm512_getexp_ps(__auto_bitcast(__to_intrin(__x)))); |
575 | else if constexpr (__have_avx512vl && __is_sse_pd<_Tp, _Np>()) |
576 | return _mm_getexp_pd(__x); |
577 | else if constexpr (__have_avx512f && __is_sse_pd<_Tp, _Np>()) |
578 | return __lo128(_mm512_getexp_pd(__auto_bitcast(__x))); |
579 | else if constexpr (__have_avx512vl && __is_avx_ps<_Tp, _Np>()) |
580 | return _mm256_getexp_ps(__x); |
581 | else if constexpr (__have_avx512f && __is_avx_ps<_Tp, _Np>()) |
582 | return __lo256(_mm512_getexp_ps(__auto_bitcast(__x))); |
583 | else if constexpr (__have_avx512vl && __is_avx_pd<_Tp, _Np>()) |
584 | return _mm256_getexp_pd(__x); |
585 | else if constexpr (__have_avx512f && __is_avx_pd<_Tp, _Np>()) |
586 | return __lo256(_mm512_getexp_pd(__auto_bitcast(__x))); |
587 | else if constexpr (__is_avx512_ps<_Tp, _Np>()) |
588 | return _mm512_getexp_ps(__x); |
589 | else if constexpr (__is_avx512_pd<_Tp, _Np>()) |
590 | return _mm512_getexp_pd(__x); |
591 | else |
592 | __assert_unreachable<_Tp>(); |
593 | } |
594 | |
595 | template <typename _Tp, size_t _Np> |
596 | _SimdWrapper<_Tp, _Np> |
597 | __getmant_avx512(_SimdWrapper<_Tp, _Np> __x) |
598 | { |
599 | if constexpr (__have_avx512vl && __is_sse_ps<_Tp, _Np>()) |
600 | return __auto_bitcast(_mm_getmant_ps(__to_intrin(__x), _MM_MANT_NORM_p5_1, |
601 | _MM_MANT_SIGN_src)); |
602 | else if constexpr (__have_avx512f && __is_sse_ps<_Tp, _Np>()) |
603 | return __auto_bitcast(_mm512_getmant_ps(__auto_bitcast(__to_intrin(__x)), |
604 | _MM_MANT_NORM_p5_1, |
605 | _MM_MANT_SIGN_src)); |
606 | else if constexpr (__have_avx512vl && __is_sse_pd<_Tp, _Np>()) |
607 | return _mm_getmant_pd(__x, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src); |
608 | else if constexpr (__have_avx512f && __is_sse_pd<_Tp, _Np>()) |
609 | return __lo128(_mm512_getmant_pd(__auto_bitcast(__x), _MM_MANT_NORM_p5_1, |
610 | _MM_MANT_SIGN_src)); |
611 | else if constexpr (__have_avx512vl && __is_avx_ps<_Tp, _Np>()) |
612 | return _mm256_getmant_ps(__x, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src); |
613 | else if constexpr (__have_avx512f && __is_avx_ps<_Tp, _Np>()) |
614 | return __lo256(_mm512_getmant_ps(__auto_bitcast(__x), _MM_MANT_NORM_p5_1, |
615 | _MM_MANT_SIGN_src)); |
616 | else if constexpr (__have_avx512vl && __is_avx_pd<_Tp, _Np>()) |
617 | return _mm256_getmant_pd(__x, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src); |
618 | else if constexpr (__have_avx512f && __is_avx_pd<_Tp, _Np>()) |
619 | return __lo256(_mm512_getmant_pd(__auto_bitcast(__x), _MM_MANT_NORM_p5_1, |
620 | _MM_MANT_SIGN_src)); |
621 | else if constexpr (__is_avx512_ps<_Tp, _Np>()) |
622 | return _mm512_getmant_ps(__x, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src); |
623 | else if constexpr (__is_avx512_pd<_Tp, _Np>()) |
624 | return _mm512_getmant_pd(__x, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src); |
625 | else |
626 | __assert_unreachable<_Tp>(); |
627 | } |
628 | #endif // _GLIBCXX_SIMD_X86INTRIN |
629 | |
630 | /** |
631 | * splits @p __v into exponent and mantissa, the sign is kept with the mantissa |
632 | * |
633 | * The return value will be in the range [0.5, 1.0[ |
634 | * The @p __e value will be an integer defining the power-of-two exponent |
635 | */ |
636 | template <typename _Tp, typename _Abi> |
637 | enable_if_t<is_floating_point_v<_Tp>, simd<_Tp, _Abi>> |
638 | frexp(const simd<_Tp, _Abi>& __x, _Samesize<int, simd<_Tp, _Abi>>* __exp) |
639 | { |
640 | if constexpr (simd_size_v<_Tp, _Abi> == 1) |
641 | { |
642 | int __tmp; |
643 | const auto __r = std::frexp(__x[0], &__tmp); |
644 | (*__exp)[0] = __tmp; |
645 | return __r; |
646 | } |
647 | else if constexpr (__is_fixed_size_abi_v<_Abi>) |
648 | { |
649 | return {__private_init, |
650 | _Abi::_SimdImpl::_S_frexp(__data(__x), __data(*__exp))}; |
651 | #if _GLIBCXX_SIMD_X86INTRIN |
652 | } |
653 | else if constexpr (__have_avx512f) |
654 | { |
655 | constexpr size_t _Np = simd_size_v<_Tp, _Abi>; |
656 | constexpr size_t _NI = _Np < 4 ? 4 : _Np; |
657 | const auto __v = __data(__x); |
658 | const auto __isnonzero |
659 | = _Abi::_SimdImpl::_S_isnonzerovalue_mask(__v._M_data); |
660 | const _SimdWrapper<int, _NI> __exp_plus1 |
661 | = 1 + __convert<_SimdWrapper<int, _NI>>(__getexp(__v))._M_data; |
662 | const _SimdWrapper<int, _Np> __e = __wrapper_bitcast<int, _Np>( |
663 | _Abi::_CommonImpl::_S_blend(_SimdWrapper<bool, _NI>(__isnonzero), |
664 | _SimdWrapper<int, _NI>(), __exp_plus1)); |
665 | simd_abi::deduce_t<int, _Np>::_CommonImpl::_S_store(__e, __exp); |
666 | return {__private_init, |
667 | _Abi::_CommonImpl::_S_blend(_SimdWrapper<bool, _Np>( |
668 | __isnonzero), |
669 | __v, __getmant_avx512(__v))}; |
670 | #endif // _GLIBCXX_SIMD_X86INTRIN |
671 | } |
672 | else |
673 | { |
674 | // fallback implementation |
675 | static_assert(sizeof(_Tp) == 4 || sizeof(_Tp) == 8); |
676 | using _V = simd<_Tp, _Abi>; |
677 | using _IV = rebind_simd_t<int, _V>; |
678 | using namespace std::experimental::__proposed; |
679 | using namespace std::experimental::__float_bitwise_operators; |
680 | |
681 | constexpr int __exp_adjust = sizeof(_Tp) == 4 ? 0x7e : 0x3fe; |
682 | constexpr int __exp_offset = sizeof(_Tp) == 4 ? 0x70 : 0x200; |
683 | constexpr _Tp __subnorm_scale = sizeof(_Tp) == 4 ? 0x1p112 : 0x1p512; |
684 | _GLIBCXX_SIMD_USE_CONSTEXPR_API _V __exponent_mask |
685 | = __infinity_v<_Tp>; // 0x7f800000 or 0x7ff0000000000000 |
686 | _GLIBCXX_SIMD_USE_CONSTEXPR_API _V __p5_1_exponent |
687 | = -(2 - __epsilon_v<_Tp>) / 2; // 0xbf7fffff or 0xbfefffffffffffff |
688 | |
689 | _V __mant = __p5_1_exponent & (__exponent_mask | __x); // +/-[.5, 1) |
690 | const _IV __exponent_bits = __extract_exponent_as_int(__x); |
691 | if (_GLIBCXX_SIMD_IS_LIKELY(all_of(isnormal(__x)))) |
692 | { |
693 | *__exp |
694 | = simd_cast<_Samesize<int, _V>>(__exponent_bits - __exp_adjust); |
695 | return __mant; |
696 | } |
697 | |
698 | #if __FINITE_MATH_ONLY__ |
699 | // at least one element of __x is 0 or subnormal, the rest is normal |
700 | // (inf and NaN are excluded by -ffinite-math-only) |
701 | const auto __iszero_inf_nan = __x == 0; |
702 | #else |
703 | const auto __as_int |
704 | = __bit_cast<rebind_simd_t<__int_for_sizeof_t<_Tp>, _V>>(abs(__x)); |
705 | const auto __inf |
706 | = __bit_cast<rebind_simd_t<__int_for_sizeof_t<_Tp>, _V>>( |
707 | _V(__infinity_v<_Tp>)); |
708 | const auto __iszero_inf_nan = static_simd_cast<typename _V::mask_type>( |
709 | __as_int == 0 || __as_int >= __inf); |
710 | #endif |
711 | |
712 | const _V __scaled_subnormal = __x * __subnorm_scale; |
713 | const _V __mant_subnormal |
714 | = __p5_1_exponent & (__exponent_mask | __scaled_subnormal); |
715 | where(!isnormal(__x), __mant) = __mant_subnormal; |
716 | where(__iszero_inf_nan, __mant) = __x; |
717 | _IV __e = __extract_exponent_as_int(__scaled_subnormal); |
718 | using _MaskType = |
719 | typename conditional_t<sizeof(typename _V::value_type) == sizeof(int), |
720 | _V, _IV>::mask_type; |
721 | const _MaskType __value_isnormal = isnormal(__x).__cvt(); |
722 | where(__value_isnormal.__cvt(), __e) = __exponent_bits; |
723 | static_assert(sizeof(_IV) == sizeof(__value_isnormal)); |
724 | const _IV __offset |
725 | = (__bit_cast<_IV>(__value_isnormal) & _IV(__exp_adjust)) |
726 | | (__bit_cast<_IV>(static_simd_cast<_MaskType>(__exponent_bits == 0) |
727 | & static_simd_cast<_MaskType>(__x != 0)) |
728 | & _IV(__exp_adjust + __exp_offset)); |
729 | *__exp = simd_cast<_Samesize<int, _V>>(__e - __offset); |
730 | return __mant; |
731 | } |
732 | } |
733 | |
734 | // }}} |
735 | _GLIBCXX_SIMD_MATH_CALL2_(ldexp, int) |
736 | _GLIBCXX_SIMD_MATH_CALL_(ilogb) |
737 | |
738 | // logarithms {{{ |
739 | _GLIBCXX_SIMD_MATH_CALL_(log) |
740 | _GLIBCXX_SIMD_MATH_CALL_(log10) |
741 | _GLIBCXX_SIMD_MATH_CALL_(log1p) |
742 | _GLIBCXX_SIMD_MATH_CALL_(log2) |
743 | |
744 | //}}} |
745 | // logb{{{ |
746 | template <typename _Tp, typename _Abi> |
747 | enable_if_t<is_floating_point<_Tp>::value, simd<_Tp, _Abi>> |
748 | logb(const simd<_Tp, _Abi>& __x) |
749 | { |
750 | constexpr size_t _Np = simd_size_v<_Tp, _Abi>; |
751 | if constexpr (_Np == 1) |
752 | return std::logb(__x[0]); |
753 | else if constexpr (__is_fixed_size_abi_v<_Abi>) |
754 | { |
755 | return {__private_init, |
756 | __data(__x)._M_apply_per_chunk([](auto __impl, auto __xx) { |
757 | using _V = typename decltype(__impl)::simd_type; |
758 | return __data( |
759 | std::experimental::logb(_V(__private_init, __xx))); |
760 | })}; |
761 | } |
762 | #if _GLIBCXX_SIMD_X86INTRIN // {{{ |
763 | else if constexpr (__have_avx512vl && __is_sse_ps<_Tp, _Np>()) |
764 | return {__private_init, |
765 | __auto_bitcast(_mm_getexp_ps(__to_intrin(__as_vector(__x))))}; |
766 | else if constexpr (__have_avx512vl && __is_sse_pd<_Tp, _Np>()) |
767 | return {__private_init, _mm_getexp_pd(__data(__x))}; |
768 | else if constexpr (__have_avx512vl && __is_avx_ps<_Tp, _Np>()) |
769 | return {__private_init, _mm256_getexp_ps(__data(__x))}; |
770 | else if constexpr (__have_avx512vl && __is_avx_pd<_Tp, _Np>()) |
771 | return {__private_init, _mm256_getexp_pd(__data(__x))}; |
772 | else if constexpr (__have_avx512f && __is_avx_ps<_Tp, _Np>()) |
773 | return {__private_init, |
774 | __lo256(_mm512_getexp_ps(__auto_bitcast(__data(__x))))}; |
775 | else if constexpr (__have_avx512f && __is_avx_pd<_Tp, _Np>()) |
776 | return {__private_init, |
777 | __lo256(_mm512_getexp_pd(__auto_bitcast(__data(__x))))}; |
778 | else if constexpr (__is_avx512_ps<_Tp, _Np>()) |
779 | return {__private_init, _mm512_getexp_ps(__data(__x))}; |
780 | else if constexpr (__is_avx512_pd<_Tp, _Np>()) |
781 | return {__private_init, _mm512_getexp_pd(__data(__x))}; |
782 | #endif // _GLIBCXX_SIMD_X86INTRIN }}} |
783 | else |
784 | { |
785 | using _V = simd<_Tp, _Abi>; |
786 | using namespace std::experimental::__proposed; |
787 | auto __is_normal = isnormal(__x); |
788 | |
789 | // work on abs(__x) to reflect the return value on Linux for negative |
790 | // inputs (domain-error => implementation-defined value is returned) |
791 | const _V abs_x = abs(__x); |
792 | |
793 | // __exponent(__x) returns the exponent value (bias removed) as |
794 | // simd<_Up> with integral _Up |
795 | auto&& __exponent = [](const _V& __v) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
796 | using namespace std::experimental::__proposed; |
797 | using _IV = rebind_simd_t< |
798 | conditional_t<sizeof(_Tp) == sizeof(_LLong), _LLong, int>, _V>; |
799 | return (__bit_cast<_IV>(__v) >> (__digits_v<_Tp> - 1)) |
800 | - (__max_exponent_v<_Tp> - 1); |
801 | }; |
802 | _V __r = static_simd_cast<_V>(__exponent(abs_x)); |
803 | if (_GLIBCXX_SIMD_IS_LIKELY(all_of(__is_normal))) |
804 | // without corner cases (nan, inf, subnormal, zero) we have our |
805 | // answer: |
806 | return __r; |
807 | const auto __is_zero = __x == 0; |
808 | const auto __is_nan = isnan(__x); |
809 | const auto __is_inf = isinf(__x); |
810 | where(__is_zero, __r) = -__infinity_v<_Tp>; |
811 | where(__is_nan, __r) = __x; |
812 | where(__is_inf, __r) = __infinity_v<_Tp>; |
813 | __is_normal |= __is_zero || __is_nan || __is_inf; |
814 | if (all_of(__is_normal)) |
815 | // at this point everything but subnormals is handled |
816 | return __r; |
817 | // subnormals repeat the exponent extraction after multiplication of the |
818 | // input with __a floating point value that has 112 (0x70) in its exponent |
819 | // (not too big for sp and large enough for dp) |
820 | const _V __scaled = abs_x * _Tp(0x1p112); |
821 | _V __scaled_exp = static_simd_cast<_V>(__exponent(__scaled) - 112); |
822 | where(__is_normal, __scaled_exp) = __r; |
823 | return __scaled_exp; |
824 | } |
825 | } |
826 | |
827 | //}}} |
828 | template <typename _Tp, typename _Abi> |
829 | enable_if_t<is_floating_point_v<_Tp>, simd<_Tp, _Abi>> |
830 | modf(const simd<_Tp, _Abi>& __x, simd<_Tp, _Abi>* __iptr) |
831 | { |
832 | if constexpr (__is_scalar_abi<_Abi>() |
833 | || (__is_fixed_size_abi_v< |
834 | _Abi> && simd_size_v<_Tp, _Abi> == 1)) |
835 | { |
836 | _Tp __tmp; |
837 | _Tp __r = std::modf(__x[0], &__tmp); |
838 | __iptr[0] = __tmp; |
839 | return __r; |
840 | } |
841 | else |
842 | { |
843 | const auto __integral = trunc(__x); |
844 | *__iptr = __integral; |
845 | auto __r = __x - __integral; |
846 | #if !__FINITE_MATH_ONLY__ |
847 | where(isinf(__x), __r) = _Tp(); |
848 | #endif |
849 | return copysign(__r, __x); |
850 | } |
851 | } |
852 | |
853 | _GLIBCXX_SIMD_MATH_CALL2_(scalbn, int) |
854 | _GLIBCXX_SIMD_MATH_CALL2_(scalbln, long) |
855 | |
856 | _GLIBCXX_SIMD_MATH_CALL_(cbrt) |
857 | |
858 | _GLIBCXX_SIMD_MATH_CALL_(abs) |
859 | _GLIBCXX_SIMD_MATH_CALL_(fabs) |
860 | |
861 | // [parallel.simd.math] only asks for is_floating_point_v<_Tp> and forgot to |
862 | // allow signed integral _Tp |
863 | template <typename _Tp, typename _Abi> |
864 | enable_if_t<!is_floating_point_v<_Tp> && is_signed_v<_Tp>, simd<_Tp, _Abi>> |
865 | abs(const simd<_Tp, _Abi>& __x) |
866 | { return {__private_init, _Abi::_SimdImpl::_S_abs(__data(__x))}; } |
867 | |
868 | template <typename _Tp, typename _Abi> |
869 | enable_if_t<!is_floating_point_v<_Tp> && is_signed_v<_Tp>, simd<_Tp, _Abi>> |
870 | fabs(const simd<_Tp, _Abi>& __x) |
871 | { return {__private_init, _Abi::_SimdImpl::_S_abs(__data(__x))}; } |
872 | |
873 | // the following are overloads for functions in <cstdlib> and not covered by |
874 | // [parallel.simd.math]. I don't see much value in making them work, though |
875 | /* |
876 | template <typename _Abi> simd<long, _Abi> labs(const simd<long, _Abi> &__x) |
877 | { return {__private_init, _Abi::_SimdImpl::abs(__data(__x))}; } |
878 | |
879 | template <typename _Abi> simd<long long, _Abi> llabs(const simd<long long, _Abi> |
880 | &__x) |
881 | { return {__private_init, _Abi::_SimdImpl::abs(__data(__x))}; } |
882 | */ |
883 | |
884 | #define _GLIBCXX_SIMD_CVTING2(_NAME) \ |
885 | template <typename _Tp, typename _Abi> \ |
886 | _GLIBCXX_SIMD_INTRINSIC simd<_Tp, _Abi> _NAME( \ |
887 | const simd<_Tp, _Abi>& __x, const __type_identity_t<simd<_Tp, _Abi>>& __y) \ |
888 | { \ |
889 | return _NAME(__x, __y); \ |
890 | } \ |
891 | \ |
892 | template <typename _Tp, typename _Abi> \ |
893 | _GLIBCXX_SIMD_INTRINSIC simd<_Tp, _Abi> _NAME( \ |
894 | const __type_identity_t<simd<_Tp, _Abi>>& __x, const simd<_Tp, _Abi>& __y) \ |
895 | { \ |
896 | return _NAME(__x, __y); \ |
897 | } |
898 | |
899 | #define _GLIBCXX_SIMD_CVTING3(_NAME) \ |
900 | template <typename _Tp, typename _Abi> \ |
901 | _GLIBCXX_SIMD_INTRINSIC simd<_Tp, _Abi> _NAME( \ |
902 | const __type_identity_t<simd<_Tp, _Abi>>& __x, const simd<_Tp, _Abi>& __y, \ |
903 | const simd<_Tp, _Abi>& __z) \ |
904 | { \ |
905 | return _NAME(__x, __y, __z); \ |
906 | } \ |
907 | \ |
908 | template <typename _Tp, typename _Abi> \ |
909 | _GLIBCXX_SIMD_INTRINSIC simd<_Tp, _Abi> _NAME( \ |
910 | const simd<_Tp, _Abi>& __x, const __type_identity_t<simd<_Tp, _Abi>>& __y, \ |
911 | const simd<_Tp, _Abi>& __z) \ |
912 | { \ |
913 | return _NAME(__x, __y, __z); \ |
914 | } \ |
915 | \ |
916 | template <typename _Tp, typename _Abi> \ |
917 | _GLIBCXX_SIMD_INTRINSIC simd<_Tp, _Abi> _NAME( \ |
918 | const simd<_Tp, _Abi>& __x, const simd<_Tp, _Abi>& __y, \ |
919 | const __type_identity_t<simd<_Tp, _Abi>>& __z) \ |
920 | { \ |
921 | return _NAME(__x, __y, __z); \ |
922 | } \ |
923 | \ |
924 | template <typename _Tp, typename _Abi> \ |
925 | _GLIBCXX_SIMD_INTRINSIC simd<_Tp, _Abi> _NAME( \ |
926 | const simd<_Tp, _Abi>& __x, const __type_identity_t<simd<_Tp, _Abi>>& __y, \ |
927 | const __type_identity_t<simd<_Tp, _Abi>>& __z) \ |
928 | { \ |
929 | return _NAME(__x, __y, __z); \ |
930 | } \ |
931 | \ |
932 | template <typename _Tp, typename _Abi> \ |
933 | _GLIBCXX_SIMD_INTRINSIC simd<_Tp, _Abi> _NAME( \ |
934 | const __type_identity_t<simd<_Tp, _Abi>>& __x, const simd<_Tp, _Abi>& __y, \ |
935 | const __type_identity_t<simd<_Tp, _Abi>>& __z) \ |
936 | { \ |
937 | return _NAME(__x, __y, __z); \ |
938 | } \ |
939 | \ |
940 | template <typename _Tp, typename _Abi> \ |
941 | _GLIBCXX_SIMD_INTRINSIC simd<_Tp, _Abi> _NAME( \ |
942 | const __type_identity_t<simd<_Tp, _Abi>>& __x, \ |
943 | const __type_identity_t<simd<_Tp, _Abi>>& __y, const simd<_Tp, _Abi>& __z) \ |
944 | { \ |
945 | return _NAME(__x, __y, __z); \ |
946 | } |
947 | |
948 | template <typename _R, typename _ToApply, typename _Tp, typename... _Tps> |
949 | _GLIBCXX_SIMD_INTRINSIC _R |
950 | __fixed_size_apply(_ToApply&& __apply, const _Tp& __arg0, |
951 | const _Tps&... __args) |
952 | { |
953 | return {__private_init, |
954 | __data(__arg0)._M_apply_per_chunk( |
955 | [&](auto __impl, const auto&... __inner) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
956 | using _V = typename decltype(__impl)::simd_type; |
957 | return __data(__apply(_V(__private_init, __inner)...)); |
958 | }, |
959 | __data(__args)...)}; |
960 | } |
961 | |
962 | template <typename _VV> |
963 | __remove_cvref_t<_VV> |
964 | __hypot(_VV __x, _VV __y) |
965 | { |
966 | using _V = __remove_cvref_t<_VV>; |
967 | using _Tp = typename _V::value_type; |
968 | if constexpr (_V::size() == 1) |
969 | return std::hypot(_Tp(__x[0]), _Tp(__y[0])); |
970 | else if constexpr (__is_fixed_size_abi_v<typename _V::abi_type>) |
971 | { |
972 | return __fixed_size_apply<_V>([](auto __a, |
973 | auto __b) { return hypot(__a, __b); }, |
974 | __x, __y); |
975 | } |
976 | else |
977 | { |
978 | // A simple solution for _Tp == float would be to cast to double and |
979 | // simply calculate sqrt(x²+y²) as it can't over-/underflow anymore with |
980 | // dp. It still needs the Annex F fixups though and isn't faster on |
981 | // Skylake-AVX512 (not even for SSE and AVX vectors, and really bad for |
982 | // AVX-512). |
983 | using namespace __float_bitwise_operators; |
984 | _V __absx = abs(__x); // no error |
985 | _V __absy = abs(__y); // no error |
986 | _V __hi = max(__absx, __absy); // no error |
987 | _V __lo = min(__absy, __absx); // no error |
988 | |
989 | // round __hi down to the next power-of-2: |
990 | _GLIBCXX_SIMD_USE_CONSTEXPR_API _V __inf(__infinity_v<_Tp>); |
991 | |
992 | #ifndef __FAST_MATH__ |
993 | if constexpr (__have_neon && !__have_neon_a32) |
994 | { // With ARMv7 NEON, we have no subnormals and must use slightly |
995 | // different strategy |
996 | const _V __hi_exp = __hi & __inf; |
997 | _V __scale_back = __hi_exp; |
998 | // For large exponents (max & max/2) the inversion comes too close |
999 | // to subnormals. Subtract 3 from the exponent: |
1000 | where(__hi_exp > 1, __scale_back) = __hi_exp * _Tp(0.125); |
1001 | // Invert and adjust for the off-by-one error of inversion via xor: |
1002 | const _V __scale = (__scale_back ^ __inf) * _Tp(.5); |
1003 | const _V __h1 = __hi * __scale; |
1004 | const _V __l1 = __lo * __scale; |
1005 | _V __r = __scale_back * sqrt(__h1 * __h1 + __l1 * __l1); |
1006 | // Fix up hypot(0, 0) to not be NaN: |
1007 | where(__hi == 0, __r) = 0; |
1008 | return __r; |
1009 | } |
1010 | #endif |
1011 | |
1012 | #ifdef __FAST_MATH__ |
1013 | // With fast-math, ignore precision of subnormals and inputs from |
1014 | // __finite_max_v/2 to __finite_max_v. This removes all |
1015 | // branching/masking. |
1016 | if constexpr (true) |
1017 | #else |
1018 | if (_GLIBCXX_SIMD_IS_LIKELY(all_of(isnormal(__x)) |
1019 | && all_of(isnormal(__y)))) |
1020 | #endif |
1021 | { |
1022 | const _V __hi_exp = __hi & __inf; |
1023 | //((__hi + __hi) & __inf) ^ __inf almost works for computing |
1024 | //__scale, |
1025 | // except when (__hi + __hi) & __inf == __inf, in which case __scale |
1026 | // becomes 0 (should be min/2 instead) and thus loses the |
1027 | // information from __lo. |
1028 | #ifdef __FAST_MATH__ |
1029 | using _Ip = __int_for_sizeof_t<_Tp>; |
1030 | using _IV = rebind_simd_t<_Ip, _V>; |
1031 | const auto __as_int = __bit_cast<_IV>(__hi_exp); |
1032 | const _V __scale |
1033 | = __bit_cast<_V>(2 * __bit_cast<_Ip>(_Tp(1)) - __as_int); |
1034 | #else |
1035 | const _V __scale = (__hi_exp ^ __inf) * _Tp(.5); |
1036 | #endif |
1037 | _GLIBCXX_SIMD_USE_CONSTEXPR_API _V __mant_mask |
1038 | = __norm_min_v<_Tp> - __denorm_min_v<_Tp>; |
1039 | const _V __h1 = (__hi & __mant_mask) | _V(1); |
1040 | const _V __l1 = __lo * __scale; |
1041 | return __hi_exp * sqrt(__h1 * __h1 + __l1 * __l1); |
1042 | } |
1043 | else |
1044 | { |
1045 | // slower path to support subnormals |
1046 | // if __hi is subnormal, avoid scaling by inf & final mul by 0 |
1047 | // (which yields NaN) by using min() |
1048 | _V __scale = _V(1 / __norm_min_v<_Tp>); |
1049 | // invert exponent w/o error and w/o using the slow divider unit: |
1050 | // xor inverts the exponent but off by 1. Multiplication with .5 |
1051 | // adjusts for the discrepancy. |
1052 | where(__hi >= __norm_min_v<_Tp>, __scale) |
1053 | = ((__hi & __inf) ^ __inf) * _Tp(.5); |
1054 | // adjust final exponent for subnormal inputs |
1055 | _V __hi_exp = __norm_min_v<_Tp>; |
1056 | where(__hi >= __norm_min_v<_Tp>, __hi_exp) |
1057 | = __hi & __inf; // no error |
1058 | _V __h1 = __hi * __scale; // no error |
1059 | _V __l1 = __lo * __scale; // no error |
1060 | |
1061 | // sqrt(x²+y²) = e*sqrt((x/e)²+(y/e)²): |
1062 | // this ensures no overflow in the argument to sqrt |
1063 | _V __r = __hi_exp * sqrt(__h1 * __h1 + __l1 * __l1); |
1064 | #ifdef __STDC_IEC_559__ |
1065 | // fixup for Annex F requirements |
1066 | // the naive fixup goes like this: |
1067 | // |
1068 | // where(__l1 == 0, __r) = __hi; |
1069 | // where(isunordered(__x, __y), __r) = __quiet_NaN_v<_Tp>; |
1070 | // where(isinf(__absx) || isinf(__absy), __r) = __inf; |
1071 | // |
1072 | // The fixup can be prepared in parallel with the sqrt, requiring a |
1073 | // single blend step after hi_exp * sqrt, reducing latency and |
1074 | // throughput: |
1075 | _V __fixup = __hi; // __lo == 0 |
1076 | where(isunordered(__x, __y), __fixup) = __quiet_NaN_v<_Tp>; |
1077 | where(isinf(__absx) || isinf(__absy), __fixup) = __inf; |
1078 | where(!(__lo == 0 || isunordered(__x, __y) |
1079 | || (isinf(__absx) || isinf(__absy))), |
1080 | __fixup) |
1081 | = __r; |
1082 | __r = __fixup; |
1083 | #endif |
1084 | return __r; |
1085 | } |
1086 | } |
1087 | } |
1088 | |
1089 | template <typename _Tp, typename _Abi> |
1090 | _GLIBCXX_SIMD_INTRINSIC simd<_Tp, _Abi> |
1091 | hypot(const simd<_Tp, _Abi>& __x, const simd<_Tp, _Abi>& __y) |
1092 | { |
1093 | return __hypot<conditional_t<__is_fixed_size_abi_v<_Abi>, |
1094 | const simd<_Tp, _Abi>&, simd<_Tp, _Abi>>>(__x, |
1095 | __y); |
1096 | } |
1097 | |
1098 | _GLIBCXX_SIMD_CVTING2(hypot) |
1099 | |
1100 | template <typename _VV> |
1101 | __remove_cvref_t<_VV> |
1102 | __hypot(_VV __x, _VV __y, _VV __z) |
1103 | { |
1104 | using _V = __remove_cvref_t<_VV>; |
1105 | using _Abi = typename _V::abi_type; |
1106 | using _Tp = typename _V::value_type; |
1107 | /* FIXME: enable after PR77776 is resolved |
1108 | if constexpr (_V::size() == 1) |
1109 | return std::hypot(_Tp(__x[0]), _Tp(__y[0]), _Tp(__z[0])); |
1110 | else |
1111 | */ |
1112 | if constexpr (__is_fixed_size_abi_v<_Abi> && _V::size() > 1) |
1113 | { |
1114 | return __fixed_size_apply<simd<_Tp, _Abi>>( |
1115 | [](auto __a, auto __b, auto __c) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
1116 | return hypot(__a, __b, __c); |
1117 | }, __x, __y, __z); |
1118 | } |
1119 | else |
1120 | { |
1121 | using namespace __float_bitwise_operators; |
1122 | const _V __absx = abs(__x); // no error |
1123 | const _V __absy = abs(__y); // no error |
1124 | const _V __absz = abs(__z); // no error |
1125 | _V __hi = max(max(__absx, __absy), __absz); // no error |
1126 | _V __l0 = min(__absz, max(__absx, __absy)); // no error |
1127 | _V __l1 = min(__absy, __absx); // no error |
1128 | if constexpr (__digits_v<_Tp> == 64 && __max_exponent_v<_Tp> == 0x4000 |
1129 | && __min_exponent_v<_Tp> == -0x3FFD && _V::size() == 1) |
1130 | { // Seems like x87 fp80, where bit 63 is always 1 unless subnormal or |
1131 | // NaN. In this case the bit-tricks don't work, they require IEC559 |
1132 | // binary32 or binary64 format. |
1133 | #ifdef __STDC_IEC_559__ |
1134 | // fixup for Annex F requirements |
1135 | if (isinf(__absx[0]) || isinf(__absy[0]) || isinf(__absz[0])) |
1136 | return __infinity_v<_Tp>; |
1137 | else if (isunordered(__absx[0], __absy[0] + __absz[0])) |
1138 | return __quiet_NaN_v<_Tp>; |
1139 | else if (__l0[0] == 0 && __l1[0] == 0) |
1140 | return __hi; |
1141 | #endif |
1142 | _V __hi_exp = __hi; |
1143 | const _ULLong __tmp = 0x8000'0000'0000'0000ull; |
1144 | __builtin_memcpy(&__data(__hi_exp), &__tmp, 8); |
1145 | const _V __scale = 1 / __hi_exp; |
1146 | __hi *= __scale; |
1147 | __l0 *= __scale; |
1148 | __l1 *= __scale; |
1149 | return __hi_exp * sqrt((__l0 * __l0 + __l1 * __l1) + __hi * __hi); |
1150 | } |
1151 | else |
1152 | { |
1153 | // round __hi down to the next power-of-2: |
1154 | _GLIBCXX_SIMD_USE_CONSTEXPR_API _V __inf(__infinity_v<_Tp>); |
1155 | |
1156 | #ifndef __FAST_MATH__ |
1157 | if constexpr (_V::size() > 1 && __have_neon && !__have_neon_a32) |
1158 | { // With ARMv7 NEON, we have no subnormals and must use slightly |
1159 | // different strategy |
1160 | const _V __hi_exp = __hi & __inf; |
1161 | _V __scale_back = __hi_exp; |
1162 | // For large exponents (max & max/2) the inversion comes too |
1163 | // close to subnormals. Subtract 3 from the exponent: |
1164 | where(__hi_exp > 1, __scale_back) = __hi_exp * _Tp(0.125); |
1165 | // Invert and adjust for the off-by-one error of inversion via |
1166 | // xor: |
1167 | const _V __scale = (__scale_back ^ __inf) * _Tp(.5); |
1168 | const _V __h1 = __hi * __scale; |
1169 | __l0 *= __scale; |
1170 | __l1 *= __scale; |
1171 | _V __lo = __l0 * __l0 |
1172 | + __l1 * __l1; // add the two smaller values first |
1173 | asm("" : "+m" (__lo)); |
1174 | _V __r = __scale_back * sqrt(__h1 * __h1 + __lo); |
1175 | // Fix up hypot(0, 0, 0) to not be NaN: |
1176 | where(__hi == 0, __r) = 0; |
1177 | return __r; |
1178 | } |
1179 | #endif |
1180 | |
1181 | #ifdef __FAST_MATH__ |
1182 | // With fast-math, ignore precision of subnormals and inputs from |
1183 | // __finite_max_v/2 to __finite_max_v. This removes all |
1184 | // branching/masking. |
1185 | if constexpr (true) |
1186 | #else |
1187 | if (_GLIBCXX_SIMD_IS_LIKELY(all_of(isnormal(__x)) |
1188 | && all_of(isnormal(__y)) |
1189 | && all_of(isnormal(__z)))) |
1190 | #endif |
1191 | { |
1192 | const _V __hi_exp = __hi & __inf; |
1193 | //((__hi + __hi) & __inf) ^ __inf almost works for computing |
1194 | //__scale, except when (__hi + __hi) & __inf == __inf, in which |
1195 | // case __scale |
1196 | // becomes 0 (should be min/2 instead) and thus loses the |
1197 | // information from __lo. |
1198 | #ifdef __FAST_MATH__ |
1199 | using _Ip = __int_for_sizeof_t<_Tp>; |
1200 | using _IV = rebind_simd_t<_Ip, _V>; |
1201 | const auto __as_int = __bit_cast<_IV>(__hi_exp); |
1202 | const _V __scale |
1203 | = __bit_cast<_V>(2 * __bit_cast<_Ip>(_Tp(1)) - __as_int); |
1204 | #else |
1205 | const _V __scale = (__hi_exp ^ __inf) * _Tp(.5); |
1206 | #endif |
1207 | constexpr _Tp __mant_mask |
1208 | = __norm_min_v<_Tp> - __denorm_min_v<_Tp>; |
1209 | const _V __h1 = (__hi & _V(__mant_mask)) | _V(1); |
1210 | __l0 *= __scale; |
1211 | __l1 *= __scale; |
1212 | const _V __lo |
1213 | = __l0 * __l0 |
1214 | + __l1 * __l1; // add the two smaller values first |
1215 | return __hi_exp * sqrt(__lo + __h1 * __h1); |
1216 | } |
1217 | else |
1218 | { |
1219 | // slower path to support subnormals |
1220 | // if __hi is subnormal, avoid scaling by inf & final mul by 0 |
1221 | // (which yields NaN) by using min() |
1222 | _V __scale = _V(1 / __norm_min_v<_Tp>); |
1223 | // invert exponent w/o error and w/o using the slow divider |
1224 | // unit: xor inverts the exponent but off by 1. Multiplication |
1225 | // with .5 adjusts for the discrepancy. |
1226 | where(__hi >= __norm_min_v<_Tp>, __scale) |
1227 | = ((__hi & __inf) ^ __inf) * _Tp(.5); |
1228 | // adjust final exponent for subnormal inputs |
1229 | _V __hi_exp = __norm_min_v<_Tp>; |
1230 | where(__hi >= __norm_min_v<_Tp>, __hi_exp) |
1231 | = __hi & __inf; // no error |
1232 | _V __h1 = __hi * __scale; // no error |
1233 | __l0 *= __scale; // no error |
1234 | __l1 *= __scale; // no error |
1235 | _V __lo = __l0 * __l0 |
1236 | + __l1 * __l1; // add the two smaller values first |
1237 | _V __r = __hi_exp * sqrt(__lo + __h1 * __h1); |
1238 | #ifdef __STDC_IEC_559__ |
1239 | // fixup for Annex F requirements |
1240 | _V __fixup = __hi; // __lo == 0 |
1241 | // where(__lo == 0, __fixup) = __hi; |
1242 | where(isunordered(__x, __y + __z), __fixup) |
1243 | = __quiet_NaN_v<_Tp>; |
1244 | where(isinf(__absx) || isinf(__absy) || isinf(__absz), __fixup) |
1245 | = __inf; |
1246 | // Instead of __lo == 0, the following could depend on __h1² == |
1247 | // __h1² + __lo (i.e. __hi is so much larger than the other two |
1248 | // inputs that the result is exactly __hi). While this may |
1249 | // improve precision, it is likely to reduce efficiency if the |
1250 | // ISA has FMAs (because __h1² + __lo is an FMA, but the |
1251 | // intermediate |
1252 | // __h1² must be kept) |
1253 | where(!(__lo == 0 || isunordered(__x, __y + __z) |
1254 | || isinf(__absx) || isinf(__absy) || isinf(__absz)), |
1255 | __fixup) |
1256 | = __r; |
1257 | __r = __fixup; |
1258 | #endif |
1259 | return __r; |
1260 | } |
1261 | } |
1262 | } |
1263 | } |
1264 | |
1265 | template <typename _Tp, typename _Abi> |
1266 | _GLIBCXX_SIMD_INTRINSIC simd<_Tp, _Abi> |
1267 | hypot(const simd<_Tp, _Abi>& __x, const simd<_Tp, _Abi>& __y, |
1268 | const simd<_Tp, _Abi>& __z) |
1269 | { |
1270 | return __hypot<conditional_t<__is_fixed_size_abi_v<_Abi>, |
1271 | const simd<_Tp, _Abi>&, simd<_Tp, _Abi>>>(__x, |
1272 | __y, |
1273 | __z); |
1274 | } |
1275 | |
1276 | _GLIBCXX_SIMD_CVTING3(hypot) |
1277 | |
1278 | _GLIBCXX_SIMD_MATH_CALL2_(pow, _Tp) |
1279 | |
1280 | _GLIBCXX_SIMD_MATH_CALL_(sqrt) |
1281 | _GLIBCXX_SIMD_MATH_CALL_(erf) |
1282 | _GLIBCXX_SIMD_MATH_CALL_(erfc) |
1283 | _GLIBCXX_SIMD_MATH_CALL_(lgamma) |
1284 | _GLIBCXX_SIMD_MATH_CALL_(tgamma) |
1285 | _GLIBCXX_SIMD_MATH_CALL_(ceil) |
1286 | _GLIBCXX_SIMD_MATH_CALL_(floor) |
1287 | _GLIBCXX_SIMD_MATH_CALL_(nearbyint) |
1288 | _GLIBCXX_SIMD_MATH_CALL_(rint) |
1289 | _GLIBCXX_SIMD_MATH_CALL_(lrint) |
1290 | _GLIBCXX_SIMD_MATH_CALL_(llrint) |
1291 | |
1292 | _GLIBCXX_SIMD_MATH_CALL_(round) |
1293 | _GLIBCXX_SIMD_MATH_CALL_(lround) |
1294 | _GLIBCXX_SIMD_MATH_CALL_(llround) |
1295 | |
1296 | _GLIBCXX_SIMD_MATH_CALL_(trunc) |
1297 | |
1298 | _GLIBCXX_SIMD_MATH_CALL2_(fmod, _Tp) |
1299 | _GLIBCXX_SIMD_MATH_CALL2_(remainder, _Tp) |
1300 | _GLIBCXX_SIMD_MATH_CALL3_(remquo, _Tp, int*) |
1301 | |
1302 | template <typename _Tp, typename _Abi> |
1303 | enable_if_t<is_floating_point_v<_Tp>, simd<_Tp, _Abi>> |
1304 | copysign(const simd<_Tp, _Abi>& __x, const simd<_Tp, _Abi>& __y) |
1305 | { |
1306 | if constexpr (simd_size_v<_Tp, _Abi> == 1) |
1307 | return std::copysign(__x[0], __y[0]); |
1308 | else if constexpr (is_same_v<_Tp, long double> && sizeof(_Tp) == 12) |
1309 | // Remove this case once __bit_cast is implemented via __builtin_bit_cast. |
1310 | // It is necessary, because __signmask below cannot be computed at compile |
1311 | // time. |
1312 | return simd<_Tp, _Abi>( |
1313 | [&](auto __i) { return std::copysign(__x[__i], __y[__i]); }); |
1314 | else |
1315 | { |
1316 | using _V = simd<_Tp, _Abi>; |
1317 | using namespace std::experimental::__float_bitwise_operators; |
1318 | _GLIBCXX_SIMD_USE_CONSTEXPR_API auto __signmask = _V(1) ^ _V(-1); |
1319 | return (__x & (__x ^ __signmask)) | (__y & __signmask); |
1320 | } |
1321 | } |
1322 | |
1323 | _GLIBCXX_SIMD_MATH_CALL2_(nextafter, _Tp) |
1324 | // not covered in [parallel.simd.math]: |
1325 | // _GLIBCXX_SIMD_MATH_CALL2_(nexttoward, long double) |
1326 | _GLIBCXX_SIMD_MATH_CALL2_(fdim, _Tp) |
1327 | _GLIBCXX_SIMD_MATH_CALL2_(fmax, _Tp) |
1328 | _GLIBCXX_SIMD_MATH_CALL2_(fmin, _Tp) |
1329 | |
1330 | _GLIBCXX_SIMD_MATH_CALL3_(fma, _Tp, _Tp) |
1331 | _GLIBCXX_SIMD_MATH_CALL_(fpclassify) |
1332 | _GLIBCXX_SIMD_MATH_CALL_(isfinite) |
1333 | |
1334 | // isnan and isinf require special treatment because old glibc may declare |
1335 | // `int isinf(double)`. |
1336 | template <typename _Tp, typename _Abi, typename..., |
1337 | typename _R = _Math_return_type_t<bool, _Tp, _Abi>> |
1338 | enable_if_t<is_floating_point_v<_Tp>, _R> |
1339 | isinf(simd<_Tp, _Abi> __x) |
1340 | { return {__private_init, _Abi::_SimdImpl::_S_isinf(__data(__x))}; } |
1341 | |
1342 | template <typename _Tp, typename _Abi, typename..., |
1343 | typename _R = _Math_return_type_t<bool, _Tp, _Abi>> |
1344 | enable_if_t<is_floating_point_v<_Tp>, _R> |
1345 | isnan(simd<_Tp, _Abi> __x) |
1346 | { return {__private_init, _Abi::_SimdImpl::_S_isnan(__data(__x))}; } |
1347 | |
1348 | _GLIBCXX_SIMD_MATH_CALL_(isnormal) |
1349 | |
1350 | template <typename..., typename _Tp, typename _Abi> |
1351 | simd_mask<_Tp, _Abi> |
1352 | signbit(simd<_Tp, _Abi> __x) |
1353 | { |
1354 | if constexpr (is_integral_v<_Tp>) |
1355 | { |
1356 | if constexpr (is_unsigned_v<_Tp>) |
1357 | return simd_mask<_Tp, _Abi>{}; // false |
1358 | else |
1359 | return __x < 0; |
1360 | } |
1361 | else |
1362 | return {__private_init, _Abi::_SimdImpl::_S_signbit(__data(__x))}; |
1363 | } |
1364 | |
1365 | _GLIBCXX_SIMD_MATH_CALL2_(isgreater, _Tp) |
1366 | _GLIBCXX_SIMD_MATH_CALL2_(isgreaterequal, _Tp) |
1367 | _GLIBCXX_SIMD_MATH_CALL2_(isless, _Tp) |
1368 | _GLIBCXX_SIMD_MATH_CALL2_(islessequal, _Tp) |
1369 | _GLIBCXX_SIMD_MATH_CALL2_(islessgreater, _Tp) |
1370 | _GLIBCXX_SIMD_MATH_CALL2_(isunordered, _Tp) |
1371 | |
1372 | /* not covered in [parallel.simd.math] |
1373 | template <typename _Abi> __doublev<_Abi> nan(const char* tagp); |
1374 | template <typename _Abi> __floatv<_Abi> nanf(const char* tagp); |
1375 | template <typename _Abi> __ldoublev<_Abi> nanl(const char* tagp); |
1376 | |
1377 | template <typename _V> struct simd_div_t { |
1378 | _V quot, rem; |
1379 | }; |
1380 | |
1381 | template <typename _Abi> |
1382 | simd_div_t<_SCharv<_Abi>> div(_SCharv<_Abi> numer, |
1383 | _SCharv<_Abi> denom); |
1384 | template <typename _Abi> |
1385 | simd_div_t<__shortv<_Abi>> div(__shortv<_Abi> numer, |
1386 | __shortv<_Abi> denom); |
1387 | template <typename _Abi> |
1388 | simd_div_t<__intv<_Abi>> div(__intv<_Abi> numer, __intv<_Abi> denom); |
1389 | template <typename _Abi> |
1390 | simd_div_t<__longv<_Abi>> div(__longv<_Abi> numer, |
1391 | __longv<_Abi> denom); |
1392 | template <typename _Abi> |
1393 | simd_div_t<__llongv<_Abi>> div(__llongv<_Abi> numer, |
1394 | __llongv<_Abi> denom); |
1395 | */ |
1396 | |
1397 | // special math {{{ |
1398 | template <typename _Tp, typename _Abi> |
1399 | enable_if_t<is_floating_point_v<_Tp>, simd<_Tp, _Abi>> |
1400 | assoc_laguerre(const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __n, |
1401 | const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __m, |
1402 | const simd<_Tp, _Abi>& __x) |
1403 | { |
1404 | return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
1405 | return std::assoc_laguerre(__n[__i], __m[__i], __x[__i]); |
1406 | }); |
1407 | } |
1408 | |
1409 | template <typename _Tp, typename _Abi> |
1410 | enable_if_t<is_floating_point_v<_Tp>, simd<_Tp, _Abi>> |
1411 | assoc_legendre(const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __n, |
1412 | const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __m, |
1413 | const simd<_Tp, _Abi>& __x) |
1414 | { |
1415 | return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
1416 | return std::assoc_legendre(__n[__i], __m[__i], __x[__i]); |
1417 | }); |
1418 | } |
1419 | |
1420 | _GLIBCXX_SIMD_MATH_CALL2_(beta, _Tp) |
1421 | _GLIBCXX_SIMD_MATH_CALL_(comp_ellint_1) |
1422 | _GLIBCXX_SIMD_MATH_CALL_(comp_ellint_2) |
1423 | _GLIBCXX_SIMD_MATH_CALL2_(comp_ellint_3, _Tp) |
1424 | _GLIBCXX_SIMD_MATH_CALL2_(cyl_bessel_i, _Tp) |
1425 | _GLIBCXX_SIMD_MATH_CALL2_(cyl_bessel_j, _Tp) |
1426 | _GLIBCXX_SIMD_MATH_CALL2_(cyl_bessel_k, _Tp) |
1427 | _GLIBCXX_SIMD_MATH_CALL2_(cyl_neumann, _Tp) |
1428 | _GLIBCXX_SIMD_MATH_CALL2_(ellint_1, _Tp) |
1429 | _GLIBCXX_SIMD_MATH_CALL2_(ellint_2, _Tp) |
1430 | _GLIBCXX_SIMD_MATH_CALL3_(ellint_3, _Tp, _Tp) |
1431 | _GLIBCXX_SIMD_MATH_CALL_(expint) |
1432 | |
1433 | template <typename _Tp, typename _Abi> |
1434 | enable_if_t<is_floating_point_v<_Tp>, simd<_Tp, _Abi>> |
1435 | hermite(const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __n, |
1436 | const simd<_Tp, _Abi>& __x) |
1437 | { |
1438 | return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
1439 | return std::hermite(__n[__i], __x[__i]); |
1440 | }); |
1441 | } |
1442 | |
1443 | template <typename _Tp, typename _Abi> |
1444 | enable_if_t<is_floating_point_v<_Tp>, simd<_Tp, _Abi>> |
1445 | laguerre(const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __n, |
1446 | const simd<_Tp, _Abi>& __x) |
1447 | { |
1448 | return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
1449 | return std::laguerre(__n[__i], __x[__i]); |
1450 | }); |
1451 | } |
1452 | |
1453 | template <typename _Tp, typename _Abi> |
1454 | enable_if_t<is_floating_point_v<_Tp>, simd<_Tp, _Abi>> |
1455 | legendre(const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __n, |
1456 | const simd<_Tp, _Abi>& __x) |
1457 | { |
1458 | return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
1459 | return std::legendre(__n[__i], __x[__i]); |
1460 | }); |
1461 | } |
1462 | |
1463 | _GLIBCXX_SIMD_MATH_CALL_(riemann_zeta) |
1464 | |
1465 | template <typename _Tp, typename _Abi> |
1466 | enable_if_t<is_floating_point_v<_Tp>, simd<_Tp, _Abi>> |
1467 | sph_bessel(const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __n, |
1468 | const simd<_Tp, _Abi>& __x) |
1469 | { |
1470 | return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
1471 | return std::sph_bessel(__n[__i], __x[__i]); |
1472 | }); |
1473 | } |
1474 | |
1475 | template <typename _Tp, typename _Abi> |
1476 | enable_if_t<is_floating_point_v<_Tp>, simd<_Tp, _Abi>> |
1477 | sph_legendre(const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __l, |
1478 | const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __m, |
1479 | const simd<_Tp, _Abi>& theta) |
1480 | { |
1481 | return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
1482 | return std::assoc_legendre(__l[__i], __m[__i], theta[__i]); |
1483 | }); |
1484 | } |
1485 | |
1486 | template <typename _Tp, typename _Abi> |
1487 | enable_if_t<is_floating_point_v<_Tp>, simd<_Tp, _Abi>> |
1488 | sph_neumann(const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __n, |
1489 | const simd<_Tp, _Abi>& __x) |
1490 | { |
1491 | return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
1492 | return std::sph_neumann(__n[__i], __x[__i]); |
1493 | }); |
1494 | } |
1495 | // }}} |
1496 | |
1497 | #undef _GLIBCXX_SIMD_MATH_CALL_ |
1498 | #undef _GLIBCXX_SIMD_MATH_CALL2_ |
1499 | #undef _GLIBCXX_SIMD_MATH_CALL3_ |
1500 | |
1501 | _GLIBCXX_SIMD_END_NAMESPACE |
1502 | |
1503 | #endif // __cplusplus >= 201703L |
1504 | #endif // _GLIBCXX_EXPERIMENTAL_SIMD_MATH_H_ |
1505 | |
1506 | // vim: foldmethod=marker sw=2 ts=8 noet sts=2 |
1507 | |