| 1 | // Math overloads for simd -*- C++ -*- |
| 2 | |
| 3 | // Copyright (C) 2020-2021 Free Software Foundation, Inc. |
| 4 | // |
| 5 | // This file is part of the GNU ISO C++ Library. This library is free |
| 6 | // software; you can redistribute it and/or modify it under the |
| 7 | // terms of the GNU General Public License as published by the |
| 8 | // Free Software Foundation; either version 3, or (at your option) |
| 9 | // any later version. |
| 10 | |
| 11 | // This library is distributed in the hope that it will be useful, |
| 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 14 | // GNU General Public License for more details. |
| 15 | |
| 16 | // Under Section 7 of GPL version 3, you are granted additional |
| 17 | // permissions described in the GCC Runtime Library Exception, version |
| 18 | // 3.1, as published by the Free Software Foundation. |
| 19 | |
| 20 | // You should have received a copy of the GNU General Public License and |
| 21 | // a copy of the GCC Runtime Library Exception along with this program; |
| 22 | // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see |
| 23 | // <http://www.gnu.org/licenses/>. |
| 24 | |
| 25 | #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_MATH_H_ |
| 26 | #define _GLIBCXX_EXPERIMENTAL_SIMD_MATH_H_ |
| 27 | |
| 28 | #if __cplusplus >= 201703L |
| 29 | |
| 30 | #include <utility> |
| 31 | #include <iomanip> |
| 32 | |
| 33 | _GLIBCXX_SIMD_BEGIN_NAMESPACE |
| 34 | template <typename _Tp, typename _V> |
| 35 | using _Samesize = fixed_size_simd<_Tp, _V::size()>; |
| 36 | |
| 37 | // _Math_return_type {{{ |
| 38 | template <typename _DoubleR, typename _Tp, typename _Abi> |
| 39 | struct _Math_return_type; |
| 40 | |
| 41 | template <typename _DoubleR, typename _Tp, typename _Abi> |
| 42 | using _Math_return_type_t = |
| 43 | typename _Math_return_type<_DoubleR, _Tp, _Abi>::type; |
| 44 | |
| 45 | template <typename _Tp, typename _Abi> |
| 46 | struct _Math_return_type<double, _Tp, _Abi> |
| 47 | { using type = simd<_Tp, _Abi>; }; |
| 48 | |
| 49 | template <typename _Tp, typename _Abi> |
| 50 | struct _Math_return_type<bool, _Tp, _Abi> |
| 51 | { using type = simd_mask<_Tp, _Abi>; }; |
| 52 | |
| 53 | template <typename _DoubleR, typename _Tp, typename _Abi> |
| 54 | struct _Math_return_type |
| 55 | { using type = fixed_size_simd<_DoubleR, simd_size_v<_Tp, _Abi>>; }; |
| 56 | |
| 57 | //}}} |
| 58 | // _GLIBCXX_SIMD_MATH_CALL_ {{{ |
| 59 | #define _GLIBCXX_SIMD_MATH_CALL_(__name) \ |
| 60 | template <typename _Tp, typename _Abi, typename..., \ |
| 61 | typename _R = _Math_return_type_t< \ |
| 62 | decltype(std::__name(declval<double>())), _Tp, _Abi>> \ |
| 63 | enable_if_t<is_floating_point_v<_Tp>, _R> \ |
| 64 | __name(simd<_Tp, _Abi> __x) \ |
| 65 | { return {__private_init, _Abi::_SimdImpl::_S_##__name(__data(__x))}; } |
| 66 | |
| 67 | // }}} |
| 68 | //_Extra_argument_type{{{ |
| 69 | template <typename _Up, typename _Tp, typename _Abi> |
| 70 | struct _Extra_argument_type; |
| 71 | |
| 72 | template <typename _Tp, typename _Abi> |
| 73 | struct <_Tp*, _Tp, _Abi> |
| 74 | { |
| 75 | using = simd<_Tp, _Abi>*; |
| 76 | static constexpr double* (); |
| 77 | static constexpr bool = true; |
| 78 | |
| 79 | _GLIBCXX_SIMD_INTRINSIC static constexpr auto (type __x) |
| 80 | { return &__data(*__x); } |
| 81 | }; |
| 82 | |
| 83 | template <typename _Up, typename _Tp, typename _Abi> |
| 84 | struct <_Up*, _Tp, _Abi> |
| 85 | { |
| 86 | static_assert(is_integral_v<_Up>); |
| 87 | using = fixed_size_simd<_Up, simd_size_v<_Tp, _Abi>>*; |
| 88 | static constexpr _Up* (); |
| 89 | static constexpr bool = true; |
| 90 | |
| 91 | _GLIBCXX_SIMD_INTRINSIC static constexpr auto (type __x) |
| 92 | { return &__data(*__x); } |
| 93 | }; |
| 94 | |
| 95 | template <typename _Tp, typename _Abi> |
| 96 | struct <_Tp, _Tp, _Abi> |
| 97 | { |
| 98 | using = simd<_Tp, _Abi>; |
| 99 | static constexpr double (); |
| 100 | static constexpr bool = false; |
| 101 | |
| 102 | _GLIBCXX_SIMD_INTRINSIC static constexpr decltype(auto) |
| 103 | (const type& __x) |
| 104 | { return __data(__x); } |
| 105 | }; |
| 106 | |
| 107 | template <typename _Up, typename _Tp, typename _Abi> |
| 108 | struct |
| 109 | { |
| 110 | static_assert(is_integral_v<_Up>); |
| 111 | using = fixed_size_simd<_Up, simd_size_v<_Tp, _Abi>>; |
| 112 | static constexpr _Up (); |
| 113 | static constexpr bool = false; |
| 114 | |
| 115 | _GLIBCXX_SIMD_INTRINSIC static constexpr decltype(auto) |
| 116 | (const type& __x) |
| 117 | { return __data(__x); } |
| 118 | }; |
| 119 | |
| 120 | //}}} |
| 121 | // _GLIBCXX_SIMD_MATH_CALL2_ {{{ |
| 122 | #define _GLIBCXX_SIMD_MATH_CALL2_(__name, arg2_) \ |
| 123 | template < \ |
| 124 | typename _Tp, typename _Abi, typename..., \ |
| 125 | typename _Arg2 = _Extra_argument_type<arg2_, _Tp, _Abi>, \ |
| 126 | typename _R = _Math_return_type_t< \ |
| 127 | decltype(std::__name(declval<double>(), _Arg2::declval())), _Tp, _Abi>> \ |
| 128 | enable_if_t<is_floating_point_v<_Tp>, _R> \ |
| 129 | __name(const simd<_Tp, _Abi>& __x, const typename _Arg2::type& __y) \ |
| 130 | { \ |
| 131 | return {__private_init, \ |
| 132 | _Abi::_SimdImpl::_S_##__name(__data(__x), _Arg2::_S_data(__y))}; \ |
| 133 | } \ |
| 134 | template <typename _Up, typename _Tp, typename _Abi> \ |
| 135 | _GLIBCXX_SIMD_INTRINSIC _Math_return_type_t< \ |
| 136 | decltype(std::__name( \ |
| 137 | declval<double>(), \ |
| 138 | declval<enable_if_t< \ |
| 139 | conjunction_v< \ |
| 140 | is_same<arg2_, _Tp>, \ |
| 141 | negation<is_same<__remove_cvref_t<_Up>, simd<_Tp, _Abi>>>, \ |
| 142 | is_convertible<_Up, simd<_Tp, _Abi>>, is_floating_point<_Tp>>, \ |
| 143 | double>>())), \ |
| 144 | _Tp, _Abi> \ |
| 145 | __name(_Up&& __xx, const simd<_Tp, _Abi>& __yy) \ |
| 146 | { return __name(simd<_Tp, _Abi>(static_cast<_Up&&>(__xx)), __yy); } |
| 147 | |
| 148 | // }}} |
| 149 | // _GLIBCXX_SIMD_MATH_CALL3_ {{{ |
| 150 | #define _GLIBCXX_SIMD_MATH_CALL3_(__name, arg2_, arg3_) \ |
| 151 | template <typename _Tp, typename _Abi, typename..., \ |
| 152 | typename _Arg2 = _Extra_argument_type<arg2_, _Tp, _Abi>, \ |
| 153 | typename _Arg3 = _Extra_argument_type<arg3_, _Tp, _Abi>, \ |
| 154 | typename _R = _Math_return_type_t< \ |
| 155 | decltype(std::__name(declval<double>(), _Arg2::declval(), \ |
| 156 | _Arg3::declval())), \ |
| 157 | _Tp, _Abi>> \ |
| 158 | enable_if_t<is_floating_point_v<_Tp>, _R> \ |
| 159 | __name(const simd<_Tp, _Abi>& __x, const typename _Arg2::type& __y, \ |
| 160 | const typename _Arg3::type& __z) \ |
| 161 | { \ |
| 162 | return {__private_init, \ |
| 163 | _Abi::_SimdImpl::_S_##__name(__data(__x), _Arg2::_S_data(__y), \ |
| 164 | _Arg3::_S_data(__z))}; \ |
| 165 | } \ |
| 166 | template < \ |
| 167 | typename _T0, typename _T1, typename _T2, typename..., \ |
| 168 | typename _U0 = __remove_cvref_t<_T0>, \ |
| 169 | typename _U1 = __remove_cvref_t<_T1>, \ |
| 170 | typename _U2 = __remove_cvref_t<_T2>, \ |
| 171 | typename _Simd = conditional_t<is_simd_v<_U1>, _U1, _U2>, \ |
| 172 | typename = enable_if_t<conjunction_v< \ |
| 173 | is_simd<_Simd>, is_convertible<_T0&&, _Simd>, \ |
| 174 | is_convertible<_T1&&, _Simd>, is_convertible<_T2&&, _Simd>, \ |
| 175 | negation<conjunction< \ |
| 176 | is_simd<_U0>, is_floating_point<__value_type_or_identity_t<_U0>>>>>>> \ |
| 177 | _GLIBCXX_SIMD_INTRINSIC decltype(__name(declval<const _Simd&>(), \ |
| 178 | declval<const _Simd&>(), \ |
| 179 | declval<const _Simd&>())) \ |
| 180 | __name(_T0&& __xx, _T1&& __yy, _T2&& __zz) \ |
| 181 | { \ |
| 182 | return __name(_Simd(static_cast<_T0&&>(__xx)), \ |
| 183 | _Simd(static_cast<_T1&&>(__yy)), \ |
| 184 | _Simd(static_cast<_T2&&>(__zz))); \ |
| 185 | } |
| 186 | |
| 187 | // }}} |
| 188 | // __cosSeries {{{ |
| 189 | template <typename _Abi> |
| 190 | _GLIBCXX_SIMD_ALWAYS_INLINE static simd<float, _Abi> |
| 191 | __cosSeries(const simd<float, _Abi>& __x) |
| 192 | { |
| 193 | const simd<float, _Abi> __x2 = __x * __x; |
| 194 | simd<float, _Abi> __y; |
| 195 | __y = 0x1.ap-16f; // 1/8! |
| 196 | __y = __y * __x2 - 0x1.6c1p-10f; // -1/6! |
| 197 | __y = __y * __x2 + 0x1.555556p-5f; // 1/4! |
| 198 | return __y * (__x2 * __x2) - .5f * __x2 + 1.f; |
| 199 | } |
| 200 | |
| 201 | template <typename _Abi> |
| 202 | _GLIBCXX_SIMD_ALWAYS_INLINE static simd<double, _Abi> |
| 203 | __cosSeries(const simd<double, _Abi>& __x) |
| 204 | { |
| 205 | const simd<double, _Abi> __x2 = __x * __x; |
| 206 | simd<double, _Abi> __y; |
| 207 | __y = 0x1.AC00000000000p-45; // 1/16! |
| 208 | __y = __y * __x2 - 0x1.9394000000000p-37; // -1/14! |
| 209 | __y = __y * __x2 + 0x1.1EED8C0000000p-29; // 1/12! |
| 210 | __y = __y * __x2 - 0x1.27E4FB7400000p-22; // -1/10! |
| 211 | __y = __y * __x2 + 0x1.A01A01A018000p-16; // 1/8! |
| 212 | __y = __y * __x2 - 0x1.6C16C16C16C00p-10; // -1/6! |
| 213 | __y = __y * __x2 + 0x1.5555555555554p-5; // 1/4! |
| 214 | return (__y * __x2 - .5f) * __x2 + 1.f; |
| 215 | } |
| 216 | |
| 217 | // }}} |
| 218 | // __sinSeries {{{ |
| 219 | template <typename _Abi> |
| 220 | _GLIBCXX_SIMD_ALWAYS_INLINE static simd<float, _Abi> |
| 221 | __sinSeries(const simd<float, _Abi>& __x) |
| 222 | { |
| 223 | const simd<float, _Abi> __x2 = __x * __x; |
| 224 | simd<float, _Abi> __y; |
| 225 | __y = -0x1.9CC000p-13f; // -1/7! |
| 226 | __y = __y * __x2 + 0x1.111100p-7f; // 1/5! |
| 227 | __y = __y * __x2 - 0x1.555556p-3f; // -1/3! |
| 228 | return __y * (__x2 * __x) + __x; |
| 229 | } |
| 230 | |
| 231 | template <typename _Abi> |
| 232 | _GLIBCXX_SIMD_ALWAYS_INLINE static simd<double, _Abi> |
| 233 | __sinSeries(const simd<double, _Abi>& __x) |
| 234 | { |
| 235 | // __x = [0, 0.7854 = pi/4] |
| 236 | // __x² = [0, 0.6169 = pi²/8] |
| 237 | const simd<double, _Abi> __x2 = __x * __x; |
| 238 | simd<double, _Abi> __y; |
| 239 | __y = -0x1.ACF0000000000p-41; // -1/15! |
| 240 | __y = __y * __x2 + 0x1.6124400000000p-33; // 1/13! |
| 241 | __y = __y * __x2 - 0x1.AE64567000000p-26; // -1/11! |
| 242 | __y = __y * __x2 + 0x1.71DE3A5540000p-19; // 1/9! |
| 243 | __y = __y * __x2 - 0x1.A01A01A01A000p-13; // -1/7! |
| 244 | __y = __y * __x2 + 0x1.1111111111110p-7; // 1/5! |
| 245 | __y = __y * __x2 - 0x1.5555555555555p-3; // -1/3! |
| 246 | return __y * (__x2 * __x) + __x; |
| 247 | } |
| 248 | |
| 249 | // }}} |
| 250 | // __zero_low_bits {{{ |
| 251 | template <int _Bits, typename _Tp, typename _Abi> |
| 252 | _GLIBCXX_SIMD_INTRINSIC simd<_Tp, _Abi> |
| 253 | __zero_low_bits(simd<_Tp, _Abi> __x) |
| 254 | { |
| 255 | const simd<_Tp, _Abi> __bitmask |
| 256 | = __bit_cast<_Tp>(~make_unsigned_t<__int_for_sizeof_t<_Tp>>() << _Bits); |
| 257 | return {__private_init, |
| 258 | _Abi::_SimdImpl::_S_bit_and(__data(__x), __data(__bitmask))}; |
| 259 | } |
| 260 | |
| 261 | // }}} |
| 262 | // __fold_input {{{ |
| 263 | |
| 264 | /**@internal |
| 265 | * Fold @p x into [-¼π, ¼π] and remember the quadrant it came from: |
| 266 | * quadrant 0: [-¼π, ¼π] |
| 267 | * quadrant 1: [ ¼π, ¾π] |
| 268 | * quadrant 2: [ ¾π, 1¼π] |
| 269 | * quadrant 3: [1¼π, 1¾π] |
| 270 | * |
| 271 | * The algorithm determines `y` as the multiple `x - y * ¼π = [-¼π, ¼π]`. Using |
| 272 | * a bitmask, `y` is reduced to `quadrant`. `y` can be calculated as |
| 273 | * ``` |
| 274 | * y = trunc(x / ¼π); |
| 275 | * y += fmod(y, 2); |
| 276 | * ``` |
| 277 | * This can be simplified by moving the (implicit) division by 2 into the |
| 278 | * truncation expression. The `+= fmod` effect can the be achieved by using |
| 279 | * rounding instead of truncation: `y = round(x / ½π) * 2`. If precision allows, |
| 280 | * `2/Ï€ * x` is better (faster). |
| 281 | */ |
| 282 | template <typename _Tp, typename _Abi> |
| 283 | struct _Folded |
| 284 | { |
| 285 | simd<_Tp, _Abi> _M_x; |
| 286 | rebind_simd_t<int, simd<_Tp, _Abi>> _M_quadrant; |
| 287 | }; |
| 288 | |
| 289 | namespace __math_float { |
| 290 | inline constexpr float __pi_over_4 = 0x1.921FB6p-1f; // π/4 |
| 291 | inline constexpr float __2_over_pi = 0x1.45F306p-1f; // 2/Ï€ |
| 292 | inline constexpr float __pi_2_5bits0 |
| 293 | = 0x1.921fc0p0f; // π/2, 5 0-bits (least significant) |
| 294 | inline constexpr float __pi_2_5bits0_rem |
| 295 | = -0x1.5777a6p-21f; // π/2 - __pi_2_5bits0 |
| 296 | } // namespace __math_float |
| 297 | namespace __math_double { |
| 298 | inline constexpr double __pi_over_4 = 0x1.921fb54442d18p-1; // π/4 |
| 299 | inline constexpr double __2_over_pi = 0x1.45F306DC9C883p-1; // 2/Ï€ |
| 300 | inline constexpr double __pi_2 = 0x1.921fb54442d18p0; // π/2 |
| 301 | } // namespace __math_double |
| 302 | |
| 303 | template <typename _Abi> |
| 304 | _GLIBCXX_SIMD_ALWAYS_INLINE _Folded<float, _Abi> |
| 305 | __fold_input(const simd<float, _Abi>& __x) |
| 306 | { |
| 307 | using _V = simd<float, _Abi>; |
| 308 | using _IV = rebind_simd_t<int, _V>; |
| 309 | using namespace __math_float; |
| 310 | _Folded<float, _Abi> __r; |
| 311 | __r._M_x = abs(__x); |
| 312 | #if 0 |
| 313 | // zero most mantissa bits: |
| 314 | constexpr float __1_over_pi = 0x1.45F306p-2f; // 1/Ï€ |
| 315 | const auto __y = (__r._M_x * __1_over_pi + 0x1.8p23f) - 0x1.8p23f; |
| 316 | // split π into 4 parts, the first three with 13 trailing zeros (to make the |
| 317 | // following multiplications precise): |
| 318 | constexpr float __pi0 = 0x1.920000p1f; |
| 319 | constexpr float __pi1 = 0x1.fb4000p-11f; |
| 320 | constexpr float __pi2 = 0x1.444000p-23f; |
| 321 | constexpr float __pi3 = 0x1.68c234p-38f; |
| 322 | __r._M_x - __y*__pi0 - __y*__pi1 - __y*__pi2 - __y*__pi3 |
| 323 | #else |
| 324 | if (_GLIBCXX_SIMD_IS_UNLIKELY(all_of(__r._M_x < __pi_over_4))) |
| 325 | __r._M_quadrant = 0; |
| 326 | else if (_GLIBCXX_SIMD_IS_LIKELY(all_of(__r._M_x < 6 * __pi_over_4))) |
| 327 | { |
| 328 | const _V __y = nearbyint(__r._M_x * __2_over_pi); |
| 329 | __r._M_quadrant = static_simd_cast<_IV>(__y) & 3; // __y mod 4 |
| 330 | __r._M_x -= __y * __pi_2_5bits0; |
| 331 | __r._M_x -= __y * __pi_2_5bits0_rem; |
| 332 | } |
| 333 | else |
| 334 | { |
| 335 | using __math_double::__2_over_pi; |
| 336 | using __math_double::__pi_2; |
| 337 | using _VD = rebind_simd_t<double, _V>; |
| 338 | _VD __xd = static_simd_cast<_VD>(__r._M_x); |
| 339 | _VD __y = nearbyint(__xd * __2_over_pi); |
| 340 | __r._M_quadrant = static_simd_cast<_IV>(__y) & 3; // = __y mod 4 |
| 341 | __r._M_x = static_simd_cast<_V>(__xd - __y * __pi_2); |
| 342 | } |
| 343 | #endif |
| 344 | return __r; |
| 345 | } |
| 346 | |
| 347 | template <typename _Abi> |
| 348 | _GLIBCXX_SIMD_ALWAYS_INLINE _Folded<double, _Abi> |
| 349 | __fold_input(const simd<double, _Abi>& __x) |
| 350 | { |
| 351 | using _V = simd<double, _Abi>; |
| 352 | using _IV = rebind_simd_t<int, _V>; |
| 353 | using namespace __math_double; |
| 354 | |
| 355 | _Folded<double, _Abi> __r; |
| 356 | __r._M_x = abs(__x); |
| 357 | if (_GLIBCXX_SIMD_IS_UNLIKELY(all_of(__r._M_x < __pi_over_4))) |
| 358 | { |
| 359 | __r._M_quadrant = 0; |
| 360 | return __r; |
| 361 | } |
| 362 | const _V __y = nearbyint(__r._M_x / (2 * __pi_over_4)); |
| 363 | __r._M_quadrant = static_simd_cast<_IV>(__y) & 3; |
| 364 | |
| 365 | if (_GLIBCXX_SIMD_IS_LIKELY(all_of(__r._M_x < 1025 * __pi_over_4))) |
| 366 | { |
| 367 | // x - y * pi/2, y uses no more than 11 mantissa bits |
| 368 | __r._M_x -= __y * 0x1.921FB54443000p0; |
| 369 | __r._M_x -= __y * -0x1.73DCB3B39A000p-43; |
| 370 | __r._M_x -= __y * 0x1.45C06E0E68948p-86; |
| 371 | } |
| 372 | else if (_GLIBCXX_SIMD_IS_LIKELY(all_of(__y <= 0x1.0p30))) |
| 373 | { |
| 374 | // x - y * pi/2, y uses no more than 29 mantissa bits |
| 375 | __r._M_x -= __y * 0x1.921FB40000000p0; |
| 376 | __r._M_x -= __y * 0x1.4442D00000000p-24; |
| 377 | __r._M_x -= __y * 0x1.8469898CC5170p-48; |
| 378 | } |
| 379 | else |
| 380 | { |
| 381 | // x - y * pi/2, y may require all mantissa bits |
| 382 | const _V __y_hi = __zero_low_bits<26>(__y); |
| 383 | const _V __y_lo = __y - __y_hi; |
| 384 | const auto __pi_2_1 = 0x1.921FB50000000p0; |
| 385 | const auto __pi_2_2 = 0x1.110B460000000p-26; |
| 386 | const auto __pi_2_3 = 0x1.1A62630000000p-54; |
| 387 | const auto __pi_2_4 = 0x1.8A2E03707344Ap-81; |
| 388 | __r._M_x = __r._M_x - __y_hi * __pi_2_1 |
| 389 | - max(__y_hi * __pi_2_2, __y_lo * __pi_2_1) |
| 390 | - min(__y_hi * __pi_2_2, __y_lo * __pi_2_1) |
| 391 | - max(__y_hi * __pi_2_3, __y_lo * __pi_2_2) |
| 392 | - min(__y_hi * __pi_2_3, __y_lo * __pi_2_2) |
| 393 | - max(__y * __pi_2_4, __y_lo * __pi_2_3) |
| 394 | - min(__y * __pi_2_4, __y_lo * __pi_2_3); |
| 395 | } |
| 396 | return __r; |
| 397 | } |
| 398 | |
| 399 | // }}} |
| 400 | // __extract_exponent_as_int {{{ |
| 401 | template <typename _Tp, typename _Abi> |
| 402 | rebind_simd_t<int, simd<_Tp, _Abi>> |
| 403 | (const simd<_Tp, _Abi>& __v) |
| 404 | { |
| 405 | using _Vp = simd<_Tp, _Abi>; |
| 406 | using _Up = make_unsigned_t<__int_for_sizeof_t<_Tp>>; |
| 407 | using namespace std::experimental::__float_bitwise_operators; |
| 408 | const _Vp __exponent_mask |
| 409 | = __infinity_v<_Tp>; // 0x7f800000 or 0x7ff0000000000000 |
| 410 | return static_simd_cast<rebind_simd_t<int, _Vp>>( |
| 411 | __bit_cast<rebind_simd_t<_Up, _Vp>>(__v & __exponent_mask) |
| 412 | >> (__digits_v<_Tp> - 1)); |
| 413 | } |
| 414 | |
| 415 | // }}} |
| 416 | // __impl_or_fallback {{{ |
| 417 | template <typename ImplFun, typename FallbackFun, typename... _Args> |
| 418 | _GLIBCXX_SIMD_INTRINSIC auto |
| 419 | __impl_or_fallback_dispatch(int, ImplFun&& __impl_fun, FallbackFun&&, |
| 420 | _Args&&... __args) |
| 421 | -> decltype(__impl_fun(static_cast<_Args&&>(__args)...)) |
| 422 | { return __impl_fun(static_cast<_Args&&>(__args)...); } |
| 423 | |
| 424 | template <typename ImplFun, typename FallbackFun, typename... _Args> |
| 425 | inline auto |
| 426 | __impl_or_fallback_dispatch(float, ImplFun&&, FallbackFun&& __fallback_fun, |
| 427 | _Args&&... __args) |
| 428 | -> decltype(__fallback_fun(static_cast<_Args&&>(__args)...)) |
| 429 | { return __fallback_fun(static_cast<_Args&&>(__args)...); } |
| 430 | |
| 431 | template <typename... _Args> |
| 432 | _GLIBCXX_SIMD_INTRINSIC auto |
| 433 | __impl_or_fallback(_Args&&... __args) |
| 434 | { |
| 435 | return __impl_or_fallback_dispatch(int(), static_cast<_Args&&>(__args)...); |
| 436 | } |
| 437 | //}}} |
| 438 | |
| 439 | // trigonometric functions {{{ |
| 440 | _GLIBCXX_SIMD_MATH_CALL_(acos) |
| 441 | _GLIBCXX_SIMD_MATH_CALL_(asin) |
| 442 | _GLIBCXX_SIMD_MATH_CALL_(atan) |
| 443 | _GLIBCXX_SIMD_MATH_CALL2_(atan2, _Tp) |
| 444 | |
| 445 | /* |
| 446 | * algorithm for sine and cosine: |
| 447 | * |
| 448 | * The result can be calculated with sine or cosine depending on the π/4 section |
| 449 | * the input is in. sine ≈ __x + __x³ cosine ≈ 1 - __x² |
| 450 | * |
| 451 | * sine: |
| 452 | * Map -__x to __x and invert the output |
| 453 | * Extend precision of __x - n * π/4 by calculating |
| 454 | * ((__x - n * p1) - n * p2) - n * p3 (p1 + p2 + p3 = π/4) |
| 455 | * |
| 456 | * Calculate Taylor series with tuned coefficients. |
| 457 | * Fix sign. |
| 458 | */ |
| 459 | // cos{{{ |
| 460 | template <typename _Tp, typename _Abi> |
| 461 | enable_if_t<is_floating_point_v<_Tp>, simd<_Tp, _Abi>> |
| 462 | cos(const simd<_Tp, _Abi>& __x) |
| 463 | { |
| 464 | using _V = simd<_Tp, _Abi>; |
| 465 | if constexpr (__is_scalar_abi<_Abi>() || __is_fixed_size_abi_v<_Abi>) |
| 466 | return {__private_init, _Abi::_SimdImpl::_S_cos(__data(__x))}; |
| 467 | else |
| 468 | { |
| 469 | if constexpr (is_same_v<_Tp, float>) |
| 470 | if (_GLIBCXX_SIMD_IS_UNLIKELY(any_of(abs(__x) >= 393382))) |
| 471 | return static_simd_cast<_V>( |
| 472 | cos(static_simd_cast<rebind_simd_t<double, _V>>(__x))); |
| 473 | |
| 474 | const auto __f = __fold_input(__x); |
| 475 | // quadrant | effect |
| 476 | // 0 | cosSeries, + |
| 477 | // 1 | sinSeries, - |
| 478 | // 2 | cosSeries, - |
| 479 | // 3 | sinSeries, + |
| 480 | using namespace std::experimental::__float_bitwise_operators; |
| 481 | const _V __sign_flip |
| 482 | = _V(-0.f) & static_simd_cast<_V>((1 + __f._M_quadrant) << 30); |
| 483 | |
| 484 | const auto __need_cos = (__f._M_quadrant & 1) == 0; |
| 485 | if (_GLIBCXX_SIMD_IS_UNLIKELY(all_of(__need_cos))) |
| 486 | return __sign_flip ^ __cosSeries(__f._M_x); |
| 487 | else if (_GLIBCXX_SIMD_IS_UNLIKELY(none_of(__need_cos))) |
| 488 | return __sign_flip ^ __sinSeries(__f._M_x); |
| 489 | else // some_of(__need_cos) |
| 490 | { |
| 491 | _V __r = __sinSeries(__f._M_x); |
| 492 | where(__need_cos.__cvt(), __r) = __cosSeries(__f._M_x); |
| 493 | return __r ^ __sign_flip; |
| 494 | } |
| 495 | } |
| 496 | } |
| 497 | |
| 498 | template <typename _Tp> |
| 499 | _GLIBCXX_SIMD_ALWAYS_INLINE |
| 500 | enable_if_t<is_floating_point<_Tp>::value, simd<_Tp, simd_abi::scalar>> |
| 501 | cos(simd<_Tp, simd_abi::scalar> __x) |
| 502 | { return std::cos(__data(__x)); } |
| 503 | |
| 504 | //}}} |
| 505 | // sin{{{ |
| 506 | template <typename _Tp, typename _Abi> |
| 507 | enable_if_t<is_floating_point_v<_Tp>, simd<_Tp, _Abi>> |
| 508 | sin(const simd<_Tp, _Abi>& __x) |
| 509 | { |
| 510 | using _V = simd<_Tp, _Abi>; |
| 511 | if constexpr (__is_scalar_abi<_Abi>() || __is_fixed_size_abi_v<_Abi>) |
| 512 | return {__private_init, _Abi::_SimdImpl::_S_sin(__data(__x))}; |
| 513 | else |
| 514 | { |
| 515 | if constexpr (is_same_v<_Tp, float>) |
| 516 | if (_GLIBCXX_SIMD_IS_UNLIKELY(any_of(abs(__x) >= 527449))) |
| 517 | return static_simd_cast<_V>( |
| 518 | sin(static_simd_cast<rebind_simd_t<double, _V>>(__x))); |
| 519 | |
| 520 | const auto __f = __fold_input(__x); |
| 521 | // quadrant | effect |
| 522 | // 0 | sinSeries |
| 523 | // 1 | cosSeries |
| 524 | // 2 | sinSeries, sign flip |
| 525 | // 3 | cosSeries, sign flip |
| 526 | using namespace std::experimental::__float_bitwise_operators; |
| 527 | const auto __sign_flip |
| 528 | = (__x ^ static_simd_cast<_V>(1 - __f._M_quadrant)) & _V(_Tp(-0.)); |
| 529 | |
| 530 | const auto __need_sin = (__f._M_quadrant & 1) == 0; |
| 531 | if (_GLIBCXX_SIMD_IS_UNLIKELY(all_of(__need_sin))) |
| 532 | return __sign_flip ^ __sinSeries(__f._M_x); |
| 533 | else if (_GLIBCXX_SIMD_IS_UNLIKELY(none_of(__need_sin))) |
| 534 | return __sign_flip ^ __cosSeries(__f._M_x); |
| 535 | else // some_of(__need_sin) |
| 536 | { |
| 537 | _V __r = __cosSeries(__f._M_x); |
| 538 | where(__need_sin.__cvt(), __r) = __sinSeries(__f._M_x); |
| 539 | return __sign_flip ^ __r; |
| 540 | } |
| 541 | } |
| 542 | } |
| 543 | |
| 544 | template <typename _Tp> |
| 545 | _GLIBCXX_SIMD_ALWAYS_INLINE |
| 546 | enable_if_t<is_floating_point<_Tp>::value, simd<_Tp, simd_abi::scalar>> |
| 547 | sin(simd<_Tp, simd_abi::scalar> __x) |
| 548 | { return std::sin(__data(__x)); } |
| 549 | |
| 550 | //}}} |
| 551 | _GLIBCXX_SIMD_MATH_CALL_(tan) |
| 552 | _GLIBCXX_SIMD_MATH_CALL_(acosh) |
| 553 | _GLIBCXX_SIMD_MATH_CALL_(asinh) |
| 554 | _GLIBCXX_SIMD_MATH_CALL_(atanh) |
| 555 | _GLIBCXX_SIMD_MATH_CALL_(cosh) |
| 556 | _GLIBCXX_SIMD_MATH_CALL_(sinh) |
| 557 | _GLIBCXX_SIMD_MATH_CALL_(tanh) |
| 558 | // }}} |
| 559 | // exponential functions {{{ |
| 560 | _GLIBCXX_SIMD_MATH_CALL_(exp) |
| 561 | _GLIBCXX_SIMD_MATH_CALL_(exp2) |
| 562 | _GLIBCXX_SIMD_MATH_CALL_(expm1) |
| 563 | |
| 564 | // }}} |
| 565 | // frexp {{{ |
| 566 | #if _GLIBCXX_SIMD_X86INTRIN |
| 567 | template <typename _Tp, size_t _Np> |
| 568 | _SimdWrapper<_Tp, _Np> |
| 569 | __getexp(_SimdWrapper<_Tp, _Np> __x) |
| 570 | { |
| 571 | if constexpr (__have_avx512vl && __is_sse_ps<_Tp, _Np>()) |
| 572 | return __auto_bitcast(_mm_getexp_ps(__to_intrin(__x))); |
| 573 | else if constexpr (__have_avx512f && __is_sse_ps<_Tp, _Np>()) |
| 574 | return __auto_bitcast(_mm512_getexp_ps(__auto_bitcast(__to_intrin(__x)))); |
| 575 | else if constexpr (__have_avx512vl && __is_sse_pd<_Tp, _Np>()) |
| 576 | return _mm_getexp_pd(__x); |
| 577 | else if constexpr (__have_avx512f && __is_sse_pd<_Tp, _Np>()) |
| 578 | return __lo128(_mm512_getexp_pd(__auto_bitcast(__x))); |
| 579 | else if constexpr (__have_avx512vl && __is_avx_ps<_Tp, _Np>()) |
| 580 | return _mm256_getexp_ps(__x); |
| 581 | else if constexpr (__have_avx512f && __is_avx_ps<_Tp, _Np>()) |
| 582 | return __lo256(_mm512_getexp_ps(__auto_bitcast(__x))); |
| 583 | else if constexpr (__have_avx512vl && __is_avx_pd<_Tp, _Np>()) |
| 584 | return _mm256_getexp_pd(__x); |
| 585 | else if constexpr (__have_avx512f && __is_avx_pd<_Tp, _Np>()) |
| 586 | return __lo256(_mm512_getexp_pd(__auto_bitcast(__x))); |
| 587 | else if constexpr (__is_avx512_ps<_Tp, _Np>()) |
| 588 | return _mm512_getexp_ps(__x); |
| 589 | else if constexpr (__is_avx512_pd<_Tp, _Np>()) |
| 590 | return _mm512_getexp_pd(__x); |
| 591 | else |
| 592 | __assert_unreachable<_Tp>(); |
| 593 | } |
| 594 | |
| 595 | template <typename _Tp, size_t _Np> |
| 596 | _SimdWrapper<_Tp, _Np> |
| 597 | __getmant_avx512(_SimdWrapper<_Tp, _Np> __x) |
| 598 | { |
| 599 | if constexpr (__have_avx512vl && __is_sse_ps<_Tp, _Np>()) |
| 600 | return __auto_bitcast(_mm_getmant_ps(__to_intrin(__x), _MM_MANT_NORM_p5_1, |
| 601 | _MM_MANT_SIGN_src)); |
| 602 | else if constexpr (__have_avx512f && __is_sse_ps<_Tp, _Np>()) |
| 603 | return __auto_bitcast(_mm512_getmant_ps(__auto_bitcast(__to_intrin(__x)), |
| 604 | _MM_MANT_NORM_p5_1, |
| 605 | _MM_MANT_SIGN_src)); |
| 606 | else if constexpr (__have_avx512vl && __is_sse_pd<_Tp, _Np>()) |
| 607 | return _mm_getmant_pd(__x, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src); |
| 608 | else if constexpr (__have_avx512f && __is_sse_pd<_Tp, _Np>()) |
| 609 | return __lo128(_mm512_getmant_pd(__auto_bitcast(__x), _MM_MANT_NORM_p5_1, |
| 610 | _MM_MANT_SIGN_src)); |
| 611 | else if constexpr (__have_avx512vl && __is_avx_ps<_Tp, _Np>()) |
| 612 | return _mm256_getmant_ps(__x, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src); |
| 613 | else if constexpr (__have_avx512f && __is_avx_ps<_Tp, _Np>()) |
| 614 | return __lo256(_mm512_getmant_ps(__auto_bitcast(__x), _MM_MANT_NORM_p5_1, |
| 615 | _MM_MANT_SIGN_src)); |
| 616 | else if constexpr (__have_avx512vl && __is_avx_pd<_Tp, _Np>()) |
| 617 | return _mm256_getmant_pd(__x, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src); |
| 618 | else if constexpr (__have_avx512f && __is_avx_pd<_Tp, _Np>()) |
| 619 | return __lo256(_mm512_getmant_pd(__auto_bitcast(__x), _MM_MANT_NORM_p5_1, |
| 620 | _MM_MANT_SIGN_src)); |
| 621 | else if constexpr (__is_avx512_ps<_Tp, _Np>()) |
| 622 | return _mm512_getmant_ps(__x, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src); |
| 623 | else if constexpr (__is_avx512_pd<_Tp, _Np>()) |
| 624 | return _mm512_getmant_pd(__x, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src); |
| 625 | else |
| 626 | __assert_unreachable<_Tp>(); |
| 627 | } |
| 628 | #endif // _GLIBCXX_SIMD_X86INTRIN |
| 629 | |
| 630 | /** |
| 631 | * splits @p __v into exponent and mantissa, the sign is kept with the mantissa |
| 632 | * |
| 633 | * The return value will be in the range [0.5, 1.0[ |
| 634 | * The @p __e value will be an integer defining the power-of-two exponent |
| 635 | */ |
| 636 | template <typename _Tp, typename _Abi> |
| 637 | enable_if_t<is_floating_point_v<_Tp>, simd<_Tp, _Abi>> |
| 638 | frexp(const simd<_Tp, _Abi>& __x, _Samesize<int, simd<_Tp, _Abi>>* __exp) |
| 639 | { |
| 640 | if constexpr (simd_size_v<_Tp, _Abi> == 1) |
| 641 | { |
| 642 | int __tmp; |
| 643 | const auto __r = std::frexp(__x[0], &__tmp); |
| 644 | (*__exp)[0] = __tmp; |
| 645 | return __r; |
| 646 | } |
| 647 | else if constexpr (__is_fixed_size_abi_v<_Abi>) |
| 648 | { |
| 649 | return {__private_init, |
| 650 | _Abi::_SimdImpl::_S_frexp(__data(__x), __data(*__exp))}; |
| 651 | #if _GLIBCXX_SIMD_X86INTRIN |
| 652 | } |
| 653 | else if constexpr (__have_avx512f) |
| 654 | { |
| 655 | constexpr size_t _Np = simd_size_v<_Tp, _Abi>; |
| 656 | constexpr size_t _NI = _Np < 4 ? 4 : _Np; |
| 657 | const auto __v = __data(__x); |
| 658 | const auto __isnonzero |
| 659 | = _Abi::_SimdImpl::_S_isnonzerovalue_mask(__v._M_data); |
| 660 | const _SimdWrapper<int, _NI> __exp_plus1 |
| 661 | = 1 + __convert<_SimdWrapper<int, _NI>>(__getexp(__v))._M_data; |
| 662 | const _SimdWrapper<int, _Np> __e = __wrapper_bitcast<int, _Np>( |
| 663 | _Abi::_CommonImpl::_S_blend(_SimdWrapper<bool, _NI>(__isnonzero), |
| 664 | _SimdWrapper<int, _NI>(), __exp_plus1)); |
| 665 | simd_abi::deduce_t<int, _Np>::_CommonImpl::_S_store(__e, __exp); |
| 666 | return {__private_init, |
| 667 | _Abi::_CommonImpl::_S_blend(_SimdWrapper<bool, _Np>( |
| 668 | __isnonzero), |
| 669 | __v, __getmant_avx512(__v))}; |
| 670 | #endif // _GLIBCXX_SIMD_X86INTRIN |
| 671 | } |
| 672 | else |
| 673 | { |
| 674 | // fallback implementation |
| 675 | static_assert(sizeof(_Tp) == 4 || sizeof(_Tp) == 8); |
| 676 | using _V = simd<_Tp, _Abi>; |
| 677 | using _IV = rebind_simd_t<int, _V>; |
| 678 | using namespace std::experimental::__proposed; |
| 679 | using namespace std::experimental::__float_bitwise_operators; |
| 680 | |
| 681 | constexpr int __exp_adjust = sizeof(_Tp) == 4 ? 0x7e : 0x3fe; |
| 682 | constexpr int __exp_offset = sizeof(_Tp) == 4 ? 0x70 : 0x200; |
| 683 | constexpr _Tp __subnorm_scale = sizeof(_Tp) == 4 ? 0x1p112 : 0x1p512; |
| 684 | _GLIBCXX_SIMD_USE_CONSTEXPR_API _V __exponent_mask |
| 685 | = __infinity_v<_Tp>; // 0x7f800000 or 0x7ff0000000000000 |
| 686 | _GLIBCXX_SIMD_USE_CONSTEXPR_API _V __p5_1_exponent |
| 687 | = -(2 - __epsilon_v<_Tp>) / 2; // 0xbf7fffff or 0xbfefffffffffffff |
| 688 | |
| 689 | _V __mant = __p5_1_exponent & (__exponent_mask | __x); // +/-[.5, 1) |
| 690 | const _IV __exponent_bits = __extract_exponent_as_int(__x); |
| 691 | if (_GLIBCXX_SIMD_IS_LIKELY(all_of(isnormal(__x)))) |
| 692 | { |
| 693 | *__exp |
| 694 | = simd_cast<_Samesize<int, _V>>(__exponent_bits - __exp_adjust); |
| 695 | return __mant; |
| 696 | } |
| 697 | |
| 698 | #if __FINITE_MATH_ONLY__ |
| 699 | // at least one element of __x is 0 or subnormal, the rest is normal |
| 700 | // (inf and NaN are excluded by -ffinite-math-only) |
| 701 | const auto __iszero_inf_nan = __x == 0; |
| 702 | #else |
| 703 | const auto __as_int |
| 704 | = __bit_cast<rebind_simd_t<__int_for_sizeof_t<_Tp>, _V>>(abs(__x)); |
| 705 | const auto __inf |
| 706 | = __bit_cast<rebind_simd_t<__int_for_sizeof_t<_Tp>, _V>>( |
| 707 | _V(__infinity_v<_Tp>)); |
| 708 | const auto __iszero_inf_nan = static_simd_cast<typename _V::mask_type>( |
| 709 | __as_int == 0 || __as_int >= __inf); |
| 710 | #endif |
| 711 | |
| 712 | const _V __scaled_subnormal = __x * __subnorm_scale; |
| 713 | const _V __mant_subnormal |
| 714 | = __p5_1_exponent & (__exponent_mask | __scaled_subnormal); |
| 715 | where(!isnormal(__x), __mant) = __mant_subnormal; |
| 716 | where(__iszero_inf_nan, __mant) = __x; |
| 717 | _IV __e = __extract_exponent_as_int(__scaled_subnormal); |
| 718 | using _MaskType = |
| 719 | typename conditional_t<sizeof(typename _V::value_type) == sizeof(int), |
| 720 | _V, _IV>::mask_type; |
| 721 | const _MaskType __value_isnormal = isnormal(__x).__cvt(); |
| 722 | where(__value_isnormal.__cvt(), __e) = __exponent_bits; |
| 723 | static_assert(sizeof(_IV) == sizeof(__value_isnormal)); |
| 724 | const _IV __offset |
| 725 | = (__bit_cast<_IV>(__value_isnormal) & _IV(__exp_adjust)) |
| 726 | | (__bit_cast<_IV>(static_simd_cast<_MaskType>(__exponent_bits == 0) |
| 727 | & static_simd_cast<_MaskType>(__x != 0)) |
| 728 | & _IV(__exp_adjust + __exp_offset)); |
| 729 | *__exp = simd_cast<_Samesize<int, _V>>(__e - __offset); |
| 730 | return __mant; |
| 731 | } |
| 732 | } |
| 733 | |
| 734 | // }}} |
| 735 | _GLIBCXX_SIMD_MATH_CALL2_(ldexp, int) |
| 736 | _GLIBCXX_SIMD_MATH_CALL_(ilogb) |
| 737 | |
| 738 | // logarithms {{{ |
| 739 | _GLIBCXX_SIMD_MATH_CALL_(log) |
| 740 | _GLIBCXX_SIMD_MATH_CALL_(log10) |
| 741 | _GLIBCXX_SIMD_MATH_CALL_(log1p) |
| 742 | _GLIBCXX_SIMD_MATH_CALL_(log2) |
| 743 | |
| 744 | //}}} |
| 745 | // logb{{{ |
| 746 | template <typename _Tp, typename _Abi> |
| 747 | enable_if_t<is_floating_point<_Tp>::value, simd<_Tp, _Abi>> |
| 748 | logb(const simd<_Tp, _Abi>& __x) |
| 749 | { |
| 750 | constexpr size_t _Np = simd_size_v<_Tp, _Abi>; |
| 751 | if constexpr (_Np == 1) |
| 752 | return std::logb(__x[0]); |
| 753 | else if constexpr (__is_fixed_size_abi_v<_Abi>) |
| 754 | { |
| 755 | return {__private_init, |
| 756 | __data(__x)._M_apply_per_chunk([](auto __impl, auto __xx) { |
| 757 | using _V = typename decltype(__impl)::simd_type; |
| 758 | return __data( |
| 759 | std::experimental::logb(_V(__private_init, __xx))); |
| 760 | })}; |
| 761 | } |
| 762 | #if _GLIBCXX_SIMD_X86INTRIN // {{{ |
| 763 | else if constexpr (__have_avx512vl && __is_sse_ps<_Tp, _Np>()) |
| 764 | return {__private_init, |
| 765 | __auto_bitcast(_mm_getexp_ps(__to_intrin(__as_vector(__x))))}; |
| 766 | else if constexpr (__have_avx512vl && __is_sse_pd<_Tp, _Np>()) |
| 767 | return {__private_init, _mm_getexp_pd(__data(__x))}; |
| 768 | else if constexpr (__have_avx512vl && __is_avx_ps<_Tp, _Np>()) |
| 769 | return {__private_init, _mm256_getexp_ps(__data(__x))}; |
| 770 | else if constexpr (__have_avx512vl && __is_avx_pd<_Tp, _Np>()) |
| 771 | return {__private_init, _mm256_getexp_pd(__data(__x))}; |
| 772 | else if constexpr (__have_avx512f && __is_avx_ps<_Tp, _Np>()) |
| 773 | return {__private_init, |
| 774 | __lo256(_mm512_getexp_ps(__auto_bitcast(__data(__x))))}; |
| 775 | else if constexpr (__have_avx512f && __is_avx_pd<_Tp, _Np>()) |
| 776 | return {__private_init, |
| 777 | __lo256(_mm512_getexp_pd(__auto_bitcast(__data(__x))))}; |
| 778 | else if constexpr (__is_avx512_ps<_Tp, _Np>()) |
| 779 | return {__private_init, _mm512_getexp_ps(__data(__x))}; |
| 780 | else if constexpr (__is_avx512_pd<_Tp, _Np>()) |
| 781 | return {__private_init, _mm512_getexp_pd(__data(__x))}; |
| 782 | #endif // _GLIBCXX_SIMD_X86INTRIN }}} |
| 783 | else |
| 784 | { |
| 785 | using _V = simd<_Tp, _Abi>; |
| 786 | using namespace std::experimental::__proposed; |
| 787 | auto __is_normal = isnormal(__x); |
| 788 | |
| 789 | // work on abs(__x) to reflect the return value on Linux for negative |
| 790 | // inputs (domain-error => implementation-defined value is returned) |
| 791 | const _V abs_x = abs(__x); |
| 792 | |
| 793 | // __exponent(__x) returns the exponent value (bias removed) as |
| 794 | // simd<_Up> with integral _Up |
| 795 | auto&& __exponent = [](const _V& __v) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
| 796 | using namespace std::experimental::__proposed; |
| 797 | using _IV = rebind_simd_t< |
| 798 | conditional_t<sizeof(_Tp) == sizeof(_LLong), _LLong, int>, _V>; |
| 799 | return (__bit_cast<_IV>(__v) >> (__digits_v<_Tp> - 1)) |
| 800 | - (__max_exponent_v<_Tp> - 1); |
| 801 | }; |
| 802 | _V __r = static_simd_cast<_V>(__exponent(abs_x)); |
| 803 | if (_GLIBCXX_SIMD_IS_LIKELY(all_of(__is_normal))) |
| 804 | // without corner cases (nan, inf, subnormal, zero) we have our |
| 805 | // answer: |
| 806 | return __r; |
| 807 | const auto __is_zero = __x == 0; |
| 808 | const auto __is_nan = isnan(__x); |
| 809 | const auto __is_inf = isinf(__x); |
| 810 | where(__is_zero, __r) = -__infinity_v<_Tp>; |
| 811 | where(__is_nan, __r) = __x; |
| 812 | where(__is_inf, __r) = __infinity_v<_Tp>; |
| 813 | __is_normal |= __is_zero || __is_nan || __is_inf; |
| 814 | if (all_of(__is_normal)) |
| 815 | // at this point everything but subnormals is handled |
| 816 | return __r; |
| 817 | // subnormals repeat the exponent extraction after multiplication of the |
| 818 | // input with __a floating point value that has 112 (0x70) in its exponent |
| 819 | // (not too big for sp and large enough for dp) |
| 820 | const _V __scaled = abs_x * _Tp(0x1p112); |
| 821 | _V __scaled_exp = static_simd_cast<_V>(__exponent(__scaled) - 112); |
| 822 | where(__is_normal, __scaled_exp) = __r; |
| 823 | return __scaled_exp; |
| 824 | } |
| 825 | } |
| 826 | |
| 827 | //}}} |
| 828 | template <typename _Tp, typename _Abi> |
| 829 | enable_if_t<is_floating_point_v<_Tp>, simd<_Tp, _Abi>> |
| 830 | modf(const simd<_Tp, _Abi>& __x, simd<_Tp, _Abi>* __iptr) |
| 831 | { |
| 832 | if constexpr (__is_scalar_abi<_Abi>() |
| 833 | || (__is_fixed_size_abi_v< |
| 834 | _Abi> && simd_size_v<_Tp, _Abi> == 1)) |
| 835 | { |
| 836 | _Tp __tmp; |
| 837 | _Tp __r = std::modf(__x[0], &__tmp); |
| 838 | __iptr[0] = __tmp; |
| 839 | return __r; |
| 840 | } |
| 841 | else |
| 842 | { |
| 843 | const auto __integral = trunc(__x); |
| 844 | *__iptr = __integral; |
| 845 | auto __r = __x - __integral; |
| 846 | #if !__FINITE_MATH_ONLY__ |
| 847 | where(isinf(__x), __r) = _Tp(); |
| 848 | #endif |
| 849 | return copysign(__r, __x); |
| 850 | } |
| 851 | } |
| 852 | |
| 853 | _GLIBCXX_SIMD_MATH_CALL2_(scalbn, int) |
| 854 | _GLIBCXX_SIMD_MATH_CALL2_(scalbln, long) |
| 855 | |
| 856 | _GLIBCXX_SIMD_MATH_CALL_(cbrt) |
| 857 | |
| 858 | _GLIBCXX_SIMD_MATH_CALL_(abs) |
| 859 | _GLIBCXX_SIMD_MATH_CALL_(fabs) |
| 860 | |
| 861 | // [parallel.simd.math] only asks for is_floating_point_v<_Tp> and forgot to |
| 862 | // allow signed integral _Tp |
| 863 | template <typename _Tp, typename _Abi> |
| 864 | enable_if_t<!is_floating_point_v<_Tp> && is_signed_v<_Tp>, simd<_Tp, _Abi>> |
| 865 | abs(const simd<_Tp, _Abi>& __x) |
| 866 | { return {__private_init, _Abi::_SimdImpl::_S_abs(__data(__x))}; } |
| 867 | |
| 868 | template <typename _Tp, typename _Abi> |
| 869 | enable_if_t<!is_floating_point_v<_Tp> && is_signed_v<_Tp>, simd<_Tp, _Abi>> |
| 870 | fabs(const simd<_Tp, _Abi>& __x) |
| 871 | { return {__private_init, _Abi::_SimdImpl::_S_abs(__data(__x))}; } |
| 872 | |
| 873 | // the following are overloads for functions in <cstdlib> and not covered by |
| 874 | // [parallel.simd.math]. I don't see much value in making them work, though |
| 875 | /* |
| 876 | template <typename _Abi> simd<long, _Abi> labs(const simd<long, _Abi> &__x) |
| 877 | { return {__private_init, _Abi::_SimdImpl::abs(__data(__x))}; } |
| 878 | |
| 879 | template <typename _Abi> simd<long long, _Abi> llabs(const simd<long long, _Abi> |
| 880 | &__x) |
| 881 | { return {__private_init, _Abi::_SimdImpl::abs(__data(__x))}; } |
| 882 | */ |
| 883 | |
| 884 | #define _GLIBCXX_SIMD_CVTING2(_NAME) \ |
| 885 | template <typename _Tp, typename _Abi> \ |
| 886 | _GLIBCXX_SIMD_INTRINSIC simd<_Tp, _Abi> _NAME( \ |
| 887 | const simd<_Tp, _Abi>& __x, const __type_identity_t<simd<_Tp, _Abi>>& __y) \ |
| 888 | { \ |
| 889 | return _NAME(__x, __y); \ |
| 890 | } \ |
| 891 | \ |
| 892 | template <typename _Tp, typename _Abi> \ |
| 893 | _GLIBCXX_SIMD_INTRINSIC simd<_Tp, _Abi> _NAME( \ |
| 894 | const __type_identity_t<simd<_Tp, _Abi>>& __x, const simd<_Tp, _Abi>& __y) \ |
| 895 | { \ |
| 896 | return _NAME(__x, __y); \ |
| 897 | } |
| 898 | |
| 899 | #define _GLIBCXX_SIMD_CVTING3(_NAME) \ |
| 900 | template <typename _Tp, typename _Abi> \ |
| 901 | _GLIBCXX_SIMD_INTRINSIC simd<_Tp, _Abi> _NAME( \ |
| 902 | const __type_identity_t<simd<_Tp, _Abi>>& __x, const simd<_Tp, _Abi>& __y, \ |
| 903 | const simd<_Tp, _Abi>& __z) \ |
| 904 | { \ |
| 905 | return _NAME(__x, __y, __z); \ |
| 906 | } \ |
| 907 | \ |
| 908 | template <typename _Tp, typename _Abi> \ |
| 909 | _GLIBCXX_SIMD_INTRINSIC simd<_Tp, _Abi> _NAME( \ |
| 910 | const simd<_Tp, _Abi>& __x, const __type_identity_t<simd<_Tp, _Abi>>& __y, \ |
| 911 | const simd<_Tp, _Abi>& __z) \ |
| 912 | { \ |
| 913 | return _NAME(__x, __y, __z); \ |
| 914 | } \ |
| 915 | \ |
| 916 | template <typename _Tp, typename _Abi> \ |
| 917 | _GLIBCXX_SIMD_INTRINSIC simd<_Tp, _Abi> _NAME( \ |
| 918 | const simd<_Tp, _Abi>& __x, const simd<_Tp, _Abi>& __y, \ |
| 919 | const __type_identity_t<simd<_Tp, _Abi>>& __z) \ |
| 920 | { \ |
| 921 | return _NAME(__x, __y, __z); \ |
| 922 | } \ |
| 923 | \ |
| 924 | template <typename _Tp, typename _Abi> \ |
| 925 | _GLIBCXX_SIMD_INTRINSIC simd<_Tp, _Abi> _NAME( \ |
| 926 | const simd<_Tp, _Abi>& __x, const __type_identity_t<simd<_Tp, _Abi>>& __y, \ |
| 927 | const __type_identity_t<simd<_Tp, _Abi>>& __z) \ |
| 928 | { \ |
| 929 | return _NAME(__x, __y, __z); \ |
| 930 | } \ |
| 931 | \ |
| 932 | template <typename _Tp, typename _Abi> \ |
| 933 | _GLIBCXX_SIMD_INTRINSIC simd<_Tp, _Abi> _NAME( \ |
| 934 | const __type_identity_t<simd<_Tp, _Abi>>& __x, const simd<_Tp, _Abi>& __y, \ |
| 935 | const __type_identity_t<simd<_Tp, _Abi>>& __z) \ |
| 936 | { \ |
| 937 | return _NAME(__x, __y, __z); \ |
| 938 | } \ |
| 939 | \ |
| 940 | template <typename _Tp, typename _Abi> \ |
| 941 | _GLIBCXX_SIMD_INTRINSIC simd<_Tp, _Abi> _NAME( \ |
| 942 | const __type_identity_t<simd<_Tp, _Abi>>& __x, \ |
| 943 | const __type_identity_t<simd<_Tp, _Abi>>& __y, const simd<_Tp, _Abi>& __z) \ |
| 944 | { \ |
| 945 | return _NAME(__x, __y, __z); \ |
| 946 | } |
| 947 | |
| 948 | template <typename _R, typename _ToApply, typename _Tp, typename... _Tps> |
| 949 | _GLIBCXX_SIMD_INTRINSIC _R |
| 950 | __fixed_size_apply(_ToApply&& __apply, const _Tp& __arg0, |
| 951 | const _Tps&... __args) |
| 952 | { |
| 953 | return {__private_init, |
| 954 | __data(__arg0)._M_apply_per_chunk( |
| 955 | [&](auto __impl, const auto&... __inner) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
| 956 | using _V = typename decltype(__impl)::simd_type; |
| 957 | return __data(__apply(_V(__private_init, __inner)...)); |
| 958 | }, |
| 959 | __data(__args)...)}; |
| 960 | } |
| 961 | |
| 962 | template <typename _VV> |
| 963 | __remove_cvref_t<_VV> |
| 964 | __hypot(_VV __x, _VV __y) |
| 965 | { |
| 966 | using _V = __remove_cvref_t<_VV>; |
| 967 | using _Tp = typename _V::value_type; |
| 968 | if constexpr (_V::size() == 1) |
| 969 | return std::hypot(_Tp(__x[0]), _Tp(__y[0])); |
| 970 | else if constexpr (__is_fixed_size_abi_v<typename _V::abi_type>) |
| 971 | { |
| 972 | return __fixed_size_apply<_V>([](auto __a, |
| 973 | auto __b) { return hypot(__a, __b); }, |
| 974 | __x, __y); |
| 975 | } |
| 976 | else |
| 977 | { |
| 978 | // A simple solution for _Tp == float would be to cast to double and |
| 979 | // simply calculate sqrt(x²+y²) as it can't over-/underflow anymore with |
| 980 | // dp. It still needs the Annex F fixups though and isn't faster on |
| 981 | // Skylake-AVX512 (not even for SSE and AVX vectors, and really bad for |
| 982 | // AVX-512). |
| 983 | using namespace __float_bitwise_operators; |
| 984 | _V __absx = abs(__x); // no error |
| 985 | _V __absy = abs(__y); // no error |
| 986 | _V __hi = max(__absx, __absy); // no error |
| 987 | _V __lo = min(__absy, __absx); // no error |
| 988 | |
| 989 | // round __hi down to the next power-of-2: |
| 990 | _GLIBCXX_SIMD_USE_CONSTEXPR_API _V __inf(__infinity_v<_Tp>); |
| 991 | |
| 992 | #ifndef __FAST_MATH__ |
| 993 | if constexpr (__have_neon && !__have_neon_a32) |
| 994 | { // With ARMv7 NEON, we have no subnormals and must use slightly |
| 995 | // different strategy |
| 996 | const _V __hi_exp = __hi & __inf; |
| 997 | _V __scale_back = __hi_exp; |
| 998 | // For large exponents (max & max/2) the inversion comes too close |
| 999 | // to subnormals. Subtract 3 from the exponent: |
| 1000 | where(__hi_exp > 1, __scale_back) = __hi_exp * _Tp(0.125); |
| 1001 | // Invert and adjust for the off-by-one error of inversion via xor: |
| 1002 | const _V __scale = (__scale_back ^ __inf) * _Tp(.5); |
| 1003 | const _V __h1 = __hi * __scale; |
| 1004 | const _V __l1 = __lo * __scale; |
| 1005 | _V __r = __scale_back * sqrt(__h1 * __h1 + __l1 * __l1); |
| 1006 | // Fix up hypot(0, 0) to not be NaN: |
| 1007 | where(__hi == 0, __r) = 0; |
| 1008 | return __r; |
| 1009 | } |
| 1010 | #endif |
| 1011 | |
| 1012 | #ifdef __FAST_MATH__ |
| 1013 | // With fast-math, ignore precision of subnormals and inputs from |
| 1014 | // __finite_max_v/2 to __finite_max_v. This removes all |
| 1015 | // branching/masking. |
| 1016 | if constexpr (true) |
| 1017 | #else |
| 1018 | if (_GLIBCXX_SIMD_IS_LIKELY(all_of(isnormal(__x)) |
| 1019 | && all_of(isnormal(__y)))) |
| 1020 | #endif |
| 1021 | { |
| 1022 | const _V __hi_exp = __hi & __inf; |
| 1023 | //((__hi + __hi) & __inf) ^ __inf almost works for computing |
| 1024 | //__scale, |
| 1025 | // except when (__hi + __hi) & __inf == __inf, in which case __scale |
| 1026 | // becomes 0 (should be min/2 instead) and thus loses the |
| 1027 | // information from __lo. |
| 1028 | #ifdef __FAST_MATH__ |
| 1029 | using _Ip = __int_for_sizeof_t<_Tp>; |
| 1030 | using _IV = rebind_simd_t<_Ip, _V>; |
| 1031 | const auto __as_int = __bit_cast<_IV>(__hi_exp); |
| 1032 | const _V __scale |
| 1033 | = __bit_cast<_V>(2 * __bit_cast<_Ip>(_Tp(1)) - __as_int); |
| 1034 | #else |
| 1035 | const _V __scale = (__hi_exp ^ __inf) * _Tp(.5); |
| 1036 | #endif |
| 1037 | _GLIBCXX_SIMD_USE_CONSTEXPR_API _V __mant_mask |
| 1038 | = __norm_min_v<_Tp> - __denorm_min_v<_Tp>; |
| 1039 | const _V __h1 = (__hi & __mant_mask) | _V(1); |
| 1040 | const _V __l1 = __lo * __scale; |
| 1041 | return __hi_exp * sqrt(__h1 * __h1 + __l1 * __l1); |
| 1042 | } |
| 1043 | else |
| 1044 | { |
| 1045 | // slower path to support subnormals |
| 1046 | // if __hi is subnormal, avoid scaling by inf & final mul by 0 |
| 1047 | // (which yields NaN) by using min() |
| 1048 | _V __scale = _V(1 / __norm_min_v<_Tp>); |
| 1049 | // invert exponent w/o error and w/o using the slow divider unit: |
| 1050 | // xor inverts the exponent but off by 1. Multiplication with .5 |
| 1051 | // adjusts for the discrepancy. |
| 1052 | where(__hi >= __norm_min_v<_Tp>, __scale) |
| 1053 | = ((__hi & __inf) ^ __inf) * _Tp(.5); |
| 1054 | // adjust final exponent for subnormal inputs |
| 1055 | _V __hi_exp = __norm_min_v<_Tp>; |
| 1056 | where(__hi >= __norm_min_v<_Tp>, __hi_exp) |
| 1057 | = __hi & __inf; // no error |
| 1058 | _V __h1 = __hi * __scale; // no error |
| 1059 | _V __l1 = __lo * __scale; // no error |
| 1060 | |
| 1061 | // sqrt(x²+y²) = e*sqrt((x/e)²+(y/e)²): |
| 1062 | // this ensures no overflow in the argument to sqrt |
| 1063 | _V __r = __hi_exp * sqrt(__h1 * __h1 + __l1 * __l1); |
| 1064 | #ifdef __STDC_IEC_559__ |
| 1065 | // fixup for Annex F requirements |
| 1066 | // the naive fixup goes like this: |
| 1067 | // |
| 1068 | // where(__l1 == 0, __r) = __hi; |
| 1069 | // where(isunordered(__x, __y), __r) = __quiet_NaN_v<_Tp>; |
| 1070 | // where(isinf(__absx) || isinf(__absy), __r) = __inf; |
| 1071 | // |
| 1072 | // The fixup can be prepared in parallel with the sqrt, requiring a |
| 1073 | // single blend step after hi_exp * sqrt, reducing latency and |
| 1074 | // throughput: |
| 1075 | _V __fixup = __hi; // __lo == 0 |
| 1076 | where(isunordered(__x, __y), __fixup) = __quiet_NaN_v<_Tp>; |
| 1077 | where(isinf(__absx) || isinf(__absy), __fixup) = __inf; |
| 1078 | where(!(__lo == 0 || isunordered(__x, __y) |
| 1079 | || (isinf(__absx) || isinf(__absy))), |
| 1080 | __fixup) |
| 1081 | = __r; |
| 1082 | __r = __fixup; |
| 1083 | #endif |
| 1084 | return __r; |
| 1085 | } |
| 1086 | } |
| 1087 | } |
| 1088 | |
| 1089 | template <typename _Tp, typename _Abi> |
| 1090 | _GLIBCXX_SIMD_INTRINSIC simd<_Tp, _Abi> |
| 1091 | hypot(const simd<_Tp, _Abi>& __x, const simd<_Tp, _Abi>& __y) |
| 1092 | { |
| 1093 | return __hypot<conditional_t<__is_fixed_size_abi_v<_Abi>, |
| 1094 | const simd<_Tp, _Abi>&, simd<_Tp, _Abi>>>(__x, |
| 1095 | __y); |
| 1096 | } |
| 1097 | |
| 1098 | _GLIBCXX_SIMD_CVTING2(hypot) |
| 1099 | |
| 1100 | template <typename _VV> |
| 1101 | __remove_cvref_t<_VV> |
| 1102 | __hypot(_VV __x, _VV __y, _VV __z) |
| 1103 | { |
| 1104 | using _V = __remove_cvref_t<_VV>; |
| 1105 | using _Abi = typename _V::abi_type; |
| 1106 | using _Tp = typename _V::value_type; |
| 1107 | /* FIXME: enable after PR77776 is resolved |
| 1108 | if constexpr (_V::size() == 1) |
| 1109 | return std::hypot(_Tp(__x[0]), _Tp(__y[0]), _Tp(__z[0])); |
| 1110 | else |
| 1111 | */ |
| 1112 | if constexpr (__is_fixed_size_abi_v<_Abi> && _V::size() > 1) |
| 1113 | { |
| 1114 | return __fixed_size_apply<simd<_Tp, _Abi>>( |
| 1115 | [](auto __a, auto __b, auto __c) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
| 1116 | return hypot(__a, __b, __c); |
| 1117 | }, __x, __y, __z); |
| 1118 | } |
| 1119 | else |
| 1120 | { |
| 1121 | using namespace __float_bitwise_operators; |
| 1122 | const _V __absx = abs(__x); // no error |
| 1123 | const _V __absy = abs(__y); // no error |
| 1124 | const _V __absz = abs(__z); // no error |
| 1125 | _V __hi = max(max(__absx, __absy), __absz); // no error |
| 1126 | _V __l0 = min(__absz, max(__absx, __absy)); // no error |
| 1127 | _V __l1 = min(__absy, __absx); // no error |
| 1128 | if constexpr (__digits_v<_Tp> == 64 && __max_exponent_v<_Tp> == 0x4000 |
| 1129 | && __min_exponent_v<_Tp> == -0x3FFD && _V::size() == 1) |
| 1130 | { // Seems like x87 fp80, where bit 63 is always 1 unless subnormal or |
| 1131 | // NaN. In this case the bit-tricks don't work, they require IEC559 |
| 1132 | // binary32 or binary64 format. |
| 1133 | #ifdef __STDC_IEC_559__ |
| 1134 | // fixup for Annex F requirements |
| 1135 | if (isinf(__absx[0]) || isinf(__absy[0]) || isinf(__absz[0])) |
| 1136 | return __infinity_v<_Tp>; |
| 1137 | else if (isunordered(__absx[0], __absy[0] + __absz[0])) |
| 1138 | return __quiet_NaN_v<_Tp>; |
| 1139 | else if (__l0[0] == 0 && __l1[0] == 0) |
| 1140 | return __hi; |
| 1141 | #endif |
| 1142 | _V __hi_exp = __hi; |
| 1143 | const _ULLong __tmp = 0x8000'0000'0000'0000ull; |
| 1144 | __builtin_memcpy(&__data(__hi_exp), &__tmp, 8); |
| 1145 | const _V __scale = 1 / __hi_exp; |
| 1146 | __hi *= __scale; |
| 1147 | __l0 *= __scale; |
| 1148 | __l1 *= __scale; |
| 1149 | return __hi_exp * sqrt((__l0 * __l0 + __l1 * __l1) + __hi * __hi); |
| 1150 | } |
| 1151 | else |
| 1152 | { |
| 1153 | // round __hi down to the next power-of-2: |
| 1154 | _GLIBCXX_SIMD_USE_CONSTEXPR_API _V __inf(__infinity_v<_Tp>); |
| 1155 | |
| 1156 | #ifndef __FAST_MATH__ |
| 1157 | if constexpr (_V::size() > 1 && __have_neon && !__have_neon_a32) |
| 1158 | { // With ARMv7 NEON, we have no subnormals and must use slightly |
| 1159 | // different strategy |
| 1160 | const _V __hi_exp = __hi & __inf; |
| 1161 | _V __scale_back = __hi_exp; |
| 1162 | // For large exponents (max & max/2) the inversion comes too |
| 1163 | // close to subnormals. Subtract 3 from the exponent: |
| 1164 | where(__hi_exp > 1, __scale_back) = __hi_exp * _Tp(0.125); |
| 1165 | // Invert and adjust for the off-by-one error of inversion via |
| 1166 | // xor: |
| 1167 | const _V __scale = (__scale_back ^ __inf) * _Tp(.5); |
| 1168 | const _V __h1 = __hi * __scale; |
| 1169 | __l0 *= __scale; |
| 1170 | __l1 *= __scale; |
| 1171 | _V __lo = __l0 * __l0 |
| 1172 | + __l1 * __l1; // add the two smaller values first |
| 1173 | asm("" : "+m" (__lo)); |
| 1174 | _V __r = __scale_back * sqrt(__h1 * __h1 + __lo); |
| 1175 | // Fix up hypot(0, 0, 0) to not be NaN: |
| 1176 | where(__hi == 0, __r) = 0; |
| 1177 | return __r; |
| 1178 | } |
| 1179 | #endif |
| 1180 | |
| 1181 | #ifdef __FAST_MATH__ |
| 1182 | // With fast-math, ignore precision of subnormals and inputs from |
| 1183 | // __finite_max_v/2 to __finite_max_v. This removes all |
| 1184 | // branching/masking. |
| 1185 | if constexpr (true) |
| 1186 | #else |
| 1187 | if (_GLIBCXX_SIMD_IS_LIKELY(all_of(isnormal(__x)) |
| 1188 | && all_of(isnormal(__y)) |
| 1189 | && all_of(isnormal(__z)))) |
| 1190 | #endif |
| 1191 | { |
| 1192 | const _V __hi_exp = __hi & __inf; |
| 1193 | //((__hi + __hi) & __inf) ^ __inf almost works for computing |
| 1194 | //__scale, except when (__hi + __hi) & __inf == __inf, in which |
| 1195 | // case __scale |
| 1196 | // becomes 0 (should be min/2 instead) and thus loses the |
| 1197 | // information from __lo. |
| 1198 | #ifdef __FAST_MATH__ |
| 1199 | using _Ip = __int_for_sizeof_t<_Tp>; |
| 1200 | using _IV = rebind_simd_t<_Ip, _V>; |
| 1201 | const auto __as_int = __bit_cast<_IV>(__hi_exp); |
| 1202 | const _V __scale |
| 1203 | = __bit_cast<_V>(2 * __bit_cast<_Ip>(_Tp(1)) - __as_int); |
| 1204 | #else |
| 1205 | const _V __scale = (__hi_exp ^ __inf) * _Tp(.5); |
| 1206 | #endif |
| 1207 | constexpr _Tp __mant_mask |
| 1208 | = __norm_min_v<_Tp> - __denorm_min_v<_Tp>; |
| 1209 | const _V __h1 = (__hi & _V(__mant_mask)) | _V(1); |
| 1210 | __l0 *= __scale; |
| 1211 | __l1 *= __scale; |
| 1212 | const _V __lo |
| 1213 | = __l0 * __l0 |
| 1214 | + __l1 * __l1; // add the two smaller values first |
| 1215 | return __hi_exp * sqrt(__lo + __h1 * __h1); |
| 1216 | } |
| 1217 | else |
| 1218 | { |
| 1219 | // slower path to support subnormals |
| 1220 | // if __hi is subnormal, avoid scaling by inf & final mul by 0 |
| 1221 | // (which yields NaN) by using min() |
| 1222 | _V __scale = _V(1 / __norm_min_v<_Tp>); |
| 1223 | // invert exponent w/o error and w/o using the slow divider |
| 1224 | // unit: xor inverts the exponent but off by 1. Multiplication |
| 1225 | // with .5 adjusts for the discrepancy. |
| 1226 | where(__hi >= __norm_min_v<_Tp>, __scale) |
| 1227 | = ((__hi & __inf) ^ __inf) * _Tp(.5); |
| 1228 | // adjust final exponent for subnormal inputs |
| 1229 | _V __hi_exp = __norm_min_v<_Tp>; |
| 1230 | where(__hi >= __norm_min_v<_Tp>, __hi_exp) |
| 1231 | = __hi & __inf; // no error |
| 1232 | _V __h1 = __hi * __scale; // no error |
| 1233 | __l0 *= __scale; // no error |
| 1234 | __l1 *= __scale; // no error |
| 1235 | _V __lo = __l0 * __l0 |
| 1236 | + __l1 * __l1; // add the two smaller values first |
| 1237 | _V __r = __hi_exp * sqrt(__lo + __h1 * __h1); |
| 1238 | #ifdef __STDC_IEC_559__ |
| 1239 | // fixup for Annex F requirements |
| 1240 | _V __fixup = __hi; // __lo == 0 |
| 1241 | // where(__lo == 0, __fixup) = __hi; |
| 1242 | where(isunordered(__x, __y + __z), __fixup) |
| 1243 | = __quiet_NaN_v<_Tp>; |
| 1244 | where(isinf(__absx) || isinf(__absy) || isinf(__absz), __fixup) |
| 1245 | = __inf; |
| 1246 | // Instead of __lo == 0, the following could depend on __h1² == |
| 1247 | // __h1² + __lo (i.e. __hi is so much larger than the other two |
| 1248 | // inputs that the result is exactly __hi). While this may |
| 1249 | // improve precision, it is likely to reduce efficiency if the |
| 1250 | // ISA has FMAs (because __h1² + __lo is an FMA, but the |
| 1251 | // intermediate |
| 1252 | // __h1² must be kept) |
| 1253 | where(!(__lo == 0 || isunordered(__x, __y + __z) |
| 1254 | || isinf(__absx) || isinf(__absy) || isinf(__absz)), |
| 1255 | __fixup) |
| 1256 | = __r; |
| 1257 | __r = __fixup; |
| 1258 | #endif |
| 1259 | return __r; |
| 1260 | } |
| 1261 | } |
| 1262 | } |
| 1263 | } |
| 1264 | |
| 1265 | template <typename _Tp, typename _Abi> |
| 1266 | _GLIBCXX_SIMD_INTRINSIC simd<_Tp, _Abi> |
| 1267 | hypot(const simd<_Tp, _Abi>& __x, const simd<_Tp, _Abi>& __y, |
| 1268 | const simd<_Tp, _Abi>& __z) |
| 1269 | { |
| 1270 | return __hypot<conditional_t<__is_fixed_size_abi_v<_Abi>, |
| 1271 | const simd<_Tp, _Abi>&, simd<_Tp, _Abi>>>(__x, |
| 1272 | __y, |
| 1273 | __z); |
| 1274 | } |
| 1275 | |
| 1276 | _GLIBCXX_SIMD_CVTING3(hypot) |
| 1277 | |
| 1278 | _GLIBCXX_SIMD_MATH_CALL2_(pow, _Tp) |
| 1279 | |
| 1280 | _GLIBCXX_SIMD_MATH_CALL_(sqrt) |
| 1281 | _GLIBCXX_SIMD_MATH_CALL_(erf) |
| 1282 | _GLIBCXX_SIMD_MATH_CALL_(erfc) |
| 1283 | _GLIBCXX_SIMD_MATH_CALL_(lgamma) |
| 1284 | _GLIBCXX_SIMD_MATH_CALL_(tgamma) |
| 1285 | _GLIBCXX_SIMD_MATH_CALL_(ceil) |
| 1286 | _GLIBCXX_SIMD_MATH_CALL_(floor) |
| 1287 | _GLIBCXX_SIMD_MATH_CALL_(nearbyint) |
| 1288 | _GLIBCXX_SIMD_MATH_CALL_(rint) |
| 1289 | _GLIBCXX_SIMD_MATH_CALL_(lrint) |
| 1290 | _GLIBCXX_SIMD_MATH_CALL_(llrint) |
| 1291 | |
| 1292 | _GLIBCXX_SIMD_MATH_CALL_(round) |
| 1293 | _GLIBCXX_SIMD_MATH_CALL_(lround) |
| 1294 | _GLIBCXX_SIMD_MATH_CALL_(llround) |
| 1295 | |
| 1296 | _GLIBCXX_SIMD_MATH_CALL_(trunc) |
| 1297 | |
| 1298 | _GLIBCXX_SIMD_MATH_CALL2_(fmod, _Tp) |
| 1299 | _GLIBCXX_SIMD_MATH_CALL2_(remainder, _Tp) |
| 1300 | _GLIBCXX_SIMD_MATH_CALL3_(remquo, _Tp, int*) |
| 1301 | |
| 1302 | template <typename _Tp, typename _Abi> |
| 1303 | enable_if_t<is_floating_point_v<_Tp>, simd<_Tp, _Abi>> |
| 1304 | copysign(const simd<_Tp, _Abi>& __x, const simd<_Tp, _Abi>& __y) |
| 1305 | { |
| 1306 | if constexpr (simd_size_v<_Tp, _Abi> == 1) |
| 1307 | return std::copysign(__x[0], __y[0]); |
| 1308 | else if constexpr (is_same_v<_Tp, long double> && sizeof(_Tp) == 12) |
| 1309 | // Remove this case once __bit_cast is implemented via __builtin_bit_cast. |
| 1310 | // It is necessary, because __signmask below cannot be computed at compile |
| 1311 | // time. |
| 1312 | return simd<_Tp, _Abi>( |
| 1313 | [&](auto __i) { return std::copysign(__x[__i], __y[__i]); }); |
| 1314 | else |
| 1315 | { |
| 1316 | using _V = simd<_Tp, _Abi>; |
| 1317 | using namespace std::experimental::__float_bitwise_operators; |
| 1318 | _GLIBCXX_SIMD_USE_CONSTEXPR_API auto __signmask = _V(1) ^ _V(-1); |
| 1319 | return (__x & (__x ^ __signmask)) | (__y & __signmask); |
| 1320 | } |
| 1321 | } |
| 1322 | |
| 1323 | _GLIBCXX_SIMD_MATH_CALL2_(nextafter, _Tp) |
| 1324 | // not covered in [parallel.simd.math]: |
| 1325 | // _GLIBCXX_SIMD_MATH_CALL2_(nexttoward, long double) |
| 1326 | _GLIBCXX_SIMD_MATH_CALL2_(fdim, _Tp) |
| 1327 | _GLIBCXX_SIMD_MATH_CALL2_(fmax, _Tp) |
| 1328 | _GLIBCXX_SIMD_MATH_CALL2_(fmin, _Tp) |
| 1329 | |
| 1330 | _GLIBCXX_SIMD_MATH_CALL3_(fma, _Tp, _Tp) |
| 1331 | _GLIBCXX_SIMD_MATH_CALL_(fpclassify) |
| 1332 | _GLIBCXX_SIMD_MATH_CALL_(isfinite) |
| 1333 | |
| 1334 | // isnan and isinf require special treatment because old glibc may declare |
| 1335 | // `int isinf(double)`. |
| 1336 | template <typename _Tp, typename _Abi, typename..., |
| 1337 | typename _R = _Math_return_type_t<bool, _Tp, _Abi>> |
| 1338 | enable_if_t<is_floating_point_v<_Tp>, _R> |
| 1339 | isinf(simd<_Tp, _Abi> __x) |
| 1340 | { return {__private_init, _Abi::_SimdImpl::_S_isinf(__data(__x))}; } |
| 1341 | |
| 1342 | template <typename _Tp, typename _Abi, typename..., |
| 1343 | typename _R = _Math_return_type_t<bool, _Tp, _Abi>> |
| 1344 | enable_if_t<is_floating_point_v<_Tp>, _R> |
| 1345 | isnan(simd<_Tp, _Abi> __x) |
| 1346 | { return {__private_init, _Abi::_SimdImpl::_S_isnan(__data(__x))}; } |
| 1347 | |
| 1348 | _GLIBCXX_SIMD_MATH_CALL_(isnormal) |
| 1349 | |
| 1350 | template <typename..., typename _Tp, typename _Abi> |
| 1351 | simd_mask<_Tp, _Abi> |
| 1352 | signbit(simd<_Tp, _Abi> __x) |
| 1353 | { |
| 1354 | if constexpr (is_integral_v<_Tp>) |
| 1355 | { |
| 1356 | if constexpr (is_unsigned_v<_Tp>) |
| 1357 | return simd_mask<_Tp, _Abi>{}; // false |
| 1358 | else |
| 1359 | return __x < 0; |
| 1360 | } |
| 1361 | else |
| 1362 | return {__private_init, _Abi::_SimdImpl::_S_signbit(__data(__x))}; |
| 1363 | } |
| 1364 | |
| 1365 | _GLIBCXX_SIMD_MATH_CALL2_(isgreater, _Tp) |
| 1366 | _GLIBCXX_SIMD_MATH_CALL2_(isgreaterequal, _Tp) |
| 1367 | _GLIBCXX_SIMD_MATH_CALL2_(isless, _Tp) |
| 1368 | _GLIBCXX_SIMD_MATH_CALL2_(islessequal, _Tp) |
| 1369 | _GLIBCXX_SIMD_MATH_CALL2_(islessgreater, _Tp) |
| 1370 | _GLIBCXX_SIMD_MATH_CALL2_(isunordered, _Tp) |
| 1371 | |
| 1372 | /* not covered in [parallel.simd.math] |
| 1373 | template <typename _Abi> __doublev<_Abi> nan(const char* tagp); |
| 1374 | template <typename _Abi> __floatv<_Abi> nanf(const char* tagp); |
| 1375 | template <typename _Abi> __ldoublev<_Abi> nanl(const char* tagp); |
| 1376 | |
| 1377 | template <typename _V> struct simd_div_t { |
| 1378 | _V quot, rem; |
| 1379 | }; |
| 1380 | |
| 1381 | template <typename _Abi> |
| 1382 | simd_div_t<_SCharv<_Abi>> div(_SCharv<_Abi> numer, |
| 1383 | _SCharv<_Abi> denom); |
| 1384 | template <typename _Abi> |
| 1385 | simd_div_t<__shortv<_Abi>> div(__shortv<_Abi> numer, |
| 1386 | __shortv<_Abi> denom); |
| 1387 | template <typename _Abi> |
| 1388 | simd_div_t<__intv<_Abi>> div(__intv<_Abi> numer, __intv<_Abi> denom); |
| 1389 | template <typename _Abi> |
| 1390 | simd_div_t<__longv<_Abi>> div(__longv<_Abi> numer, |
| 1391 | __longv<_Abi> denom); |
| 1392 | template <typename _Abi> |
| 1393 | simd_div_t<__llongv<_Abi>> div(__llongv<_Abi> numer, |
| 1394 | __llongv<_Abi> denom); |
| 1395 | */ |
| 1396 | |
| 1397 | // special math {{{ |
| 1398 | template <typename _Tp, typename _Abi> |
| 1399 | enable_if_t<is_floating_point_v<_Tp>, simd<_Tp, _Abi>> |
| 1400 | assoc_laguerre(const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __n, |
| 1401 | const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __m, |
| 1402 | const simd<_Tp, _Abi>& __x) |
| 1403 | { |
| 1404 | return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
| 1405 | return std::assoc_laguerre(__n[__i], __m[__i], __x[__i]); |
| 1406 | }); |
| 1407 | } |
| 1408 | |
| 1409 | template <typename _Tp, typename _Abi> |
| 1410 | enable_if_t<is_floating_point_v<_Tp>, simd<_Tp, _Abi>> |
| 1411 | assoc_legendre(const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __n, |
| 1412 | const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __m, |
| 1413 | const simd<_Tp, _Abi>& __x) |
| 1414 | { |
| 1415 | return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
| 1416 | return std::assoc_legendre(__n[__i], __m[__i], __x[__i]); |
| 1417 | }); |
| 1418 | } |
| 1419 | |
| 1420 | _GLIBCXX_SIMD_MATH_CALL2_(beta, _Tp) |
| 1421 | _GLIBCXX_SIMD_MATH_CALL_(comp_ellint_1) |
| 1422 | _GLIBCXX_SIMD_MATH_CALL_(comp_ellint_2) |
| 1423 | _GLIBCXX_SIMD_MATH_CALL2_(comp_ellint_3, _Tp) |
| 1424 | _GLIBCXX_SIMD_MATH_CALL2_(cyl_bessel_i, _Tp) |
| 1425 | _GLIBCXX_SIMD_MATH_CALL2_(cyl_bessel_j, _Tp) |
| 1426 | _GLIBCXX_SIMD_MATH_CALL2_(cyl_bessel_k, _Tp) |
| 1427 | _GLIBCXX_SIMD_MATH_CALL2_(cyl_neumann, _Tp) |
| 1428 | _GLIBCXX_SIMD_MATH_CALL2_(ellint_1, _Tp) |
| 1429 | _GLIBCXX_SIMD_MATH_CALL2_(ellint_2, _Tp) |
| 1430 | _GLIBCXX_SIMD_MATH_CALL3_(ellint_3, _Tp, _Tp) |
| 1431 | _GLIBCXX_SIMD_MATH_CALL_(expint) |
| 1432 | |
| 1433 | template <typename _Tp, typename _Abi> |
| 1434 | enable_if_t<is_floating_point_v<_Tp>, simd<_Tp, _Abi>> |
| 1435 | hermite(const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __n, |
| 1436 | const simd<_Tp, _Abi>& __x) |
| 1437 | { |
| 1438 | return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
| 1439 | return std::hermite(__n[__i], __x[__i]); |
| 1440 | }); |
| 1441 | } |
| 1442 | |
| 1443 | template <typename _Tp, typename _Abi> |
| 1444 | enable_if_t<is_floating_point_v<_Tp>, simd<_Tp, _Abi>> |
| 1445 | laguerre(const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __n, |
| 1446 | const simd<_Tp, _Abi>& __x) |
| 1447 | { |
| 1448 | return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
| 1449 | return std::laguerre(__n[__i], __x[__i]); |
| 1450 | }); |
| 1451 | } |
| 1452 | |
| 1453 | template <typename _Tp, typename _Abi> |
| 1454 | enable_if_t<is_floating_point_v<_Tp>, simd<_Tp, _Abi>> |
| 1455 | legendre(const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __n, |
| 1456 | const simd<_Tp, _Abi>& __x) |
| 1457 | { |
| 1458 | return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
| 1459 | return std::legendre(__n[__i], __x[__i]); |
| 1460 | }); |
| 1461 | } |
| 1462 | |
| 1463 | _GLIBCXX_SIMD_MATH_CALL_(riemann_zeta) |
| 1464 | |
| 1465 | template <typename _Tp, typename _Abi> |
| 1466 | enable_if_t<is_floating_point_v<_Tp>, simd<_Tp, _Abi>> |
| 1467 | sph_bessel(const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __n, |
| 1468 | const simd<_Tp, _Abi>& __x) |
| 1469 | { |
| 1470 | return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
| 1471 | return std::sph_bessel(__n[__i], __x[__i]); |
| 1472 | }); |
| 1473 | } |
| 1474 | |
| 1475 | template <typename _Tp, typename _Abi> |
| 1476 | enable_if_t<is_floating_point_v<_Tp>, simd<_Tp, _Abi>> |
| 1477 | sph_legendre(const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __l, |
| 1478 | const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __m, |
| 1479 | const simd<_Tp, _Abi>& theta) |
| 1480 | { |
| 1481 | return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
| 1482 | return std::assoc_legendre(__l[__i], __m[__i], theta[__i]); |
| 1483 | }); |
| 1484 | } |
| 1485 | |
| 1486 | template <typename _Tp, typename _Abi> |
| 1487 | enable_if_t<is_floating_point_v<_Tp>, simd<_Tp, _Abi>> |
| 1488 | sph_neumann(const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __n, |
| 1489 | const simd<_Tp, _Abi>& __x) |
| 1490 | { |
| 1491 | return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { |
| 1492 | return std::sph_neumann(__n[__i], __x[__i]); |
| 1493 | }); |
| 1494 | } |
| 1495 | // }}} |
| 1496 | |
| 1497 | #undef _GLIBCXX_SIMD_MATH_CALL_ |
| 1498 | #undef _GLIBCXX_SIMD_MATH_CALL2_ |
| 1499 | #undef _GLIBCXX_SIMD_MATH_CALL3_ |
| 1500 | |
| 1501 | _GLIBCXX_SIMD_END_NAMESPACE |
| 1502 | |
| 1503 | #endif // __cplusplus >= 201703L |
| 1504 | #endif // _GLIBCXX_EXPERIMENTAL_SIMD_MATH_H_ |
| 1505 | |
| 1506 | // vim: foldmethod=marker sw=2 ts=8 noet sts=2 |
| 1507 | |