| 1 | /* Double-precision vector (SVE) cos function. |
| 2 | |
| 3 | Copyright (C) 2023-2024 Free Software Foundation, Inc. |
| 4 | This file is part of the GNU C Library. |
| 5 | |
| 6 | The GNU C Library is free software; you can redistribute it and/or |
| 7 | modify it under the terms of the GNU Lesser General Public |
| 8 | License as published by the Free Software Foundation; either |
| 9 | version 2.1 of the License, or (at your option) any later version. |
| 10 | |
| 11 | The GNU C Library is distributed in the hope that it will be useful, |
| 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 14 | Lesser General Public License for more details. |
| 15 | |
| 16 | You should have received a copy of the GNU Lesser General Public |
| 17 | License along with the GNU C Library; if not, see |
| 18 | <https://www.gnu.org/licenses/>. */ |
| 19 | |
| 20 | #include "sv_math.h" |
| 21 | |
| 22 | static const struct data |
| 23 | { |
| 24 | double inv_pio2, pio2_1, pio2_2, pio2_3, shift; |
| 25 | } data = { |
| 26 | /* Polynomial coefficients are hardwired in FTMAD instructions. */ |
| 27 | .inv_pio2 = 0x1.45f306dc9c882p-1, |
| 28 | .pio2_1 = 0x1.921fb50000000p+0, |
| 29 | .pio2_2 = 0x1.110b460000000p-26, |
| 30 | .pio2_3 = 0x1.1a62633145c07p-54, |
| 31 | /* Original shift used in AdvSIMD cos, |
| 32 | plus a contribution to set the bit #0 of q |
| 33 | as expected by trigonometric instructions. */ |
| 34 | .shift = 0x1.8000000000001p52 |
| 35 | }; |
| 36 | |
| 37 | #define RangeVal 0x4160000000000000 /* asuint64 (0x1p23). */ |
| 38 | |
| 39 | static svfloat64_t NOINLINE |
| 40 | special_case (svfloat64_t x, svfloat64_t y, svbool_t oob) |
| 41 | { |
| 42 | return sv_call_f64 (f: cos, x, y, cmp: oob); |
| 43 | } |
| 44 | |
| 45 | /* A fast SVE implementation of cos based on trigonometric |
| 46 | instructions (FTMAD, FTSSEL, FTSMUL). |
| 47 | Maximum measured error: 2.108 ULPs. |
| 48 | SV_NAME_D1 (cos)(0x1.9b0ba158c98f3p+7) got -0x1.fddd4c65c7f07p-3 |
| 49 | want -0x1.fddd4c65c7f05p-3. */ |
| 50 | svfloat64_t SV_NAME_D1 (cos) (svfloat64_t x, const svbool_t pg) |
| 51 | { |
| 52 | const struct data *d = ptr_barrier (&data); |
| 53 | |
| 54 | svfloat64_t r = svabs_x (pg, x); |
| 55 | svbool_t oob = svcmpge (pg, svreinterpret_u64 (r), RangeVal); |
| 56 | |
| 57 | /* Load some constants in quad-word chunks to minimise memory access. */ |
| 58 | svbool_t ptrue = svptrue_b64 (); |
| 59 | svfloat64_t invpio2_and_pio2_1 = svld1rq (ptrue, &d->inv_pio2); |
| 60 | svfloat64_t pio2_23 = svld1rq (ptrue, &d->pio2_2); |
| 61 | |
| 62 | /* n = rint(|x|/(pi/2)). */ |
| 63 | svfloat64_t q = svmla_lane (sv_f64 (x: d->shift), r, invpio2_and_pio2_1, 0); |
| 64 | svfloat64_t n = svsub_x (pg, q, d->shift); |
| 65 | |
| 66 | /* r = |x| - n*(pi/2) (range reduction into -pi/4 .. pi/4). */ |
| 67 | r = svmls_lane (r, n, invpio2_and_pio2_1, 1); |
| 68 | r = svmls_lane (r, n, pio2_23, 0); |
| 69 | r = svmls_lane (r, n, pio2_23, 1); |
| 70 | |
| 71 | /* cos(r) poly approx. */ |
| 72 | svfloat64_t r2 = svtsmul (r, svreinterpret_u64 (q)); |
| 73 | svfloat64_t y = sv_f64 (x: 0.0); |
| 74 | y = svtmad (y, r2, 7); |
| 75 | y = svtmad (y, r2, 6); |
| 76 | y = svtmad (y, r2, 5); |
| 77 | y = svtmad (y, r2, 4); |
| 78 | y = svtmad (y, r2, 3); |
| 79 | y = svtmad (y, r2, 2); |
| 80 | y = svtmad (y, r2, 1); |
| 81 | y = svtmad (y, r2, 0); |
| 82 | |
| 83 | /* Final multiplicative factor: 1.0 or x depending on bit #0 of q. */ |
| 84 | svfloat64_t f = svtssel (r, svreinterpret_u64 (q)); |
| 85 | |
| 86 | if (__glibc_unlikely (svptest_any (pg, oob))) |
| 87 | return special_case (x, y: svmul_x (svnot_z (pg, oob), y, f), oob); |
| 88 | |
| 89 | /* Apply factor. */ |
| 90 | return svmul_x (pg, f, y); |
| 91 | } |
| 92 | |