1 | /* Double-precision vector (SVE) cos function. |
2 | |
3 | Copyright (C) 2023-2024 Free Software Foundation, Inc. |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | #include "sv_math.h" |
21 | |
22 | static const struct data |
23 | { |
24 | double inv_pio2, pio2_1, pio2_2, pio2_3, shift; |
25 | } data = { |
26 | /* Polynomial coefficients are hardwired in FTMAD instructions. */ |
27 | .inv_pio2 = 0x1.45f306dc9c882p-1, |
28 | .pio2_1 = 0x1.921fb50000000p+0, |
29 | .pio2_2 = 0x1.110b460000000p-26, |
30 | .pio2_3 = 0x1.1a62633145c07p-54, |
31 | /* Original shift used in AdvSIMD cos, |
32 | plus a contribution to set the bit #0 of q |
33 | as expected by trigonometric instructions. */ |
34 | .shift = 0x1.8000000000001p52 |
35 | }; |
36 | |
37 | #define RangeVal 0x4160000000000000 /* asuint64 (0x1p23). */ |
38 | |
39 | static svfloat64_t NOINLINE |
40 | special_case (svfloat64_t x, svfloat64_t y, svbool_t oob) |
41 | { |
42 | return sv_call_f64 (f: cos, x, y, cmp: oob); |
43 | } |
44 | |
45 | /* A fast SVE implementation of cos based on trigonometric |
46 | instructions (FTMAD, FTSSEL, FTSMUL). |
47 | Maximum measured error: 2.108 ULPs. |
48 | SV_NAME_D1 (cos)(0x1.9b0ba158c98f3p+7) got -0x1.fddd4c65c7f07p-3 |
49 | want -0x1.fddd4c65c7f05p-3. */ |
50 | svfloat64_t SV_NAME_D1 (cos) (svfloat64_t x, const svbool_t pg) |
51 | { |
52 | const struct data *d = ptr_barrier (&data); |
53 | |
54 | svfloat64_t r = svabs_x (pg, x); |
55 | svbool_t oob = svcmpge (pg, svreinterpret_u64 (r), RangeVal); |
56 | |
57 | /* Load some constants in quad-word chunks to minimise memory access. */ |
58 | svbool_t ptrue = svptrue_b64 (); |
59 | svfloat64_t invpio2_and_pio2_1 = svld1rq (ptrue, &d->inv_pio2); |
60 | svfloat64_t pio2_23 = svld1rq (ptrue, &d->pio2_2); |
61 | |
62 | /* n = rint(|x|/(pi/2)). */ |
63 | svfloat64_t q = svmla_lane (sv_f64 (x: d->shift), r, invpio2_and_pio2_1, 0); |
64 | svfloat64_t n = svsub_x (pg, q, d->shift); |
65 | |
66 | /* r = |x| - n*(pi/2) (range reduction into -pi/4 .. pi/4). */ |
67 | r = svmls_lane (r, n, invpio2_and_pio2_1, 1); |
68 | r = svmls_lane (r, n, pio2_23, 0); |
69 | r = svmls_lane (r, n, pio2_23, 1); |
70 | |
71 | /* cos(r) poly approx. */ |
72 | svfloat64_t r2 = svtsmul (r, svreinterpret_u64 (q)); |
73 | svfloat64_t y = sv_f64 (x: 0.0); |
74 | y = svtmad (y, r2, 7); |
75 | y = svtmad (y, r2, 6); |
76 | y = svtmad (y, r2, 5); |
77 | y = svtmad (y, r2, 4); |
78 | y = svtmad (y, r2, 3); |
79 | y = svtmad (y, r2, 2); |
80 | y = svtmad (y, r2, 1); |
81 | y = svtmad (y, r2, 0); |
82 | |
83 | /* Final multiplicative factor: 1.0 or x depending on bit #0 of q. */ |
84 | svfloat64_t f = svtssel (r, svreinterpret_u64 (q)); |
85 | |
86 | if (__glibc_unlikely (svptest_any (pg, oob))) |
87 | return special_case (x, y: svmul_x (svnot_z (pg, oob), y, f), oob); |
88 | |
89 | /* Apply factor. */ |
90 | return svmul_x (pg, f, y); |
91 | } |
92 | |