cos_sve.c source code [glibc/sysdeps/aarch64/fpu/cos_sve.c]

1	/ Double-precision vector (SVE) cos function.*
2
3	Copyright (C) 2023-2024 Free Software Foundation, Inc.
4	This file is part of the GNU C Library.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, see
18	<https://www.gnu.org/licenses/>. /*
19
20	#include "sv_math.h"
21
22	static const struct data
23	{
24	double inv_pio2, pio2_1, pio2_2, pio2_3, shift;
25	} data = {
26	/ Polynomial coefficients are hardwired in FTMAD instructions. /
27	.inv_pio2 = `0x1.45f306dc9c882p-1`,
28	.pio2_1 = `0x1.921fb50000000p+0`,
29	.pio2_2 = `0x1.110b460000000p-26`,
30	.pio2_3 = `0x1.1a62633145c07p-54`,
31	/ Original shift used in AdvSIMD cos,*
32	plus a contribution to set the bit #0 of q
33	as expected by trigonometric instructions. /*
34	.shift = `0x1.8000000000001p52`
35	};
36
37	#define RangeVal 0x4160000000000000 /* asuint64 (0x1p23). */
38
39	static svfloat64_t NOINLINE
40	special_case (svfloat64_t x, svfloat64_t y, svbool_t oob)
41	{
42	return sv_call_f64 (f: cos, x, y, cmp: oob);
43	}
44
45	/ A fast SVE implementation of cos based on trigonometric*
46	instructions (FTMAD, FTSSEL, FTSMUL).
47	Maximum measured error: 2.108 ULPs.
48	SV_NAME_D1 (cos)(0x1.9b0ba158c98f3p+7) got -0x1.fddd4c65c7f07p-3
49	want -0x1.fddd4c65c7f05p-3. /*
50	svfloat64_t SV_NAME_D1 (cos) (svfloat64_t x, const svbool_t pg)
51	{
52	const struct data *d = ptr_barrier (&data);
53
54	svfloat64_t r = svabs_x (pg, x);
55	svbool_t oob = svcmpge (pg, svreinterpret_u64 (r), RangeVal);
56
57	/ Load some constants in quad-word chunks to minimise memory access. /
58	svbool_t ptrue = svptrue_b64 ();
59	svfloat64_t invpio2_and_pio2_1 = svld1rq (ptrue, &d->inv_pio2);
60	svfloat64_t pio2_23 = svld1rq (ptrue, &d->pio2_2);
61
62	/ n = rint(\|x\|/(pi/2)). /
63	svfloat64_t q = svmla_lane (sv_f64 (x: d->shift), r, invpio2_and_pio2_1, `0`);
64	svfloat64_t n = svsub_x (pg, q, d->shift);
65
66	/ r = \|x\| - n(pi/2) (range reduction into -pi/4 .. pi/4). /*
67	r = svmls_lane (r, n, invpio2_and_pio2_1, `1`);
68	r = svmls_lane (r, n, pio2_23, `0`);
69	r = svmls_lane (r, n, pio2_23, `1`);
70
71	/ cos(r) poly approx. /
72	svfloat64_t r2 = svtsmul (r, svreinterpret_u64 (q));
73	svfloat64_t y = sv_f64 (x: `0.0`);
74	y = svtmad (y, r2, `7`);
75	y = svtmad (y, r2, `6`);
76	y = svtmad (y, r2, `5`);
77	y = svtmad (y, r2, `4`);
78	y = svtmad (y, r2, `3`);
79	y = svtmad (y, r2, `2`);
80	y = svtmad (y, r2, `1`);
81	y = svtmad (y, r2, `0`);
82
83	/ Final multiplicative factor: 1.0 or x depending on bit #0 of q. /
84	svfloat64_t f = svtssel (r, svreinterpret_u64 (q));
85
86	if (__glibc_unlikely (svptest_any (pg, oob)))
87	return special_case (x, y: svmul_x (svnot_z (pg, oob), y, f), oob);
88
89	/ Apply factor. /
90	return svmul_x (pg, f, y);
91	}
92

source code of glibc/sysdeps/aarch64/fpu/cos_sve.c