cosf_sve.c source code [glibc/sysdeps/aarch64/fpu/cosf_sve.c]

1	/ Single-precision vector (SVE) cos function.*
2
3	Copyright (C) 2023-2024 Free Software Foundation, Inc.
4	This file is part of the GNU C Library.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, see
18	<https://www.gnu.org/licenses/>. /*
19
20	#include "sv_math.h"
21
22	static const struct data
23	{
24	float neg_pio2_1, neg_pio2_2, neg_pio2_3, inv_pio2, shift;
25	} data = {
26	/ Polynomial coefficients are hard-wired in FTMAD instructions. /
27	.neg_pio2_1 = -`0x1.921fb6p+0f`,
28	.neg_pio2_2 = `0x1.777a5cp-25f`,
29	.neg_pio2_3 = `0x1.ee59dap-50f`,
30	.inv_pio2 = `0x1.45f306p-1f`,
31	/ Original shift used in AdvSIMD cosf,*
32	plus a contribution to set the bit #0 of q
33	as expected by trigonometric instructions. /*
34	.shift = `0x1.800002p+23f`
35	};
36
37	#define RangeVal 0x49800000 /* asuint32(0x1p20f). */
38
39	static svfloat32_t NOINLINE
40	special_case (svfloat32_t x, svfloat32_t y, svbool_t oob)
41	{
42	return sv_call_f32 (f: cosf, x, y, cmp: oob);
43	}
44
45	/ A fast SVE implementation of cosf based on trigonometric*
46	instructions (FTMAD, FTSSEL, FTSMUL).
47	Maximum measured error: 2.06 ULPs.
48	SV_NAME_F1 (cos)(0x1.dea2f2p+19) got 0x1.fffe7ap-6
49	want 0x1.fffe76p-6. /*
50	svfloat32_t SV_NAME_F1 (cos) (svfloat32_t x, const svbool_t pg)
51	{
52	const struct data *d = ptr_barrier (&data);
53
54	svfloat32_t r = svabs_x (pg, x);
55	svbool_t oob = svcmpge (pg, svreinterpret_u32 (r), RangeVal);
56
57	/ Load some constants in quad-word chunks to minimise memory access. /
58	svfloat32_t negpio2_and_invpio2 = svld1rq (svptrue_b32 (), &d->neg_pio2_1);
59
60	/ n = rint(\|x\|/(pi/2)). /
61	svfloat32_t q = svmla_lane (sv_f32 (x: d->shift), r, negpio2_and_invpio2, `3`);
62	svfloat32_t n = svsub_x (pg, q, d->shift);
63
64	/ r = \|x\| - n(pi/2) (range reduction into -pi/4 .. pi/4). /*
65	r = svmla_lane (r, n, negpio2_and_invpio2, `0`);
66	r = svmla_lane (r, n, negpio2_and_invpio2, `1`);
67	r = svmla_lane (r, n, negpio2_and_invpio2, `2`);
68
69	/ Final multiplicative factor: 1.0 or x depending on bit #0 of q. /
70	svfloat32_t f = svtssel (r, svreinterpret_u32 (q));
71
72	/ cos(r) poly approx. /
73	svfloat32_t r2 = svtsmul (r, svreinterpret_u32 (q));
74	svfloat32_t y = sv_f32 (x: `0.0f`);
75	y = svtmad (y, r2, `4`);
76	y = svtmad (y, r2, `3`);
77	y = svtmad (y, r2, `2`);
78	y = svtmad (y, r2, `1`);
79	y = svtmad (y, r2, `0`);
80
81	if (__glibc_unlikely (svptest_any (pg, oob)))
82	return special_case (x, y: svmul_x (svnot_z (pg, oob), f, y), oob);
83	/ Apply factor. /
84	return svmul_x (pg, f, y);
85	}
86

source code of glibc/sysdeps/aarch64/fpu/cosf_sve.c