acosf_advsimd.c source code [glibc/sysdeps/aarch64/fpu/acosf_advsimd.c]

1	/ Single-precision AdvSIMD inverse cos*
2
3	Copyright (C) 2023-2024 Free Software Foundation, Inc.
4	This file is part of the GNU C Library.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, see
18	<https://www.gnu.org/licenses/>. /*
19
20	#include "v_math.h"
21	#include "poly_advsimd_f32.h"
22
23	static const struct data
24	{
25	float32x4_t poly[`5`];
26	float32x4_t pi_over_2f, pif;
27	} data = {
28	/ Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on*
29	[ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . /*
30	.poly = { V4 (`0x1.55555ep-3`), V4 (`0x1.33261ap-4`), V4 (`0x1.70d7dcp-5`),
31	V4 (`0x1.b059dp-6`), V4 (`0x1.3af7d8p-5`) },
32	.pi_over_2f = V4 (`0x1.921fb6p+0f`),
33	.pif = V4 (`0x1.921fb6p+1f`),
34	};
35
36	#define AbsMask 0x7fffffff
37	#define Half 0x3f000000
38	#define One 0x3f800000
39	#define Small 0x32800000 /* 2^-26. */
40
41	#if WANT_SIMD_EXCEPT
42	static float32x4_t VPCS_ATTR NOINLINE
43	special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
44	{
45	return v_call_f32 (acosf, x, y, special);
46	}
47	#endif
48
49	/ Single-precision implementation of vector acos(x).*
50
51	For \|x\| < Small, approximate acos(x) by pi/2 - x. Small = 2^-26 for correct
52	rounding.
53	If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the following
54	approximation.
55
56	For \|x\| in [Small, 0.5], use order 4 polynomial P such that the final
57	approximation of asin is an odd polynomial:
58
59	acos(x) ~ pi/2 - (x + x^3 P(x^2)).
60
61	The largest observed error in this region is 1.26 ulps,
62	_ZGVnN4v_acosf (0x1.843bfcp-2) got 0x1.2e934cp+0 want 0x1.2e934ap+0.
63
64	For \|x\| in [0.5, 1.0], use same approximation with a change of variable
65
66	acos(x) = y + y z * P(z), with z = (1-x)/2 and y = sqrt(z).*
67
68	The largest observed error in this region is 1.32 ulps,
69	_ZGVnN4v_acosf (0x1.15ba56p-1) got 0x1.feb33p-1
70	want 0x1.feb32ep-1. /*
71	float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (acos) (float32x4_t x)
72	{
73	const struct data *d = ptr_barrier (&data);
74
75	uint32x4_t ix = vreinterpretq_u32_f32 (x);
76	uint32x4_t ia = vandq_u32 (ix, v_u32 (AbsMask));
77
78	#if WANT_SIMD_EXCEPT
79	/ A single comparison for One, Small and QNaN. /
80	uint32x4_t special
81	= vcgtq_u32 (vsubq_u32 (ia, v_u32 (Small)), v_u32 (One - Small));
82	if (__glibc_unlikely (v_any_u32 (special)))
83	return special_case (x, x, v_u32 (`0xffffffff`));
84	#endif
85
86	float32x4_t ax = vreinterpretq_f32_u32 (ia);
87	uint32x4_t a_le_half = vcleq_u32 (ia, v_u32 (Half));
88
89	/ Evaluate polynomial Q(x) = z + z * z2 * P(z2) with*
90	z2 = x ^ 2 and z = \|x\| , if \|x\| < 0.5
91	z2 = (1 - \|x\|) / 2 and z = sqrt(z2), if \|x\| >= 0.5. /*
92	float32x4_t z2 = vbslq_f32 (a_le_half, vmulq_f32 (x, x),
93	vfmsq_n_f32 (v_f32 (`0.5`), ax, `0.5`));
94	float32x4_t z = vbslq_f32 (a_le_half, ax, vsqrtq_f32 (z2));
95
96	/ Use a single polynomial approximation P for both intervals. /
97	float32x4_t p = v_horner_4_f32 (z2, d->poly);
98	/ Finalize polynomial: z + z * z2 * P(z2). /
99	p = vfmaq_f32 (z, vmulq_f32 (z, z2), p);
100
101	/ acos(\|x\|) = pi/2 - sign(x) * Q(\|x\|), for \|x\| < 0.5*
102	= 2 Q(\|x\|) , for 0.5 < x < 1.0
103	= pi - 2 Q(\|x\|) , for -1.0 < x < -0.5. /*
104	float32x4_t y = vbslq_f32 (v_u32 (AbsMask), p, x);
105
106	uint32x4_t is_neg = vcltzq_f32 (x);
107	float32x4_t off = vreinterpretq_f32_u32 (
108	vandq_u32 (vreinterpretq_u32_f32 (d->pif), is_neg));
109	float32x4_t mul = vbslq_f32 (a_le_half, v_f32 (-`1.0`), v_f32 (`2.0`));
110	float32x4_t add = vbslq_f32 (a_le_half, d->pi_over_2f, off);
111
112	return vfmaq_f32 (add, mul, y);
113	}
114	libmvec_hidden_def (V_NAME_F1(acos))
115	HALF_WIDTH_ALIAS_F1 (acos)
116

source code of glibc/sysdeps/aarch64/fpu/acosf_advsimd.c