expm1f_advsimd.c source code [glibc/sysdeps/aarch64/fpu/expm1f_advsimd.c]

1	/ Single-precision AdvSIMD expm1*
2
3	Copyright (C) 2023-2024 Free Software Foundation, Inc.
4	This file is part of the GNU C Library.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, see
18	<https://www.gnu.org/licenses/>. /*
19
20	#include "v_math.h"
21	#include "poly_advsimd_f32.h"
22
23	static const struct data
24	{
25	float32x4_t poly[`5`];
26	float invln2_and_ln2[`4`];
27	float32x4_t shift;
28	int32x4_t exponent_bias;
29	#if WANT_SIMD_EXCEPT
30	uint32x4_t thresh;
31	#else
32	float32x4_t oflow_bound;
33	#endif
34	} data = {
35	/ Generated using fpminimax with degree=5 in [-log(2)/2, log(2)/2]. /
36	.poly = { V4 (`0x1.fffffep-2`), V4 (`0x1.5554aep-3`), V4 (`0x1.555736p-5`),
37	V4 (`0x1.12287cp-7`), V4 (`0x1.6b55a2p-10`) },
38	/ Stores constants: invln2, ln2_hi, ln2_lo, 0. /
39	.invln2_and_ln2 = { `0x1.715476p+0f`, `0x1.62e4p-1f`, `0x1.7f7d1cp-20f`, `0` },
40	.shift = V4 (`0x1.8p23f`),
41	.exponent_bias = V4 (`0x3f800000`),
42	#if !WANT_SIMD_EXCEPT
43	/ Value above which expm1f(x) should overflow. Absolute value of the*
44	underflow bound is greater than this, so it catches both cases - there is
45	a small window where fallbacks are triggered unnecessarily. /*
46	.oflow_bound = V4 (`0x1.5ebc4p+6`),
47	#else
48	/ asuint(oflow_bound) - asuint(0x1p-23), shifted left by 1 for absolute*
49	compare. /*
50	.thresh = V4 (`0x1d5ebc40`),
51	#endif
52	};
53
54	/ asuint(0x1p-23), shifted by 1 for abs compare. /
55	#define TinyBound v_u32 (0x34000000 << 1)
56
57	static float32x4_t VPCS_ATTR NOINLINE
58	special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
59	{
60	return v_call_f32 (expm1f, x, y, special);
61	}
62
63	/ Single-precision vector exp(x) - 1 function.*
64	The maximum error is 1.51 ULP:
65	_ZGVnN4v_expm1f (0x1.8baa96p-2) got 0x1.e2fb9p-2
66	want 0x1.e2fb94p-2. /*
67	float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
68	{
69	const struct data *d = ptr_barrier (&data);
70	uint32x4_t ix = vreinterpretq_u32_f32 (x);
71
72	#if WANT_SIMD_EXCEPT
73	/ If fp exceptions are to be triggered correctly, fall back to scalar for*
74	\|x\| < 2^-23, \|x\| > oflow_bound, Inf & NaN. Add ix to itself for
75	shift-left by 1, and compare with thresh which was left-shifted offline -
76	this is effectively an absolute compare. /*
77	uint32x4_t special
78	= vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh);
79	if (__glibc_unlikely (v_any_u32 (special)))
80	x = v_zerofy_f32 (x, special);
81	#else
82	/ Handles very large values (+ve and -ve), +/-NaN, +/-Inf. /
83	uint32x4_t special = vcagtq_f32 (x, d->oflow_bound);
84	#endif
85
86	/ Reduce argument to smaller range:*
87	Let i = round(x / ln2)
88	and f = x - i ln2, then f is in [-ln2/2, ln2/2].*
89	exp(x) - 1 = 2^i (expm1(f) + 1) - 1*
90	where 2^i is exact because i is an integer. /*
91	float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
92	float32x4_t j
93	= vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, `0`), d->shift);
94	int32x4_t i = vcvtq_s32_f32 (j);
95	float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, `1`);
96	f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, `2`);
97
98	/ Approximate expm1(f) using polynomial.*
99	Taylor expansion for expm1(x) has the form:
100	x + ax^2 + bx^3 + cx^4 ....
101	So we calculate the polynomial P(f) = a + bf + cf^2 + ...
102	and assemble the approximation expm1(f) ~= f + f^2 P(f). /
103	float32x4_t p = v_horner_4_f32 (f, d->poly);
104	p = vfmaq_f32 (f, vmulq_f32 (f, f), p);
105
106	/ Assemble the result.*
107	expm1(x) ~= 2^i (p + 1) - 1*
108	Let t = 2^i. /*
109	int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, `23`), d->exponent_bias);
110	float32x4_t t = vreinterpretq_f32_s32 (u);
111
112	if (__glibc_unlikely (v_any_u32 (special)))
113	return special_case (vreinterpretq_f32_u32 (ix),
114	vfmaq_f32 (vsubq_f32 (t, v_f32 (`1.0f`)), p, t),
115	special);
116
117	/ expm1(x) ~= p * t + (t - 1). /
118	return vfmaq_f32 (vsubq_f32 (t, v_f32 (`1.0f`)), p, t);
119	}
120	libmvec_hidden_def (V_NAME_F1 (expm1))
121	HALF_WIDTH_ALIAS_F1 (expm1)
122

source code of glibc/sysdeps/aarch64/fpu/expm1f_advsimd.c