expf_sve.c source code [glibc/sysdeps/aarch64/fpu/expf_sve.c]

1	/ Single-precision vector (SVE) exp function.*
2
3	Copyright (C) 2023-2024 Free Software Foundation, Inc.
4	This file is part of the GNU C Library.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, see
18	<https://www.gnu.org/licenses/>. /*
19
20	#include "sv_math.h"
21
22	static const struct data
23	{
24	float poly[`5`];
25	float inv_ln2, ln2_hi, ln2_lo, shift, thres;
26	} data = {
27	/ Coefficients copied from the polynomial in AdvSIMD variant, reversed for*
28	compatibility with polynomial helpers. /*
29	.poly = { `0x1.ffffecp-1f`, `0x1.fffdb6p-2f`, `0x1.555e66p-3f`, `0x1.573e2ep-5f`,
30	`0x1.0e4020p-7f` },
31	.inv_ln2 = `0x1.715476p+0f`,
32	.ln2_hi = `0x1.62e4p-1f`,
33	.ln2_lo = `0x1.7f7d1cp-20f`,
34	/ 1.52^17 + 127. /*
35	.shift = `0x1.903f8p17f`,
36	/ Roughly 87.3. For x < -Thres, the result is subnormal and not handled*
37	correctly by FEXPA. /*
38	.thres = `0x1.5d5e2ap+6f`,
39	};
40
41	#define C(i) sv_f32 (d->poly[i])
42	#define ExponentBias 0x3f800000
43
44	static svfloat32_t NOINLINE
45	special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
46	{
47	return sv_call_f32 (f: expf, x, y, cmp: special);
48	}
49
50	/ Optimised single-precision SVE exp function.*
51	Worst-case error is 1.04 ulp:
52	SV_NAME_F1 (exp)(0x1.a8eda4p+1) got 0x1.ba74bcp+4
53	want 0x1.ba74bap+4. /*
54	svfloat32_t SV_NAME_F1 (exp) (svfloat32_t x, const svbool_t pg)
55	{
56	const struct data *d = ptr_barrier (&data);
57
58	/ exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]*
59	x = ln2n + r, with r in [-ln2/2, ln2/2]. /
60
61	/ Load some constants in quad-word chunks to minimise memory access (last*
62	lane is wasted). /*
63	svfloat32_t invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->inv_ln2);
64
65	/ n = round(x/(ln2/N)). /
66	svfloat32_t z = svmla_lane (sv_f32 (x: d->shift), x, invln2_and_ln2, `0`);
67	svfloat32_t n = svsub_x (pg, z, d->shift);
68
69	/ r = x - nln2/N. /*
70	svfloat32_t r = svmls_lane (x, n, invln2_and_ln2, `1`);
71	r = svmls_lane (r, n, invln2_and_ln2, `2`);
72
73	/ scale = 2^(n/N). /
74	svbool_t is_special_case = svacgt (pg, x, d->thres);
75	svfloat32_t scale = svexpa (svreinterpret_u32 (z));
76
77	/ y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. /
78	svfloat32_t p12 = svmla_x (pg, C (`1`), C (`2`), r);
79	svfloat32_t p34 = svmla_x (pg, C (`3`), C (`4`), r);
80	svfloat32_t r2 = svmul_x (pg, r, r);
81	svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
82	svfloat32_t p0 = svmul_x (pg, r, C (`0`));
83	svfloat32_t poly = svmla_x (pg, p0, r2, p14);
84
85	if (__glibc_unlikely (svptest_any (pg, is_special_case)))
86	return special_case (x, y: svmla_x (pg, scale, scale, poly), special: is_special_case);
87
88	return svmla_x (pg, scale, scale, poly);
89	}
90

source code of glibc/sysdeps/aarch64/fpu/expf_sve.c