log1p_sve.c source code [glibc/sysdeps/aarch64/fpu/log1p_sve.c]

1	/ Double-precision SVE log1p*
2
3	Copyright (C) 2023-2024 Free Software Foundation, Inc.
4	This file is part of the GNU C Library.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, see
18	<https://www.gnu.org/licenses/>. /*
19
20	#include "sv_math.h"
21	#include "poly_sve_f64.h"
22
23	static const struct data
24	{
25	double poly[`19`];
26	double ln2_hi, ln2_lo;
27	uint64_t hfrt2_top, onemhfrt2_top, inf, mone;
28	} data = {
29	/ Generated using Remez in [ sqrt(2)/2 - 1, sqrt(2) - 1]. Order 20*
30	polynomial, however first 2 coefficients are 0 and 1 so are not stored. /*
31	.poly = { -`0x1.ffffffffffffbp-2`, `0x1.55555555551a9p-2`, -`0x1.00000000008e3p-2`,
32	`0x1.9999999a32797p-3`, -`0x1.555555552fecfp-3`, `0x1.249248e071e5ap-3`,
33	-`0x1.ffffff8bf8482p-4`, `0x1.c71c8f07da57ap-4`, -`0x1.9999ca4ccb617p-4`,
34	`0x1.7459ad2e1dfa3p-4`, -`0x1.554d2680a3ff2p-4`, `0x1.3b4c54d487455p-4`,
35	-`0x1.2548a9ffe80e6p-4`, `0x1.0f389a24b2e07p-4`, -`0x1.eee4db15db335p-5`,
36	`0x1.e95b494d4a5ddp-5`, -`0x1.15fdf07cb7c73p-4`, `0x1.0310b70800fcfp-4`,
37	-`0x1.cfa7385bdb37ep-6`, },
38	.ln2_hi = `0x1.62e42fefa3800p-1`,
39	.ln2_lo = `0x1.ef35793c76730p-45`,
40	/ top32(asuint64(sqrt(2)/2)) << 32. /
41	.hfrt2_top = `0x3fe6a09e00000000`,
42	/ (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) << 32. /
43	.onemhfrt2_top = `0x00095f6200000000`,
44	.inf = `0x7ff0000000000000`,
45	.mone = `0xbff0000000000000`,
46	};
47
48	#define AbsMask 0x7fffffffffffffff
49	#define BottomMask 0xffffffff
50
51	static svfloat64_t NOINLINE
52	special_case (svbool_t special, svfloat64_t x, svfloat64_t y)
53	{
54	return sv_call_f64 (f: log1p, x, y, cmp: special);
55	}
56
57	/ Vector approximation for log1p using polynomial on reduced interval. Maximum*
58	observed error is 2.46 ULP:
59	_ZGVsMxv_log1p(0x1.654a1307242a4p+11) got 0x1.fd5565fb590f4p+2
60	want 0x1.fd5565fb590f6p+2. /*
61	svfloat64_t SV_NAME_D1 (log1p) (svfloat64_t x, svbool_t pg)
62	{
63	const struct data *d = ptr_barrier (&data);
64	svuint64_t ix = svreinterpret_u64 (x);
65	svuint64_t ax = svand_x (pg, ix, AbsMask);
66	svbool_t special
67	= svorr_z (pg, svcmpge (pg, ax, d->inf), svcmpge (pg, ix, d->mone));
68
69	/ With x + 1 = t * 2^k (where t = f + 1 and k is chosen such that f*
70	is in [sqrt(2)/2, sqrt(2)]):
71	log1p(x) = klog(2) + log1p(f).*
72
73	f may not be representable exactly, so we need a correction term:
74	let m = round(1 + x), c = (1 + x) - m.
75	c << m: at very small x, log1p(x) ~ x, hence:
76	log(1+x) - log(m) ~ c/m.
77
78	We therefore calculate log1p(x) by klog2 + log1p(f) + c/m. /
79
80	/ Obtain correctly scaled k by manipulation in the exponent.*
81	The scalar algorithm casts down to 32-bit at this point to calculate k and
82	u_red. We stay in double-width to obtain f and k, using the same constants
83	as the scalar algorithm but shifted left by 32. /*
84	svfloat64_t m = svadd_x (pg, x, `1`);
85	svuint64_t mi = svreinterpret_u64 (m);
86	svuint64_t u = svadd_x (pg, mi, d->onemhfrt2_top);
87
88	svint64_t ki = svsub_x (pg, svreinterpret_s64 (svlsr_x (pg, u, `52`)), `0x3ff`);
89	svfloat64_t k = svcvt_f64_x (pg, ki);
90
91	/ Reduce x to f in [sqrt(2)/2, sqrt(2)]. /
92	svuint64_t utop
93	= svadd_x (pg, svand_x (pg, u, `0x000fffff00000000`), d->hfrt2_top);
94	svuint64_t u_red = svorr_x (pg, utop, svand_x (pg, mi, BottomMask));
95	svfloat64_t f = svsub_x (pg, svreinterpret_f64 (u_red), `1`);
96
97	/ Correction term c/m. /
98	svfloat64_t cm = svdiv_x (pg, svsub_x (pg, x, svsub_x (pg, m, `1`)), m);
99
100	/ Approximate log1p(x) on the reduced input using a polynomial. Because*
101	log1p(0)=0 we choose an approximation of the form:
102	x + C0x^2 + C1x^3 + C2x^4 + ...
103	Hence approximation has the form f + f^2 P(f)*
104	where P(x) = C0 + C1x + C2x^2 + ...*
105	Assembling this all correctly is dealt with at the final step. /*
106	svfloat64_t f2 = svmul_x (pg, f, f), f4 = svmul_x (pg, f2, f2),
107	f8 = svmul_x (pg, f4, f4), f16 = svmul_x (pg, f8, f8);
108	svfloat64_t p = sv_estrin_18_f64_x (pg, x: f, x2: f2, x4: f4, x8: f8, x16: f16, poly: d->poly);
109
110	svfloat64_t ylo = svmla_x (pg, cm, k, d->ln2_lo);
111	svfloat64_t yhi = svmla_x (pg, f, k, d->ln2_hi);
112	svfloat64_t y = svmla_x (pg, svadd_x (pg, ylo, yhi), f2, p);
113
114	if (__glibc_unlikely (svptest_any (pg, special)))
115	return special_case (special, x, y);
116
117	return y;
118	}
119

source code of glibc/sysdeps/aarch64/fpu/log1p_sve.c