1 | /* Single-precision vector (SVE) exp10 function. |
2 | |
3 | Copyright (C) 2023-2024 Free Software Foundation, Inc. |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | #include "sv_math.h" |
21 | #include "poly_sve_f32.h" |
22 | |
23 | /* For x < -SpecialBound, the result is subnormal and not handled correctly by |
24 | FEXPA. */ |
25 | #define SpecialBound 37.9 |
26 | |
27 | static const struct data |
28 | { |
29 | float poly[5]; |
30 | float shift, log10_2, log2_10_hi, log2_10_lo, special_bound; |
31 | } data = { |
32 | /* Coefficients generated using Remez algorithm with minimisation of relative |
33 | error. |
34 | rel error: 0x1.89dafa3p-24 |
35 | abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2] |
36 | maxerr: 0.52 +0.5 ulp. */ |
37 | .poly = { 0x1.26bb16p+1f, 0x1.5350d2p+1f, 0x1.04744ap+1f, 0x1.2d8176p+0f, |
38 | 0x1.12b41ap-1f }, |
39 | /* 1.5*2^17 + 127, a shift value suitable for FEXPA. */ |
40 | .shift = 0x1.903f8p17f, |
41 | .log10_2 = 0x1.a934fp+1, |
42 | .log2_10_hi = 0x1.344136p-2, |
43 | .log2_10_lo = -0x1.ec10cp-27, |
44 | .special_bound = SpecialBound, |
45 | }; |
46 | |
47 | static svfloat32_t NOINLINE |
48 | special_case (svfloat32_t x, svfloat32_t y, svbool_t special) |
49 | { |
50 | return sv_call_f32 (exp10f, x, y, special); |
51 | } |
52 | |
53 | /* Single-precision SVE exp10f routine. Implements the same algorithm |
54 | as AdvSIMD exp10f. |
55 | Worst case error is 1.02 ULPs. |
56 | _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1 |
57 | want 0x1.ba5f9cp-1. */ |
58 | svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg) |
59 | { |
60 | const struct data *d = ptr_barrier (&data); |
61 | /* exp10(x) = 2^(n/N) * 10^r = 2^n * (1 + poly (r)), |
62 | with poly(r) in [1/sqrt(2), sqrt(2)] and |
63 | x = r + n * log10(2) / N, with r in [-log10(2)/2N, log10(2)/2N]. */ |
64 | |
65 | /* Load some constants in quad-word chunks to minimise memory access (last |
66 | lane is wasted). */ |
67 | svfloat32_t log10_2_and_inv = svld1rq (svptrue_b32 (), &d->log10_2); |
68 | |
69 | /* n = round(x/(log10(2)/N)). */ |
70 | svfloat32_t shift = sv_f32 (x: d->shift); |
71 | svfloat32_t z = svmla_lane (shift, x, log10_2_and_inv, 0); |
72 | svfloat32_t n = svsub_x (pg, z, shift); |
73 | |
74 | /* r = x - n*log10(2)/N. */ |
75 | svfloat32_t r = svmls_lane (x, n, log10_2_and_inv, 1); |
76 | r = svmls_lane (r, n, log10_2_and_inv, 2); |
77 | |
78 | svbool_t special = svacgt (pg, x, d->special_bound); |
79 | svfloat32_t scale = svexpa (svreinterpret_u32 (z)); |
80 | |
81 | /* Polynomial evaluation: poly(r) ~ exp10(r)-1. */ |
82 | svfloat32_t r2 = svmul_x (pg, r, r); |
83 | svfloat32_t poly |
84 | = svmla_x (pg, svmul_x (pg, r, d->poly[0]), |
85 | sv_pairwise_poly_3_f32_x (pg, x: r, x2: r2, poly: d->poly + 1), r2); |
86 | |
87 | if (__glibc_unlikely (svptest_any (pg, special))) |
88 | return special_case (x, y: svmla_x (pg, scale, scale, poly), special); |
89 | |
90 | return svmla_x (pg, scale, scale, poly); |
91 | } |
92 | |