1 | /* Single-precision vector (AdvSIMD) exp10 function. |
2 | |
3 | Copyright (C) 2023-2024 Free Software Foundation, Inc. |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | #include "v_math.h" |
21 | #include "poly_advsimd_f32.h" |
22 | |
23 | #define ScaleBound 192.0f |
24 | |
25 | static const struct data |
26 | { |
27 | float32x4_t poly[5]; |
28 | float32x4_t log10_2_and_inv, shift; |
29 | |
30 | #if !WANT_SIMD_EXCEPT |
31 | float32x4_t scale_thresh; |
32 | #endif |
33 | } data = { |
34 | /* Coefficients generated using Remez algorithm with minimisation of relative |
35 | error. |
36 | rel error: 0x1.89dafa3p-24 |
37 | abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2] |
38 | maxerr: 1.85943 +0.5 ulp. */ |
39 | .poly = { V4 (0x1.26bb16p+1f), V4 (0x1.5350d2p+1f), V4 (0x1.04744ap+1f), |
40 | V4 (0x1.2d8176p+0f), V4 (0x1.12b41ap-1f) }, |
41 | .shift = V4 (0x1.8p23f), |
42 | |
43 | /* Stores constants 1/log10(2), log10(2)_high, log10(2)_low, 0. */ |
44 | .log10_2_and_inv = { 0x1.a934fp+1, 0x1.344136p-2, -0x1.ec10cp-27, 0 }, |
45 | #if !WANT_SIMD_EXCEPT |
46 | .scale_thresh = V4 (ScaleBound) |
47 | #endif |
48 | }; |
49 | |
50 | #define ExponentBias v_u32 (0x3f800000) |
51 | |
52 | #if WANT_SIMD_EXCEPT |
53 | |
54 | # define SpecialBound 38.0f /* rint(log10(2^127)). */ |
55 | # define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */ |
56 | # define BigBound v_u32 (0x42180000) /* asuint (SpecialBound). */ |
57 | # define Thres v_u32 (0x22180000) /* BigBound - TinyBound. */ |
58 | |
59 | static float32x4_t VPCS_ATTR NOINLINE |
60 | special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) |
61 | { |
62 | /* If fenv exceptions are to be triggered correctly, fall back to the scalar |
63 | routine to special lanes. */ |
64 | return v_call_f32 (exp10f, x, y, cmp); |
65 | } |
66 | |
67 | #else |
68 | |
69 | # define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))). */ |
70 | # define SpecialOffset v_u32 (0x82000000) |
71 | # define SpecialBias v_u32 (0x7f000000) |
72 | |
73 | static float32x4_t VPCS_ATTR NOINLINE |
74 | special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, |
75 | float32x4_t scale, const struct data *d) |
76 | { |
77 | /* 2^n may overflow, break it up into s1*s2. */ |
78 | uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset); |
79 | float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias)); |
80 | float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); |
81 | uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); |
82 | float32x4_t r2 = vmulq_f32 (s1, s1); |
83 | float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1); |
84 | /* Similar to r1 but avoids double rounding in the subnormal range. */ |
85 | float32x4_t r0 = vfmaq_f32 (scale, poly, scale); |
86 | float32x4_t r = vbslq_f32 (cmp1, r1, r0); |
87 | return vbslq_f32 (cmp2, r2, r); |
88 | } |
89 | |
90 | #endif |
91 | |
92 | /* Fast vector implementation of single-precision exp10. |
93 | Algorithm is accurate to 2.36 ULP. |
94 | _ZGVnN4v_exp10f(0x1.be2b36p+1) got 0x1.7e79c4p+11 |
95 | want 0x1.7e79cp+11. */ |
96 | float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x) |
97 | { |
98 | const struct data *d = ptr_barrier (&data); |
99 | #if WANT_SIMD_EXCEPT |
100 | /* asuint(x) - TinyBound >= BigBound - TinyBound. */ |
101 | uint32x4_t cmp = vcgeq_u32 ( |
102 | vsubq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (x)), TinyBound), Thres); |
103 | float32x4_t xm = x; |
104 | /* If any lanes are special, mask them with 1 and retain a copy of x to allow |
105 | special case handler to fix special lanes later. This is only necessary if |
106 | fenv exceptions are to be triggered correctly. */ |
107 | if (__glibc_unlikely (v_any_u32 (cmp))) |
108 | x = v_zerofy_f32 (x, cmp); |
109 | #endif |
110 | |
111 | /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)), |
112 | with poly(r) in [1/sqrt(2), sqrt(2)] and |
113 | x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */ |
114 | float32x4_t z = vfmaq_laneq_f32 (d->shift, x, d->log10_2_and_inv, 0); |
115 | float32x4_t n = vsubq_f32 (z, d->shift); |
116 | float32x4_t r = vfmsq_laneq_f32 (x, n, d->log10_2_and_inv, 1); |
117 | r = vfmsq_laneq_f32 (r, n, d->log10_2_and_inv, 2); |
118 | uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); |
119 | |
120 | float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias)); |
121 | |
122 | #if !WANT_SIMD_EXCEPT |
123 | uint32x4_t cmp = vcagtq_f32 (n, v_f32 (SpecialBound)); |
124 | #endif |
125 | |
126 | float32x4_t r2 = vmulq_f32 (r, r); |
127 | float32x4_t poly |
128 | = vfmaq_f32 (vmulq_f32 (r, d->poly[0]), |
129 | v_pairwise_poly_3_f32 (r, r2, d->poly + 1), r2); |
130 | |
131 | if (__glibc_unlikely (v_any_u32 (cmp))) |
132 | #if WANT_SIMD_EXCEPT |
133 | return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp); |
134 | #else |
135 | return special_case (poly, n, e, cmp, scale, d); |
136 | #endif |
137 | |
138 | return vfmaq_f32 (scale, poly, scale); |
139 | } |
140 | libmvec_hidden_def (V_NAME_F1 (exp10)) |
141 | HALF_WIDTH_ALIAS_F1 (exp10) |
142 | |