1 | /* Single-precision vector (Advanced SIMD) exp function. |
2 | |
3 | Copyright (C) 2023-2024 Free Software Foundation, Inc. |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | #include "v_math.h" |
21 | |
22 | static const struct data |
23 | { |
24 | float32x4_t poly[5]; |
25 | float32x4_t shift, inv_ln2, ln2_hi, ln2_lo; |
26 | uint32x4_t exponent_bias; |
27 | #if !WANT_SIMD_EXCEPT |
28 | float32x4_t special_bound, scale_thresh; |
29 | #endif |
30 | } data = { |
31 | /* maxerr: 1.45358 +0.5 ulp. */ |
32 | .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f), |
33 | V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) }, |
34 | .shift = V4 (0x1.8p23f), |
35 | .inv_ln2 = V4 (0x1.715476p+0f), |
36 | .ln2_hi = V4 (0x1.62e4p-1f), |
37 | .ln2_lo = V4 (0x1.7f7d1cp-20f), |
38 | .exponent_bias = V4 (0x3f800000), |
39 | #if !WANT_SIMD_EXCEPT |
40 | .special_bound = V4 (126.0f), |
41 | .scale_thresh = V4 (192.0f), |
42 | #endif |
43 | }; |
44 | |
45 | #define C(i) d->poly[i] |
46 | |
47 | #if WANT_SIMD_EXCEPT |
48 | |
49 | # define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */ |
50 | # define BigBound v_u32 (0x42800000) /* asuint (0x1p6). */ |
51 | # define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound. */ |
52 | |
53 | static float32x4_t VPCS_ATTR NOINLINE |
54 | special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) |
55 | { |
56 | /* If fenv exceptions are to be triggered correctly, fall back to the scalar |
57 | routine to special lanes. */ |
58 | return v_call_f32 (expf, x, y, cmp); |
59 | } |
60 | |
61 | #else |
62 | |
63 | # define SpecialOffset v_u32 (0x82000000) |
64 | # define SpecialBias v_u32 (0x7f000000) |
65 | |
66 | static float32x4_t VPCS_ATTR NOINLINE |
67 | special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, |
68 | float32x4_t scale, const struct data *d) |
69 | { |
70 | /* 2^n may overflow, break it up into s1*s2. */ |
71 | uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset); |
72 | float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias)); |
73 | float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); |
74 | uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); |
75 | float32x4_t r2 = vmulq_f32 (s1, s1); |
76 | float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1); |
77 | /* Similar to r1 but avoids double rounding in the subnormal range. */ |
78 | float32x4_t r0 = vfmaq_f32 (scale, poly, scale); |
79 | float32x4_t r = vbslq_f32 (cmp1, r1, r0); |
80 | return vbslq_f32 (cmp2, r2, r); |
81 | } |
82 | |
83 | #endif |
84 | |
85 | float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x) |
86 | { |
87 | const struct data *d = ptr_barrier (&data); |
88 | float32x4_t n, r, r2, scale, p, q, poly, z; |
89 | uint32x4_t cmp, e; |
90 | |
91 | #if WANT_SIMD_EXCEPT |
92 | /* asuint(x) - TinyBound >= BigBound - TinyBound. */ |
93 | cmp = vcgeq_u32 ( |
94 | vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)), |
95 | TinyBound), |
96 | SpecialBound); |
97 | float32x4_t xm = x; |
98 | /* If any lanes are special, mask them with 1 and retain a copy of x to allow |
99 | special case handler to fix special lanes later. This is only necessary if |
100 | fenv exceptions are to be triggered correctly. */ |
101 | if (__glibc_unlikely (v_any_u32 (cmp))) |
102 | x = vbslq_f32 (cmp, v_f32 (1), x); |
103 | #endif |
104 | |
105 | /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] |
106 | x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ |
107 | z = vfmaq_f32 (d->shift, x, d->inv_ln2); |
108 | n = vsubq_f32 (z, d->shift); |
109 | r = vfmsq_f32 (x, n, d->ln2_hi); |
110 | r = vfmsq_f32 (r, n, d->ln2_lo); |
111 | e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); |
112 | scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); |
113 | |
114 | #if !WANT_SIMD_EXCEPT |
115 | cmp = vcagtq_f32 (n, d->special_bound); |
116 | #endif |
117 | |
118 | r2 = vmulq_f32 (r, r); |
119 | p = vfmaq_f32 (C (1), C (0), r); |
120 | q = vfmaq_f32 (C (3), C (2), r); |
121 | q = vfmaq_f32 (q, p, r2); |
122 | p = vmulq_f32 (C (4), r); |
123 | poly = vfmaq_f32 (p, q, r2); |
124 | |
125 | if (__glibc_unlikely (v_any_u32 (cmp))) |
126 | #if WANT_SIMD_EXCEPT |
127 | return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp); |
128 | #else |
129 | return special_case (poly, n, e, cmp, scale, d); |
130 | #endif |
131 | |
132 | return vfmaq_f32 (scale, poly, scale); |
133 | } |
134 | libmvec_hidden_def (V_NAME_F1 (exp)) |
135 | HALF_WIDTH_ALIAS_F1 (exp) |
136 | |