1 | /* Single-precision vector (AdvSIMD) exp2 function |
2 | |
3 | Copyright (C) 2023-2024 Free Software Foundation, Inc. |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | #include "v_math.h" |
21 | |
22 | static const struct data |
23 | { |
24 | float32x4_t poly[5]; |
25 | uint32x4_t exponent_bias; |
26 | #if !WANT_SIMD_EXCEPT |
27 | float32x4_t special_bound, scale_thresh; |
28 | #endif |
29 | } data = { |
30 | /* maxerr: 1.962 ulp. */ |
31 | .poly = { V4 (0x1.59977ap-10f), V4 (0x1.3ce9e4p-7f), V4 (0x1.c6bd32p-5f), |
32 | V4 (0x1.ebf9bcp-3f), V4 (0x1.62e422p-1f) }, |
33 | .exponent_bias = V4 (0x3f800000), |
34 | #if !WANT_SIMD_EXCEPT |
35 | .special_bound = V4 (126.0f), |
36 | .scale_thresh = V4 (192.0f), |
37 | #endif |
38 | }; |
39 | |
40 | #define C(i) d->poly[i] |
41 | |
42 | #if WANT_SIMD_EXCEPT |
43 | |
44 | # define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */ |
45 | # define BigBound v_u32 (0x42800000) /* asuint (0x1p6). */ |
46 | # define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound. */ |
47 | |
48 | static float32x4_t VPCS_ATTR NOINLINE |
49 | special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) |
50 | { |
51 | /* If fenv exceptions are to be triggered correctly, fall back to the scalar |
52 | routine for special lanes. */ |
53 | return v_call_f32 (exp2f, x, y, cmp); |
54 | } |
55 | |
56 | #else |
57 | |
58 | # define SpecialOffset v_u32 (0x82000000) |
59 | # define SpecialBias v_u32 (0x7f000000) |
60 | |
61 | static float32x4_t VPCS_ATTR NOINLINE |
62 | special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, |
63 | float32x4_t scale, const struct data *d) |
64 | { |
65 | /* 2^n may overflow, break it up into s1*s2. */ |
66 | uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset); |
67 | float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias)); |
68 | float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); |
69 | uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); |
70 | float32x4_t r2 = vmulq_f32 (s1, s1); |
71 | float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1); |
72 | /* Similar to r1 but avoids double rounding in the subnormal range. */ |
73 | float32x4_t r0 = vfmaq_f32 (scale, poly, scale); |
74 | float32x4_t r = vbslq_f32 (cmp1, r1, r0); |
75 | return vbslq_f32 (cmp2, r2, r); |
76 | } |
77 | |
78 | #endif |
79 | |
80 | float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x) |
81 | { |
82 | const struct data *d = ptr_barrier (&data); |
83 | float32x4_t n, r, r2, scale, p, q, poly; |
84 | uint32x4_t cmp, e; |
85 | |
86 | #if WANT_SIMD_EXCEPT |
87 | /* asuint(|x|) - TinyBound >= BigBound - TinyBound. */ |
88 | uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x)); |
89 | cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound); |
90 | float32x4_t xm = x; |
91 | /* If any lanes are special, mask them with 1 and retain a copy of x to allow |
92 | special_case to fix special lanes later. This is only necessary if fenv |
93 | exceptions are to be triggered correctly. */ |
94 | if (__glibc_unlikely (v_any_u32 (cmp))) |
95 | x = vbslq_f32 (cmp, v_f32 (1), x); |
96 | #endif |
97 | |
98 | /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] |
99 | x = n + r, with r in [-1/2, 1/2]. */ |
100 | n = vrndaq_f32 (x); |
101 | r = vsubq_f32 (x, n); |
102 | e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23); |
103 | scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); |
104 | |
105 | #if !WANT_SIMD_EXCEPT |
106 | cmp = vcagtq_f32 (n, d->special_bound); |
107 | #endif |
108 | |
109 | r2 = vmulq_f32 (r, r); |
110 | p = vfmaq_f32 (C (1), C (0), r); |
111 | q = vfmaq_f32 (C (3), C (2), r); |
112 | q = vfmaq_f32 (q, p, r2); |
113 | p = vmulq_f32 (C (4), r); |
114 | poly = vfmaq_f32 (p, q, r2); |
115 | |
116 | if (__glibc_unlikely (v_any_u32 (cmp))) |
117 | #if WANT_SIMD_EXCEPT |
118 | return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp); |
119 | #else |
120 | return special_case (poly, n, e, cmp, scale, d); |
121 | #endif |
122 | |
123 | return vfmaq_f32 (scale, poly, scale); |
124 | } |
125 | libmvec_hidden_def (V_NAME_F1 (exp2)) |
126 | HALF_WIDTH_ALIAS_F1 (exp2) |
127 | |