| 1 | /* Single-precision vector (AdvSIMD) exp10 function. |
| 2 | |
| 3 | Copyright (C) 2023-2024 Free Software Foundation, Inc. |
| 4 | This file is part of the GNU C Library. |
| 5 | |
| 6 | The GNU C Library is free software; you can redistribute it and/or |
| 7 | modify it under the terms of the GNU Lesser General Public |
| 8 | License as published by the Free Software Foundation; either |
| 9 | version 2.1 of the License, or (at your option) any later version. |
| 10 | |
| 11 | The GNU C Library is distributed in the hope that it will be useful, |
| 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 14 | Lesser General Public License for more details. |
| 15 | |
| 16 | You should have received a copy of the GNU Lesser General Public |
| 17 | License along with the GNU C Library; if not, see |
| 18 | <https://www.gnu.org/licenses/>. */ |
| 19 | |
| 20 | #include "v_math.h" |
| 21 | #include "poly_advsimd_f32.h" |
| 22 | |
| 23 | #define ScaleBound 192.0f |
| 24 | |
| 25 | static const struct data |
| 26 | { |
| 27 | float32x4_t poly[5]; |
| 28 | float log10_2_and_inv[4]; |
| 29 | float32x4_t shift; |
| 30 | |
| 31 | #if !WANT_SIMD_EXCEPT |
| 32 | float32x4_t scale_thresh; |
| 33 | #endif |
| 34 | } data = { |
| 35 | /* Coefficients generated using Remez algorithm with minimisation of relative |
| 36 | error. |
| 37 | rel error: 0x1.89dafa3p-24 |
| 38 | abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2] |
| 39 | maxerr: 1.85943 +0.5 ulp. */ |
| 40 | .poly = { V4 (0x1.26bb16p+1f), V4 (0x1.5350d2p+1f), V4 (0x1.04744ap+1f), |
| 41 | V4 (0x1.2d8176p+0f), V4 (0x1.12b41ap-1f) }, |
| 42 | .shift = V4 (0x1.8p23f), |
| 43 | |
| 44 | /* Stores constants 1/log10(2), log10(2)_high, log10(2)_low, 0. */ |
| 45 | .log10_2_and_inv = { 0x1.a934fp+1, 0x1.344136p-2, -0x1.ec10cp-27, 0 }, |
| 46 | #if !WANT_SIMD_EXCEPT |
| 47 | .scale_thresh = V4 (ScaleBound) |
| 48 | #endif |
| 49 | }; |
| 50 | |
| 51 | #define ExponentBias v_u32 (0x3f800000) |
| 52 | |
| 53 | #if WANT_SIMD_EXCEPT |
| 54 | |
| 55 | # define SpecialBound 38.0f /* rint(log10(2^127)). */ |
| 56 | # define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */ |
| 57 | # define BigBound v_u32 (0x42180000) /* asuint (SpecialBound). */ |
| 58 | # define Thres v_u32 (0x22180000) /* BigBound - TinyBound. */ |
| 59 | |
| 60 | static float32x4_t VPCS_ATTR NOINLINE |
| 61 | special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) |
| 62 | { |
| 63 | /* If fenv exceptions are to be triggered correctly, fall back to the scalar |
| 64 | routine to special lanes. */ |
| 65 | return v_call_f32 (exp10f, x, y, cmp); |
| 66 | } |
| 67 | |
| 68 | #else |
| 69 | |
| 70 | # define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))). */ |
| 71 | # define SpecialOffset v_u32 (0x82000000) |
| 72 | # define SpecialBias v_u32 (0x7f000000) |
| 73 | |
| 74 | static float32x4_t VPCS_ATTR NOINLINE |
| 75 | special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, |
| 76 | float32x4_t scale, const struct data *d) |
| 77 | { |
| 78 | /* 2^n may overflow, break it up into s1*s2. */ |
| 79 | uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset); |
| 80 | float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias)); |
| 81 | float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); |
| 82 | uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); |
| 83 | float32x4_t r2 = vmulq_f32 (s1, s1); |
| 84 | float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1); |
| 85 | /* Similar to r1 but avoids double rounding in the subnormal range. */ |
| 86 | float32x4_t r0 = vfmaq_f32 (scale, poly, scale); |
| 87 | float32x4_t r = vbslq_f32 (cmp1, r1, r0); |
| 88 | return vbslq_f32 (cmp2, r2, r); |
| 89 | } |
| 90 | |
| 91 | #endif |
| 92 | |
| 93 | /* Fast vector implementation of single-precision exp10. |
| 94 | Algorithm is accurate to 2.36 ULP. |
| 95 | _ZGVnN4v_exp10f(0x1.be2b36p+1) got 0x1.7e79c4p+11 |
| 96 | want 0x1.7e79cp+11. */ |
| 97 | float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x) |
| 98 | { |
| 99 | const struct data *d = ptr_barrier (&data); |
| 100 | #if WANT_SIMD_EXCEPT |
| 101 | /* asuint(x) - TinyBound >= BigBound - TinyBound. */ |
| 102 | uint32x4_t cmp = vcgeq_u32 ( |
| 103 | vsubq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (x)), TinyBound), Thres); |
| 104 | float32x4_t xm = x; |
| 105 | /* If any lanes are special, mask them with 1 and retain a copy of x to allow |
| 106 | special case handler to fix special lanes later. This is only necessary if |
| 107 | fenv exceptions are to be triggered correctly. */ |
| 108 | if (__glibc_unlikely (v_any_u32 (cmp))) |
| 109 | x = v_zerofy_f32 (x, cmp); |
| 110 | #endif |
| 111 | |
| 112 | /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)), |
| 113 | with poly(r) in [1/sqrt(2), sqrt(2)] and |
| 114 | x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */ |
| 115 | float32x4_t log10_2_and_inv = vld1q_f32 (d->log10_2_and_inv); |
| 116 | float32x4_t z = vfmaq_laneq_f32 (d->shift, x, log10_2_and_inv, 0); |
| 117 | float32x4_t n = vsubq_f32 (z, d->shift); |
| 118 | float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2_and_inv, 1); |
| 119 | r = vfmsq_laneq_f32 (r, n, log10_2_and_inv, 2); |
| 120 | uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); |
| 121 | |
| 122 | float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias)); |
| 123 | |
| 124 | #if !WANT_SIMD_EXCEPT |
| 125 | uint32x4_t cmp = vcagtq_f32 (n, v_f32 (SpecialBound)); |
| 126 | #endif |
| 127 | |
| 128 | float32x4_t r2 = vmulq_f32 (r, r); |
| 129 | float32x4_t poly |
| 130 | = vfmaq_f32 (vmulq_f32 (r, d->poly[0]), |
| 131 | v_pairwise_poly_3_f32 (r, r2, d->poly + 1), r2); |
| 132 | |
| 133 | if (__glibc_unlikely (v_any_u32 (cmp))) |
| 134 | #if WANT_SIMD_EXCEPT |
| 135 | return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp); |
| 136 | #else |
| 137 | return special_case (poly, n, e, cmp, scale, d); |
| 138 | #endif |
| 139 | |
| 140 | return vfmaq_f32 (scale, poly, scale); |
| 141 | } |
| 142 | libmvec_hidden_def (V_NAME_F1 (exp10)) |
| 143 | HALF_WIDTH_ALIAS_F1 (exp10) |
| 144 | |