| 1 | //===-- Common utilities for half-precision exponential functions ---------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #ifndef LLVM_LIBC_SRC_MATH_GENERIC_EXPXF16_H |
| 10 | #define LLVM_LIBC_SRC_MATH_GENERIC_EXPXF16_H |
| 11 | |
| 12 | #include "src/__support/FPUtil/FPBits.h" |
| 13 | #include "src/__support/FPUtil/cast.h" |
| 14 | #include "src/__support/FPUtil/multiply_add.h" |
| 15 | #include "src/__support/FPUtil/nearest_integer.h" |
| 16 | #include "src/__support/macros/attributes.h" |
| 17 | #include "src/__support/macros/config.h" |
| 18 | #include <stdint.h> |
| 19 | |
| 20 | #include "src/__support/math/expf16_utils.h" |
| 21 | |
| 22 | namespace LIBC_NAMESPACE_DECL { |
| 23 | |
| 24 | // Generated by Sollya with the following commands: |
| 25 | // > display = hexadecimal; |
| 26 | // > for i from 0 to 7 do printsingle(round(2^(i * 2^-3), SG, RN)); |
| 27 | constexpr cpp::array<uint32_t, 8> EXP2_MID_BITS = { |
| 28 | 0x3f80'0000U, 0x3f8b'95c2U, 0x3f98'37f0U, 0x3fa5'fed7U, |
| 29 | 0x3fb5'04f3U, 0x3fc5'672aU, 0x3fd7'44fdU, 0x3fea'c0c7U, |
| 30 | }; |
| 31 | |
| 32 | LIBC_INLINE ExpRangeReduction exp2_range_reduction(float16 x) { |
| 33 | // For -25 < x < 16, to compute 2^x, we perform the following range reduction: |
| 34 | // find hi, mid, lo, such that: |
| 35 | // x = hi + mid + lo, in which |
| 36 | // hi is an integer, |
| 37 | // mid * 2^3 is an integer, |
| 38 | // -2^(-4) <= lo < 2^(-4). |
| 39 | // In particular, |
| 40 | // hi + mid = round(x * 2^3) * 2^(-3). |
| 41 | // Then, |
| 42 | // 2^x = 2^(hi + mid + lo) = 2^hi * 2^mid * 2^lo. |
| 43 | // We store 2^mid in the lookup table EXP2_MID_BITS, and compute 2^hi * 2^mid |
| 44 | // by adding hi to the exponent field of 2^mid. 2^lo is computed using a |
| 45 | // degree-3 minimax polynomial generated by Sollya. |
| 46 | |
| 47 | float xf = x; |
| 48 | float kf = fputil::nearest_integer(xf * 0x1.0p+3f); |
| 49 | int x_hi_mid = static_cast<int>(kf); |
| 50 | unsigned x_hi = static_cast<unsigned>(x_hi_mid) >> 3; |
| 51 | unsigned x_mid = static_cast<unsigned>(x_hi_mid) & 0x7; |
| 52 | // lo = x - (hi + mid) = round(x * 2^3) * (-2^(-3)) + x |
| 53 | float lo = fputil::multiply_add(kf, -0x1.0p-3f, xf); |
| 54 | |
| 55 | uint32_t exp2_hi_mid_bits = |
| 56 | EXP2_MID_BITS[x_mid] + |
| 57 | static_cast<uint32_t>(x_hi << fputil::FPBits<float>::FRACTION_LEN); |
| 58 | float exp2_hi_mid = fputil::FPBits<float>(exp2_hi_mid_bits).get_val(); |
| 59 | // Degree-3 minimax polynomial generated by Sollya with the following |
| 60 | // commands: |
| 61 | // > display = hexadecimal; |
| 62 | // > P = fpminimax((2^x - 1)/x, 2, [|SG...|], [-2^-4, 2^-4]); |
| 63 | // > 1 + x * P; |
| 64 | float exp2_lo = fputil::polyeval(lo, 0x1p+0f, 0x1.62e43p-1f, 0x1.ec0aa6p-3f, |
| 65 | 0x1.c6b4a6p-5f); |
| 66 | return {exp2_hi_mid, exp2_lo}; |
| 67 | } |
| 68 | |
| 69 | // Generated by Sollya with the following commands: |
| 70 | // > display = hexadecimal; |
| 71 | // > round(log2(10), SG, RN); |
| 72 | static constexpr float LOG2F_10 = 0x1.a934fp+1f; |
| 73 | |
| 74 | // Generated by Sollya with the following commands: |
| 75 | // > display = hexadecimal; |
| 76 | // > round(log10(2), SG, RN); |
| 77 | static constexpr float LOG10F_2 = 0x1.344136p-2f; |
| 78 | |
| 79 | LIBC_INLINE ExpRangeReduction exp10_range_reduction(float16 x) { |
| 80 | // For -8 < x < 5, to compute 10^x, we perform the following range reduction: |
| 81 | // find hi, mid, lo, such that: |
| 82 | // x = (hi + mid) * log2(10) + lo, in which |
| 83 | // hi is an integer, |
| 84 | // mid * 2^3 is an integer, |
| 85 | // -2^(-4) <= lo < 2^(-4). |
| 86 | // In particular, |
| 87 | // hi + mid = round(x * 2^3) * 2^(-3). |
| 88 | // Then, |
| 89 | // 10^x = 10^(hi + mid + lo) = 2^((hi + mid) * log2(10)) + 10^lo |
| 90 | // We store 2^mid in the lookup table EXP2_MID_BITS, and compute 2^hi * 2^mid |
| 91 | // by adding hi to the exponent field of 2^mid. 10^lo is computed using a |
| 92 | // degree-4 minimax polynomial generated by Sollya. |
| 93 | |
| 94 | float xf = x; |
| 95 | float kf = fputil::nearest_integer(xf * (LOG2F_10 * 0x1.0p+3f)); |
| 96 | int x_hi_mid = static_cast<int>(kf); |
| 97 | unsigned x_hi = static_cast<unsigned>(x_hi_mid) >> 3; |
| 98 | unsigned x_mid = static_cast<unsigned>(x_hi_mid) & 0x7; |
| 99 | // lo = x - (hi + mid) = round(x * 2^3 * log2(10)) * log10(2) * (-2^(-3)) + x |
| 100 | float lo = fputil::multiply_add(kf, LOG10F_2 * -0x1.0p-3f, xf); |
| 101 | |
| 102 | uint32_t exp2_hi_mid_bits = |
| 103 | EXP2_MID_BITS[x_mid] + |
| 104 | static_cast<uint32_t>(x_hi << fputil::FPBits<float>::FRACTION_LEN); |
| 105 | float exp2_hi_mid = fputil::FPBits<float>(exp2_hi_mid_bits).get_val(); |
| 106 | // Degree-4 minimax polynomial generated by Sollya with the following |
| 107 | // commands: |
| 108 | // > display = hexadecimal; |
| 109 | // > P = fpminimax((10^x - 1)/x, 3, [|SG...|], [-2^-4, 2^-4]); |
| 110 | // > 1 + x * P; |
| 111 | float exp10_lo = fputil::polyeval(lo, 0x1p+0f, 0x1.26bb14p+1f, 0x1.53526p+1f, |
| 112 | 0x1.04b434p+1f, 0x1.2bcf9ep+0f); |
| 113 | return {exp2_hi_mid, exp10_lo}; |
| 114 | } |
| 115 | |
| 116 | // Generated by Sollya with the following commands: |
| 117 | // > display = hexadecimal; |
| 118 | // > round(log2(exp(1)), SG, RN); |
| 119 | static constexpr float LOG2F_E = 0x1.715476p+0f; |
| 120 | |
| 121 | // Generated by Sollya with the following commands: |
| 122 | // > display = hexadecimal; |
| 123 | // > round(log(2), SG, RN); |
| 124 | static constexpr float LOGF_2 = 0x1.62e43p-1f; |
| 125 | |
| 126 | // Generated by Sollya with the following commands: |
| 127 | // > display = hexadecimal; |
| 128 | // > for i from 0 to 31 do printsingle(round(2^(i * 2^-5), SG, RN)); |
| 129 | static constexpr cpp::array<uint32_t, 32> EXP2_MID_5_BITS = { |
| 130 | 0x3f80'0000U, 0x3f82'cd87U, 0x3f85'aac3U, 0x3f88'980fU, 0x3f8b'95c2U, |
| 131 | 0x3f8e'a43aU, 0x3f91'c3d3U, 0x3f94'f4f0U, 0x3f98'37f0U, 0x3f9b'8d3aU, |
| 132 | 0x3f9e'f532U, 0x3fa2'7043U, 0x3fa5'fed7U, 0x3fa9'a15bU, 0x3fad'583fU, |
| 133 | 0x3fb1'23f6U, 0x3fb5'04f3U, 0x3fb8'fbafU, 0x3fbd'08a4U, 0x3fc1'2c4dU, |
| 134 | 0x3fc5'672aU, 0x3fc9'b9beU, 0x3fce'248cU, 0x3fd2'a81eU, 0x3fd7'44fdU, |
| 135 | 0x3fdb'fbb8U, 0x3fe0'ccdfU, 0x3fe5'b907U, 0x3fea'c0c7U, 0x3fef'e4baU, |
| 136 | 0x3ff5'257dU, 0x3ffa'83b3U, |
| 137 | }; |
| 138 | |
| 139 | // This function correctly calculates sinh(x) and cosh(x) by calculating exp(x) |
| 140 | // and exp(-x) simultaneously. |
| 141 | // To compute e^x, we perform the following range reduction: |
| 142 | // find hi, mid, lo such that: |
| 143 | // x = (hi + mid) * log(2) + lo, in which |
| 144 | // hi is an integer, |
| 145 | // 0 <= mid * 2^5 < 32 is an integer |
| 146 | // -2^(-5) <= lo * log2(e) <= 2^-5. |
| 147 | // In particular, |
| 148 | // hi + mid = round(x * log2(e) * 2^5) * 2^(-5). |
| 149 | // Then, |
| 150 | // e^x = 2^(hi + mid) * e^lo = 2^hi * 2^mid * e^lo. |
| 151 | // We store 2^mid in the lookup table EXP2_MID_5_BITS, and compute 2^hi * 2^mid |
| 152 | // by adding hi to the exponent field of 2^mid. |
| 153 | // e^lo is computed using a degree-3 minimax polynomial generated by Sollya: |
| 154 | // e^lo ~ P(lo) |
| 155 | // = 1 + lo + c2 * lo^2 + ... + c5 * lo^5 |
| 156 | // = (1 + c2*lo^2 + c4*lo^4) + lo * (1 + c3*lo^2 + c5*lo^4) |
| 157 | // = P_even + lo * P_odd |
| 158 | // To compute e^(-x), notice that: |
| 159 | // e^(-x) = 2^(-(hi + mid)) * e^(-lo) |
| 160 | // ~ 2^(-(hi + mid)) * P(-lo) |
| 161 | // = 2^(-(hi + mid)) * (P_even - lo * P_odd) |
| 162 | // So: |
| 163 | // sinh(x) = (e^x - e^(-x)) / 2 |
| 164 | // ~ 0.5 * (2^(hi + mid) * (P_even + lo * P_odd) - |
| 165 | // 2^(-(hi + mid)) * (P_even - lo * P_odd)) |
| 166 | // = 0.5 * (P_even * (2^(hi + mid) - 2^(-(hi + mid))) + |
| 167 | // lo * P_odd * (2^(hi + mid) + 2^(-(hi + mid)))) |
| 168 | // And similarly: |
| 169 | // cosh(x) = (e^x + e^(-x)) / 2 |
| 170 | // ~ 0.5 * (P_even * (2^(hi + mid) + 2^(-(hi + mid))) + |
| 171 | // lo * P_odd * (2^(hi + mid) - 2^(-(hi + mid)))) |
| 172 | // The main point of these formulas is that the expensive part of calculating |
| 173 | // the polynomials approximating lower parts of e^x and e^(-x) is shared and |
| 174 | // only done once. |
| 175 | template <bool IsSinh> LIBC_INLINE float16 eval_sinh_or_cosh(float16 x) { |
| 176 | float xf = x; |
| 177 | float kf = fputil::nearest_integer(xf * (LOG2F_E * 0x1.0p+5f)); |
| 178 | int x_hi_mid_p = static_cast<int>(kf); |
| 179 | int x_hi_mid_m = -x_hi_mid_p; |
| 180 | |
| 181 | unsigned x_hi_p = static_cast<unsigned>(x_hi_mid_p) >> 5; |
| 182 | unsigned x_hi_m = static_cast<unsigned>(x_hi_mid_m) >> 5; |
| 183 | unsigned x_mid_p = static_cast<unsigned>(x_hi_mid_p) & 0x1f; |
| 184 | unsigned x_mid_m = static_cast<unsigned>(x_hi_mid_m) & 0x1f; |
| 185 | |
| 186 | uint32_t exp2_hi_mid_bits_p = |
| 187 | EXP2_MID_5_BITS[x_mid_p] + |
| 188 | static_cast<uint32_t>(x_hi_p << fputil::FPBits<float>::FRACTION_LEN); |
| 189 | uint32_t exp2_hi_mid_bits_m = |
| 190 | EXP2_MID_5_BITS[x_mid_m] + |
| 191 | static_cast<uint32_t>(x_hi_m << fputil::FPBits<float>::FRACTION_LEN); |
| 192 | // exp2_hi_mid_p = 2^(hi + mid) |
| 193 | float exp2_hi_mid_p = fputil::FPBits<float>(exp2_hi_mid_bits_p).get_val(); |
| 194 | // exp2_hi_mid_m = 2^(-(hi + mid)) |
| 195 | float exp2_hi_mid_m = fputil::FPBits<float>(exp2_hi_mid_bits_m).get_val(); |
| 196 | |
| 197 | // exp2_hi_mid_sum = 2^(hi + mid) + 2^(-(hi + mid)) |
| 198 | float exp2_hi_mid_sum = exp2_hi_mid_p + exp2_hi_mid_m; |
| 199 | // exp2_hi_mid_diff = 2^(hi + mid) - 2^(-(hi + mid)) |
| 200 | float exp2_hi_mid_diff = exp2_hi_mid_p - exp2_hi_mid_m; |
| 201 | |
| 202 | // lo = x - (hi + mid) = round(x * log2(e) * 2^5) * log(2) * (-2^(-5)) + x |
| 203 | float lo = fputil::multiply_add(kf, LOGF_2 * -0x1.0p-5f, xf); |
| 204 | float lo_sq = lo * lo; |
| 205 | |
| 206 | // Degree-3 minimax polynomial generated by Sollya with the following |
| 207 | // commands: |
| 208 | // > display = hexadecimal; |
| 209 | // > P = fpminimax(expm1(x)/x, 2, [|SG...|], [-2^-5, 2^-5]); |
| 210 | // > 1 + x * P; |
| 211 | constexpr cpp::array<float, 4> COEFFS = {0x1p+0f, 0x1p+0f, 0x1.0004p-1f, |
| 212 | 0x1.555778p-3f}; |
| 213 | float half_p_odd = |
| 214 | fputil::polyeval(lo_sq, COEFFS[1] * 0.5f, COEFFS[3] * 0.5f); |
| 215 | float half_p_even = |
| 216 | fputil::polyeval(lo_sq, COEFFS[0] * 0.5f, COEFFS[2] * 0.5f); |
| 217 | |
| 218 | // sinh(x) = lo * (0.5 * P_odd * (2^(hi + mid) + 2^(-(hi + mid)))) + |
| 219 | // (0.5 * P_even * (2^(hi + mid) - 2^(-(hi + mid)))) |
| 220 | if constexpr (IsSinh) |
| 221 | return fputil::cast<float16>(fputil::multiply_add( |
| 222 | lo, half_p_odd * exp2_hi_mid_sum, half_p_even * exp2_hi_mid_diff)); |
| 223 | // cosh(x) = lo * (0.5 * P_odd * (2^(hi + mid) - 2^(-(hi + mid)))) + |
| 224 | // (0.5 * P_even * (2^(hi + mid) + 2^(-(hi + mid)))) |
| 225 | return fputil::cast<float16>(fputil::multiply_add( |
| 226 | lo, half_p_odd * exp2_hi_mid_diff, half_p_even * exp2_hi_mid_sum)); |
| 227 | } |
| 228 | |
| 229 | // Generated by Sollya with the following commands: |
| 230 | // > display = hexadecimal; |
| 231 | // > for i from 0 to 31 do print(round(log(1 + i * 2^-5), SG, RN)); |
| 232 | constexpr cpp::array<float, 32> LOGF_F = { |
| 233 | 0x0p+0f, 0x1.f829bp-6f, 0x1.f0a30cp-5f, 0x1.6f0d28p-4f, |
| 234 | 0x1.e27076p-4f, 0x1.29553p-3f, 0x1.5ff308p-3f, 0x1.9525aap-3f, |
| 235 | 0x1.c8ff7cp-3f, 0x1.fb9186p-3f, 0x1.1675cap-2f, 0x1.2e8e2cp-2f, |
| 236 | 0x1.4618bcp-2f, 0x1.5d1bdcp-2f, 0x1.739d8p-2f, 0x1.89a338p-2f, |
| 237 | 0x1.9f323ep-2f, 0x1.b44f78p-2f, 0x1.c8ff7cp-2f, 0x1.dd46ap-2f, |
| 238 | 0x1.f128f6p-2f, 0x1.02552ap-1f, 0x1.0be72ep-1f, 0x1.154c3ep-1f, |
| 239 | 0x1.1e85f6p-1f, 0x1.2795e2p-1f, 0x1.307d74p-1f, 0x1.393e0ep-1f, |
| 240 | 0x1.41d8fep-1f, 0x1.4a4f86p-1f, 0x1.52a2d2p-1f, 0x1.5ad404p-1f, |
| 241 | }; |
| 242 | |
| 243 | // Generated by Sollya with the following commands: |
| 244 | // > display = hexadecimal; |
| 245 | // > for i from 0 to 31 do print(round(log2(1 + i * 2^-5), SG, RN)); |
| 246 | constexpr cpp::array<float, 32> LOG2F_F = { |
| 247 | 0x0p+0f, 0x1.6bad38p-5f, 0x1.663f7p-4f, 0x1.08c588p-3f, |
| 248 | 0x1.5c01a4p-3f, 0x1.acf5e2p-3f, 0x1.fbc16cp-3f, 0x1.24407ap-2f, |
| 249 | 0x1.49a784p-2f, 0x1.6e221cp-2f, 0x1.91bba8p-2f, 0x1.b47ecp-2f, |
| 250 | 0x1.d6753ep-2f, 0x1.f7a856p-2f, 0x1.0c105p-1f, 0x1.1bf312p-1f, |
| 251 | 0x1.2b8034p-1f, 0x1.3abb4p-1f, 0x1.49a784p-1f, 0x1.584822p-1f, |
| 252 | 0x1.66a008p-1f, 0x1.74b1fep-1f, 0x1.82809ep-1f, 0x1.900e62p-1f, |
| 253 | 0x1.9d5dap-1f, 0x1.aa709p-1f, 0x1.b74948p-1f, 0x1.c3e9cap-1f, |
| 254 | 0x1.d053f6p-1f, 0x1.dc899ap-1f, 0x1.e88c6cp-1f, 0x1.f45e08p-1f, |
| 255 | }; |
| 256 | |
| 257 | // Generated by Sollya with the following commands: |
| 258 | // > display = hexadecimal; |
| 259 | // > for i from 0 to 31 do print(round(log10(1 + i * 2^-5), SG, RN)); |
| 260 | constexpr cpp::array<float, 32> LOG10F_F = { |
| 261 | 0x0p+0f, 0x1.b5e908p-7f, 0x1.af5f92p-6f, 0x1.3ed11ap-5f, |
| 262 | 0x1.a30a9ep-5f, 0x1.02428cp-4f, 0x1.31b306p-4f, 0x1.5fe804p-4f, |
| 263 | 0x1.8cf184p-4f, 0x1.b8de4ep-4f, 0x1.e3bc1ap-4f, 0x1.06cbd6p-3f, |
| 264 | 0x1.1b3e72p-3f, 0x1.2f3b6ap-3f, 0x1.42c7e8p-3f, 0x1.55e8c6p-3f, |
| 265 | 0x1.68a288p-3f, 0x1.7af974p-3f, 0x1.8cf184p-3f, 0x1.9e8e7cp-3f, |
| 266 | 0x1.afd3e4p-3f, 0x1.c0c514p-3f, 0x1.d1653p-3f, 0x1.e1b734p-3f, |
| 267 | 0x1.f1bdeep-3f, 0x1.00be06p-2f, 0x1.087a08p-2f, 0x1.101432p-2f, |
| 268 | 0x1.178da6p-2f, 0x1.1ee778p-2f, 0x1.2622bp-2f, 0x1.2d404cp-2f, |
| 269 | }; |
| 270 | |
| 271 | // Generated by Sollya with the following commands: |
| 272 | // > display = hexadecimal; |
| 273 | // > for i from 0 to 31 do print(round(1 / (1 + i * 2^-5), SG, RN)); |
| 274 | constexpr cpp::array<float, 32> ONE_OVER_F_F = { |
| 275 | 0x1p+0f, 0x1.f07c2p-1f, 0x1.e1e1e2p-1f, 0x1.d41d42p-1f, |
| 276 | 0x1.c71c72p-1f, 0x1.bacf92p-1f, 0x1.af286cp-1f, 0x1.a41a42p-1f, |
| 277 | 0x1.99999ap-1f, 0x1.8f9c18p-1f, 0x1.861862p-1f, 0x1.7d05f4p-1f, |
| 278 | 0x1.745d18p-1f, 0x1.6c16c2p-1f, 0x1.642c86p-1f, 0x1.5c9882p-1f, |
| 279 | 0x1.555556p-1f, 0x1.4e5e0ap-1f, 0x1.47ae14p-1f, 0x1.414142p-1f, |
| 280 | 0x1.3b13b2p-1f, 0x1.3521dp-1f, 0x1.2f684cp-1f, 0x1.29e412p-1f, |
| 281 | 0x1.24924ap-1f, 0x1.1f7048p-1f, 0x1.1a7b96p-1f, 0x1.15b1e6p-1f, |
| 282 | 0x1.111112p-1f, 0x1.0c9714p-1f, 0x1.08421p-1f, 0x1.041042p-1f, |
| 283 | }; |
| 284 | |
| 285 | } // namespace LIBC_NAMESPACE_DECL |
| 286 | |
| 287 | #endif // LLVM_LIBC_SRC_MATH_GENERIC_EXPXF16_H |
| 288 | |