Warning: This file is not a C or C++ file. It does not have highlighting.
| 1 | //===-- Calculate square root of fixed point numbers. -----*- C++ -*-=========// |
|---|---|
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #ifndef LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_SQRT_H |
| 10 | #define LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_SQRT_H |
| 11 | |
| 12 | #include "include/llvm-libc-macros/stdfix-macros.h" |
| 13 | #include "src/__support/CPP/bit.h" |
| 14 | #include "src/__support/CPP/limits.h" // CHAR_BIT |
| 15 | #include "src/__support/CPP/type_traits.h" |
| 16 | #include "src/__support/macros/attributes.h" // LIBC_INLINE |
| 17 | #include "src/__support/macros/config.h" |
| 18 | #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY |
| 19 | |
| 20 | #include "fx_rep.h" |
| 21 | |
| 22 | #ifdef LIBC_COMPILER_HAS_FIXED_POINT |
| 23 | |
| 24 | namespace LIBC_NAMESPACE_DECL { |
| 25 | namespace fixed_point { |
| 26 | |
| 27 | namespace internal { |
| 28 | |
| 29 | template <typename T> struct SqrtConfig; |
| 30 | |
| 31 | template <> struct SqrtConfig<unsigned short fract> { |
| 32 | using Type = unsigned short fract; |
| 33 | static constexpr int EXTRA_STEPS = 0; |
| 34 | |
| 35 | // Linear approximation for the initial values, with errors bounded by: |
| 36 | // max(1.5 * 2^-11, eps) |
| 37 | // Generated with Sollya: |
| 38 | // > for i from 4 to 15 do { |
| 39 | // P = fpminimax(sqrt(x), 1, [|8, 8|], [i * 2^-4, (i + 1)*2^-4], |
| 40 | // fixed, absolute); |
| 41 | // print("{", coeff(P, 1), "uhr,", coeff(P, 0), "uhr},"); |
| 42 | // }; |
| 43 | static constexpr Type FIRST_APPROX[12][2] = { |
| 44 | {0x1.e8p-1uhr, 0x1.0cp-2uhr}, {0x1.bap-1uhr, 0x1.28p-2uhr}, |
| 45 | {0x1.94p-1uhr, 0x1.44p-2uhr}, {0x1.74p-1uhr, 0x1.6p-2uhr}, |
| 46 | {0x1.6p-1uhr, 0x1.74p-2uhr}, {0x1.4ep-1uhr, 0x1.88p-2uhr}, |
| 47 | {0x1.3ep-1uhr, 0x1.9cp-2uhr}, {0x1.32p-1uhr, 0x1.acp-2uhr}, |
| 48 | {0x1.22p-1uhr, 0x1.c4p-2uhr}, {0x1.18p-1uhr, 0x1.d4p-2uhr}, |
| 49 | {0x1.08p-1uhr, 0x1.fp-2uhr}, {0x1.04p-1uhr, 0x1.f8p-2uhr}, |
| 50 | }; |
| 51 | }; |
| 52 | |
| 53 | template <> struct SqrtConfig<unsigned fract> { |
| 54 | using Type = unsigned fract; |
| 55 | static constexpr int EXTRA_STEPS = 1; |
| 56 | |
| 57 | // Linear approximation for the initial values, with errors bounded by: |
| 58 | // max(1.5 * 2^-11, eps) |
| 59 | // Generated with Sollya: |
| 60 | // > for i from 4 to 14 do { |
| 61 | // P = fpminimax(sqrt(x), 1, [|16, 16|], [i * 2^-4, (i + 1)*2^-4], |
| 62 | // fixed, absolute); |
| 63 | // print("{", coeff(P, 1), "ur,", coeff(P, 0), "ur},"); |
| 64 | // }; |
| 65 | // For the last interval [15/16, 1), we choose the linear function Q such that |
| 66 | // Q(1) = 1 and Q(15/16) = P(15/16), |
| 67 | // where P is the polynomial generated by Sollya above for [14/16, 15/16]. |
| 68 | // This is to prevent overflow in the last interval [15/16, 1). |
| 69 | static constexpr Type FIRST_APPROX[12][2] = { |
| 70 | {0x1.e378p-1ur, 0x1.0ebp-2ur}, {0x1.b512p-1ur, 0x1.2b94p-2ur}, |
| 71 | {0x1.91fp-1ur, 0x1.45dcp-2ur}, {0x1.7622p-1ur, 0x1.5e24p-2ur}, |
| 72 | {0x1.5f5ap-1ur, 0x1.74e4p-2ur}, {0x1.4c58p-1ur, 0x1.8a4p-2ur}, |
| 73 | {0x1.3c1ep-1ur, 0x1.9e84p-2ur}, {0x1.2e0cp-1ur, 0x1.b1d8p-2ur}, |
| 74 | {0x1.21aap-1ur, 0x1.c468p-2ur}, {0x1.16bap-1ur, 0x1.d62cp-2ur}, |
| 75 | {0x1.0cfp-1ur, 0x1.e74cp-2ur}, {0x1.039p-1ur, 0x1.f8ep-2ur}, |
| 76 | }; |
| 77 | }; |
| 78 | |
| 79 | template <> struct SqrtConfig<unsigned long fract> { |
| 80 | using Type = unsigned long fract; |
| 81 | static constexpr int EXTRA_STEPS = 2; |
| 82 | |
| 83 | // Linear approximation for the initial values, with errors bounded by: |
| 84 | // max(1.5 * 2^-11, eps) |
| 85 | // Generated with Sollya: |
| 86 | // > for i from 4 to 14 do { |
| 87 | // P = fpminimax(sqrt(x), 1, [|32, 32|], [i * 2^-4, (i + 1)*2^-4], |
| 88 | // fixed, absolute); |
| 89 | // print("{", coeff(P, 1), "ulr,", coeff(P, 0), "ulr},"); |
| 90 | // }; |
| 91 | // For the last interval [15/16, 1), we choose the linear function Q such that |
| 92 | // Q(1) = 1 and Q(15/16) = P(15/16), |
| 93 | // where P is the polynomial generated by Sollya above for [14/16, 15/16]. |
| 94 | // This is to prevent overflow in the last interval [15/16, 1). |
| 95 | static constexpr Type FIRST_APPROX[12][2] = { |
| 96 | {0x1.e3779b98p-1ulr, 0x1.0eaff788p-2ulr}, |
| 97 | {0x1.b5167872p-1ulr, 0x1.2b908ad4p-2ulr}, |
| 98 | {0x1.91f195cap-1ulr, 0x1.45da800cp-2ulr}, |
| 99 | {0x1.761ebcb4p-1ulr, 0x1.5e27004cp-2ulr}, |
| 100 | {0x1.5f619986p-1ulr, 0x1.74db933cp-2ulr}, |
| 101 | {0x1.4c583adep-1ulr, 0x1.8a3fbfccp-2ulr}, |
| 102 | {0x1.3c1a591cp-1ulr, 0x1.9e88373cp-2ulr}, |
| 103 | {0x1.2e08545ap-1ulr, 0x1.b1dd2534p-2ulr}, |
| 104 | {0x1.21b05c0ap-1ulr, 0x1.c45e023p-2ulr}, |
| 105 | {0x1.16becd02p-1ulr, 0x1.d624031p-2ulr}, |
| 106 | {0x1.0cf49fep-1ulr, 0x1.e743b844p-2ulr}, |
| 107 | {0x1.038cdfcp-1ulr, 0x1.f8e6408p-2ulr}, |
| 108 | }; |
| 109 | }; |
| 110 | |
| 111 | template <> |
| 112 | struct SqrtConfig<unsigned short accum> : SqrtConfig<unsigned fract> {}; |
| 113 | |
| 114 | template <> |
| 115 | struct SqrtConfig<unsigned accum> : SqrtConfig<unsigned long fract> {}; |
| 116 | |
| 117 | // Integer square root |
| 118 | template <> struct SqrtConfig<unsigned short> { |
| 119 | using OutType = unsigned short accum; |
| 120 | using FracType = unsigned fract; |
| 121 | // For fast-but-less-accurate version |
| 122 | using FastFracType = unsigned short fract; |
| 123 | using HalfType = unsigned char; |
| 124 | }; |
| 125 | |
| 126 | template <> struct SqrtConfig<unsigned int> { |
| 127 | using OutType = unsigned accum; |
| 128 | using FracType = unsigned long fract; |
| 129 | // For fast-but-less-accurate version |
| 130 | using FastFracType = unsigned fract; |
| 131 | using HalfType = unsigned short; |
| 132 | }; |
| 133 | |
| 134 | // TODO: unsigned long accum type is 64-bit, and will need 64-bit fract type. |
| 135 | // Probably we will use DyadicFloat<64> for intermediate computations instead. |
| 136 | |
| 137 | } // namespace internal |
| 138 | |
| 139 | // Core computation for sqrt with normalized inputs (0.25 <= x < 1). |
| 140 | template <typename Config> |
| 141 | LIBC_INLINE constexpr typename Config::Type |
| 142 | sqrt_core(typename Config::Type x_frac) { |
| 143 | using FracType = typename Config::Type; |
| 144 | using FXRep = FXRep<FracType>; |
| 145 | using StorageType = typename FXRep::StorageType; |
| 146 | // Exact case: |
| 147 | if (x_frac == FXRep::ONE_FOURTH()) |
| 148 | return FXRep::ONE_HALF(); |
| 149 | |
| 150 | // Use use Newton method to approximate sqrt(a): |
| 151 | // x_{n + 1} = 1/2 (x_n + a / x_n) |
| 152 | // For the initial values, we choose x_0 |
| 153 | |
| 154 | // Use the leading 4 bits to do look up for sqrt(x). |
| 155 | // After normalization, 0.25 <= x_frac < 1, so the leading 4 bits of x_frac |
| 156 | // are between 0b0100 and 0b1111. Hence the lookup table only needs 12 |
| 157 | // entries, and we can get the index by subtracting the leading 4 bits of |
| 158 | // x_frac by 4 = 0b0100. |
| 159 | StorageType x_bit = cpp::bit_cast<StorageType>(x_frac); |
| 160 | int index = (static_cast<int>(x_bit >> (FXRep::TOTAL_LEN - 4))) - 4; |
| 161 | FracType a = Config::FIRST_APPROX[index][0]; |
| 162 | FracType b = Config::FIRST_APPROX[index][1]; |
| 163 | |
| 164 | // Initial approximation step. |
| 165 | // Estimated error bounds: | r - sqrt(x_frac) | < max(1.5 * 2^-11, eps). |
| 166 | FracType r = a * x_frac + b; |
| 167 | |
| 168 | // Further Newton-method iterations for square-root: |
| 169 | // x_{n + 1} = 0.5 * (x_n + a / x_n) |
| 170 | // We distribute and do the multiplication by 0.5 first to avoid overflow. |
| 171 | // TODO: Investigate the performance and accuracy of using division-free |
| 172 | // iterations from: |
| 173 | // Blanchard, J. D. and Chamberland, M., "Newton's Method Without Division", |
| 174 | // The American Mathematical Monthly (2023). |
| 175 | // https://chamberland.math.grinnell.edu/papers/newton.pdf |
| 176 | for (int i = 0; i < Config::EXTRA_STEPS; ++i) |
| 177 | r = (r >> 1) + (x_frac >> 1) / r; |
| 178 | |
| 179 | return r; |
| 180 | } |
| 181 | |
| 182 | template <typename T> |
| 183 | LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_fixed_point_v<T>, T> sqrt(T x) { |
| 184 | using BitType = typename FXRep<T>::StorageType; |
| 185 | BitType x_bit = cpp::bit_cast<BitType>(x); |
| 186 | |
| 187 | if (LIBC_UNLIKELY(x_bit == 0)) |
| 188 | return FXRep<T>::ZERO(); |
| 189 | |
| 190 | int leading_zeros = cpp::countl_zero(x_bit); |
| 191 | constexpr int STORAGE_LENGTH = sizeof(BitType) * CHAR_BIT; |
| 192 | constexpr int EXP_ADJUSTMENT = STORAGE_LENGTH - FXRep<T>::FRACTION_LEN - 1; |
| 193 | // x_exp is the real exponent of the leading bit of x. |
| 194 | int x_exp = EXP_ADJUSTMENT - leading_zeros; |
| 195 | int shift = EXP_ADJUSTMENT - 1 - (x_exp & (~1)); |
| 196 | // Normalize. |
| 197 | x_bit <<= shift; |
| 198 | using FracType = typename internal::SqrtConfig<T>::Type; |
| 199 | FracType x_frac = cpp::bit_cast<FracType>(x_bit); |
| 200 | |
| 201 | // Compute sqrt(x_frac) using Newton-method. |
| 202 | FracType r = sqrt_core<internal::SqrtConfig<T>>(x_frac); |
| 203 | |
| 204 | // Re-scaling |
| 205 | r >>= EXP_ADJUSTMENT - (x_exp >> 1); |
| 206 | |
| 207 | // Return result. |
| 208 | return cpp::bit_cast<T>(r); |
| 209 | } |
| 210 | |
| 211 | // Integer square root - Accurate version: |
| 212 | // Absolute errors < 2^(-fraction length). |
| 213 | template <typename T> |
| 214 | LIBC_INLINE constexpr typename internal::SqrtConfig<T>::OutType isqrt(T x) { |
| 215 | using OutType = typename internal::SqrtConfig<T>::OutType; |
| 216 | using FracType = typename internal::SqrtConfig<T>::FracType; |
| 217 | |
| 218 | if (x == 0) |
| 219 | return FXRep<OutType>::ZERO(); |
| 220 | |
| 221 | // Normalize the leading bits to the first two bits. |
| 222 | // Shift and then Bit cast x to x_frac gives us: |
| 223 | // x = 2^(FRACTION_LEN + 1 - shift) * x_frac; |
| 224 | int leading_zeros = cpp::countl_zero(x); |
| 225 | int shift = ((leading_zeros >> 1) << 1); |
| 226 | x <<= shift; |
| 227 | // Convert to frac type and compute square root. |
| 228 | FracType x_frac = cpp::bit_cast<FracType>(x); |
| 229 | FracType r = sqrt_core<internal::SqrtConfig<FracType>>(x_frac); |
| 230 | // To rescale back to the OutType (Accum) |
| 231 | r >>= (shift >> 1); |
| 232 | |
| 233 | return cpp::bit_cast<OutType>(r); |
| 234 | } |
| 235 | |
| 236 | // Integer square root - Fast but less accurate version: |
| 237 | // Relative errors < 2^(-fraction length). |
| 238 | template <typename T> |
| 239 | LIBC_INLINE constexpr typename internal::SqrtConfig<T>::OutType |
| 240 | isqrt_fast(T x) { |
| 241 | using OutType = typename internal::SqrtConfig<T>::OutType; |
| 242 | using FracType = typename internal::SqrtConfig<T>::FastFracType; |
| 243 | using StorageType = typename FXRep<FracType>::StorageType; |
| 244 | |
| 245 | if (x == 0) |
| 246 | return FXRep<OutType>::ZERO(); |
| 247 | |
| 248 | // Normalize the leading bits to the first two bits. |
| 249 | // Shift and then Bit cast x to x_frac gives us: |
| 250 | // x = 2^(FRACTION_LEN + 1 - shift) * x_frac; |
| 251 | int leading_zeros = cpp::countl_zero(x); |
| 252 | int shift = (leading_zeros & (~1)); |
| 253 | x <<= shift; |
| 254 | // Convert to frac type and compute square root. |
| 255 | FracType x_frac = cpp::bit_cast<FracType>( |
| 256 | static_cast<StorageType>(x >> FXRep<FracType>::FRACTION_LEN)); |
| 257 | OutType r = |
| 258 | static_cast<OutType>(sqrt_core<internal::SqrtConfig<FracType>>(x_frac)); |
| 259 | // To rescale back to the OutType (Accum) |
| 260 | r <<= (FXRep<OutType>::INTEGRAL_LEN - (shift >> 1)); |
| 261 | return cpp::bit_cast<OutType>(r); |
| 262 | } |
| 263 | |
| 264 | } // namespace fixed_point |
| 265 | } // namespace LIBC_NAMESPACE_DECL |
| 266 | |
| 267 | #endif // LIBC_COMPILER_HAS_FIXED_POINT |
| 268 | |
| 269 | #endif // LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_SQRT_H |
| 270 |
Warning: This file is not a C or C++ file. It does not have highlighting.
