1 | //===-- Calculate square root of fixed point numbers. -----*- C++ -*-=========// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #ifndef LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_SQRT_H |
10 | #define LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_SQRT_H |
11 | |
12 | #include "include/llvm-libc-macros/stdfix-macros.h" |
13 | #include "src/__support/CPP/bit.h" |
14 | #include "src/__support/CPP/limits.h" // CHAR_BIT |
15 | #include "src/__support/CPP/type_traits.h" |
16 | #include "src/__support/macros/attributes.h" // LIBC_INLINE |
17 | #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY |
18 | |
19 | #include "fx_rep.h" |
20 | |
21 | #ifdef LIBC_COMPILER_HAS_FIXED_POINT |
22 | |
23 | namespace LIBC_NAMESPACE::fixed_point { |
24 | |
25 | namespace internal { |
26 | |
27 | template <typename T> struct SqrtConfig; |
28 | |
29 | template <> struct SqrtConfig<unsigned short fract> { |
30 | using Type = unsigned short fract; |
31 | static constexpr int EXTRA_STEPS = 0; |
32 | |
33 | // Linear approximation for the initial values, with errors bounded by: |
34 | // max(1.5 * 2^-11, eps) |
35 | // Generated with Sollya: |
36 | // > for i from 4 to 15 do { |
37 | // P = fpminimax(sqrt(x), 1, [|8, 8|], [i * 2^-4, (i + 1)*2^-4], |
38 | // fixed, absolute); |
39 | // print("{", coeff(P, 1), "uhr,", coeff(P, 0), "uhr},"); |
40 | // }; |
41 | static constexpr Type FIRST_APPROX[12][2] = { |
42 | {0x1.e8p-1uhr, 0x1.0cp-2uhr}, {0x1.bap-1uhr, 0x1.28p-2uhr}, |
43 | {0x1.94p-1uhr, 0x1.44p-2uhr}, {0x1.74p-1uhr, 0x1.6p-2uhr}, |
44 | {0x1.6p-1uhr, 0x1.74p-2uhr}, {0x1.4ep-1uhr, 0x1.88p-2uhr}, |
45 | {0x1.3ep-1uhr, 0x1.9cp-2uhr}, {0x1.32p-1uhr, 0x1.acp-2uhr}, |
46 | {0x1.22p-1uhr, 0x1.c4p-2uhr}, {0x1.18p-1uhr, 0x1.d4p-2uhr}, |
47 | {0x1.08p-1uhr, 0x1.fp-2uhr}, {0x1.04p-1uhr, 0x1.f8p-2uhr}, |
48 | }; |
49 | }; |
50 | |
51 | template <> struct SqrtConfig<unsigned fract> { |
52 | using Type = unsigned fract; |
53 | static constexpr int EXTRA_STEPS = 1; |
54 | |
55 | // Linear approximation for the initial values, with errors bounded by: |
56 | // max(1.5 * 2^-11, eps) |
57 | // Generated with Sollya: |
58 | // > for i from 4 to 15 do { |
59 | // P = fpminimax(sqrt(x), 1, [|16, 16|], [i * 2^-4, (i + 1)*2^-4], |
60 | // fixed, absolute); |
61 | // print("{", coeff(P, 1), "ur,", coeff(P, 0), "ur},"); |
62 | // }; |
63 | static constexpr Type FIRST_APPROX[12][2] = { |
64 | {0x1.e378p-1ur, 0x1.0ebp-2ur}, {0x1.b512p-1ur, 0x1.2b94p-2ur}, |
65 | {0x1.91fp-1ur, 0x1.45dcp-2ur}, {0x1.7622p-1ur, 0x1.5e24p-2ur}, |
66 | {0x1.5f5ap-1ur, 0x1.74e4p-2ur}, {0x1.4c58p-1ur, 0x1.8a4p-2ur}, |
67 | {0x1.3c1ep-1ur, 0x1.9e84p-2ur}, {0x1.2e0cp-1ur, 0x1.b1d8p-2ur}, |
68 | {0x1.21aap-1ur, 0x1.c468p-2ur}, {0x1.16bap-1ur, 0x1.d62cp-2ur}, |
69 | {0x1.0cfp-1ur, 0x1.e74cp-2ur}, {0x1.0418p-1ur, 0x1.f7ep-2ur}, |
70 | }; |
71 | }; |
72 | |
73 | template <> struct SqrtConfig<unsigned long fract> { |
74 | using Type = unsigned long fract; |
75 | static constexpr int EXTRA_STEPS = 2; |
76 | |
77 | // Linear approximation for the initial values, with errors bounded by: |
78 | // max(1.5 * 2^-11, eps) |
79 | // Generated with Sollya: |
80 | // > for i from 4 to 15 do { |
81 | // P = fpminimax(sqrt(x), 1, [|32, 32|], [i * 2^-4, (i + 1)*2^-4], |
82 | // fixed, absolute); |
83 | // print("{", coeff(P, 1), "ulr,", coeff(P, 0), "ulr},"); |
84 | // }; |
85 | static constexpr Type FIRST_APPROX[12][2] = { |
86 | {0x1.e3779b98p-1ulr, 0x1.0eaff788p-2ulr}, |
87 | {0x1.b5167872p-1ulr, 0x1.2b908ad4p-2ulr}, |
88 | {0x1.91f195cap-1ulr, 0x1.45da800cp-2ulr}, |
89 | {0x1.761ebcb4p-1ulr, 0x1.5e27004cp-2ulr}, |
90 | {0x1.5f619986p-1ulr, 0x1.74db933cp-2ulr}, |
91 | {0x1.4c583adep-1ulr, 0x1.8a3fbfccp-2ulr}, |
92 | {0x1.3c1a591cp-1ulr, 0x1.9e88373cp-2ulr}, |
93 | {0x1.2e08545ap-1ulr, 0x1.b1dd2534p-2ulr}, |
94 | {0x1.21b05c0ap-1ulr, 0x1.c45e023p-2ulr}, |
95 | {0x1.16becd02p-1ulr, 0x1.d624031p-2ulr}, |
96 | {0x1.0cf49fep-1ulr, 0x1.e743b844p-2ulr}, |
97 | {0x1.04214e9cp-1ulr, 0x1.f7ce2c3cp-2ulr}, |
98 | }; |
99 | }; |
100 | |
101 | template <> |
102 | struct SqrtConfig<unsigned short accum> : SqrtConfig<unsigned fract> {}; |
103 | |
104 | template <> |
105 | struct SqrtConfig<unsigned accum> : SqrtConfig<unsigned long fract> {}; |
106 | |
107 | // Integer square root |
108 | template <> struct SqrtConfig<unsigned short> { |
109 | using OutType = unsigned short accum; |
110 | using FracType = unsigned fract; |
111 | // For fast-but-less-accurate version |
112 | using FastFracType = unsigned short fract; |
113 | using HalfType = unsigned char; |
114 | }; |
115 | |
116 | template <> struct SqrtConfig<unsigned int> { |
117 | using OutType = unsigned accum; |
118 | using FracType = unsigned long fract; |
119 | // For fast-but-less-accurate version |
120 | using FastFracType = unsigned fract; |
121 | using HalfType = unsigned short; |
122 | }; |
123 | |
124 | // TODO: unsigned long accum type is 64-bit, and will need 64-bit fract type. |
125 | // Probably we will use DyadicFloat<64> for intermediate computations instead. |
126 | |
127 | } // namespace internal |
128 | |
129 | // Core computation for sqrt with normalized inputs (0.25 <= x < 1). |
130 | template <typename Config> |
131 | LIBC_INLINE constexpr typename Config::Type |
132 | sqrt_core(typename Config::Type x_frac) { |
133 | using FracType = typename Config::Type; |
134 | using FXRep = FXRep<FracType>; |
135 | using StorageType = typename FXRep::StorageType; |
136 | // Exact case: |
137 | if (x_frac == FXRep::ONE_FOURTH()) |
138 | return FXRep::ONE_HALF(); |
139 | |
140 | // Use use Newton method to approximate sqrt(a): |
141 | // x_{n + 1} = 1/2 (x_n + a / x_n) |
142 | // For the initial values, we choose x_0 |
143 | |
144 | // Use the leading 4 bits to do look up for sqrt(x). |
145 | // After normalization, 0.25 <= x_frac < 1, so the leading 4 bits of x_frac |
146 | // are between 0b0100 and 0b1111. Hence the lookup table only needs 12 |
147 | // entries, and we can get the index by subtracting the leading 4 bits of |
148 | // x_frac by 4 = 0b0100. |
149 | StorageType x_bit = cpp::bit_cast<StorageType>(x_frac); |
150 | int index = (static_cast<int>(x_bit >> (FXRep::TOTAL_LEN - 4))) - 4; |
151 | FracType a = Config::FIRST_APPROX[index][0]; |
152 | FracType b = Config::FIRST_APPROX[index][1]; |
153 | |
154 | // Initial approximation step. |
155 | // Estimated error bounds: | r - sqrt(x_frac) | < max(1.5 * 2^-11, eps). |
156 | FracType r = a * x_frac + b; |
157 | |
158 | // Further Newton-method iterations for square-root: |
159 | // x_{n + 1} = 0.5 * (x_n + a / x_n) |
160 | // We distribute and do the multiplication by 0.5 first to avoid overflow. |
161 | // TODO: Investigate the performance and accuracy of using division-free |
162 | // iterations from: |
163 | // Blanchard, J. D. and Chamberland, M., "Newton's Method Without Division", |
164 | // The American Mathematical Monthly (2023). |
165 | // https://chamberland.math.grinnell.edu/papers/newton.pdf |
166 | for (int i = 0; i < Config::EXTRA_STEPS; ++i) |
167 | r = (r >> 1) + (x_frac >> 1) / r; |
168 | |
169 | return r; |
170 | } |
171 | |
172 | template <typename T> |
173 | LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_fixed_point_v<T>, T> sqrt(T x) { |
174 | using BitType = typename FXRep<T>::StorageType; |
175 | BitType x_bit = cpp::bit_cast<BitType>(x); |
176 | |
177 | if (LIBC_UNLIKELY(x_bit == 0)) |
178 | return FXRep<T>::ZERO(); |
179 | |
180 | int leading_zeros = cpp::countl_zero(x_bit); |
181 | constexpr int STORAGE_LENGTH = sizeof(BitType) * CHAR_BIT; |
182 | constexpr int EXP_ADJUSTMENT = STORAGE_LENGTH - FXRep<T>::FRACTION_LEN - 1; |
183 | // x_exp is the real exponent of the leading bit of x. |
184 | int x_exp = EXP_ADJUSTMENT - leading_zeros; |
185 | int shift = EXP_ADJUSTMENT - 1 - (x_exp & (~1)); |
186 | // Normalize. |
187 | x_bit <<= shift; |
188 | using FracType = typename internal::SqrtConfig<T>::Type; |
189 | FracType x_frac = cpp::bit_cast<FracType>(x_bit); |
190 | |
191 | // Compute sqrt(x_frac) using Newton-method. |
192 | FracType r = sqrt_core<internal::SqrtConfig<T>>(x_frac); |
193 | |
194 | // Re-scaling |
195 | r >>= EXP_ADJUSTMENT - (x_exp >> 1); |
196 | |
197 | // Return result. |
198 | return cpp::bit_cast<T>(r); |
199 | } |
200 | |
201 | // Integer square root - Accurate version: |
202 | // Absolute errors < 2^(-fraction length). |
203 | template <typename T> |
204 | LIBC_INLINE constexpr typename internal::SqrtConfig<T>::OutType isqrt(T x) { |
205 | using OutType = typename internal::SqrtConfig<T>::OutType; |
206 | using FracType = typename internal::SqrtConfig<T>::FracType; |
207 | |
208 | if (x == 0) |
209 | return FXRep<OutType>::ZERO(); |
210 | |
211 | // Normalize the leading bits to the first two bits. |
212 | // Shift and then Bit cast x to x_frac gives us: |
213 | // x = 2^(FRACTION_LEN + 1 - shift) * x_frac; |
214 | int leading_zeros = cpp::countl_zero(x); |
215 | int shift = ((leading_zeros >> 1) << 1); |
216 | x <<= shift; |
217 | // Convert to frac type and compute square root. |
218 | FracType x_frac = cpp::bit_cast<FracType>(x); |
219 | FracType r = sqrt_core<internal::SqrtConfig<FracType>>(x_frac); |
220 | // To rescale back to the OutType (Accum) |
221 | r >>= (shift >> 1); |
222 | |
223 | return cpp::bit_cast<OutType>(r); |
224 | } |
225 | |
226 | // Integer square root - Fast but less accurate version: |
227 | // Relative errors < 2^(-fraction length). |
228 | template <typename T> |
229 | LIBC_INLINE constexpr typename internal::SqrtConfig<T>::OutType |
230 | isqrt_fast(T x) { |
231 | using OutType = typename internal::SqrtConfig<T>::OutType; |
232 | using FracType = typename internal::SqrtConfig<T>::FastFracType; |
233 | using StorageType = typename FXRep<FracType>::StorageType; |
234 | |
235 | if (x == 0) |
236 | return FXRep<OutType>::ZERO(); |
237 | |
238 | // Normalize the leading bits to the first two bits. |
239 | // Shift and then Bit cast x to x_frac gives us: |
240 | // x = 2^(FRACTION_LEN + 1 - shift) * x_frac; |
241 | int leading_zeros = cpp::countl_zero(x); |
242 | int shift = (leading_zeros & (~1)); |
243 | x <<= shift; |
244 | // Convert to frac type and compute square root. |
245 | FracType x_frac = cpp::bit_cast<FracType>( |
246 | static_cast<StorageType>(x >> FXRep<FracType>::FRACTION_LEN)); |
247 | OutType r = |
248 | static_cast<OutType>(sqrt_core<internal::SqrtConfig<FracType>>(x_frac)); |
249 | // To rescale back to the OutType (Accum) |
250 | r <<= (FXRep<OutType>::INTEGRAL_LEN - (shift >> 1)); |
251 | return cpp::bit_cast<OutType>(r); |
252 | } |
253 | |
254 | } // namespace LIBC_NAMESPACE::fixed_point |
255 | |
256 | #endif // LIBC_COMPILER_HAS_FIXED_POINT |
257 | |
258 | #endif // LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_SQRT_H |
259 | |