1//===-- Calculate square root of fixed point numbers. -----*- C++ -*-=========//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_SQRT_H
10#define LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_SQRT_H
11
12#include "include/llvm-libc-macros/stdfix-macros.h"
13#include "src/__support/CPP/bit.h"
14#include "src/__support/CPP/limits.h" // CHAR_BIT
15#include "src/__support/CPP/type_traits.h"
16#include "src/__support/macros/attributes.h" // LIBC_INLINE
17#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
18
19#include "fx_rep.h"
20
21#ifdef LIBC_COMPILER_HAS_FIXED_POINT
22
23namespace LIBC_NAMESPACE::fixed_point {
24
25namespace internal {
26
27template <typename T> struct SqrtConfig;
28
29template <> struct SqrtConfig<unsigned short fract> {
30 using Type = unsigned short fract;
31 static constexpr int EXTRA_STEPS = 0;
32
33 // Linear approximation for the initial values, with errors bounded by:
34 // max(1.5 * 2^-11, eps)
35 // Generated with Sollya:
36 // > for i from 4 to 15 do {
37 // P = fpminimax(sqrt(x), 1, [|8, 8|], [i * 2^-4, (i + 1)*2^-4],
38 // fixed, absolute);
39 // print("{", coeff(P, 1), "uhr,", coeff(P, 0), "uhr},");
40 // };
41 static constexpr Type FIRST_APPROX[12][2] = {
42 {0x1.e8p-1uhr, 0x1.0cp-2uhr}, {0x1.bap-1uhr, 0x1.28p-2uhr},
43 {0x1.94p-1uhr, 0x1.44p-2uhr}, {0x1.74p-1uhr, 0x1.6p-2uhr},
44 {0x1.6p-1uhr, 0x1.74p-2uhr}, {0x1.4ep-1uhr, 0x1.88p-2uhr},
45 {0x1.3ep-1uhr, 0x1.9cp-2uhr}, {0x1.32p-1uhr, 0x1.acp-2uhr},
46 {0x1.22p-1uhr, 0x1.c4p-2uhr}, {0x1.18p-1uhr, 0x1.d4p-2uhr},
47 {0x1.08p-1uhr, 0x1.fp-2uhr}, {0x1.04p-1uhr, 0x1.f8p-2uhr},
48 };
49};
50
51template <> struct SqrtConfig<unsigned fract> {
52 using Type = unsigned fract;
53 static constexpr int EXTRA_STEPS = 1;
54
55 // Linear approximation for the initial values, with errors bounded by:
56 // max(1.5 * 2^-11, eps)
57 // Generated with Sollya:
58 // > for i from 4 to 15 do {
59 // P = fpminimax(sqrt(x), 1, [|16, 16|], [i * 2^-4, (i + 1)*2^-4],
60 // fixed, absolute);
61 // print("{", coeff(P, 1), "ur,", coeff(P, 0), "ur},");
62 // };
63 static constexpr Type FIRST_APPROX[12][2] = {
64 {0x1.e378p-1ur, 0x1.0ebp-2ur}, {0x1.b512p-1ur, 0x1.2b94p-2ur},
65 {0x1.91fp-1ur, 0x1.45dcp-2ur}, {0x1.7622p-1ur, 0x1.5e24p-2ur},
66 {0x1.5f5ap-1ur, 0x1.74e4p-2ur}, {0x1.4c58p-1ur, 0x1.8a4p-2ur},
67 {0x1.3c1ep-1ur, 0x1.9e84p-2ur}, {0x1.2e0cp-1ur, 0x1.b1d8p-2ur},
68 {0x1.21aap-1ur, 0x1.c468p-2ur}, {0x1.16bap-1ur, 0x1.d62cp-2ur},
69 {0x1.0cfp-1ur, 0x1.e74cp-2ur}, {0x1.0418p-1ur, 0x1.f7ep-2ur},
70 };
71};
72
73template <> struct SqrtConfig<unsigned long fract> {
74 using Type = unsigned long fract;
75 static constexpr int EXTRA_STEPS = 2;
76
77 // Linear approximation for the initial values, with errors bounded by:
78 // max(1.5 * 2^-11, eps)
79 // Generated with Sollya:
80 // > for i from 4 to 15 do {
81 // P = fpminimax(sqrt(x), 1, [|32, 32|], [i * 2^-4, (i + 1)*2^-4],
82 // fixed, absolute);
83 // print("{", coeff(P, 1), "ulr,", coeff(P, 0), "ulr},");
84 // };
85 static constexpr Type FIRST_APPROX[12][2] = {
86 {0x1.e3779b98p-1ulr, 0x1.0eaff788p-2ulr},
87 {0x1.b5167872p-1ulr, 0x1.2b908ad4p-2ulr},
88 {0x1.91f195cap-1ulr, 0x1.45da800cp-2ulr},
89 {0x1.761ebcb4p-1ulr, 0x1.5e27004cp-2ulr},
90 {0x1.5f619986p-1ulr, 0x1.74db933cp-2ulr},
91 {0x1.4c583adep-1ulr, 0x1.8a3fbfccp-2ulr},
92 {0x1.3c1a591cp-1ulr, 0x1.9e88373cp-2ulr},
93 {0x1.2e08545ap-1ulr, 0x1.b1dd2534p-2ulr},
94 {0x1.21b05c0ap-1ulr, 0x1.c45e023p-2ulr},
95 {0x1.16becd02p-1ulr, 0x1.d624031p-2ulr},
96 {0x1.0cf49fep-1ulr, 0x1.e743b844p-2ulr},
97 {0x1.04214e9cp-1ulr, 0x1.f7ce2c3cp-2ulr},
98 };
99};
100
101template <>
102struct SqrtConfig<unsigned short accum> : SqrtConfig<unsigned fract> {};
103
104template <>
105struct SqrtConfig<unsigned accum> : SqrtConfig<unsigned long fract> {};
106
107// Integer square root
108template <> struct SqrtConfig<unsigned short> {
109 using OutType = unsigned short accum;
110 using FracType = unsigned fract;
111 // For fast-but-less-accurate version
112 using FastFracType = unsigned short fract;
113 using HalfType = unsigned char;
114};
115
116template <> struct SqrtConfig<unsigned int> {
117 using OutType = unsigned accum;
118 using FracType = unsigned long fract;
119 // For fast-but-less-accurate version
120 using FastFracType = unsigned fract;
121 using HalfType = unsigned short;
122};
123
124// TODO: unsigned long accum type is 64-bit, and will need 64-bit fract type.
125// Probably we will use DyadicFloat<64> for intermediate computations instead.
126
127} // namespace internal
128
129// Core computation for sqrt with normalized inputs (0.25 <= x < 1).
130template <typename Config>
131LIBC_INLINE constexpr typename Config::Type
132sqrt_core(typename Config::Type x_frac) {
133 using FracType = typename Config::Type;
134 using FXRep = FXRep<FracType>;
135 using StorageType = typename FXRep::StorageType;
136 // Exact case:
137 if (x_frac == FXRep::ONE_FOURTH())
138 return FXRep::ONE_HALF();
139
140 // Use use Newton method to approximate sqrt(a):
141 // x_{n + 1} = 1/2 (x_n + a / x_n)
142 // For the initial values, we choose x_0
143
144 // Use the leading 4 bits to do look up for sqrt(x).
145 // After normalization, 0.25 <= x_frac < 1, so the leading 4 bits of x_frac
146 // are between 0b0100 and 0b1111. Hence the lookup table only needs 12
147 // entries, and we can get the index by subtracting the leading 4 bits of
148 // x_frac by 4 = 0b0100.
149 StorageType x_bit = cpp::bit_cast<StorageType>(x_frac);
150 int index = (static_cast<int>(x_bit >> (FXRep::TOTAL_LEN - 4))) - 4;
151 FracType a = Config::FIRST_APPROX[index][0];
152 FracType b = Config::FIRST_APPROX[index][1];
153
154 // Initial approximation step.
155 // Estimated error bounds: | r - sqrt(x_frac) | < max(1.5 * 2^-11, eps).
156 FracType r = a * x_frac + b;
157
158 // Further Newton-method iterations for square-root:
159 // x_{n + 1} = 0.5 * (x_n + a / x_n)
160 // We distribute and do the multiplication by 0.5 first to avoid overflow.
161 // TODO: Investigate the performance and accuracy of using division-free
162 // iterations from:
163 // Blanchard, J. D. and Chamberland, M., "Newton's Method Without Division",
164 // The American Mathematical Monthly (2023).
165 // https://chamberland.math.grinnell.edu/papers/newton.pdf
166 for (int i = 0; i < Config::EXTRA_STEPS; ++i)
167 r = (r >> 1) + (x_frac >> 1) / r;
168
169 return r;
170}
171
172template <typename T>
173LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_fixed_point_v<T>, T> sqrt(T x) {
174 using BitType = typename FXRep<T>::StorageType;
175 BitType x_bit = cpp::bit_cast<BitType>(x);
176
177 if (LIBC_UNLIKELY(x_bit == 0))
178 return FXRep<T>::ZERO();
179
180 int leading_zeros = cpp::countl_zero(x_bit);
181 constexpr int STORAGE_LENGTH = sizeof(BitType) * CHAR_BIT;
182 constexpr int EXP_ADJUSTMENT = STORAGE_LENGTH - FXRep<T>::FRACTION_LEN - 1;
183 // x_exp is the real exponent of the leading bit of x.
184 int x_exp = EXP_ADJUSTMENT - leading_zeros;
185 int shift = EXP_ADJUSTMENT - 1 - (x_exp & (~1));
186 // Normalize.
187 x_bit <<= shift;
188 using FracType = typename internal::SqrtConfig<T>::Type;
189 FracType x_frac = cpp::bit_cast<FracType>(x_bit);
190
191 // Compute sqrt(x_frac) using Newton-method.
192 FracType r = sqrt_core<internal::SqrtConfig<T>>(x_frac);
193
194 // Re-scaling
195 r >>= EXP_ADJUSTMENT - (x_exp >> 1);
196
197 // Return result.
198 return cpp::bit_cast<T>(r);
199}
200
201// Integer square root - Accurate version:
202// Absolute errors < 2^(-fraction length).
203template <typename T>
204LIBC_INLINE constexpr typename internal::SqrtConfig<T>::OutType isqrt(T x) {
205 using OutType = typename internal::SqrtConfig<T>::OutType;
206 using FracType = typename internal::SqrtConfig<T>::FracType;
207
208 if (x == 0)
209 return FXRep<OutType>::ZERO();
210
211 // Normalize the leading bits to the first two bits.
212 // Shift and then Bit cast x to x_frac gives us:
213 // x = 2^(FRACTION_LEN + 1 - shift) * x_frac;
214 int leading_zeros = cpp::countl_zero(x);
215 int shift = ((leading_zeros >> 1) << 1);
216 x <<= shift;
217 // Convert to frac type and compute square root.
218 FracType x_frac = cpp::bit_cast<FracType>(x);
219 FracType r = sqrt_core<internal::SqrtConfig<FracType>>(x_frac);
220 // To rescale back to the OutType (Accum)
221 r >>= (shift >> 1);
222
223 return cpp::bit_cast<OutType>(r);
224}
225
226// Integer square root - Fast but less accurate version:
227// Relative errors < 2^(-fraction length).
228template <typename T>
229LIBC_INLINE constexpr typename internal::SqrtConfig<T>::OutType
230isqrt_fast(T x) {
231 using OutType = typename internal::SqrtConfig<T>::OutType;
232 using FracType = typename internal::SqrtConfig<T>::FastFracType;
233 using StorageType = typename FXRep<FracType>::StorageType;
234
235 if (x == 0)
236 return FXRep<OutType>::ZERO();
237
238 // Normalize the leading bits to the first two bits.
239 // Shift and then Bit cast x to x_frac gives us:
240 // x = 2^(FRACTION_LEN + 1 - shift) * x_frac;
241 int leading_zeros = cpp::countl_zero(x);
242 int shift = (leading_zeros & (~1));
243 x <<= shift;
244 // Convert to frac type and compute square root.
245 FracType x_frac = cpp::bit_cast<FracType>(
246 static_cast<StorageType>(x >> FXRep<FracType>::FRACTION_LEN));
247 OutType r =
248 static_cast<OutType>(sqrt_core<internal::SqrtConfig<FracType>>(x_frac));
249 // To rescale back to the OutType (Accum)
250 r <<= (FXRep<OutType>::INTEGRAL_LEN - (shift >> 1));
251 return cpp::bit_cast<OutType>(r);
252}
253
254} // namespace LIBC_NAMESPACE::fixed_point
255
256#endif // LIBC_COMPILER_HAS_FIXED_POINT
257
258#endif // LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_SQRT_H
259

source code of libc/src/__support/fixed_point/sqrt.h