Warning: This file is not a C or C++ file. It does not have highlighting.
1 | //===-- A class to store high precision floating point numbers --*- C++ -*-===// |
---|---|
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_DYADIC_FLOAT_H |
10 | #define LLVM_LIBC_SRC___SUPPORT_FPUTIL_DYADIC_FLOAT_H |
11 | |
12 | #include "FEnvImpl.h" |
13 | #include "FPBits.h" |
14 | #include "hdr/errno_macros.h" |
15 | #include "hdr/fenv_macros.h" |
16 | #include "multiply_add.h" |
17 | #include "rounding_mode.h" |
18 | #include "src/__support/CPP/type_traits.h" |
19 | #include "src/__support/big_int.h" |
20 | #include "src/__support/macros/config.h" |
21 | #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY |
22 | #include "src/__support/macros/properties/types.h" |
23 | |
24 | #include <stddef.h> |
25 | |
26 | namespace LIBC_NAMESPACE_DECL { |
27 | namespace fputil { |
28 | |
29 | // Decide whether to round a UInt up, down or not at all at a given bit |
30 | // position, based on the current rounding mode. The assumption is that the |
31 | // caller is going to make the integer `value >> rshift`, and then might need |
32 | // to round it up by 1 depending on the value of the bits shifted off the |
33 | // bottom. |
34 | // |
35 | // `logical_sign` causes the behavior of FE_DOWNWARD and FE_UPWARD to |
36 | // be reversed, which is what you'd want if this is the mantissa of a |
37 | // negative floating-point number. |
38 | // |
39 | // Return value is +1 if the value should be rounded up; -1 if it should be |
40 | // rounded down; 0 if it's exact and needs no rounding. |
41 | template <size_t Bits> |
42 | LIBC_INLINE constexpr int |
43 | rounding_direction(const LIBC_NAMESPACE::UInt<Bits> &value, size_t rshift, |
44 | Sign logical_sign) { |
45 | if (rshift == 0 || (rshift < Bits && (value << (Bits - rshift)) == 0) || |
46 | (rshift >= Bits && value == 0)) |
47 | return 0; // exact |
48 | |
49 | switch (quick_get_round()) { |
50 | case FE_TONEAREST: |
51 | if (rshift > 0 && rshift <= Bits && value.get_bit(rshift - 1)) { |
52 | // We round up, unless the value is an exact halfway case and |
53 | // the bit that will end up in the units place is 0, in which |
54 | // case tie-break-to-even says round down. |
55 | bool round_bit = rshift < Bits ? value.get_bit(rshift) : 0; |
56 | return round_bit != 0 || (value << (Bits - rshift + 1)) != 0 ? +1 : -1; |
57 | } else { |
58 | return -1; |
59 | } |
60 | case FE_TOWARDZERO: |
61 | return -1; |
62 | case FE_DOWNWARD: |
63 | return logical_sign.is_neg() && |
64 | (rshift < Bits && (value << (Bits - rshift)) != 0) |
65 | ? +1 |
66 | : -1; |
67 | case FE_UPWARD: |
68 | return logical_sign.is_pos() && |
69 | (rshift < Bits && (value << (Bits - rshift)) != 0) |
70 | ? +1 |
71 | : -1; |
72 | default: |
73 | __builtin_unreachable(); |
74 | } |
75 | } |
76 | |
77 | // A generic class to perform computations of high precision floating points. |
78 | // We store the value in dyadic format, including 3 fields: |
79 | // sign : boolean value - false means positive, true means negative |
80 | // exponent: the exponent value of the least significant bit of the mantissa. |
81 | // mantissa: unsigned integer of length `Bits`. |
82 | // So the real value that is stored is: |
83 | // real value = (-1)^sign * 2^exponent * (mantissa as unsigned integer) |
84 | // The stored data is normal if for non-zero mantissa, the leading bit is 1. |
85 | // The outputs of the constructors and most functions will be normalized. |
86 | // To simplify and improve the efficiency, many functions will assume that the |
87 | // inputs are normal. |
88 | template <size_t Bits> struct DyadicFloat { |
89 | using MantissaType = LIBC_NAMESPACE::UInt<Bits>; |
90 | |
91 | Sign sign = Sign::POS; |
92 | int exponent = 0; |
93 | MantissaType mantissa = MantissaType(0); |
94 | |
95 | LIBC_INLINE constexpr DyadicFloat() = default; |
96 | |
97 | template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0> |
98 | LIBC_INLINE constexpr DyadicFloat(T x) { |
99 | static_assert(FPBits<T>::FRACTION_LEN < Bits); |
100 | FPBits<T> x_bits(x); |
101 | sign = x_bits.sign(); |
102 | exponent = x_bits.get_explicit_exponent() - FPBits<T>::FRACTION_LEN; |
103 | mantissa = MantissaType(x_bits.get_explicit_mantissa()); |
104 | normalize(); |
105 | } |
106 | |
107 | LIBC_INLINE constexpr DyadicFloat(Sign s, int e, const MantissaType &m) |
108 | : sign(s), exponent(e), mantissa(m) { |
109 | normalize(); |
110 | } |
111 | |
112 | // Normalizing the mantissa, bringing the leading 1 bit to the most |
113 | // significant bit. |
114 | LIBC_INLINE constexpr DyadicFloat &normalize() { |
115 | if (!mantissa.is_zero()) { |
116 | int shift_length = cpp::countl_zero(mantissa); |
117 | exponent -= shift_length; |
118 | mantissa <<= static_cast<size_t>(shift_length); |
119 | } |
120 | return *this; |
121 | } |
122 | |
123 | // Used for aligning exponents. Output might not be normalized. |
124 | LIBC_INLINE constexpr DyadicFloat &shift_left(unsigned shift_length) { |
125 | if (shift_length < Bits) { |
126 | exponent -= static_cast<int>(shift_length); |
127 | mantissa <<= shift_length; |
128 | } else { |
129 | exponent = 0; |
130 | mantissa = MantissaType(0); |
131 | } |
132 | return *this; |
133 | } |
134 | |
135 | // Used for aligning exponents. Output might not be normalized. |
136 | LIBC_INLINE constexpr DyadicFloat &shift_right(unsigned shift_length) { |
137 | if (shift_length < Bits) { |
138 | exponent += static_cast<int>(shift_length); |
139 | mantissa >>= shift_length; |
140 | } else { |
141 | exponent = 0; |
142 | mantissa = MantissaType(0); |
143 | } |
144 | return *this; |
145 | } |
146 | |
147 | // Assume that it is already normalized. Output the unbiased exponent. |
148 | LIBC_INLINE constexpr int get_unbiased_exponent() const { |
149 | return exponent + (Bits - 1); |
150 | } |
151 | |
152 | // Produce a correctly rounded DyadicFloat from a too-large mantissa, |
153 | // by shifting it down and rounding if necessary. |
154 | template <size_t MantissaBits> |
155 | LIBC_INLINE constexpr static DyadicFloat<Bits> |
156 | round(Sign result_sign, int result_exponent, |
157 | const LIBC_NAMESPACE::UInt<MantissaBits> &input_mantissa, |
158 | size_t rshift) { |
159 | MantissaType result_mantissa(input_mantissa >> rshift); |
160 | if (rounding_direction(input_mantissa, rshift, result_sign) > 0) { |
161 | ++result_mantissa; |
162 | if (result_mantissa == 0) { |
163 | // Rounding up made the mantissa integer wrap round to 0, |
164 | // carrying a bit off the top. So we've rounded up to the next |
165 | // exponent. |
166 | result_mantissa.set_bit(Bits - 1); |
167 | ++result_exponent; |
168 | } |
169 | } |
170 | return DyadicFloat(result_sign, result_exponent, result_mantissa); |
171 | } |
172 | |
173 | #ifdef LIBC_TYPES_HAS_FLOAT16 |
174 | template <typename T, bool ShouldSignalExceptions> |
175 | LIBC_INLINE constexpr cpp::enable_if_t< |
176 | cpp::is_floating_point_v<T> && (FPBits<T>::FRACTION_LEN < Bits), T> |
177 | generic_as() const { |
178 | using FPBits = FPBits<T>; |
179 | using StorageType = typename FPBits::StorageType; |
180 | |
181 | constexpr int EXTRA_FRACTION_LEN = Bits - 1 - FPBits::FRACTION_LEN; |
182 | |
183 | if (mantissa == 0) |
184 | return FPBits::zero(sign).get_val(); |
185 | |
186 | int unbiased_exp = get_unbiased_exponent(); |
187 | |
188 | if (unbiased_exp + FPBits::EXP_BIAS >= FPBits::MAX_BIASED_EXPONENT) { |
189 | if constexpr (ShouldSignalExceptions) { |
190 | set_errno_if_required(ERANGE); |
191 | raise_except_if_required(FE_OVERFLOW | FE_INEXACT); |
192 | } |
193 | |
194 | switch (quick_get_round()) { |
195 | case FE_TONEAREST: |
196 | return FPBits::inf(sign).get_val(); |
197 | case FE_TOWARDZERO: |
198 | return FPBits::max_normal(sign).get_val(); |
199 | case FE_DOWNWARD: |
200 | if (sign.is_pos()) |
201 | return FPBits::max_normal(Sign::POS).get_val(); |
202 | return FPBits::inf(Sign::NEG).get_val(); |
203 | case FE_UPWARD: |
204 | if (sign.is_neg()) |
205 | return FPBits::max_normal(Sign::NEG).get_val(); |
206 | return FPBits::inf(Sign::POS).get_val(); |
207 | default: |
208 | __builtin_unreachable(); |
209 | } |
210 | } |
211 | |
212 | StorageType out_biased_exp = 0; |
213 | StorageType out_mantissa = 0; |
214 | bool round = false; |
215 | bool sticky = false; |
216 | bool underflow = false; |
217 | |
218 | if (unbiased_exp < -FPBits::EXP_BIAS - FPBits::FRACTION_LEN) { |
219 | sticky = true; |
220 | underflow = true; |
221 | } else if (unbiased_exp == -FPBits::EXP_BIAS - FPBits::FRACTION_LEN) { |
222 | round = true; |
223 | MantissaType sticky_mask = (MantissaType(1) << (Bits - 1)) - 1; |
224 | sticky = (mantissa & sticky_mask) != 0; |
225 | } else { |
226 | int extra_fraction_len = EXTRA_FRACTION_LEN; |
227 | |
228 | if (unbiased_exp < 1 - FPBits::EXP_BIAS) { |
229 | underflow = true; |
230 | extra_fraction_len += 1 - FPBits::EXP_BIAS - unbiased_exp; |
231 | } else { |
232 | out_biased_exp = |
233 | static_cast<StorageType>(unbiased_exp + FPBits::EXP_BIAS); |
234 | } |
235 | |
236 | MantissaType round_mask = MantissaType(1) << (extra_fraction_len - 1); |
237 | round = (mantissa & round_mask) != 0; |
238 | MantissaType sticky_mask = round_mask - 1; |
239 | sticky = (mantissa & sticky_mask) != 0; |
240 | |
241 | out_mantissa = static_cast<StorageType>(mantissa >> extra_fraction_len); |
242 | } |
243 | |
244 | bool lsb = (out_mantissa & 1) != 0; |
245 | |
246 | StorageType result = |
247 | FPBits::create_value(sign, out_biased_exp, out_mantissa).uintval(); |
248 | |
249 | switch (quick_get_round()) { |
250 | case FE_TONEAREST: |
251 | if (round && (lsb || sticky)) |
252 | ++result; |
253 | break; |
254 | case FE_DOWNWARD: |
255 | if (sign.is_neg() && (round || sticky)) |
256 | ++result; |
257 | break; |
258 | case FE_UPWARD: |
259 | if (sign.is_pos() && (round || sticky)) |
260 | ++result; |
261 | break; |
262 | default: |
263 | break; |
264 | } |
265 | |
266 | if (ShouldSignalExceptions && (round || sticky)) { |
267 | int excepts = FE_INEXACT; |
268 | if (FPBits(result).is_inf()) { |
269 | set_errno_if_required(ERANGE); |
270 | excepts |= FE_OVERFLOW; |
271 | } else if (underflow) { |
272 | set_errno_if_required(ERANGE); |
273 | excepts |= FE_UNDERFLOW; |
274 | } |
275 | raise_except_if_required(excepts); |
276 | } |
277 | |
278 | return FPBits(result).get_val(); |
279 | } |
280 | #endif // LIBC_TYPES_HAS_FLOAT16 |
281 | |
282 | template <typename T, bool ShouldSignalExceptions, |
283 | typename = cpp::enable_if_t<cpp::is_floating_point_v<T> && |
284 | (FPBits<T>::FRACTION_LEN < Bits), |
285 | void>> |
286 | LIBC_INLINE constexpr T fast_as() const { |
287 | if (LIBC_UNLIKELY(mantissa.is_zero())) |
288 | return FPBits<T>::zero(sign).get_val(); |
289 | |
290 | // Assume that it is normalized, and output is also normal. |
291 | constexpr uint32_t PRECISION = FPBits<T>::FRACTION_LEN + 1; |
292 | using output_bits_t = typename FPBits<T>::StorageType; |
293 | constexpr output_bits_t IMPLICIT_MASK = |
294 | FPBits<T>::SIG_MASK - FPBits<T>::FRACTION_MASK; |
295 | |
296 | int exp_hi = exponent + static_cast<int>((Bits - 1) + FPBits<T>::EXP_BIAS); |
297 | |
298 | if (LIBC_UNLIKELY(exp_hi > 2 * FPBits<T>::EXP_BIAS)) { |
299 | // Results overflow. |
300 | T d_hi = |
301 | FPBits<T>::create_value(sign, 2 * FPBits<T>::EXP_BIAS, IMPLICIT_MASK) |
302 | .get_val(); |
303 | // volatile prevents constant propagation that would result in infinity |
304 | // always being returned no matter the current rounding mode. |
305 | volatile T two = static_cast<T>(2.0); |
306 | T r = two * d_hi; |
307 | |
308 | // TODO: Whether rounding down the absolute value to max_normal should |
309 | // also raise FE_OVERFLOW and set ERANGE is debatable. |
310 | if (ShouldSignalExceptions && FPBits<T>(r).is_inf()) |
311 | set_errno_if_required(ERANGE); |
312 | |
313 | return r; |
314 | } |
315 | |
316 | bool denorm = false; |
317 | uint32_t shift = Bits - PRECISION; |
318 | if (LIBC_UNLIKELY(exp_hi <= 0)) { |
319 | // Output is denormal. |
320 | denorm = true; |
321 | shift = (Bits - PRECISION) + static_cast<uint32_t>(1 - exp_hi); |
322 | |
323 | exp_hi = FPBits<T>::EXP_BIAS; |
324 | } |
325 | |
326 | int exp_lo = exp_hi - static_cast<int>(PRECISION) - 1; |
327 | |
328 | MantissaType m_hi = |
329 | shift >= MantissaType::BITS ? MantissaType(0) : mantissa >> shift; |
330 | |
331 | T d_hi = FPBits<T>::create_value( |
332 | sign, static_cast<output_bits_t>(exp_hi), |
333 | (static_cast<output_bits_t>(m_hi) & FPBits<T>::SIG_MASK) | |
334 | IMPLICIT_MASK) |
335 | .get_val(); |
336 | |
337 | MantissaType round_mask = |
338 | shift - 1 >= MantissaType::BITS ? 0 : MantissaType(1) << (shift - 1); |
339 | MantissaType sticky_mask = round_mask - MantissaType(1); |
340 | |
341 | bool round_bit = !(mantissa & round_mask).is_zero(); |
342 | bool sticky_bit = !(mantissa & sticky_mask).is_zero(); |
343 | int round_and_sticky = int(round_bit) * 2 + int(sticky_bit); |
344 | |
345 | T d_lo; |
346 | |
347 | if (LIBC_UNLIKELY(exp_lo <= 0)) { |
348 | // d_lo is denormal, but the output is normal. |
349 | int scale_up_exponent = 1 - exp_lo; |
350 | T scale_up_factor = |
351 | FPBits<T>::create_value(Sign::POS, |
352 | static_cast<output_bits_t>( |
353 | FPBits<T>::EXP_BIAS + scale_up_exponent), |
354 | IMPLICIT_MASK) |
355 | .get_val(); |
356 | T scale_down_factor = |
357 | FPBits<T>::create_value(Sign::POS, |
358 | static_cast<output_bits_t>( |
359 | FPBits<T>::EXP_BIAS - scale_up_exponent), |
360 | IMPLICIT_MASK) |
361 | .get_val(); |
362 | |
363 | d_lo = FPBits<T>::create_value( |
364 | sign, static_cast<output_bits_t>(exp_lo + scale_up_exponent), |
365 | IMPLICIT_MASK) |
366 | .get_val(); |
367 | |
368 | return multiply_add(d_lo, T(round_and_sticky), d_hi * scale_up_factor) * |
369 | scale_down_factor; |
370 | } |
371 | |
372 | d_lo = FPBits<T>::create_value(sign, static_cast<output_bits_t>(exp_lo), |
373 | IMPLICIT_MASK) |
374 | .get_val(); |
375 | |
376 | // Still correct without FMA instructions if `d_lo` is not underflow. |
377 | T r = multiply_add(d_lo, T(round_and_sticky), d_hi); |
378 | |
379 | if (LIBC_UNLIKELY(denorm)) { |
380 | // Exponent before rounding is in denormal range, simply clear the |
381 | // exponent field. |
382 | output_bits_t clear_exp = static_cast<output_bits_t>( |
383 | output_bits_t(exp_hi) << FPBits<T>::SIG_LEN); |
384 | output_bits_t r_bits = FPBits<T>(r).uintval() - clear_exp; |
385 | |
386 | if (!(r_bits & FPBits<T>::EXP_MASK)) { |
387 | // Output is denormal after rounding, clear the implicit bit for 80-bit |
388 | // long double. |
389 | r_bits -= IMPLICIT_MASK; |
390 | |
391 | // TODO: IEEE Std 754-2019 lets implementers choose whether to check for |
392 | // "tininess" before or after rounding for base-2 formats, as long as |
393 | // the same choice is made for all operations. Our choice to check after |
394 | // rounding might not be the same as the hardware's. |
395 | if (ShouldSignalExceptions && round_and_sticky) { |
396 | set_errno_if_required(ERANGE); |
397 | raise_except_if_required(FE_UNDERFLOW); |
398 | } |
399 | } |
400 | |
401 | return FPBits<T>(r_bits).get_val(); |
402 | } |
403 | |
404 | return r; |
405 | } |
406 | |
407 | // Assume that it is already normalized. |
408 | // Output is rounded correctly with respect to the current rounding mode. |
409 | template <typename T, bool ShouldSignalExceptions, |
410 | typename = cpp::enable_if_t<cpp::is_floating_point_v<T> && |
411 | (FPBits<T>::FRACTION_LEN < Bits), |
412 | void>> |
413 | LIBC_INLINE constexpr T as() const { |
414 | #if defined(LIBC_TYPES_HAS_FLOAT16) && !defined(__LIBC_USE_FLOAT16_CONVERSION) |
415 | if constexpr (cpp::is_same_v<T, float16>) |
416 | return generic_as<T, ShouldSignalExceptions>(); |
417 | #endif |
418 | return fast_as<T, ShouldSignalExceptions>(); |
419 | } |
420 | |
421 | template <typename T, |
422 | typename = cpp::enable_if_t<cpp::is_floating_point_v<T> && |
423 | (FPBits<T>::FRACTION_LEN < Bits), |
424 | void>> |
425 | LIBC_INLINE explicit constexpr operator T() const { |
426 | return as<T, /*ShouldSignalExceptions=*/false>(); |
427 | } |
428 | |
429 | LIBC_INLINE constexpr MantissaType as_mantissa_type() const { |
430 | if (mantissa.is_zero()) |
431 | return 0; |
432 | |
433 | MantissaType new_mant = mantissa; |
434 | if (exponent > 0) { |
435 | new_mant <<= exponent; |
436 | } else { |
437 | // Cast the exponent to size_t before negating it, rather than after, |
438 | // to avoid undefined behavior negating INT_MIN as an integer (although |
439 | // exponents coming in to this function _shouldn't_ be that large). The |
440 | // result should always end up as a positive size_t. |
441 | size_t shift = -static_cast<size_t>(exponent); |
442 | new_mant >>= shift; |
443 | } |
444 | |
445 | if (sign.is_neg()) { |
446 | new_mant = (~new_mant) + 1; |
447 | } |
448 | |
449 | return new_mant; |
450 | } |
451 | |
452 | LIBC_INLINE constexpr MantissaType |
453 | as_mantissa_type_rounded(int *round_dir_out = nullptr) const { |
454 | int round_dir = 0; |
455 | MantissaType new_mant; |
456 | if (mantissa.is_zero()) { |
457 | new_mant = 0; |
458 | } else { |
459 | new_mant = mantissa; |
460 | if (exponent > 0) { |
461 | new_mant <<= exponent; |
462 | } else if (exponent < 0) { |
463 | // Cast the exponent to size_t before negating it, rather than after, |
464 | // to avoid undefined behavior negating INT_MIN as an integer (although |
465 | // exponents coming in to this function _shouldn't_ be that large). The |
466 | // result should always end up as a positive size_t. |
467 | size_t shift = -static_cast<size_t>(exponent); |
468 | new_mant >>= shift; |
469 | round_dir = rounding_direction(mantissa, shift, sign); |
470 | if (round_dir > 0) |
471 | ++new_mant; |
472 | } |
473 | |
474 | if (sign.is_neg()) { |
475 | new_mant = (~new_mant) + 1; |
476 | } |
477 | } |
478 | |
479 | if (round_dir_out) |
480 | *round_dir_out = round_dir; |
481 | |
482 | return new_mant; |
483 | } |
484 | |
485 | LIBC_INLINE constexpr DyadicFloat operator-() const { |
486 | return DyadicFloat(sign.negate(), exponent, mantissa); |
487 | } |
488 | }; |
489 | |
490 | // Quick add - Add 2 dyadic floats with rounding toward 0 and then normalize the |
491 | // output: |
492 | // - Align the exponents so that: |
493 | // new a.exponent = new b.exponent = max(a.exponent, b.exponent) |
494 | // - Add or subtract the mantissas depending on the signs. |
495 | // - Normalize the result. |
496 | // The absolute errors compared to the mathematical sum is bounded by: |
497 | // | quick_add(a, b) - (a + b) | < MSB(a + b) * 2^(-Bits + 2), |
498 | // i.e., errors are up to 2 ULPs. |
499 | // Assume inputs are normalized (by constructors or other functions) so that we |
500 | // don't need to normalize the inputs again in this function. If the inputs are |
501 | // not normalized, the results might lose precision significantly. |
502 | template <size_t Bits> |
503 | LIBC_INLINE constexpr DyadicFloat<Bits> quick_add(DyadicFloat<Bits> a, |
504 | DyadicFloat<Bits> b) { |
505 | if (LIBC_UNLIKELY(a.mantissa.is_zero())) |
506 | return b; |
507 | if (LIBC_UNLIKELY(b.mantissa.is_zero())) |
508 | return a; |
509 | |
510 | // Align exponents |
511 | if (a.exponent > b.exponent) |
512 | b.shift_right(static_cast<unsigned>(a.exponent - b.exponent)); |
513 | else if (b.exponent > a.exponent) |
514 | a.shift_right(static_cast<unsigned>(b.exponent - a.exponent)); |
515 | |
516 | DyadicFloat<Bits> result; |
517 | |
518 | if (a.sign == b.sign) { |
519 | // Addition |
520 | result.sign = a.sign; |
521 | result.exponent = a.exponent; |
522 | result.mantissa = a.mantissa; |
523 | if (result.mantissa.add_overflow(b.mantissa)) { |
524 | // Mantissa addition overflow. |
525 | result.shift_right(1); |
526 | result.mantissa.val[DyadicFloat<Bits>::MantissaType::WORD_COUNT - 1] |= |
527 | (uint64_t(1) << 63); |
528 | } |
529 | // Result is already normalized. |
530 | return result; |
531 | } |
532 | |
533 | // Subtraction |
534 | if (a.mantissa >= b.mantissa) { |
535 | result.sign = a.sign; |
536 | result.exponent = a.exponent; |
537 | result.mantissa = a.mantissa - b.mantissa; |
538 | } else { |
539 | result.sign = b.sign; |
540 | result.exponent = b.exponent; |
541 | result.mantissa = b.mantissa - a.mantissa; |
542 | } |
543 | |
544 | return result.normalize(); |
545 | } |
546 | |
547 | template <size_t Bits> |
548 | LIBC_INLINE constexpr DyadicFloat<Bits> quick_sub(DyadicFloat<Bits> a, |
549 | DyadicFloat<Bits> b) { |
550 | return quick_add(a, -b); |
551 | } |
552 | |
553 | // Quick Mul - Slightly less accurate but efficient multiplication of 2 dyadic |
554 | // floats with rounding toward 0 and then normalize the output: |
555 | // result.exponent = a.exponent + b.exponent + Bits, |
556 | // result.mantissa = quick_mul_hi(a.mantissa + b.mantissa) |
557 | // ~ (full product a.mantissa * b.mantissa) >> Bits. |
558 | // The errors compared to the mathematical product is bounded by: |
559 | // 2 * errors of quick_mul_hi = 2 * (UInt<Bits>::WORD_COUNT - 1) in ULPs. |
560 | // Assume inputs are normalized (by constructors or other functions) so that we |
561 | // don't need to normalize the inputs again in this function. If the inputs are |
562 | // not normalized, the results might lose precision significantly. |
563 | template <size_t Bits> |
564 | LIBC_INLINE constexpr DyadicFloat<Bits> quick_mul(const DyadicFloat<Bits> &a, |
565 | const DyadicFloat<Bits> &b) { |
566 | DyadicFloat<Bits> result; |
567 | result.sign = (a.sign != b.sign) ? Sign::NEG : Sign::POS; |
568 | result.exponent = a.exponent + b.exponent + static_cast<int>(Bits); |
569 | |
570 | if (!(a.mantissa.is_zero() || b.mantissa.is_zero())) { |
571 | result.mantissa = a.mantissa.quick_mul_hi(b.mantissa); |
572 | // Check the leading bit directly, should be faster than using clz in |
573 | // normalize(). |
574 | if (result.mantissa.val[DyadicFloat<Bits>::MantissaType::WORD_COUNT - 1] >> |
575 | 63 == |
576 | 0) |
577 | result.shift_left(1); |
578 | } else { |
579 | result.mantissa = (typename DyadicFloat<Bits>::MantissaType)(0); |
580 | } |
581 | return result; |
582 | } |
583 | |
584 | // Correctly rounded multiplication of 2 dyadic floats, assuming the |
585 | // exponent remains within range. |
586 | template <size_t Bits> |
587 | LIBC_INLINE constexpr DyadicFloat<Bits> |
588 | rounded_mul(const DyadicFloat<Bits> &a, const DyadicFloat<Bits> &b) { |
589 | using DblMant = LIBC_NAMESPACE::UInt<(2 * Bits)>; |
590 | Sign result_sign = (a.sign != b.sign) ? Sign::NEG : Sign::POS; |
591 | int result_exponent = a.exponent + b.exponent + static_cast<int>(Bits); |
592 | auto product = DblMant(a.mantissa) * DblMant(b.mantissa); |
593 | // As in quick_mul(), renormalize by 1 bit manually rather than countl_zero |
594 | if (product.get_bit(2 * Bits - 1) == 0) { |
595 | product <<= 1; |
596 | result_exponent -= 1; |
597 | } |
598 | |
599 | return DyadicFloat<Bits>::round(result_sign, result_exponent, product, Bits); |
600 | } |
601 | |
602 | // Approximate reciprocal - given a nonzero a, make a good approximation to 1/a. |
603 | // The method is Newton-Raphson iteration, based on quick_mul. |
604 | template <size_t Bits, typename = cpp::enable_if_t<(Bits >= 32)>> |
605 | LIBC_INLINE constexpr DyadicFloat<Bits> |
606 | approx_reciprocal(const DyadicFloat<Bits> &a) { |
607 | // Given an approximation x to 1/a, a better one is x' = x(2-ax). |
608 | // |
609 | // You can derive this by using the Newton-Raphson formula with the function |
610 | // f(x) = 1/x - a. But another way to see that it works is to say: suppose |
611 | // that ax = 1-e for some small error e. Then ax' = ax(2-ax) = (1-e)(1+e) = |
612 | // 1-e^2. So the error in x' is the square of the error in x, i.e. the number |
613 | // of correct bits in x' is double the number in x. |
614 | |
615 | // An initial approximation to the reciprocal |
616 | DyadicFloat<Bits> x(Sign::POS, -32 - a.exponent - int(Bits), |
617 | uint64_t(0xFFFFFFFFFFFFFFFF) / |
618 | static_cast<uint64_t>(a.mantissa >> (Bits - 32))); |
619 | |
620 | // The constant 2, which we'll need in every iteration |
621 | DyadicFloat<Bits> two(Sign::POS, 1, 1); |
622 | |
623 | // We expect at least 31 correct bits from our 32-bit starting approximation |
624 | size_t ok_bits = 31; |
625 | |
626 | // The number of good bits doubles in each iteration, except that rounding |
627 | // errors introduce a little extra each time. Subtract a bit from our |
628 | // accuracy assessment to account for that. |
629 | while (ok_bits < Bits) { |
630 | x = quick_mul(x, quick_sub(two, quick_mul(a, x))); |
631 | ok_bits = 2 * ok_bits - 1; |
632 | } |
633 | |
634 | return x; |
635 | } |
636 | |
637 | // Correctly rounded division of 2 dyadic floats, assuming the |
638 | // exponent remains within range. |
639 | template <size_t Bits> |
640 | LIBC_INLINE constexpr DyadicFloat<Bits> |
641 | rounded_div(const DyadicFloat<Bits> &af, const DyadicFloat<Bits> &bf) { |
642 | using DblMant = LIBC_NAMESPACE::UInt<(Bits * 2 + 64)>; |
643 | |
644 | // Make an approximation to the quotient as a * (1/b). Both the |
645 | // multiplication and the reciprocal are a bit sloppy, which doesn't |
646 | // matter, because we're going to correct for that below. |
647 | auto qf = fputil::quick_mul(af, fputil::approx_reciprocal(bf)); |
648 | |
649 | // Switch to BigInt and stop using quick_add and quick_mul: now |
650 | // we're working in exact integers so as to get the true remainder. |
651 | DblMant a = af.mantissa, b = bf.mantissa, q = qf.mantissa; |
652 | q <<= 2; // leave room for a round bit, even if exponent decreases |
653 | a <<= af.exponent - bf.exponent - qf.exponent + 2; |
654 | DblMant qb = q * b; |
655 | if (qb < a) { |
656 | DblMant too_small = a - b; |
657 | while (qb <= too_small) { |
658 | qb += b; |
659 | ++q; |
660 | } |
661 | } else { |
662 | while (qb > a) { |
663 | qb -= b; |
664 | --q; |
665 | } |
666 | } |
667 | |
668 | DyadicFloat<(Bits * 2)> qbig(qf.sign, qf.exponent - 2, q); |
669 | return DyadicFloat<Bits>::round(qbig.sign, qbig.exponent + Bits, |
670 | qbig.mantissa, Bits); |
671 | } |
672 | |
673 | // Simple polynomial approximation. |
674 | template <size_t Bits> |
675 | LIBC_INLINE constexpr DyadicFloat<Bits> |
676 | multiply_add(const DyadicFloat<Bits> &a, const DyadicFloat<Bits> &b, |
677 | const DyadicFloat<Bits> &c) { |
678 | return quick_add(c, quick_mul(a, b)); |
679 | } |
680 | |
681 | // Simple exponentiation implementation for printf. Only handles positive |
682 | // exponents, since division isn't implemented. |
683 | template <size_t Bits> |
684 | LIBC_INLINE constexpr DyadicFloat<Bits> pow_n(const DyadicFloat<Bits> &a, |
685 | uint32_t power) { |
686 | DyadicFloat<Bits> result = 1.0; |
687 | DyadicFloat<Bits> cur_power = a; |
688 | |
689 | while (power > 0) { |
690 | if ((power % 2) > 0) { |
691 | result = quick_mul(result, cur_power); |
692 | } |
693 | power = power >> 1; |
694 | cur_power = quick_mul(cur_power, cur_power); |
695 | } |
696 | return result; |
697 | } |
698 | |
699 | template <size_t Bits> |
700 | LIBC_INLINE constexpr DyadicFloat<Bits> mul_pow_2(const DyadicFloat<Bits> &a, |
701 | int32_t pow_2) { |
702 | DyadicFloat<Bits> result = a; |
703 | result.exponent += pow_2; |
704 | return result; |
705 | } |
706 | |
707 | } // namespace fputil |
708 | } // namespace LIBC_NAMESPACE_DECL |
709 | |
710 | #endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_DYADIC_FLOAT_H |
711 |
Warning: This file is not a C or C++ file. It does not have highlighting.