Warning: This file is not a C or C++ file. It does not have highlighting.

1//===-- A class to store high precision floating point numbers --*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_DYADIC_FLOAT_H
10#define LLVM_LIBC_SRC___SUPPORT_FPUTIL_DYADIC_FLOAT_H
11
12#include "FEnvImpl.h"
13#include "FPBits.h"
14#include "hdr/errno_macros.h"
15#include "hdr/fenv_macros.h"
16#include "multiply_add.h"
17#include "rounding_mode.h"
18#include "src/__support/CPP/type_traits.h"
19#include "src/__support/big_int.h"
20#include "src/__support/macros/config.h"
21#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
22#include "src/__support/macros/properties/types.h"
23
24#include <stddef.h>
25
26namespace LIBC_NAMESPACE_DECL {
27namespace fputil {
28
29// Decide whether to round a UInt up, down or not at all at a given bit
30// position, based on the current rounding mode. The assumption is that the
31// caller is going to make the integer `value >> rshift`, and then might need
32// to round it up by 1 depending on the value of the bits shifted off the
33// bottom.
34//
35// `logical_sign` causes the behavior of FE_DOWNWARD and FE_UPWARD to
36// be reversed, which is what you'd want if this is the mantissa of a
37// negative floating-point number.
38//
39// Return value is +1 if the value should be rounded up; -1 if it should be
40// rounded down; 0 if it's exact and needs no rounding.
41template <size_t Bits>
42LIBC_INLINE constexpr int
43rounding_direction(const LIBC_NAMESPACE::UInt<Bits> &value, size_t rshift,
44 Sign logical_sign) {
45 if (rshift == 0 || (rshift < Bits && (value << (Bits - rshift)) == 0) ||
46 (rshift >= Bits && value == 0))
47 return 0; // exact
48
49 switch (quick_get_round()) {
50 case FE_TONEAREST:
51 if (rshift > 0 && rshift <= Bits && value.get_bit(rshift - 1)) {
52 // We round up, unless the value is an exact halfway case and
53 // the bit that will end up in the units place is 0, in which
54 // case tie-break-to-even says round down.
55 bool round_bit = rshift < Bits ? value.get_bit(rshift) : 0;
56 return round_bit != 0 || (value << (Bits - rshift + 1)) != 0 ? +1 : -1;
57 } else {
58 return -1;
59 }
60 case FE_TOWARDZERO:
61 return -1;
62 case FE_DOWNWARD:
63 return logical_sign.is_neg() &&
64 (rshift < Bits && (value << (Bits - rshift)) != 0)
65 ? +1
66 : -1;
67 case FE_UPWARD:
68 return logical_sign.is_pos() &&
69 (rshift < Bits && (value << (Bits - rshift)) != 0)
70 ? +1
71 : -1;
72 default:
73 __builtin_unreachable();
74 }
75}
76
77// A generic class to perform computations of high precision floating points.
78// We store the value in dyadic format, including 3 fields:
79// sign : boolean value - false means positive, true means negative
80// exponent: the exponent value of the least significant bit of the mantissa.
81// mantissa: unsigned integer of length `Bits`.
82// So the real value that is stored is:
83// real value = (-1)^sign * 2^exponent * (mantissa as unsigned integer)
84// The stored data is normal if for non-zero mantissa, the leading bit is 1.
85// The outputs of the constructors and most functions will be normalized.
86// To simplify and improve the efficiency, many functions will assume that the
87// inputs are normal.
88template <size_t Bits> struct DyadicFloat {
89 using MantissaType = LIBC_NAMESPACE::UInt<Bits>;
90
91 Sign sign = Sign::POS;
92 int exponent = 0;
93 MantissaType mantissa = MantissaType(0);
94
95 LIBC_INLINE constexpr DyadicFloat() = default;
96
97 template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
98 LIBC_INLINE constexpr DyadicFloat(T x) {
99 static_assert(FPBits<T>::FRACTION_LEN < Bits);
100 FPBits<T> x_bits(x);
101 sign = x_bits.sign();
102 exponent = x_bits.get_explicit_exponent() - FPBits<T>::FRACTION_LEN;
103 mantissa = MantissaType(x_bits.get_explicit_mantissa());
104 normalize();
105 }
106
107 LIBC_INLINE constexpr DyadicFloat(Sign s, int e, const MantissaType &m)
108 : sign(s), exponent(e), mantissa(m) {
109 normalize();
110 }
111
112 // Normalizing the mantissa, bringing the leading 1 bit to the most
113 // significant bit.
114 LIBC_INLINE constexpr DyadicFloat &normalize() {
115 if (!mantissa.is_zero()) {
116 int shift_length = cpp::countl_zero(mantissa);
117 exponent -= shift_length;
118 mantissa <<= static_cast<size_t>(shift_length);
119 }
120 return *this;
121 }
122
123 // Used for aligning exponents. Output might not be normalized.
124 LIBC_INLINE constexpr DyadicFloat &shift_left(unsigned shift_length) {
125 if (shift_length < Bits) {
126 exponent -= static_cast<int>(shift_length);
127 mantissa <<= shift_length;
128 } else {
129 exponent = 0;
130 mantissa = MantissaType(0);
131 }
132 return *this;
133 }
134
135 // Used for aligning exponents. Output might not be normalized.
136 LIBC_INLINE constexpr DyadicFloat &shift_right(unsigned shift_length) {
137 if (shift_length < Bits) {
138 exponent += static_cast<int>(shift_length);
139 mantissa >>= shift_length;
140 } else {
141 exponent = 0;
142 mantissa = MantissaType(0);
143 }
144 return *this;
145 }
146
147 // Assume that it is already normalized. Output the unbiased exponent.
148 LIBC_INLINE constexpr int get_unbiased_exponent() const {
149 return exponent + (Bits - 1);
150 }
151
152 // Produce a correctly rounded DyadicFloat from a too-large mantissa,
153 // by shifting it down and rounding if necessary.
154 template <size_t MantissaBits>
155 LIBC_INLINE constexpr static DyadicFloat<Bits>
156 round(Sign result_sign, int result_exponent,
157 const LIBC_NAMESPACE::UInt<MantissaBits> &input_mantissa,
158 size_t rshift) {
159 MantissaType result_mantissa(input_mantissa >> rshift);
160 if (rounding_direction(input_mantissa, rshift, result_sign) > 0) {
161 ++result_mantissa;
162 if (result_mantissa == 0) {
163 // Rounding up made the mantissa integer wrap round to 0,
164 // carrying a bit off the top. So we've rounded up to the next
165 // exponent.
166 result_mantissa.set_bit(Bits - 1);
167 ++result_exponent;
168 }
169 }
170 return DyadicFloat(result_sign, result_exponent, result_mantissa);
171 }
172
173#ifdef LIBC_TYPES_HAS_FLOAT16
174 template <typename T, bool ShouldSignalExceptions>
175 LIBC_INLINE constexpr cpp::enable_if_t<
176 cpp::is_floating_point_v<T> && (FPBits<T>::FRACTION_LEN < Bits), T>
177 generic_as() const {
178 using FPBits = FPBits<T>;
179 using StorageType = typename FPBits::StorageType;
180
181 constexpr int EXTRA_FRACTION_LEN = Bits - 1 - FPBits::FRACTION_LEN;
182
183 if (mantissa == 0)
184 return FPBits::zero(sign).get_val();
185
186 int unbiased_exp = get_unbiased_exponent();
187
188 if (unbiased_exp + FPBits::EXP_BIAS >= FPBits::MAX_BIASED_EXPONENT) {
189 if constexpr (ShouldSignalExceptions) {
190 set_errno_if_required(ERANGE);
191 raise_except_if_required(FE_OVERFLOW | FE_INEXACT);
192 }
193
194 switch (quick_get_round()) {
195 case FE_TONEAREST:
196 return FPBits::inf(sign).get_val();
197 case FE_TOWARDZERO:
198 return FPBits::max_normal(sign).get_val();
199 case FE_DOWNWARD:
200 if (sign.is_pos())
201 return FPBits::max_normal(Sign::POS).get_val();
202 return FPBits::inf(Sign::NEG).get_val();
203 case FE_UPWARD:
204 if (sign.is_neg())
205 return FPBits::max_normal(Sign::NEG).get_val();
206 return FPBits::inf(Sign::POS).get_val();
207 default:
208 __builtin_unreachable();
209 }
210 }
211
212 StorageType out_biased_exp = 0;
213 StorageType out_mantissa = 0;
214 bool round = false;
215 bool sticky = false;
216 bool underflow = false;
217
218 if (unbiased_exp < -FPBits::EXP_BIAS - FPBits::FRACTION_LEN) {
219 sticky = true;
220 underflow = true;
221 } else if (unbiased_exp == -FPBits::EXP_BIAS - FPBits::FRACTION_LEN) {
222 round = true;
223 MantissaType sticky_mask = (MantissaType(1) << (Bits - 1)) - 1;
224 sticky = (mantissa & sticky_mask) != 0;
225 } else {
226 int extra_fraction_len = EXTRA_FRACTION_LEN;
227
228 if (unbiased_exp < 1 - FPBits::EXP_BIAS) {
229 underflow = true;
230 extra_fraction_len += 1 - FPBits::EXP_BIAS - unbiased_exp;
231 } else {
232 out_biased_exp =
233 static_cast<StorageType>(unbiased_exp + FPBits::EXP_BIAS);
234 }
235
236 MantissaType round_mask = MantissaType(1) << (extra_fraction_len - 1);
237 round = (mantissa & round_mask) != 0;
238 MantissaType sticky_mask = round_mask - 1;
239 sticky = (mantissa & sticky_mask) != 0;
240
241 out_mantissa = static_cast<StorageType>(mantissa >> extra_fraction_len);
242 }
243
244 bool lsb = (out_mantissa & 1) != 0;
245
246 StorageType result =
247 FPBits::create_value(sign, out_biased_exp, out_mantissa).uintval();
248
249 switch (quick_get_round()) {
250 case FE_TONEAREST:
251 if (round && (lsb || sticky))
252 ++result;
253 break;
254 case FE_DOWNWARD:
255 if (sign.is_neg() && (round || sticky))
256 ++result;
257 break;
258 case FE_UPWARD:
259 if (sign.is_pos() && (round || sticky))
260 ++result;
261 break;
262 default:
263 break;
264 }
265
266 if (ShouldSignalExceptions && (round || sticky)) {
267 int excepts = FE_INEXACT;
268 if (FPBits(result).is_inf()) {
269 set_errno_if_required(ERANGE);
270 excepts |= FE_OVERFLOW;
271 } else if (underflow) {
272 set_errno_if_required(ERANGE);
273 excepts |= FE_UNDERFLOW;
274 }
275 raise_except_if_required(excepts);
276 }
277
278 return FPBits(result).get_val();
279 }
280#endif // LIBC_TYPES_HAS_FLOAT16
281
282 template <typename T, bool ShouldSignalExceptions,
283 typename = cpp::enable_if_t<cpp::is_floating_point_v<T> &&
284 (FPBits<T>::FRACTION_LEN < Bits),
285 void>>
286 LIBC_INLINE constexpr T fast_as() const {
287 if (LIBC_UNLIKELY(mantissa.is_zero()))
288 return FPBits<T>::zero(sign).get_val();
289
290 // Assume that it is normalized, and output is also normal.
291 constexpr uint32_t PRECISION = FPBits<T>::FRACTION_LEN + 1;
292 using output_bits_t = typename FPBits<T>::StorageType;
293 constexpr output_bits_t IMPLICIT_MASK =
294 FPBits<T>::SIG_MASK - FPBits<T>::FRACTION_MASK;
295
296 int exp_hi = exponent + static_cast<int>((Bits - 1) + FPBits<T>::EXP_BIAS);
297
298 if (LIBC_UNLIKELY(exp_hi > 2 * FPBits<T>::EXP_BIAS)) {
299 // Results overflow.
300 T d_hi =
301 FPBits<T>::create_value(sign, 2 * FPBits<T>::EXP_BIAS, IMPLICIT_MASK)
302 .get_val();
303 // volatile prevents constant propagation that would result in infinity
304 // always being returned no matter the current rounding mode.
305 volatile T two = static_cast<T>(2.0);
306 T r = two * d_hi;
307
308 // TODO: Whether rounding down the absolute value to max_normal should
309 // also raise FE_OVERFLOW and set ERANGE is debatable.
310 if (ShouldSignalExceptions && FPBits<T>(r).is_inf())
311 set_errno_if_required(ERANGE);
312
313 return r;
314 }
315
316 bool denorm = false;
317 uint32_t shift = Bits - PRECISION;
318 if (LIBC_UNLIKELY(exp_hi <= 0)) {
319 // Output is denormal.
320 denorm = true;
321 shift = (Bits - PRECISION) + static_cast<uint32_t>(1 - exp_hi);
322
323 exp_hi = FPBits<T>::EXP_BIAS;
324 }
325
326 int exp_lo = exp_hi - static_cast<int>(PRECISION) - 1;
327
328 MantissaType m_hi =
329 shift >= MantissaType::BITS ? MantissaType(0) : mantissa >> shift;
330
331 T d_hi = FPBits<T>::create_value(
332 sign, static_cast<output_bits_t>(exp_hi),
333 (static_cast<output_bits_t>(m_hi) & FPBits<T>::SIG_MASK) |
334 IMPLICIT_MASK)
335 .get_val();
336
337 MantissaType round_mask =
338 shift - 1 >= MantissaType::BITS ? 0 : MantissaType(1) << (shift - 1);
339 MantissaType sticky_mask = round_mask - MantissaType(1);
340
341 bool round_bit = !(mantissa & round_mask).is_zero();
342 bool sticky_bit = !(mantissa & sticky_mask).is_zero();
343 int round_and_sticky = int(round_bit) * 2 + int(sticky_bit);
344
345 T d_lo;
346
347 if (LIBC_UNLIKELY(exp_lo <= 0)) {
348 // d_lo is denormal, but the output is normal.
349 int scale_up_exponent = 1 - exp_lo;
350 T scale_up_factor =
351 FPBits<T>::create_value(Sign::POS,
352 static_cast<output_bits_t>(
353 FPBits<T>::EXP_BIAS + scale_up_exponent),
354 IMPLICIT_MASK)
355 .get_val();
356 T scale_down_factor =
357 FPBits<T>::create_value(Sign::POS,
358 static_cast<output_bits_t>(
359 FPBits<T>::EXP_BIAS - scale_up_exponent),
360 IMPLICIT_MASK)
361 .get_val();
362
363 d_lo = FPBits<T>::create_value(
364 sign, static_cast<output_bits_t>(exp_lo + scale_up_exponent),
365 IMPLICIT_MASK)
366 .get_val();
367
368 return multiply_add(d_lo, T(round_and_sticky), d_hi * scale_up_factor) *
369 scale_down_factor;
370 }
371
372 d_lo = FPBits<T>::create_value(sign, static_cast<output_bits_t>(exp_lo),
373 IMPLICIT_MASK)
374 .get_val();
375
376 // Still correct without FMA instructions if `d_lo` is not underflow.
377 T r = multiply_add(d_lo, T(round_and_sticky), d_hi);
378
379 if (LIBC_UNLIKELY(denorm)) {
380 // Exponent before rounding is in denormal range, simply clear the
381 // exponent field.
382 output_bits_t clear_exp = static_cast<output_bits_t>(
383 output_bits_t(exp_hi) << FPBits<T>::SIG_LEN);
384 output_bits_t r_bits = FPBits<T>(r).uintval() - clear_exp;
385
386 if (!(r_bits & FPBits<T>::EXP_MASK)) {
387 // Output is denormal after rounding, clear the implicit bit for 80-bit
388 // long double.
389 r_bits -= IMPLICIT_MASK;
390
391 // TODO: IEEE Std 754-2019 lets implementers choose whether to check for
392 // "tininess" before or after rounding for base-2 formats, as long as
393 // the same choice is made for all operations. Our choice to check after
394 // rounding might not be the same as the hardware's.
395 if (ShouldSignalExceptions && round_and_sticky) {
396 set_errno_if_required(ERANGE);
397 raise_except_if_required(FE_UNDERFLOW);
398 }
399 }
400
401 return FPBits<T>(r_bits).get_val();
402 }
403
404 return r;
405 }
406
407 // Assume that it is already normalized.
408 // Output is rounded correctly with respect to the current rounding mode.
409 template <typename T, bool ShouldSignalExceptions,
410 typename = cpp::enable_if_t<cpp::is_floating_point_v<T> &&
411 (FPBits<T>::FRACTION_LEN < Bits),
412 void>>
413 LIBC_INLINE constexpr T as() const {
414#if defined(LIBC_TYPES_HAS_FLOAT16) && !defined(__LIBC_USE_FLOAT16_CONVERSION)
415 if constexpr (cpp::is_same_v<T, float16>)
416 return generic_as<T, ShouldSignalExceptions>();
417#endif
418 return fast_as<T, ShouldSignalExceptions>();
419 }
420
421 template <typename T,
422 typename = cpp::enable_if_t<cpp::is_floating_point_v<T> &&
423 (FPBits<T>::FRACTION_LEN < Bits),
424 void>>
425 LIBC_INLINE explicit constexpr operator T() const {
426 return as<T, /*ShouldSignalExceptions=*/false>();
427 }
428
429 LIBC_INLINE constexpr MantissaType as_mantissa_type() const {
430 if (mantissa.is_zero())
431 return 0;
432
433 MantissaType new_mant = mantissa;
434 if (exponent > 0) {
435 new_mant <<= exponent;
436 } else {
437 // Cast the exponent to size_t before negating it, rather than after,
438 // to avoid undefined behavior negating INT_MIN as an integer (although
439 // exponents coming in to this function _shouldn't_ be that large). The
440 // result should always end up as a positive size_t.
441 size_t shift = -static_cast<size_t>(exponent);
442 new_mant >>= shift;
443 }
444
445 if (sign.is_neg()) {
446 new_mant = (~new_mant) + 1;
447 }
448
449 return new_mant;
450 }
451
452 LIBC_INLINE constexpr MantissaType
453 as_mantissa_type_rounded(int *round_dir_out = nullptr) const {
454 int round_dir = 0;
455 MantissaType new_mant;
456 if (mantissa.is_zero()) {
457 new_mant = 0;
458 } else {
459 new_mant = mantissa;
460 if (exponent > 0) {
461 new_mant <<= exponent;
462 } else if (exponent < 0) {
463 // Cast the exponent to size_t before negating it, rather than after,
464 // to avoid undefined behavior negating INT_MIN as an integer (although
465 // exponents coming in to this function _shouldn't_ be that large). The
466 // result should always end up as a positive size_t.
467 size_t shift = -static_cast<size_t>(exponent);
468 new_mant >>= shift;
469 round_dir = rounding_direction(mantissa, shift, sign);
470 if (round_dir > 0)
471 ++new_mant;
472 }
473
474 if (sign.is_neg()) {
475 new_mant = (~new_mant) + 1;
476 }
477 }
478
479 if (round_dir_out)
480 *round_dir_out = round_dir;
481
482 return new_mant;
483 }
484
485 LIBC_INLINE constexpr DyadicFloat operator-() const {
486 return DyadicFloat(sign.negate(), exponent, mantissa);
487 }
488};
489
490// Quick add - Add 2 dyadic floats with rounding toward 0 and then normalize the
491// output:
492// - Align the exponents so that:
493// new a.exponent = new b.exponent = max(a.exponent, b.exponent)
494// - Add or subtract the mantissas depending on the signs.
495// - Normalize the result.
496// The absolute errors compared to the mathematical sum is bounded by:
497// | quick_add(a, b) - (a + b) | < MSB(a + b) * 2^(-Bits + 2),
498// i.e., errors are up to 2 ULPs.
499// Assume inputs are normalized (by constructors or other functions) so that we
500// don't need to normalize the inputs again in this function. If the inputs are
501// not normalized, the results might lose precision significantly.
502template <size_t Bits>
503LIBC_INLINE constexpr DyadicFloat<Bits> quick_add(DyadicFloat<Bits> a,
504 DyadicFloat<Bits> b) {
505 if (LIBC_UNLIKELY(a.mantissa.is_zero()))
506 return b;
507 if (LIBC_UNLIKELY(b.mantissa.is_zero()))
508 return a;
509
510 // Align exponents
511 if (a.exponent > b.exponent)
512 b.shift_right(static_cast<unsigned>(a.exponent - b.exponent));
513 else if (b.exponent > a.exponent)
514 a.shift_right(static_cast<unsigned>(b.exponent - a.exponent));
515
516 DyadicFloat<Bits> result;
517
518 if (a.sign == b.sign) {
519 // Addition
520 result.sign = a.sign;
521 result.exponent = a.exponent;
522 result.mantissa = a.mantissa;
523 if (result.mantissa.add_overflow(b.mantissa)) {
524 // Mantissa addition overflow.
525 result.shift_right(1);
526 result.mantissa.val[DyadicFloat<Bits>::MantissaType::WORD_COUNT - 1] |=
527 (uint64_t(1) << 63);
528 }
529 // Result is already normalized.
530 return result;
531 }
532
533 // Subtraction
534 if (a.mantissa >= b.mantissa) {
535 result.sign = a.sign;
536 result.exponent = a.exponent;
537 result.mantissa = a.mantissa - b.mantissa;
538 } else {
539 result.sign = b.sign;
540 result.exponent = b.exponent;
541 result.mantissa = b.mantissa - a.mantissa;
542 }
543
544 return result.normalize();
545}
546
547template <size_t Bits>
548LIBC_INLINE constexpr DyadicFloat<Bits> quick_sub(DyadicFloat<Bits> a,
549 DyadicFloat<Bits> b) {
550 return quick_add(a, -b);
551}
552
553// Quick Mul - Slightly less accurate but efficient multiplication of 2 dyadic
554// floats with rounding toward 0 and then normalize the output:
555// result.exponent = a.exponent + b.exponent + Bits,
556// result.mantissa = quick_mul_hi(a.mantissa + b.mantissa)
557// ~ (full product a.mantissa * b.mantissa) >> Bits.
558// The errors compared to the mathematical product is bounded by:
559// 2 * errors of quick_mul_hi = 2 * (UInt<Bits>::WORD_COUNT - 1) in ULPs.
560// Assume inputs are normalized (by constructors or other functions) so that we
561// don't need to normalize the inputs again in this function. If the inputs are
562// not normalized, the results might lose precision significantly.
563template <size_t Bits>
564LIBC_INLINE constexpr DyadicFloat<Bits> quick_mul(const DyadicFloat<Bits> &a,
565 const DyadicFloat<Bits> &b) {
566 DyadicFloat<Bits> result;
567 result.sign = (a.sign != b.sign) ? Sign::NEG : Sign::POS;
568 result.exponent = a.exponent + b.exponent + static_cast<int>(Bits);
569
570 if (!(a.mantissa.is_zero() || b.mantissa.is_zero())) {
571 result.mantissa = a.mantissa.quick_mul_hi(b.mantissa);
572 // Check the leading bit directly, should be faster than using clz in
573 // normalize().
574 if (result.mantissa.val[DyadicFloat<Bits>::MantissaType::WORD_COUNT - 1] >>
575 63 ==
576 0)
577 result.shift_left(1);
578 } else {
579 result.mantissa = (typename DyadicFloat<Bits>::MantissaType)(0);
580 }
581 return result;
582}
583
584// Correctly rounded multiplication of 2 dyadic floats, assuming the
585// exponent remains within range.
586template <size_t Bits>
587LIBC_INLINE constexpr DyadicFloat<Bits>
588rounded_mul(const DyadicFloat<Bits> &a, const DyadicFloat<Bits> &b) {
589 using DblMant = LIBC_NAMESPACE::UInt<(2 * Bits)>;
590 Sign result_sign = (a.sign != b.sign) ? Sign::NEG : Sign::POS;
591 int result_exponent = a.exponent + b.exponent + static_cast<int>(Bits);
592 auto product = DblMant(a.mantissa) * DblMant(b.mantissa);
593 // As in quick_mul(), renormalize by 1 bit manually rather than countl_zero
594 if (product.get_bit(2 * Bits - 1) == 0) {
595 product <<= 1;
596 result_exponent -= 1;
597 }
598
599 return DyadicFloat<Bits>::round(result_sign, result_exponent, product, Bits);
600}
601
602// Approximate reciprocal - given a nonzero a, make a good approximation to 1/a.
603// The method is Newton-Raphson iteration, based on quick_mul.
604template <size_t Bits, typename = cpp::enable_if_t<(Bits >= 32)>>
605LIBC_INLINE constexpr DyadicFloat<Bits>
606approx_reciprocal(const DyadicFloat<Bits> &a) {
607 // Given an approximation x to 1/a, a better one is x' = x(2-ax).
608 //
609 // You can derive this by using the Newton-Raphson formula with the function
610 // f(x) = 1/x - a. But another way to see that it works is to say: suppose
611 // that ax = 1-e for some small error e. Then ax' = ax(2-ax) = (1-e)(1+e) =
612 // 1-e^2. So the error in x' is the square of the error in x, i.e. the number
613 // of correct bits in x' is double the number in x.
614
615 // An initial approximation to the reciprocal
616 DyadicFloat<Bits> x(Sign::POS, -32 - a.exponent - int(Bits),
617 uint64_t(0xFFFFFFFFFFFFFFFF) /
618 static_cast<uint64_t>(a.mantissa >> (Bits - 32)));
619
620 // The constant 2, which we'll need in every iteration
621 DyadicFloat<Bits> two(Sign::POS, 1, 1);
622
623 // We expect at least 31 correct bits from our 32-bit starting approximation
624 size_t ok_bits = 31;
625
626 // The number of good bits doubles in each iteration, except that rounding
627 // errors introduce a little extra each time. Subtract a bit from our
628 // accuracy assessment to account for that.
629 while (ok_bits < Bits) {
630 x = quick_mul(x, quick_sub(two, quick_mul(a, x)));
631 ok_bits = 2 * ok_bits - 1;
632 }
633
634 return x;
635}
636
637// Correctly rounded division of 2 dyadic floats, assuming the
638// exponent remains within range.
639template <size_t Bits>
640LIBC_INLINE constexpr DyadicFloat<Bits>
641rounded_div(const DyadicFloat<Bits> &af, const DyadicFloat<Bits> &bf) {
642 using DblMant = LIBC_NAMESPACE::UInt<(Bits * 2 + 64)>;
643
644 // Make an approximation to the quotient as a * (1/b). Both the
645 // multiplication and the reciprocal are a bit sloppy, which doesn't
646 // matter, because we're going to correct for that below.
647 auto qf = fputil::quick_mul(af, fputil::approx_reciprocal(bf));
648
649 // Switch to BigInt and stop using quick_add and quick_mul: now
650 // we're working in exact integers so as to get the true remainder.
651 DblMant a = af.mantissa, b = bf.mantissa, q = qf.mantissa;
652 q <<= 2; // leave room for a round bit, even if exponent decreases
653 a <<= af.exponent - bf.exponent - qf.exponent + 2;
654 DblMant qb = q * b;
655 if (qb < a) {
656 DblMant too_small = a - b;
657 while (qb <= too_small) {
658 qb += b;
659 ++q;
660 }
661 } else {
662 while (qb > a) {
663 qb -= b;
664 --q;
665 }
666 }
667
668 DyadicFloat<(Bits * 2)> qbig(qf.sign, qf.exponent - 2, q);
669 return DyadicFloat<Bits>::round(qbig.sign, qbig.exponent + Bits,
670 qbig.mantissa, Bits);
671}
672
673// Simple polynomial approximation.
674template <size_t Bits>
675LIBC_INLINE constexpr DyadicFloat<Bits>
676multiply_add(const DyadicFloat<Bits> &a, const DyadicFloat<Bits> &b,
677 const DyadicFloat<Bits> &c) {
678 return quick_add(c, quick_mul(a, b));
679}
680
681// Simple exponentiation implementation for printf. Only handles positive
682// exponents, since division isn't implemented.
683template <size_t Bits>
684LIBC_INLINE constexpr DyadicFloat<Bits> pow_n(const DyadicFloat<Bits> &a,
685 uint32_t power) {
686 DyadicFloat<Bits> result = 1.0;
687 DyadicFloat<Bits> cur_power = a;
688
689 while (power > 0) {
690 if ((power % 2) > 0) {
691 result = quick_mul(result, cur_power);
692 }
693 power = power >> 1;
694 cur_power = quick_mul(cur_power, cur_power);
695 }
696 return result;
697}
698
699template <size_t Bits>
700LIBC_INLINE constexpr DyadicFloat<Bits> mul_pow_2(const DyadicFloat<Bits> &a,
701 int32_t pow_2) {
702 DyadicFloat<Bits> result = a;
703 result.exponent += pow_2;
704 return result;
705}
706
707} // namespace fputil
708} // namespace LIBC_NAMESPACE_DECL
709
710#endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_DYADIC_FLOAT_H
711

Warning: This file is not a C or C++ file. It does not have highlighting.

Provided by KDAB

Privacy Policy
Improve your Profiling and Debugging skills
Find out more

source code of libc/src/__support/FPUtil/dyadic_float.h