Warning: This file is not a C or C++ file. It does not have highlighting.
1 | //===-- Utilities for double-double data type. ------------------*- C++ -*-===// |
---|---|
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_DOUBLE_DOUBLE_H |
10 | #define LLVM_LIBC_SRC___SUPPORT_FPUTIL_DOUBLE_DOUBLE_H |
11 | |
12 | #include "multiply_add.h" |
13 | #include "src/__support/common.h" |
14 | #include "src/__support/macros/config.h" |
15 | #include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA |
16 | #include "src/__support/number_pair.h" |
17 | |
18 | namespace LIBC_NAMESPACE_DECL { |
19 | namespace fputil { |
20 | |
21 | template <typename T> struct DefaultSplit; |
22 | template <> struct DefaultSplit<float> { |
23 | static constexpr size_t VALUE = 12; |
24 | }; |
25 | template <> struct DefaultSplit<double> { |
26 | static constexpr size_t VALUE = 27; |
27 | }; |
28 | |
29 | using DoubleDouble = NumberPair<double>; |
30 | using FloatFloat = NumberPair<float>; |
31 | |
32 | // The output of Dekker's FastTwoSum algorithm is correct, i.e.: |
33 | // r.hi + r.lo = a + b exactly |
34 | // and |r.lo| < eps(r.lo) |
35 | // Assumption: |a| >= |b|, or a = 0. |
36 | template <bool FAST2SUM = true, typename T = double> |
37 | LIBC_INLINE constexpr NumberPair<T> exact_add(T a, T b) { |
38 | NumberPair<T> r{0.0, 0.0}; |
39 | if constexpr (FAST2SUM) { |
40 | r.hi = a + b; |
41 | T t = r.hi - a; |
42 | r.lo = b - t; |
43 | } else { |
44 | r.hi = a + b; |
45 | T t1 = r.hi - a; |
46 | T t2 = r.hi - t1; |
47 | T t3 = b - t1; |
48 | T t4 = a - t2; |
49 | r.lo = t3 + t4; |
50 | } |
51 | return r; |
52 | } |
53 | |
54 | // Assumption: |a.hi| >= |b.hi| |
55 | template <typename T> |
56 | LIBC_INLINE constexpr NumberPair<T> add(const NumberPair<T> &a, |
57 | const NumberPair<T> &b) { |
58 | NumberPair<T> r = exact_add(a.hi, b.hi); |
59 | T lo = a.lo + b.lo; |
60 | return exact_add(r.hi, r.lo + lo); |
61 | } |
62 | |
63 | // Assumption: |a.hi| >= |b| |
64 | template <typename T> |
65 | LIBC_INLINE constexpr NumberPair<T> add(const NumberPair<T> &a, T b) { |
66 | NumberPair<T> r = exact_add<false>(a.hi, b); |
67 | return exact_add(r.hi, r.lo + a.lo); |
68 | } |
69 | |
70 | // Veltkamp's Splitting for double precision. |
71 | // Note: This is proved to be correct for all rounding modes: |
72 | // Zimmermann, P., "Note on the Veltkamp/Dekker Algorithms with Directed |
73 | // Roundings," https://inria.hal.science/hal-04480440. |
74 | // Default splitting constant = 2^ceil(prec(double)/2) + 1 = 2^27 + 1. |
75 | template <typename T = double, size_t N = DefaultSplit<T>::VALUE> |
76 | LIBC_INLINE constexpr NumberPair<T> split(T a) { |
77 | NumberPair<T> r{0.0, 0.0}; |
78 | // CN = 2^N. |
79 | constexpr T CN = static_cast<T>(1 << N); |
80 | constexpr T C = CN + 1.0; |
81 | double t1 = C * a; |
82 | double t2 = a - t1; |
83 | r.hi = t1 + t2; |
84 | r.lo = a - r.hi; |
85 | return r; |
86 | } |
87 | |
88 | // Helper for non-fma exact mult where the first number is already split. |
89 | template <typename T = double, size_t SPLIT_B = DefaultSplit<T>::VALUE> |
90 | LIBC_INLINE NumberPair<T> exact_mult(const NumberPair<T> &as, T a, T b) { |
91 | NumberPair<T> bs = split<T, SPLIT_B>(b); |
92 | NumberPair<T> r{0.0, 0.0}; |
93 | |
94 | r.hi = a * b; |
95 | T t1 = as.hi * bs.hi - r.hi; |
96 | T t2 = as.hi * bs.lo + t1; |
97 | T t3 = as.lo * bs.hi + t2; |
98 | r.lo = as.lo * bs.lo + t3; |
99 | |
100 | return r; |
101 | } |
102 | |
103 | // The templated exact multiplication needs template version of |
104 | // LIBC_TARGET_CPU_HAS_FMA_* macro to correctly select the implementation. |
105 | // These can be moved to "src/__support/macros/properties/cpu_features.h" if |
106 | // other part of libc needed. |
107 | template <typename T> struct TargetHasFmaInstruction { |
108 | static constexpr bool VALUE = false; |
109 | }; |
110 | |
111 | #ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT |
112 | template <> struct TargetHasFmaInstruction<float> { |
113 | static constexpr bool VALUE = true; |
114 | }; |
115 | #endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT |
116 | |
117 | #ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE |
118 | template <> struct TargetHasFmaInstruction<double> { |
119 | static constexpr bool VALUE = true; |
120 | }; |
121 | #endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE |
122 | |
123 | // Note: When FMA instruction is not available, the `exact_mult` function is |
124 | // only correct for round-to-nearest mode. See: |
125 | // Zimmermann, P., "Note on the Veltkamp/Dekker Algorithms with Directed |
126 | // Roundings," https://inria.hal.science/hal-04480440. |
127 | // Using Theorem 1 in the paper above, without FMA instruction, if we restrict |
128 | // the generated constants to precision <= 51, and splitting it by 2^28 + 1, |
129 | // then a * b = r.hi + r.lo is exact for all rounding modes. |
130 | template <typename T = double, size_t SPLIT_B = DefaultSplit<T>::VALUE> |
131 | LIBC_INLINE NumberPair<T> exact_mult(T a, T b) { |
132 | NumberPair<T> r{0.0, 0.0}; |
133 | |
134 | if constexpr (TargetHasFmaInstruction<T>::VALUE) { |
135 | r.hi = a * b; |
136 | r.lo = fputil::multiply_add(a, b, -r.hi); |
137 | } else { |
138 | // Dekker's Product. |
139 | NumberPair<T> as = split(a); |
140 | |
141 | r = exact_mult<T, SPLIT_B>(as, a, b); |
142 | } |
143 | |
144 | return r; |
145 | } |
146 | |
147 | LIBC_INLINE DoubleDouble quick_mult(double a, const DoubleDouble &b) { |
148 | DoubleDouble r = exact_mult(a, b.hi); |
149 | r.lo = multiply_add(a, b.lo, r.lo); |
150 | return r; |
151 | } |
152 | |
153 | template <size_t SPLIT_B = 27> |
154 | LIBC_INLINE DoubleDouble quick_mult(const DoubleDouble &a, |
155 | const DoubleDouble &b) { |
156 | DoubleDouble r = exact_mult<double, SPLIT_B>(a.hi, b.hi); |
157 | double t1 = multiply_add(a.hi, b.lo, r.lo); |
158 | double t2 = multiply_add(a.lo, b.hi, t1); |
159 | r.lo = t2; |
160 | return r; |
161 | } |
162 | |
163 | // Assuming |c| >= |a * b|. |
164 | template <> |
165 | LIBC_INLINE DoubleDouble multiply_add<DoubleDouble>(const DoubleDouble &a, |
166 | const DoubleDouble &b, |
167 | const DoubleDouble &c) { |
168 | return add(c, quick_mult(a, b)); |
169 | } |
170 | |
171 | // Accurate double-double division, following Karp-Markstein's trick for |
172 | // division, implemented in the CORE-MATH project at: |
173 | // https://gitlab.inria.fr/core-math/core-math/-/blob/master/src/binary64/tan/tan.c#L1855 |
174 | // |
175 | // Error bounds: |
176 | // Let a = ah + al, b = bh + bl. |
177 | // Let r = rh + rl be the approximation of (ah + al) / (bh + bl). |
178 | // Then: |
179 | // (ah + al) / (bh + bl) - rh = |
180 | // = ((ah - bh * rh) + (al - bl * rh)) / (bh + bl) |
181 | // = (1 + O(bl/bh)) * ((ah - bh * rh) + (al - bl * rh)) / bh |
182 | // Let q = round(1/bh), then the above expressions are approximately: |
183 | // = (1 + O(bl / bh)) * (1 + O(2^-52)) * q * ((ah - bh * rh) + (al - bl * rh)) |
184 | // So we can compute: |
185 | // rl = q * (ah - bh * rh) + q * (al - bl * rh) |
186 | // as accurate as possible, then the error is bounded by: |
187 | // |(ah + al) / (bh + bl) - (rh + rl)| < O(bl/bh) * (2^-52 + al/ah + bl/bh) |
188 | template <typename T> |
189 | LIBC_INLINE NumberPair<T> div(const NumberPair<T> &a, const NumberPair<T> &b) { |
190 | NumberPair<T> r; |
191 | T q = T(1) / b.hi; |
192 | r.hi = a.hi * q; |
193 | |
194 | #ifdef LIBC_TARGET_CPU_HAS_FMA |
195 | T e_hi = fputil::multiply_add(b.hi, -r.hi, a.hi); |
196 | T e_lo = fputil::multiply_add(b.lo, -r.hi, a.lo); |
197 | #else |
198 | NumberPair<T> b_hi_r_hi = fputil::exact_mult(b.hi, -r.hi); |
199 | NumberPair<T> b_lo_r_hi = fputil::exact_mult(b.lo, -r.hi); |
200 | T e_hi = (a.hi + b_hi_r_hi.hi) + b_hi_r_hi.lo; |
201 | T e_lo = (a.lo + b_lo_r_hi.hi) + b_lo_r_hi.lo; |
202 | #endif // LIBC_TARGET_CPU_HAS_FMA |
203 | |
204 | r.lo = q * (e_hi + e_lo); |
205 | return r; |
206 | } |
207 | |
208 | } // namespace fputil |
209 | } // namespace LIBC_NAMESPACE_DECL |
210 | |
211 | #endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_DOUBLE_DOUBLE_H |
212 |
Warning: This file is not a C or C++ file. It does not have highlighting.