1// Adapted from https://github.com/Alexhuszagh/rust-lexical.
2
3//! Estimate the error in an 80-bit approximation of a float.
4//!
5//! This estimates the error in a floating-point representation.
6//!
7//! This implementation is loosely based off the Golang implementation,
8//! found here: <https://golang.org/src/strconv/atof.go>
9
10use super::float::*;
11use super::num::*;
12use super::rounding::*;
13
14pub(crate) trait FloatErrors {
15 /// Get the full error scale.
16 fn error_scale() -> u32;
17 /// Get the half error scale.
18 fn error_halfscale() -> u32;
19 /// Determine if the number of errors is tolerable for float precision.
20 fn error_is_accurate<F: Float>(count: u32, fp: &ExtendedFloat) -> bool;
21}
22
23/// Check if the error is accurate with a round-nearest rounding scheme.
24#[inline]
25fn nearest_error_is_accurate(errors: u64, fp: &ExtendedFloat, extrabits: u64) -> bool {
26 // Round-to-nearest, need to use the halfway point.
27 if extrabits == 65 {
28 // Underflow, we have a shift larger than the mantissa.
29 // Representation is valid **only** if the value is close enough
30 // overflow to the next bit within errors. If it overflows,
31 // the representation is **not** valid.
32 !fp.mant.overflowing_add(errors).1
33 } else {
34 let mask: u64 = lower_n_mask(extrabits);
35 let extra: u64 = fp.mant & mask;
36
37 // Round-to-nearest, need to check if we're close to halfway.
38 // IE, b10100 | 100000, where `|` signifies the truncation point.
39 let halfway: u64 = lower_n_halfway(extrabits);
40 let cmp1 = halfway.wrapping_sub(errors) < extra;
41 let cmp2 = extra < halfway.wrapping_add(errors);
42
43 // If both comparisons are true, we have significant rounding error,
44 // and the value cannot be exactly represented. Otherwise, the
45 // representation is valid.
46 !(cmp1 && cmp2)
47 }
48}
49
50impl FloatErrors for u64 {
51 #[inline]
52 fn error_scale() -> u32 {
53 8
54 }
55
56 #[inline]
57 fn error_halfscale() -> u32 {
58 u64::error_scale() / 2
59 }
60
61 #[inline]
62 fn error_is_accurate<F: Float>(count: u32, fp: &ExtendedFloat) -> bool {
63 // Determine if extended-precision float is a good approximation.
64 // If the error has affected too many units, the float will be
65 // inaccurate, or if the representation is too close to halfway
66 // that any operations could affect this halfway representation.
67 // See the documentation for dtoa for more information.
68 let bias = -(F::EXPONENT_BIAS - F::MANTISSA_SIZE);
69 let denormal_exp = bias - 63;
70 // This is always a valid u32, since (denormal_exp - fp.exp)
71 // will always be positive and the significand size is {23, 52}.
72 let extrabits = if fp.exp <= denormal_exp {
73 64 - F::MANTISSA_SIZE + denormal_exp - fp.exp
74 } else {
75 63 - F::MANTISSA_SIZE
76 };
77
78 // Our logic is as follows: we want to determine if the actual
79 // mantissa and the errors during calculation differ significantly
80 // from the rounding point. The rounding point for round-nearest
81 // is the halfway point, IE, this when the truncated bits start
82 // with b1000..., while the rounding point for the round-toward
83 // is when the truncated bits are equal to 0.
84 // To do so, we can check whether the rounding point +/- the error
85 // are >/< the actual lower n bits.
86 //
87 // For whether we need to use signed or unsigned types for this
88 // analysis, see this example, using u8 rather than u64 to simplify
89 // things.
90 //
91 // # Comparisons
92 // cmp1 = (halfway - errors) < extra
93 // cmp1 = extra < (halfway + errors)
94 //
95 // # Large Extrabits, Low Errors
96 //
97 // extrabits = 8
98 // halfway = 0b10000000
99 // extra = 0b10000010
100 // errors = 0b00000100
101 // halfway - errors = 0b01111100
102 // halfway + errors = 0b10000100
103 //
104 // Unsigned:
105 // halfway - errors = 124
106 // halfway + errors = 132
107 // extra = 130
108 // cmp1 = true
109 // cmp2 = true
110 // Signed:
111 // halfway - errors = 124
112 // halfway + errors = -124
113 // extra = -126
114 // cmp1 = false
115 // cmp2 = true
116 //
117 // # Conclusion
118 //
119 // Since errors will always be small, and since we want to detect
120 // if the representation is accurate, we need to use an **unsigned**
121 // type for comparisons.
122
123 let extrabits = extrabits as u64;
124 let errors = count as u64;
125 if extrabits > 65 {
126 // Underflow, we have a literal 0.
127 return true;
128 }
129
130 nearest_error_is_accurate(errors, fp, extrabits)
131 }
132}
133