1 | // Adapted from https://github.com/Alexhuszagh/rust-lexical. |
2 | |
3 | //! Estimate the error in an 80-bit approximation of a float. |
4 | //! |
5 | //! This estimates the error in a floating-point representation. |
6 | //! |
7 | //! This implementation is loosely based off the Golang implementation, |
8 | //! found here: <https://golang.org/src/strconv/atof.go> |
9 | |
10 | use super::float::*; |
11 | use super::num::*; |
12 | use super::rounding::*; |
13 | |
14 | pub(crate) trait FloatErrors { |
15 | /// Get the full error scale. |
16 | fn error_scale() -> u32; |
17 | /// Get the half error scale. |
18 | fn error_halfscale() -> u32; |
19 | /// Determine if the number of errors is tolerable for float precision. |
20 | fn error_is_accurate<F: Float>(count: u32, fp: &ExtendedFloat) -> bool; |
21 | } |
22 | |
23 | /// Check if the error is accurate with a round-nearest rounding scheme. |
24 | #[inline ] |
25 | fn nearest_error_is_accurate(errors: u64, fp: &ExtendedFloat, extrabits: u64) -> bool { |
26 | // Round-to-nearest, need to use the halfway point. |
27 | if extrabits == 65 { |
28 | // Underflow, we have a shift larger than the mantissa. |
29 | // Representation is valid **only** if the value is close enough |
30 | // overflow to the next bit within errors. If it overflows, |
31 | // the representation is **not** valid. |
32 | !fp.mant.overflowing_add(errors).1 |
33 | } else { |
34 | let mask: u64 = lower_n_mask(extrabits); |
35 | let extra: u64 = fp.mant & mask; |
36 | |
37 | // Round-to-nearest, need to check if we're close to halfway. |
38 | // IE, b10100 | 100000, where `|` signifies the truncation point. |
39 | let halfway: u64 = lower_n_halfway(extrabits); |
40 | let cmp1 = halfway.wrapping_sub(errors) < extra; |
41 | let cmp2 = extra < halfway.wrapping_add(errors); |
42 | |
43 | // If both comparisons are true, we have significant rounding error, |
44 | // and the value cannot be exactly represented. Otherwise, the |
45 | // representation is valid. |
46 | !(cmp1 && cmp2) |
47 | } |
48 | } |
49 | |
50 | impl FloatErrors for u64 { |
51 | #[inline ] |
52 | fn error_scale() -> u32 { |
53 | 8 |
54 | } |
55 | |
56 | #[inline ] |
57 | fn error_halfscale() -> u32 { |
58 | u64::error_scale() / 2 |
59 | } |
60 | |
61 | #[inline ] |
62 | fn error_is_accurate<F: Float>(count: u32, fp: &ExtendedFloat) -> bool { |
63 | // Determine if extended-precision float is a good approximation. |
64 | // If the error has affected too many units, the float will be |
65 | // inaccurate, or if the representation is too close to halfway |
66 | // that any operations could affect this halfway representation. |
67 | // See the documentation for dtoa for more information. |
68 | let bias = -(F::EXPONENT_BIAS - F::MANTISSA_SIZE); |
69 | let denormal_exp = bias - 63; |
70 | // This is always a valid u32, since (denormal_exp - fp.exp) |
71 | // will always be positive and the significand size is {23, 52}. |
72 | let extrabits = if fp.exp <= denormal_exp { |
73 | 64 - F::MANTISSA_SIZE + denormal_exp - fp.exp |
74 | } else { |
75 | 63 - F::MANTISSA_SIZE |
76 | }; |
77 | |
78 | // Our logic is as follows: we want to determine if the actual |
79 | // mantissa and the errors during calculation differ significantly |
80 | // from the rounding point. The rounding point for round-nearest |
81 | // is the halfway point, IE, this when the truncated bits start |
82 | // with b1000..., while the rounding point for the round-toward |
83 | // is when the truncated bits are equal to 0. |
84 | // To do so, we can check whether the rounding point +/- the error |
85 | // are >/< the actual lower n bits. |
86 | // |
87 | // For whether we need to use signed or unsigned types for this |
88 | // analysis, see this example, using u8 rather than u64 to simplify |
89 | // things. |
90 | // |
91 | // # Comparisons |
92 | // cmp1 = (halfway - errors) < extra |
93 | // cmp1 = extra < (halfway + errors) |
94 | // |
95 | // # Large Extrabits, Low Errors |
96 | // |
97 | // extrabits = 8 |
98 | // halfway = 0b10000000 |
99 | // extra = 0b10000010 |
100 | // errors = 0b00000100 |
101 | // halfway - errors = 0b01111100 |
102 | // halfway + errors = 0b10000100 |
103 | // |
104 | // Unsigned: |
105 | // halfway - errors = 124 |
106 | // halfway + errors = 132 |
107 | // extra = 130 |
108 | // cmp1 = true |
109 | // cmp2 = true |
110 | // Signed: |
111 | // halfway - errors = 124 |
112 | // halfway + errors = -124 |
113 | // extra = -126 |
114 | // cmp1 = false |
115 | // cmp2 = true |
116 | // |
117 | // # Conclusion |
118 | // |
119 | // Since errors will always be small, and since we want to detect |
120 | // if the representation is accurate, we need to use an **unsigned** |
121 | // type for comparisons. |
122 | |
123 | let extrabits = extrabits as u64; |
124 | let errors = count as u64; |
125 | if extrabits > 65 { |
126 | // Underflow, we have a literal 0. |
127 | return true; |
128 | } |
129 | |
130 | nearest_error_is_accurate(errors, fp, extrabits) |
131 | } |
132 | } |
133 | |