| 1 | // Adapted from https://github.com/Alexhuszagh/rust-lexical. |
| 2 | |
| 3 | //! Estimate the error in an 80-bit approximation of a float. |
| 4 | //! |
| 5 | //! This estimates the error in a floating-point representation. |
| 6 | //! |
| 7 | //! This implementation is loosely based off the Golang implementation, |
| 8 | //! found here: <https://golang.org/src/strconv/atof.go> |
| 9 | |
| 10 | use super::float::*; |
| 11 | use super::num::*; |
| 12 | use super::rounding::*; |
| 13 | |
| 14 | pub(crate) trait FloatErrors { |
| 15 | /// Get the full error scale. |
| 16 | fn error_scale() -> u32; |
| 17 | /// Get the half error scale. |
| 18 | fn error_halfscale() -> u32; |
| 19 | /// Determine if the number of errors is tolerable for float precision. |
| 20 | fn error_is_accurate<F: Float>(count: u32, fp: &ExtendedFloat) -> bool; |
| 21 | } |
| 22 | |
| 23 | /// Check if the error is accurate with a round-nearest rounding scheme. |
| 24 | #[inline ] |
| 25 | fn nearest_error_is_accurate(errors: u64, fp: &ExtendedFloat, extrabits: u64) -> bool { |
| 26 | // Round-to-nearest, need to use the halfway point. |
| 27 | if extrabits == 65 { |
| 28 | // Underflow, we have a shift larger than the mantissa. |
| 29 | // Representation is valid **only** if the value is close enough |
| 30 | // overflow to the next bit within errors. If it overflows, |
| 31 | // the representation is **not** valid. |
| 32 | !fp.mant.overflowing_add(errors).1 |
| 33 | } else { |
| 34 | let mask: u64 = lower_n_mask(extrabits); |
| 35 | let extra: u64 = fp.mant & mask; |
| 36 | |
| 37 | // Round-to-nearest, need to check if we're close to halfway. |
| 38 | // IE, b10100 | 100000, where `|` signifies the truncation point. |
| 39 | let halfway: u64 = lower_n_halfway(extrabits); |
| 40 | let cmp1 = halfway.wrapping_sub(errors) < extra; |
| 41 | let cmp2 = extra < halfway.wrapping_add(errors); |
| 42 | |
| 43 | // If both comparisons are true, we have significant rounding error, |
| 44 | // and the value cannot be exactly represented. Otherwise, the |
| 45 | // representation is valid. |
| 46 | !(cmp1 && cmp2) |
| 47 | } |
| 48 | } |
| 49 | |
| 50 | impl FloatErrors for u64 { |
| 51 | #[inline ] |
| 52 | fn error_scale() -> u32 { |
| 53 | 8 |
| 54 | } |
| 55 | |
| 56 | #[inline ] |
| 57 | fn error_halfscale() -> u32 { |
| 58 | u64::error_scale() / 2 |
| 59 | } |
| 60 | |
| 61 | #[inline ] |
| 62 | fn error_is_accurate<F: Float>(count: u32, fp: &ExtendedFloat) -> bool { |
| 63 | // Determine if extended-precision float is a good approximation. |
| 64 | // If the error has affected too many units, the float will be |
| 65 | // inaccurate, or if the representation is too close to halfway |
| 66 | // that any operations could affect this halfway representation. |
| 67 | // See the documentation for dtoa for more information. |
| 68 | let bias = -(F::EXPONENT_BIAS - F::MANTISSA_SIZE); |
| 69 | let denormal_exp = bias - 63; |
| 70 | // This is always a valid u32, since (denormal_exp - fp.exp) |
| 71 | // will always be positive and the significand size is {23, 52}. |
| 72 | let extrabits = if fp.exp <= denormal_exp { |
| 73 | 64 - F::MANTISSA_SIZE + denormal_exp - fp.exp |
| 74 | } else { |
| 75 | 63 - F::MANTISSA_SIZE |
| 76 | }; |
| 77 | |
| 78 | // Our logic is as follows: we want to determine if the actual |
| 79 | // mantissa and the errors during calculation differ significantly |
| 80 | // from the rounding point. The rounding point for round-nearest |
| 81 | // is the halfway point, IE, this when the truncated bits start |
| 82 | // with b1000..., while the rounding point for the round-toward |
| 83 | // is when the truncated bits are equal to 0. |
| 84 | // To do so, we can check whether the rounding point +/- the error |
| 85 | // are >/< the actual lower n bits. |
| 86 | // |
| 87 | // For whether we need to use signed or unsigned types for this |
| 88 | // analysis, see this example, using u8 rather than u64 to simplify |
| 89 | // things. |
| 90 | // |
| 91 | // # Comparisons |
| 92 | // cmp1 = (halfway - errors) < extra |
| 93 | // cmp1 = extra < (halfway + errors) |
| 94 | // |
| 95 | // # Large Extrabits, Low Errors |
| 96 | // |
| 97 | // extrabits = 8 |
| 98 | // halfway = 0b10000000 |
| 99 | // extra = 0b10000010 |
| 100 | // errors = 0b00000100 |
| 101 | // halfway - errors = 0b01111100 |
| 102 | // halfway + errors = 0b10000100 |
| 103 | // |
| 104 | // Unsigned: |
| 105 | // halfway - errors = 124 |
| 106 | // halfway + errors = 132 |
| 107 | // extra = 130 |
| 108 | // cmp1 = true |
| 109 | // cmp2 = true |
| 110 | // Signed: |
| 111 | // halfway - errors = 124 |
| 112 | // halfway + errors = -124 |
| 113 | // extra = -126 |
| 114 | // cmp1 = false |
| 115 | // cmp2 = true |
| 116 | // |
| 117 | // # Conclusion |
| 118 | // |
| 119 | // Since errors will always be small, and since we want to detect |
| 120 | // if the representation is accurate, we need to use an **unsigned** |
| 121 | // type for comparisons. |
| 122 | |
| 123 | let extrabits = extrabits as u64; |
| 124 | let errors = count as u64; |
| 125 | if extrabits > 65 { |
| 126 | // Underflow, we have a literal 0. |
| 127 | return true; |
| 128 | } |
| 129 | |
| 130 | nearest_error_is_accurate(errors, fp, extrabits) |
| 131 | } |
| 132 | } |
| 133 | |