1 | //! Defines rounding schemes for floating-point numbers. |
2 | |
3 | #![doc (hidden)] |
4 | |
5 | use crate::extended_float::ExtendedFloat; |
6 | use crate::mask::{lower_n_halfway, lower_n_mask}; |
7 | use crate::num::Float; |
8 | |
9 | // ROUNDING |
10 | // -------- |
11 | |
12 | /// Round an extended-precision float to the nearest machine float. |
13 | /// |
14 | /// Shifts the significant digits into place, adjusts the exponent, |
15 | /// so it can be easily converted to a native float. |
16 | #[cfg_attr (not(feature = "compact" ), inline)] |
17 | pub fn round<F, Cb>(fp: &mut ExtendedFloat, cb: Cb) |
18 | where |
19 | F: Float, |
20 | Cb: Fn(&mut ExtendedFloat, i32), |
21 | { |
22 | let fp_inf = ExtendedFloat { |
23 | mant: 0, |
24 | exp: F::INFINITE_POWER, |
25 | }; |
26 | |
27 | // Calculate our shift in significant digits. |
28 | let mantissa_shift = 64 - F::MANTISSA_SIZE - 1; |
29 | |
30 | // Check for a denormal float, if after the shift the exponent is negative. |
31 | if -fp.exp >= mantissa_shift { |
32 | // Have a denormal float that isn't a literal 0. |
33 | // The extra 1 is to adjust for the denormal float, which is |
34 | // `1 - F::EXPONENT_BIAS`. This works as before, because our |
35 | // old logic rounded to `F::DENORMAL_EXPONENT` (now 1), and then |
36 | // checked if `exp == F::DENORMAL_EXPONENT` and no hidden mask |
37 | // bit was set. Here, we handle that here, rather than later. |
38 | // |
39 | // This might round-down to 0, but shift will be at **max** 65, |
40 | // for halfway cases rounding towards 0. |
41 | let shift = -fp.exp + 1; |
42 | debug_assert!(shift <= 65); |
43 | cb(fp, shift.min(64)); |
44 | // Check for round-up: if rounding-nearest carried us to the hidden bit. |
45 | fp.exp = (fp.mant >= F::HIDDEN_BIT_MASK) as i32; |
46 | return; |
47 | } |
48 | |
49 | // The float is normal, round to the hidden bit. |
50 | cb(fp, mantissa_shift); |
51 | |
52 | // Check if we carried, and if so, shift the bit to the hidden bit. |
53 | let carry_mask = F::CARRY_MASK; |
54 | if fp.mant & carry_mask == carry_mask { |
55 | fp.mant >>= 1; |
56 | fp.exp += 1; |
57 | } |
58 | |
59 | // Handle if we carried and check for overflow again. |
60 | if fp.exp >= F::INFINITE_POWER { |
61 | // Exponent is above largest normal value, must be infinite. |
62 | *fp = fp_inf; |
63 | return; |
64 | } |
65 | |
66 | // Remove the hidden bit. |
67 | fp.mant &= F::MANTISSA_MASK; |
68 | } |
69 | |
70 | /// Shift right N-bytes and round towards a direction. |
71 | /// |
72 | /// Callback should take the following parameters: |
73 | /// 1. is_odd |
74 | /// 1. is_halfway |
75 | /// 1. is_above |
76 | #[cfg_attr (not(feature = "compact" ), inline)] |
77 | pub fn round_nearest_tie_even<Cb>(fp: &mut ExtendedFloat, shift: i32, cb: Cb) |
78 | where |
79 | // is_odd, is_halfway, is_above |
80 | Cb: Fn(bool, bool, bool) -> bool, |
81 | { |
82 | // Ensure we've already handled denormal values that underflow. |
83 | debug_assert!(shift <= 64); |
84 | |
85 | // Extract the truncated bits using mask. |
86 | // Calculate if the value of the truncated bits are either above |
87 | // the mid-way point, or equal to it. |
88 | // |
89 | // For example, for 4 truncated bytes, the mask would be 0b1111 |
90 | // and the midway point would be 0b1000. |
91 | let mask = lower_n_mask(shift as u64); |
92 | let halfway = lower_n_halfway(shift as u64); |
93 | let truncated_bits = fp.mant & mask; |
94 | let is_above = truncated_bits > halfway; |
95 | let is_halfway = truncated_bits == halfway; |
96 | |
97 | // Bit shift so the leading bit is in the hidden bit. |
98 | // This optimixes pretty well: |
99 | // ```text |
100 | // mov ecx, esi |
101 | // shr rdi, cl |
102 | // xor eax, eax |
103 | // cmp esi, 64 |
104 | // cmovne rax, rdi |
105 | // ret |
106 | // ``` |
107 | fp.mant = match shift == 64 { |
108 | true => 0, |
109 | false => fp.mant >> shift, |
110 | }; |
111 | fp.exp += shift; |
112 | |
113 | // Extract the last bit after shifting (and determine if it is odd). |
114 | let is_odd = fp.mant & 1 == 1; |
115 | |
116 | // Calculate if we need to roundup. |
117 | // We need to roundup if we are above halfway, or if we are odd |
118 | // and at half-way (need to tie-to-even). Avoid the branch here. |
119 | fp.mant += cb(is_odd, is_halfway, is_above) as u64; |
120 | } |
121 | |
122 | /// Round our significant digits into place, truncating them. |
123 | #[cfg_attr (not(feature = "compact" ), inline)] |
124 | pub fn round_down(fp: &mut ExtendedFloat, shift: i32) { |
125 | // Might have a shift greater than 64 if we have an error. |
126 | fp.mant = match shift == 64 { |
127 | true => 0, |
128 | false => fp.mant >> shift, |
129 | }; |
130 | fp.exp += shift; |
131 | } |
132 | |