1//! Defines rounding schemes for floating-point numbers.
2
3#![doc(hidden)]
4
5use crate::extended_float::ExtendedFloat;
6use crate::mask::{lower_n_halfway, lower_n_mask};
7use crate::num::Float;
8
9// ROUNDING
10// --------
11
12/// Round an extended-precision float to the nearest machine float.
13///
14/// Shifts the significant digits into place, adjusts the exponent,
15/// so it can be easily converted to a native float.
16#[cfg_attr(not(feature = "compact"), inline)]
17pub fn round<F, Cb>(fp: &mut ExtendedFloat, cb: Cb)
18where
19 F: Float,
20 Cb: Fn(&mut ExtendedFloat, i32),
21{
22 let fp_inf = ExtendedFloat {
23 mant: 0,
24 exp: F::INFINITE_POWER,
25 };
26
27 // Calculate our shift in significant digits.
28 let mantissa_shift = 64 - F::MANTISSA_SIZE - 1;
29
30 // Check for a denormal float, if after the shift the exponent is negative.
31 if -fp.exp >= mantissa_shift {
32 // Have a denormal float that isn't a literal 0.
33 // The extra 1 is to adjust for the denormal float, which is
34 // `1 - F::EXPONENT_BIAS`. This works as before, because our
35 // old logic rounded to `F::DENORMAL_EXPONENT` (now 1), and then
36 // checked if `exp == F::DENORMAL_EXPONENT` and no hidden mask
37 // bit was set. Here, we handle that here, rather than later.
38 //
39 // This might round-down to 0, but shift will be at **max** 65,
40 // for halfway cases rounding towards 0.
41 let shift = -fp.exp + 1;
42 debug_assert!(shift <= 65);
43 cb(fp, shift.min(64));
44 // Check for round-up: if rounding-nearest carried us to the hidden bit.
45 fp.exp = (fp.mant >= F::HIDDEN_BIT_MASK) as i32;
46 return;
47 }
48
49 // The float is normal, round to the hidden bit.
50 cb(fp, mantissa_shift);
51
52 // Check if we carried, and if so, shift the bit to the hidden bit.
53 let carry_mask = F::CARRY_MASK;
54 if fp.mant & carry_mask == carry_mask {
55 fp.mant >>= 1;
56 fp.exp += 1;
57 }
58
59 // Handle if we carried and check for overflow again.
60 if fp.exp >= F::INFINITE_POWER {
61 // Exponent is above largest normal value, must be infinite.
62 *fp = fp_inf;
63 return;
64 }
65
66 // Remove the hidden bit.
67 fp.mant &= F::MANTISSA_MASK;
68}
69
70/// Shift right N-bytes and round towards a direction.
71///
72/// Callback should take the following parameters:
73/// 1. is_odd
74/// 1. is_halfway
75/// 1. is_above
76#[cfg_attr(not(feature = "compact"), inline)]
77pub fn round_nearest_tie_even<Cb>(fp: &mut ExtendedFloat, shift: i32, cb: Cb)
78where
79 // is_odd, is_halfway, is_above
80 Cb: Fn(bool, bool, bool) -> bool,
81{
82 // Ensure we've already handled denormal values that underflow.
83 debug_assert!(shift <= 64);
84
85 // Extract the truncated bits using mask.
86 // Calculate if the value of the truncated bits are either above
87 // the mid-way point, or equal to it.
88 //
89 // For example, for 4 truncated bytes, the mask would be 0b1111
90 // and the midway point would be 0b1000.
91 let mask = lower_n_mask(shift as u64);
92 let halfway = lower_n_halfway(shift as u64);
93 let truncated_bits = fp.mant & mask;
94 let is_above = truncated_bits > halfway;
95 let is_halfway = truncated_bits == halfway;
96
97 // Bit shift so the leading bit is in the hidden bit.
98 // This optimixes pretty well:
99 // ```text
100 // mov ecx, esi
101 // shr rdi, cl
102 // xor eax, eax
103 // cmp esi, 64
104 // cmovne rax, rdi
105 // ret
106 // ```
107 fp.mant = match shift == 64 {
108 true => 0,
109 false => fp.mant >> shift,
110 };
111 fp.exp += shift;
112
113 // Extract the last bit after shifting (and determine if it is odd).
114 let is_odd = fp.mant & 1 == 1;
115
116 // Calculate if we need to roundup.
117 // We need to roundup if we are above halfway, or if we are odd
118 // and at half-way (need to tie-to-even). Avoid the branch here.
119 fp.mant += cb(is_odd, is_halfway, is_above) as u64;
120}
121
122/// Round our significant digits into place, truncating them.
123#[cfg_attr(not(feature = "compact"), inline)]
124pub fn round_down(fp: &mut ExtendedFloat, shift: i32) {
125 // Might have a shift greater than 64 if we have an error.
126 fp.mant = match shift == 64 {
127 true => 0,
128 false => fp.mant >> shift,
129 };
130 fp.exp += shift;
131}
132