rounding.rs source code [crates/minimal-lexical/src/rounding.rs]

1	//! Defines rounding schemes for floating-point numbers.
2
3	#![doc(hidden)]
4
5	use crate::extended_float::ExtendedFloat;
6	use crate::mask::{lower_n_halfway, lower_n_mask};
7	use crate::num::Float;
8
9	// ROUNDING
10	// --------
11
12	/// Round an extended-precision float to the nearest machine float.
13	///
14	/// Shifts the significant digits into place, adjusts the exponent,
15	/// so it can be easily converted to a native float.
16	#[cfg_attr(not(feature = "compact"), inline)]
17	pub fn round<F, Cb>(fp: &mut ExtendedFloat, cb: Cb)
18	where
19	F: Float,
20	Cb: Fn(&mut ExtendedFloat, i32),
21	{
22	let fp_inf = ExtendedFloat {
23	mant: `0`,
24	exp: F::INFINITE_POWER,
25	};
26
27	// Calculate our shift in significant digits.
28	let mantissa_shift = `64` - F::MANTISSA_SIZE - `1`;
29
30	// Check for a denormal float, if after the shift the exponent is negative.
31	if -fp.exp >= mantissa_shift {
32	// Have a denormal float that isn't a literal 0.
33	// The extra 1 is to adjust for the denormal float, which is
34	// `1 - F::EXPONENT_BIAS`. This works as before, because our
35	// old logic rounded to `F::DENORMAL_EXPONENT` (now 1), and then
36	// checked if `exp == F::DENORMAL_EXPONENT` and no hidden mask
37	// bit was set. Here, we handle that here, rather than later.
38	//
39	// This might round-down to 0, but shift will be at max* 65,*
40	// for halfway cases rounding towards 0.
41	let shift = -fp.exp + `1`;
42	debug_assert!(shift <= `65`);
43	cb(fp, shift.min(`64`));
44	// Check for round-up: if rounding-nearest carried us to the hidden bit.
45	fp.exp = (fp.mant >= F::HIDDEN_BIT_MASK) as i32;
46	return;
47	}
48
49	// The float is normal, round to the hidden bit.
50	cb(fp, mantissa_shift);
51
52	// Check if we carried, and if so, shift the bit to the hidden bit.
53	let carry_mask = F::CARRY_MASK;
54	if fp.mant & carry_mask == carry_mask {
55	fp.mant >>= `1`;
56	fp.exp += `1`;
57	}
58
59	// Handle if we carried and check for overflow again.
60	if fp.exp >= F::INFINITE_POWER {
61	// Exponent is above largest normal value, must be infinite.
62	*fp = fp_inf;
63	return;
64	}
65
66	// Remove the hidden bit.
67	fp.mant &= F::MANTISSA_MASK;
68	}
69
70	/// Shift right N-bytes and round towards a direction.
71	///
72	/// Callback should take the following parameters:
73	/// 1. is_odd
74	/// 1. is_halfway
75	/// 1. is_above
76	#[cfg_attr(not(feature = "compact"), inline)]
77	pub fn round_nearest_tie_even<Cb>(fp: &mut ExtendedFloat, shift: i32, cb: Cb)
78	where
79	// is_odd, is_halfway, is_above
80	Cb: Fn(bool, bool, bool) -> bool,
81	{
82	// Ensure we've already handled denormal values that underflow.
83	debug_assert!(shift <= `64`);
84
85	// Extract the truncated bits using mask.
86	// Calculate if the value of the truncated bits are either above
87	// the mid-way point, or equal to it.
88	//
89	// For example, for 4 truncated bytes, the mask would be 0b1111
90	// and the midway point would be 0b1000.
91	let mask = lower_n_mask(shift as u64);
92	let halfway = lower_n_halfway(shift as u64);
93	let truncated_bits = fp.mant & mask;
94	let is_above = truncated_bits > halfway;
95	let is_halfway = truncated_bits == halfway;
96
97	// Bit shift so the leading bit is in the hidden bit.
98	// This optimixes pretty well:
99	// ```text
100	// mov ecx, esi
101	// shr rdi, cl
102	// xor eax, eax
103	// cmp esi, 64
104	// cmovne rax, rdi
105	// ret
106	// ```
107	fp.mant = match shift == `64` {
108	`true` => `0`,
109	`false` => fp.mant >> shift,
110	};
111	fp.exp += shift;
112
113	// Extract the last bit after shifting (and determine if it is odd).
114	let is_odd = fp.mant & `1` == `1`;
115
116	// Calculate if we need to roundup.
117	// We need to roundup if we are above halfway, or if we are odd
118	// and at half-way (need to tie-to-even). Avoid the branch here.
119	fp.mant += cb(is_odd, is_halfway, is_above) as u64;
120	}
121
122	/// Round our significant digits into place, truncating them.
123	#[cfg_attr(not(feature = "compact"), inline)]
124	pub fn round_down(fp: &mut ExtendedFloat, shift: i32) {
125	// Might have a shift greater than 64 if we have an error.
126	fp.mant = match shift == `64` {
127	`true` => `0`,
128	`false` => fp.mant >> shift,
129	};
130	fp.exp += shift;
131	}
132