scalbn.rs source code [crates/libm/src/math/generic/scalbn.rs]

1	use crate::support::{CastFrom, CastInto, Float, IntTy, MinInt};
2
3	/// Scale the exponent.
4	///
5	/// From N3220:
6	///
7	/// > The scalbn and scalbln functions compute `x b^n`, where `b = FLT_RADIX` if the return type*
8	/// > of the function is a standard floating type, or `b = 10` if the return type of the function
9	/// > is a decimal floating type. A range error occurs for some finite x, depending on n.
10	/// >
11	/// > [...]
12	/// >
13	/// > `scalbn(±0, n)` returns `±0`.*
14	/// > `scalbn(x, 0)` returns `x`.*
15	/// > `scalbn(±∞, n)` returns `±∞`.*
16	/// >
17	/// > If the calculation does not overflow or underflow, the returned value is exact and
18	/// > independent of the current rounding direction mode.
19	#[inline]
20	pub fn scalbn<F: Float>(mut x: F, mut n: i32) -> F
21	where
22	u32: CastInto<F::Int>,
23	F::Int: CastFrom<i32>,
24	F::Int: CastFrom<u32>,
25	{
26	let zero = IntTy::<F>::ZERO;
27
28	// Bits including the implicit bit
29	let sig_total_bits = F::SIG_BITS + `1`;
30
31	// Maximum and minimum values when biased
32	let exp_max = F::EXP_MAX;
33	let exp_min = F::EXP_MIN;
34
35	// 2 ^ Emax, maximum positive with null significand (0x1p1023 for f64)
36	let f_exp_max = F::from_parts(`false`, F::EXP_BIAS << `1`, zero);
37
38	// 2 ^ Emin, minimum positive normal with null significand (0x1p-1022 for f64)
39	let f_exp_min = F::from_parts(`false`, `1`, zero);
40
41	// 2 ^ sig_total_bits, moltiplier to normalize subnormals (0x1p53 for f64)
42	let f_pow_subnorm = F::from_parts(`false`, sig_total_bits + F::EXP_BIAS, zero);
43
44	/*
45	* The goal is to multiply `x` by a scale factor that applies `n`. However, there are cases
46	* where `2^n` is not representable by `F` but the result should be, e.g. `x = 2^Emin` with
47	* `n = -EMin + 2` (one out of range of 2^Emax). To get around this, reduce the magnitude of
48	* the final scale operation by prescaling by the max/min power representable by `F`.
49	*/
50
51	if n > exp_max {
52	// Worse case positive `n`: `x` is the minimum subnormal value, the result is `F::MAX`.
53	// This can be reached by three scaling multiplications (two here and one final).
54	debug_assert!(-exp_min + F::SIG_BITS as i32 + exp_max <= exp_max * `3`);
55
56	x *= f_exp_max;
57	n -= exp_max;
58	if n > exp_max {
59	x *= f_exp_max;
60	n -= exp_max;
61	if n > exp_max {
62	n = exp_max;
63	}
64	}
65	} else if n < exp_min {
66	// When scaling toward 0, the prescaling is limited to a value that does not allow `x` to
67	// go subnormal. This avoids double rounding.
68	if F::BITS > `16` {
69	// `mul` s.t. `!(x mul).is_subnormal() ∀ x`*
70	let mul = f_exp_min * f_pow_subnorm;
71	let add = -exp_min - sig_total_bits as i32;
72
73	// Worse case negative `n`: `x` is the maximum positive value, the result is `F::MIN`.
74	// This must be reachable by three scaling multiplications (two here and one final).
75	debug_assert!(-exp_min + F::SIG_BITS as i32 + exp_max <= add * `2` + -exp_min);
76
77	x *= mul;
78	n += add;
79
80	if n < exp_min {
81	x *= mul;
82	n += add;
83
84	if n < exp_min {
85	n = exp_min;
86	}
87	}
88	} else {
89	// `f16` is unique compared to other float types in that the difference between the
90	// minimum exponent and the significand bits (`add = -exp_min - sig_total_bits`) is
91	// small, only three. The above method depend on decrementing `n` by `add` two times;
92	// for other float types this works out because `add` is a substantial fraction of
93	// the exponent range. For `f16`, however, 3 is relatively small compared to the
94	// exponent range (which is 39), so that requires ~10 prescale rounds rather than two.
95	//
96	// Work aroudn this by using a different algorithm that calculates the prescale
97	// dynamically based on the maximum possible value. This adds more operations per round
98	// since it needs to construct the scale, but works better in the general case.
99	let add = -(n + sig_total_bits as i32).clamp(exp_min, sig_total_bits as i32);
100	let mul = F::from_parts(`false`, (F::EXP_BIAS as i32 - add) as u32, zero);
101
102	x *= mul;
103	n += add;
104
105	if n < exp_min {
106	let add = -(n + sig_total_bits as i32).clamp(exp_min, sig_total_bits as i32);
107	let mul = F::from_parts(`false`, (F::EXP_BIAS as i32 - add) as u32, zero);
108
109	x *= mul;
110	n += add;
111
112	if n < exp_min {
113	n = exp_min;
114	}
115	}
116	}
117	}
118
119	let scale = F::from_parts(`false`, (F::EXP_BIAS as i32 + n) as u32, zero);
120	x * scale
121	}
122