1use super::super::{CastFrom, CastInto, Float, IntTy, MinInt};
2
3/// Scale the exponent.
4///
5/// From N3220:
6///
7/// > The scalbn and scalbln functions compute `x * b^n`, where `b = FLT_RADIX` if the return type
8/// > of the function is a standard floating type, or `b = 10` if the return type of the function
9/// > is a decimal floating type. A range error occurs for some finite x, depending on n.
10/// >
11/// > [...]
12/// >
13/// > * `scalbn(±0, n)` returns `±0`.
14/// > * `scalbn(x, 0)` returns `x`.
15/// > * `scalbn(±∞, n)` returns `±∞`.
16/// >
17/// > If the calculation does not overflow or underflow, the returned value is exact and
18/// > independent of the current rounding direction mode.
19pub fn scalbn<F: Float>(mut x: F, mut n: i32) -> F
20where
21 u32: CastInto<F::Int>,
22 F::Int: CastFrom<i32>,
23 F::Int: CastFrom<u32>,
24{
25 let zero = IntTy::<F>::ZERO;
26
27 // Bits including the implicit bit
28 let sig_total_bits = F::SIG_BITS + 1;
29
30 // Maximum and minimum values when biased
31 let exp_max = F::EXP_MAX;
32 let exp_min = F::EXP_MIN;
33
34 // 2 ^ Emax, maximum positive with null significand (0x1p1023 for f64)
35 let f_exp_max = F::from_parts(false, F::EXP_BIAS << 1, zero);
36
37 // 2 ^ Emin, minimum positive normal with null significand (0x1p-1022 for f64)
38 let f_exp_min = F::from_parts(false, 1, zero);
39
40 // 2 ^ sig_total_bits, moltiplier to normalize subnormals (0x1p53 for f64)
41 let f_pow_subnorm = F::from_parts(false, sig_total_bits + F::EXP_BIAS, zero);
42
43 /*
44 * The goal is to multiply `x` by a scale factor that applies `n`. However, there are cases
45 * where `2^n` is not representable by `F` but the result should be, e.g. `x = 2^Emin` with
46 * `n = -EMin + 2` (one out of range of 2^Emax). To get around this, reduce the magnitude of
47 * the final scale operation by prescaling by the max/min power representable by `F`.
48 */
49
50 if n > exp_max {
51 // Worse case positive `n`: `x` is the minimum subnormal value, the result is `F::MAX`.
52 // This can be reached by three scaling multiplications (two here and one final).
53 debug_assert!(-exp_min + F::SIG_BITS as i32 + exp_max <= exp_max * 3);
54
55 x *= f_exp_max;
56 n -= exp_max;
57 if n > exp_max {
58 x *= f_exp_max;
59 n -= exp_max;
60 if n > exp_max {
61 n = exp_max;
62 }
63 }
64 } else if n < exp_min {
65 // When scaling toward 0, the prescaling is limited to a value that does not allow `x` to
66 // go subnormal. This avoids double rounding.
67 if F::BITS > 16 {
68 // `mul` s.t. `!(x * mul).is_subnormal() ∀ x`
69 let mul = f_exp_min * f_pow_subnorm;
70 let add = -exp_min - sig_total_bits as i32;
71
72 // Worse case negative `n`: `x` is the maximum positive value, the result is `F::MIN`.
73 // This must be reachable by three scaling multiplications (two here and one final).
74 debug_assert!(-exp_min + F::SIG_BITS as i32 + exp_max <= add * 2 + -exp_min);
75
76 x *= mul;
77 n += add;
78
79 if n < exp_min {
80 x *= mul;
81 n += add;
82
83 if n < exp_min {
84 n = exp_min;
85 }
86 }
87 } else {
88 // `f16` is unique compared to other float types in that the difference between the
89 // minimum exponent and the significand bits (`add = -exp_min - sig_total_bits`) is
90 // small, only three. The above method depend on decrementing `n` by `add` two times;
91 // for other float types this works out because `add` is a substantial fraction of
92 // the exponent range. For `f16`, however, 3 is relatively small compared to the
93 // exponent range (which is 39), so that requires ~10 prescale rounds rather than two.
94 //
95 // Work aroudn this by using a different algorithm that calculates the prescale
96 // dynamically based on the maximum possible value. This adds more operations per round
97 // since it needs to construct the scale, but works better in the general case.
98 let add = -(n + sig_total_bits as i32).clamp(exp_min, sig_total_bits as i32);
99 let mul = F::from_parts(false, (F::EXP_BIAS as i32 - add) as u32, zero);
100
101 x *= mul;
102 n += add;
103
104 if n < exp_min {
105 let add = -(n + sig_total_bits as i32).clamp(exp_min, sig_total_bits as i32);
106 let mul = F::from_parts(false, (F::EXP_BIAS as i32 - add) as u32, zero);
107
108 x *= mul;
109 n += add;
110
111 if n < exp_min {
112 n = exp_min;
113 }
114 }
115 }
116 }
117
118 let scale = F::from_parts(false, (F::EXP_BIAS as i32 + n) as u32, zero);
119 x * scale
120}
121
122#[cfg(test)]
123mod tests {
124 use super::super::super::Int;
125 use super::*;
126
127 // Tests against N3220
128 fn spec_test<F: Float>()
129 where
130 u32: CastInto<F::Int>,
131 F::Int: CastFrom<i32>,
132 F::Int: CastFrom<u32>,
133 {
134 // `scalbn(±0, n)` returns `±0`.
135 assert_biteq!(scalbn(F::NEG_ZERO, 10), F::NEG_ZERO);
136 assert_biteq!(scalbn(F::NEG_ZERO, 0), F::NEG_ZERO);
137 assert_biteq!(scalbn(F::NEG_ZERO, -10), F::NEG_ZERO);
138 assert_biteq!(scalbn(F::ZERO, 10), F::ZERO);
139 assert_biteq!(scalbn(F::ZERO, 0), F::ZERO);
140 assert_biteq!(scalbn(F::ZERO, -10), F::ZERO);
141
142 // `scalbn(x, 0)` returns `x`.
143 assert_biteq!(scalbn(F::MIN, 0), F::MIN);
144 assert_biteq!(scalbn(F::MAX, 0), F::MAX);
145 assert_biteq!(scalbn(F::INFINITY, 0), F::INFINITY);
146 assert_biteq!(scalbn(F::NEG_INFINITY, 0), F::NEG_INFINITY);
147 assert_biteq!(scalbn(F::ZERO, 0), F::ZERO);
148 assert_biteq!(scalbn(F::NEG_ZERO, 0), F::NEG_ZERO);
149
150 // `scalbn(±∞, n)` returns `±∞`.
151 assert_biteq!(scalbn(F::INFINITY, 10), F::INFINITY);
152 assert_biteq!(scalbn(F::INFINITY, -10), F::INFINITY);
153 assert_biteq!(scalbn(F::NEG_INFINITY, 10), F::NEG_INFINITY);
154 assert_biteq!(scalbn(F::NEG_INFINITY, -10), F::NEG_INFINITY);
155
156 // NaN should remain NaNs.
157 assert!(scalbn(F::NAN, 10).is_nan());
158 assert!(scalbn(F::NAN, 0).is_nan());
159 assert!(scalbn(F::NAN, -10).is_nan());
160 assert!(scalbn(-F::NAN, 10).is_nan());
161 assert!(scalbn(-F::NAN, 0).is_nan());
162 assert!(scalbn(-F::NAN, -10).is_nan());
163 }
164
165 #[test]
166 #[cfg(f16_enabled)]
167 fn spec_test_f16() {
168 spec_test::<f16>();
169 }
170
171 #[test]
172 fn spec_test_f32() {
173 spec_test::<f32>();
174 }
175
176 #[test]
177 fn spec_test_f64() {
178 spec_test::<f64>();
179 }
180
181 #[test]
182 #[cfg(f128_enabled)]
183 fn spec_test_f128() {
184 spec_test::<f128>();
185 }
186}
187