1 | use super::super::{CastFrom, CastInto, Float, IntTy, MinInt}; |
2 | |
3 | /// Scale the exponent. |
4 | /// |
5 | /// From N3220: |
6 | /// |
7 | /// > The scalbn and scalbln functions compute `x * b^n`, where `b = FLT_RADIX` if the return type |
8 | /// > of the function is a standard floating type, or `b = 10` if the return type of the function |
9 | /// > is a decimal floating type. A range error occurs for some finite x, depending on n. |
10 | /// > |
11 | /// > [...] |
12 | /// > |
13 | /// > * `scalbn(±0, n)` returns `±0`. |
14 | /// > * `scalbn(x, 0)` returns `x`. |
15 | /// > * `scalbn(±∞, n)` returns `±∞`. |
16 | /// > |
17 | /// > If the calculation does not overflow or underflow, the returned value is exact and |
18 | /// > independent of the current rounding direction mode. |
19 | pub fn scalbn<F: Float>(mut x: F, mut n: i32) -> F |
20 | where |
21 | u32: CastInto<F::Int>, |
22 | F::Int: CastFrom<i32>, |
23 | F::Int: CastFrom<u32>, |
24 | { |
25 | let zero = IntTy::<F>::ZERO; |
26 | |
27 | // Bits including the implicit bit |
28 | let sig_total_bits = F::SIG_BITS + 1; |
29 | |
30 | // Maximum and minimum values when biased |
31 | let exp_max = F::EXP_MAX; |
32 | let exp_min = F::EXP_MIN; |
33 | |
34 | // 2 ^ Emax, maximum positive with null significand (0x1p1023 for f64) |
35 | let f_exp_max = F::from_parts(false, F::EXP_BIAS << 1, zero); |
36 | |
37 | // 2 ^ Emin, minimum positive normal with null significand (0x1p-1022 for f64) |
38 | let f_exp_min = F::from_parts(false, 1, zero); |
39 | |
40 | // 2 ^ sig_total_bits, moltiplier to normalize subnormals (0x1p53 for f64) |
41 | let f_pow_subnorm = F::from_parts(false, sig_total_bits + F::EXP_BIAS, zero); |
42 | |
43 | /* |
44 | * The goal is to multiply `x` by a scale factor that applies `n`. However, there are cases |
45 | * where `2^n` is not representable by `F` but the result should be, e.g. `x = 2^Emin` with |
46 | * `n = -EMin + 2` (one out of range of 2^Emax). To get around this, reduce the magnitude of |
47 | * the final scale operation by prescaling by the max/min power representable by `F`. |
48 | */ |
49 | |
50 | if n > exp_max { |
51 | // Worse case positive `n`: `x` is the minimum subnormal value, the result is `F::MAX`. |
52 | // This can be reached by three scaling multiplications (two here and one final). |
53 | debug_assert!(-exp_min + F::SIG_BITS as i32 + exp_max <= exp_max * 3); |
54 | |
55 | x *= f_exp_max; |
56 | n -= exp_max; |
57 | if n > exp_max { |
58 | x *= f_exp_max; |
59 | n -= exp_max; |
60 | if n > exp_max { |
61 | n = exp_max; |
62 | } |
63 | } |
64 | } else if n < exp_min { |
65 | // When scaling toward 0, the prescaling is limited to a value that does not allow `x` to |
66 | // go subnormal. This avoids double rounding. |
67 | if F::BITS > 16 { |
68 | // `mul` s.t. `!(x * mul).is_subnormal() ∀ x` |
69 | let mul = f_exp_min * f_pow_subnorm; |
70 | let add = -exp_min - sig_total_bits as i32; |
71 | |
72 | // Worse case negative `n`: `x` is the maximum positive value, the result is `F::MIN`. |
73 | // This must be reachable by three scaling multiplications (two here and one final). |
74 | debug_assert!(-exp_min + F::SIG_BITS as i32 + exp_max <= add * 2 + -exp_min); |
75 | |
76 | x *= mul; |
77 | n += add; |
78 | |
79 | if n < exp_min { |
80 | x *= mul; |
81 | n += add; |
82 | |
83 | if n < exp_min { |
84 | n = exp_min; |
85 | } |
86 | } |
87 | } else { |
88 | // `f16` is unique compared to other float types in that the difference between the |
89 | // minimum exponent and the significand bits (`add = -exp_min - sig_total_bits`) is |
90 | // small, only three. The above method depend on decrementing `n` by `add` two times; |
91 | // for other float types this works out because `add` is a substantial fraction of |
92 | // the exponent range. For `f16`, however, 3 is relatively small compared to the |
93 | // exponent range (which is 39), so that requires ~10 prescale rounds rather than two. |
94 | // |
95 | // Work aroudn this by using a different algorithm that calculates the prescale |
96 | // dynamically based on the maximum possible value. This adds more operations per round |
97 | // since it needs to construct the scale, but works better in the general case. |
98 | let add = -(n + sig_total_bits as i32).clamp(exp_min, sig_total_bits as i32); |
99 | let mul = F::from_parts(false, (F::EXP_BIAS as i32 - add) as u32, zero); |
100 | |
101 | x *= mul; |
102 | n += add; |
103 | |
104 | if n < exp_min { |
105 | let add = -(n + sig_total_bits as i32).clamp(exp_min, sig_total_bits as i32); |
106 | let mul = F::from_parts(false, (F::EXP_BIAS as i32 - add) as u32, zero); |
107 | |
108 | x *= mul; |
109 | n += add; |
110 | |
111 | if n < exp_min { |
112 | n = exp_min; |
113 | } |
114 | } |
115 | } |
116 | } |
117 | |
118 | let scale = F::from_parts(false, (F::EXP_BIAS as i32 + n) as u32, zero); |
119 | x * scale |
120 | } |
121 | |
122 | #[cfg (test)] |
123 | mod tests { |
124 | use super::super::super::Int; |
125 | use super::*; |
126 | |
127 | // Tests against N3220 |
128 | fn spec_test<F: Float>() |
129 | where |
130 | u32: CastInto<F::Int>, |
131 | F::Int: CastFrom<i32>, |
132 | F::Int: CastFrom<u32>, |
133 | { |
134 | // `scalbn(±0, n)` returns `±0`. |
135 | assert_biteq!(scalbn(F::NEG_ZERO, 10), F::NEG_ZERO); |
136 | assert_biteq!(scalbn(F::NEG_ZERO, 0), F::NEG_ZERO); |
137 | assert_biteq!(scalbn(F::NEG_ZERO, -10), F::NEG_ZERO); |
138 | assert_biteq!(scalbn(F::ZERO, 10), F::ZERO); |
139 | assert_biteq!(scalbn(F::ZERO, 0), F::ZERO); |
140 | assert_biteq!(scalbn(F::ZERO, -10), F::ZERO); |
141 | |
142 | // `scalbn(x, 0)` returns `x`. |
143 | assert_biteq!(scalbn(F::MIN, 0), F::MIN); |
144 | assert_biteq!(scalbn(F::MAX, 0), F::MAX); |
145 | assert_biteq!(scalbn(F::INFINITY, 0), F::INFINITY); |
146 | assert_biteq!(scalbn(F::NEG_INFINITY, 0), F::NEG_INFINITY); |
147 | assert_biteq!(scalbn(F::ZERO, 0), F::ZERO); |
148 | assert_biteq!(scalbn(F::NEG_ZERO, 0), F::NEG_ZERO); |
149 | |
150 | // `scalbn(±∞, n)` returns `±∞`. |
151 | assert_biteq!(scalbn(F::INFINITY, 10), F::INFINITY); |
152 | assert_biteq!(scalbn(F::INFINITY, -10), F::INFINITY); |
153 | assert_biteq!(scalbn(F::NEG_INFINITY, 10), F::NEG_INFINITY); |
154 | assert_biteq!(scalbn(F::NEG_INFINITY, -10), F::NEG_INFINITY); |
155 | |
156 | // NaN should remain NaNs. |
157 | assert!(scalbn(F::NAN, 10).is_nan()); |
158 | assert!(scalbn(F::NAN, 0).is_nan()); |
159 | assert!(scalbn(F::NAN, -10).is_nan()); |
160 | assert!(scalbn(-F::NAN, 10).is_nan()); |
161 | assert!(scalbn(-F::NAN, 0).is_nan()); |
162 | assert!(scalbn(-F::NAN, -10).is_nan()); |
163 | } |
164 | |
165 | #[test ] |
166 | #[cfg (f16_enabled)] |
167 | fn spec_test_f16() { |
168 | spec_test::<f16>(); |
169 | } |
170 | |
171 | #[test ] |
172 | fn spec_test_f32() { |
173 | spec_test::<f32>(); |
174 | } |
175 | |
176 | #[test ] |
177 | fn spec_test_f64() { |
178 | spec_test::<f64>(); |
179 | } |
180 | |
181 | #[test ] |
182 | #[cfg (f128_enabled)] |
183 | fn spec_test_f128() { |
184 | spec_test::<f128>(); |
185 | } |
186 | } |
187 | |