| 1 | use super::super::{CastFrom, CastInto, Float, IntTy, MinInt}; |
| 2 | |
| 3 | /// Scale the exponent. |
| 4 | /// |
| 5 | /// From N3220: |
| 6 | /// |
| 7 | /// > The scalbn and scalbln functions compute `x * b^n`, where `b = FLT_RADIX` if the return type |
| 8 | /// > of the function is a standard floating type, or `b = 10` if the return type of the function |
| 9 | /// > is a decimal floating type. A range error occurs for some finite x, depending on n. |
| 10 | /// > |
| 11 | /// > [...] |
| 12 | /// > |
| 13 | /// > * `scalbn(±0, n)` returns `±0`. |
| 14 | /// > * `scalbn(x, 0)` returns `x`. |
| 15 | /// > * `scalbn(±∞, n)` returns `±∞`. |
| 16 | /// > |
| 17 | /// > If the calculation does not overflow or underflow, the returned value is exact and |
| 18 | /// > independent of the current rounding direction mode. |
| 19 | pub fn scalbn<F: Float>(mut x: F, mut n: i32) -> F |
| 20 | where |
| 21 | u32: CastInto<F::Int>, |
| 22 | F::Int: CastFrom<i32>, |
| 23 | F::Int: CastFrom<u32>, |
| 24 | { |
| 25 | let zero = IntTy::<F>::ZERO; |
| 26 | |
| 27 | // Bits including the implicit bit |
| 28 | let sig_total_bits = F::SIG_BITS + 1; |
| 29 | |
| 30 | // Maximum and minimum values when biased |
| 31 | let exp_max = F::EXP_MAX; |
| 32 | let exp_min = F::EXP_MIN; |
| 33 | |
| 34 | // 2 ^ Emax, maximum positive with null significand (0x1p1023 for f64) |
| 35 | let f_exp_max = F::from_parts(false, F::EXP_BIAS << 1, zero); |
| 36 | |
| 37 | // 2 ^ Emin, minimum positive normal with null significand (0x1p-1022 for f64) |
| 38 | let f_exp_min = F::from_parts(false, 1, zero); |
| 39 | |
| 40 | // 2 ^ sig_total_bits, moltiplier to normalize subnormals (0x1p53 for f64) |
| 41 | let f_pow_subnorm = F::from_parts(false, sig_total_bits + F::EXP_BIAS, zero); |
| 42 | |
| 43 | /* |
| 44 | * The goal is to multiply `x` by a scale factor that applies `n`. However, there are cases |
| 45 | * where `2^n` is not representable by `F` but the result should be, e.g. `x = 2^Emin` with |
| 46 | * `n = -EMin + 2` (one out of range of 2^Emax). To get around this, reduce the magnitude of |
| 47 | * the final scale operation by prescaling by the max/min power representable by `F`. |
| 48 | */ |
| 49 | |
| 50 | if n > exp_max { |
| 51 | // Worse case positive `n`: `x` is the minimum subnormal value, the result is `F::MAX`. |
| 52 | // This can be reached by three scaling multiplications (two here and one final). |
| 53 | debug_assert!(-exp_min + F::SIG_BITS as i32 + exp_max <= exp_max * 3); |
| 54 | |
| 55 | x *= f_exp_max; |
| 56 | n -= exp_max; |
| 57 | if n > exp_max { |
| 58 | x *= f_exp_max; |
| 59 | n -= exp_max; |
| 60 | if n > exp_max { |
| 61 | n = exp_max; |
| 62 | } |
| 63 | } |
| 64 | } else if n < exp_min { |
| 65 | // When scaling toward 0, the prescaling is limited to a value that does not allow `x` to |
| 66 | // go subnormal. This avoids double rounding. |
| 67 | if F::BITS > 16 { |
| 68 | // `mul` s.t. `!(x * mul).is_subnormal() ∀ x` |
| 69 | let mul = f_exp_min * f_pow_subnorm; |
| 70 | let add = -exp_min - sig_total_bits as i32; |
| 71 | |
| 72 | // Worse case negative `n`: `x` is the maximum positive value, the result is `F::MIN`. |
| 73 | // This must be reachable by three scaling multiplications (two here and one final). |
| 74 | debug_assert!(-exp_min + F::SIG_BITS as i32 + exp_max <= add * 2 + -exp_min); |
| 75 | |
| 76 | x *= mul; |
| 77 | n += add; |
| 78 | |
| 79 | if n < exp_min { |
| 80 | x *= mul; |
| 81 | n += add; |
| 82 | |
| 83 | if n < exp_min { |
| 84 | n = exp_min; |
| 85 | } |
| 86 | } |
| 87 | } else { |
| 88 | // `f16` is unique compared to other float types in that the difference between the |
| 89 | // minimum exponent and the significand bits (`add = -exp_min - sig_total_bits`) is |
| 90 | // small, only three. The above method depend on decrementing `n` by `add` two times; |
| 91 | // for other float types this works out because `add` is a substantial fraction of |
| 92 | // the exponent range. For `f16`, however, 3 is relatively small compared to the |
| 93 | // exponent range (which is 39), so that requires ~10 prescale rounds rather than two. |
| 94 | // |
| 95 | // Work aroudn this by using a different algorithm that calculates the prescale |
| 96 | // dynamically based on the maximum possible value. This adds more operations per round |
| 97 | // since it needs to construct the scale, but works better in the general case. |
| 98 | let add = -(n + sig_total_bits as i32).clamp(exp_min, sig_total_bits as i32); |
| 99 | let mul = F::from_parts(false, (F::EXP_BIAS as i32 - add) as u32, zero); |
| 100 | |
| 101 | x *= mul; |
| 102 | n += add; |
| 103 | |
| 104 | if n < exp_min { |
| 105 | let add = -(n + sig_total_bits as i32).clamp(exp_min, sig_total_bits as i32); |
| 106 | let mul = F::from_parts(false, (F::EXP_BIAS as i32 - add) as u32, zero); |
| 107 | |
| 108 | x *= mul; |
| 109 | n += add; |
| 110 | |
| 111 | if n < exp_min { |
| 112 | n = exp_min; |
| 113 | } |
| 114 | } |
| 115 | } |
| 116 | } |
| 117 | |
| 118 | let scale = F::from_parts(false, (F::EXP_BIAS as i32 + n) as u32, zero); |
| 119 | x * scale |
| 120 | } |
| 121 | |
| 122 | #[cfg (test)] |
| 123 | mod tests { |
| 124 | use super::super::super::Int; |
| 125 | use super::*; |
| 126 | |
| 127 | // Tests against N3220 |
| 128 | fn spec_test<F: Float>() |
| 129 | where |
| 130 | u32: CastInto<F::Int>, |
| 131 | F::Int: CastFrom<i32>, |
| 132 | F::Int: CastFrom<u32>, |
| 133 | { |
| 134 | // `scalbn(±0, n)` returns `±0`. |
| 135 | assert_biteq!(scalbn(F::NEG_ZERO, 10), F::NEG_ZERO); |
| 136 | assert_biteq!(scalbn(F::NEG_ZERO, 0), F::NEG_ZERO); |
| 137 | assert_biteq!(scalbn(F::NEG_ZERO, -10), F::NEG_ZERO); |
| 138 | assert_biteq!(scalbn(F::ZERO, 10), F::ZERO); |
| 139 | assert_biteq!(scalbn(F::ZERO, 0), F::ZERO); |
| 140 | assert_biteq!(scalbn(F::ZERO, -10), F::ZERO); |
| 141 | |
| 142 | // `scalbn(x, 0)` returns `x`. |
| 143 | assert_biteq!(scalbn(F::MIN, 0), F::MIN); |
| 144 | assert_biteq!(scalbn(F::MAX, 0), F::MAX); |
| 145 | assert_biteq!(scalbn(F::INFINITY, 0), F::INFINITY); |
| 146 | assert_biteq!(scalbn(F::NEG_INFINITY, 0), F::NEG_INFINITY); |
| 147 | assert_biteq!(scalbn(F::ZERO, 0), F::ZERO); |
| 148 | assert_biteq!(scalbn(F::NEG_ZERO, 0), F::NEG_ZERO); |
| 149 | |
| 150 | // `scalbn(±∞, n)` returns `±∞`. |
| 151 | assert_biteq!(scalbn(F::INFINITY, 10), F::INFINITY); |
| 152 | assert_biteq!(scalbn(F::INFINITY, -10), F::INFINITY); |
| 153 | assert_biteq!(scalbn(F::NEG_INFINITY, 10), F::NEG_INFINITY); |
| 154 | assert_biteq!(scalbn(F::NEG_INFINITY, -10), F::NEG_INFINITY); |
| 155 | |
| 156 | // NaN should remain NaNs. |
| 157 | assert!(scalbn(F::NAN, 10).is_nan()); |
| 158 | assert!(scalbn(F::NAN, 0).is_nan()); |
| 159 | assert!(scalbn(F::NAN, -10).is_nan()); |
| 160 | assert!(scalbn(-F::NAN, 10).is_nan()); |
| 161 | assert!(scalbn(-F::NAN, 0).is_nan()); |
| 162 | assert!(scalbn(-F::NAN, -10).is_nan()); |
| 163 | } |
| 164 | |
| 165 | #[test ] |
| 166 | #[cfg (f16_enabled)] |
| 167 | fn spec_test_f16() { |
| 168 | spec_test::<f16>(); |
| 169 | } |
| 170 | |
| 171 | #[test ] |
| 172 | fn spec_test_f32() { |
| 173 | spec_test::<f32>(); |
| 174 | } |
| 175 | |
| 176 | #[test ] |
| 177 | fn spec_test_f64() { |
| 178 | spec_test::<f64>(); |
| 179 | } |
| 180 | |
| 181 | #[test ] |
| 182 | #[cfg (f128_enabled)] |
| 183 | fn spec_test_f128() { |
| 184 | spec_test::<f128>(); |
| 185 | } |
| 186 | } |
| 187 | |