1 | //! Support for floating point types compatible with IEEE 754. |
---|---|
2 | |
3 | use crate::{Category, ExpInt, IEK_INF, IEK_NAN, IEK_ZERO}; |
4 | use crate::{Float, FloatConvert, ParseError, Round, Status, StatusAnd}; |
5 | |
6 | use core::cmp::{self, Ordering}; |
7 | use core::convert::TryFrom; |
8 | use core::fmt::{self, Write}; |
9 | use core::marker::PhantomData; |
10 | use core::mem; |
11 | use core::ops::Neg; |
12 | |
13 | /// A floating point number that uses IEEE semantics. |
14 | /// |
15 | /// Usually you will want to use the available type aliases of this type |
16 | /// (e.g., [`Single`], [`Double`]) rather than referencing it directly. |
17 | /// |
18 | /// If `S` implements [`Semantics`], this type will implement [`Float`]. |
19 | #[must_use] |
20 | pub struct IeeeFloat<S> { |
21 | /// Absolute significand value (including the integer bit). |
22 | sig: [Limb; 1], |
23 | |
24 | /// The signed unbiased exponent of the value. |
25 | exp: ExpInt, |
26 | |
27 | /// What kind of floating point number this is. |
28 | // |
29 | // HACK(eddyb) because mutating this without accounting for `exp`/`sig` |
30 | // can break some subtle edge cases, it should be only read through the |
31 | // `.category()` method, and only set during initialization, either for |
32 | // one of the special value constants, or for conversion from bits. |
33 | read_only_category_do_not_mutate: Category, |
34 | |
35 | /// Sign bit of the number. |
36 | // |
37 | // HACK(eddyb) because mutating this without accounting for `category` |
38 | // can break some subtle edge cases, it should be only read through the |
39 | // `.is_negative()` method, and only set through negation (which can be |
40 | // more easily used through e.g. `copy_sign`/`negate_if`/`with_sign`). |
41 | read_only_sign_do_not_mutate: bool, |
42 | |
43 | marker: PhantomData<S>, |
44 | } |
45 | |
46 | /// Fundamental unit of big integer arithmetic, but also |
47 | /// large to store the largest significands by itself. |
48 | type Limb = u128; |
49 | const LIMB_BITS: usize = 128; |
50 | #[inline(always)] |
51 | fn limbs_for_bits(bits: usize) -> usize { |
52 | (bits + LIMB_BITS - 1) / LIMB_BITS |
53 | } |
54 | |
55 | /// Growable `[Limb]` (i.e. heap-allocated and typically `Vec`/`SmallVec`/etc.), |
56 | /// used only by algorithms that may require dynamically arbitrary precision, |
57 | /// i.e. conversions from/to decimal strings. |
58 | /// |
59 | /// Note: the specific type was chosen by starting with `SmallVec<[_; 1]>` and |
60 | /// increasing the inline length as long as benchmarks were showing improvements, |
61 | /// or at least the `Double::from_str` ones, which roughly had these behaviors: |
62 | /// * `Vec<_>` -> `SmallVec<[_; 1]>`: ~15% speedup, but only for shorter inputs |
63 | /// * `SmallVec<[_; 1]>` -> `SmallVec<[_; 2]>`: ~10% speedup for longer inputs |
64 | /// * `SmallVec<[_; 2]>` -> `SmallVec<[_; 3]>`: noise and/or diminishing returns |
65 | /// |
66 | /// Note: the choice of type described above, and the factors in its decision, |
67 | /// are specific to `Limb` being `u128`, so if `Limb` changes, this should too. |
68 | type DynPrecisionLimbVec = smallvec::SmallVec<[Limb; 2]>; |
69 | |
70 | /// Enum that represents what fraction of the LSB truncated bits of an fp number |
71 | /// represent. |
72 | /// |
73 | /// This essentially combines the roles of guard and sticky bits. |
74 | #[must_use] |
75 | #[derive(Copy, Clone, PartialEq, Eq, Debug)] |
76 | enum Loss { |
77 | // Example of truncated bits: |
78 | ExactlyZero, // 000000 |
79 | LessThanHalf, // 0xxxxx x's not all zero |
80 | ExactlyHalf, // 100000 |
81 | MoreThanHalf, // 1xxxxx x's not all zero |
82 | } |
83 | |
84 | /// How the nonfinite values Inf and NaN are represented. |
85 | #[derive(Copy, Clone, PartialEq, Eq)] |
86 | pub enum NonfiniteBehavior { |
87 | /// Represents standard IEEE 754 behavior. A value is nonfinite if the |
88 | /// exponent field is all 1s. In such cases, a value is Inf if the |
89 | /// significand bits are all zero, and NaN otherwise |
90 | IEEE754, |
91 | |
92 | /// Only the Float8E5M2 has this behavior. There is no Inf representation. A |
93 | /// value is NaN if the exponent field and the mantissa field are all 1s. |
94 | /// This behavior matches the FP8 E4M3 type described in |
95 | /// <https://arxiv.org/abs/2209.05433>. We treat both signed and unsigned NaNs |
96 | /// as non-signalling, although the paper does not state whether the NaN |
97 | /// values are signalling or not. |
98 | NanOnly, |
99 | } |
100 | |
101 | // HACK(eddyb) extension method flipping/changing the sign based on `bool`s. |
102 | trait NegExt: Neg<Output = Self> + Sized { |
103 | fn negate_if(self, negate: bool) -> Self { |
104 | if negate { |
105 | -self |
106 | } else { |
107 | self |
108 | } |
109 | } |
110 | |
111 | fn with_sign(self, sign: bool) -> Self |
112 | where |
113 | Self: Float, |
114 | { |
115 | self.negate_if(self.is_negative() != sign) |
116 | } |
117 | } |
118 | impl<T: Neg<Output = Self>> NegExt for T {} |
119 | |
120 | /// Represents floating point arithmetic semantics. |
121 | pub trait Semantics: Sized { |
122 | /// Total number of bits in the interchange format. |
123 | const BITS: usize; |
124 | |
125 | /// Number of exponent bits in the interchange format. |
126 | const EXP_BITS: usize; |
127 | |
128 | /// Number of bits in the significand. This includes the integer bit. |
129 | const PRECISION: usize = (Self::BITS - 1 - Self::EXP_BITS) + 1; |
130 | |
131 | /// How the nonfinite values Inf and NaN are represented. |
132 | const NONFINITE_BEHAVIOR: NonfiniteBehavior = NonfiniteBehavior::IEEE754; |
133 | |
134 | /// The largest E such that 2^E is representable; this matches the |
135 | /// definition of IEEE 754. |
136 | const MAX_EXP: ExpInt = { |
137 | let ieee_inf_and_nan_replaced_with_extra_normals = match Self::NONFINITE_BEHAVIOR { |
138 | NonfiniteBehavior::IEEE754 => false, |
139 | NonfiniteBehavior::NanOnly => true, |
140 | }; |
141 | Self::IEEE_MAX_EXP |
142 | + (Self::MIN_EXP - Self::IEEE_MIN_EXP) |
143 | + (ieee_inf_and_nan_replaced_with_extra_normals as ExpInt) |
144 | }; |
145 | const IEEE_MAX_EXP: ExpInt = -Self::IEEE_MIN_EXP + 1; |
146 | |
147 | /// The smallest E such that 2^E is a normalized number; this |
148 | /// matches the definition of IEEE 754. |
149 | const MIN_EXP: ExpInt = Self::IEEE_MIN_EXP; |
150 | const IEEE_MIN_EXP: ExpInt = -(1 << (Self::EXP_BITS - 1)) + 2; |
151 | |
152 | /// The base significand bitpattern of NaNs, i.e. the bits that must always |
153 | /// be set in all NaNs, with other significand bits being either used for |
154 | /// payload bits (if `NAN_PAYLOAD_MASK` covers them) or always unset. |
155 | const NAN_SIGNIFICAND_BASE: Limb = match Self::NONFINITE_BEHAVIOR { |
156 | NonfiniteBehavior::IEEE754 => 0, |
157 | NonfiniteBehavior::NanOnly => (1 << (Self::PRECISION - 1)) - 1, |
158 | }; |
159 | |
160 | /// The significand bitmask for the payload of a NaN (if supported), |
161 | /// including the "quiet bit" (telling QNaNs apart from SNaNs). |
162 | const NAN_PAYLOAD_MASK: Limb = match Self::NONFINITE_BEHAVIOR { |
163 | NonfiniteBehavior::IEEE754 => (1 << (Self::PRECISION - 1)) - 1, |
164 | NonfiniteBehavior::NanOnly => 0, |
165 | }; |
166 | |
167 | /// The significand bitpattern to mark a NaN as quiet (if supported). |
168 | /// |
169 | /// NOTE: for X87DoubleExtended we need to set two bits instead of one. |
170 | /// |
171 | /// NOTE: all NaNs are quiet if unsupported (see `NonfiniteBehavior::NanOnly`). |
172 | const QNAN_SIGNIFICAND: Limb = match Self::NONFINITE_BEHAVIOR { |
173 | NonfiniteBehavior::IEEE754 => 1 << (Self::PRECISION - 2), |
174 | NonfiniteBehavior::NanOnly => 0, |
175 | }; |
176 | |
177 | fn from_bits(bits: u128) -> IeeeFloat<Self> { |
178 | assert!(Self::BITS > Self::PRECISION); |
179 | |
180 | let sign = bits & (1 << (Self::BITS - 1)); |
181 | let exponent = ((bits & !sign) >> (Self::BITS - 1 - Self::EXP_BITS)) & ((1 << Self::EXP_BITS) - 1); |
182 | let mut r = IeeeFloat { |
183 | sig: [bits & ((1 << (Self::PRECISION - 1)) - 1)], |
184 | // Convert the exponent from its bias representation to a signed integer. |
185 | exp: (exponent as ExpInt) + (Self::MIN_EXP - 1), |
186 | read_only_category_do_not_mutate: Category::Zero, |
187 | read_only_sign_do_not_mutate: sign != 0, |
188 | marker: PhantomData, |
189 | }; |
190 | |
191 | // NOTE(eddyb) unlike the original C++ code, this doesn't check for |
192 | // specific exponent/significand combinations, but instead relies on |
193 | // being able to construct known-good special values to compare to. |
194 | let try_category_from_special = |mut special: IeeeFloat<Self>| { |
195 | special = special.copy_sign(r); |
196 | |
197 | // Ignore NaN payload to avoid needing a separate NaN check. |
198 | let sig_mask = if special.is_nan() { !Self::NAN_PAYLOAD_MASK } else { !0 }; |
199 | |
200 | if special.is_negative() == r.is_negative() |
201 | && special.exp == r.exp |
202 | && special.sig[0] & sig_mask == r.sig[0] & sig_mask |
203 | { |
204 | Some(special.category()) |
205 | } else { |
206 | None |
207 | } |
208 | }; |
209 | |
210 | // NOTE(eddyb) the order here matters, i.e. `NAN` needs to be last, as |
211 | // its relaxed check (see above) overlaps `INFINITY`, for IEEE NaNs. |
212 | let specials = [ |
213 | IeeeFloat::<Self>::ZERO, |
214 | IeeeFloat::<Self>::INFINITY, |
215 | IeeeFloat::<Self>::NAN, |
216 | ]; |
217 | |
218 | let category = specials |
219 | .into_iter() |
220 | .find_map(try_category_from_special) |
221 | .unwrap_or_else(|| { |
222 | if r.exp == Self::MIN_EXP - 1 { |
223 | // Denormal. |
224 | r.exp = Self::MIN_EXP; |
225 | } else { |
226 | // Set integer bit. |
227 | sig::set_bit(&mut r.sig, Self::PRECISION - 1); |
228 | } |
229 | Category::Normal |
230 | }); |
231 | |
232 | r.read_only_category_do_not_mutate = category; |
233 | |
234 | r |
235 | } |
236 | |
237 | fn to_bits(x: IeeeFloat<Self>) -> u128 { |
238 | assert!(Self::BITS > Self::PRECISION); |
239 | |
240 | // Split integer bit from significand. |
241 | let integer_bit = sig::get_bit(&x.sig, Self::PRECISION - 1); |
242 | let mut significand = x.sig[0] & ((1 << (Self::PRECISION - 1)) - 1); |
243 | let mut exponent = x.exp; |
244 | match x.category() { |
245 | Category::Normal => { |
246 | if exponent == Self::MIN_EXP && !integer_bit { |
247 | // Denormal. |
248 | exponent -= 1; |
249 | } |
250 | } |
251 | Category::Zero => { |
252 | // FIXME(eddyb) Maybe we should guarantee an invariant instead? |
253 | IeeeFloat::<Self> { |
254 | sig: [significand], |
255 | exp: exponent, |
256 | .. |
257 | } = Float::ZERO; |
258 | } |
259 | Category::Infinity => { |
260 | // FIXME(eddyb) Maybe we should guarantee an invariant instead? |
261 | IeeeFloat::<Self> { |
262 | sig: [significand], |
263 | exp: exponent, |
264 | .. |
265 | } = Float::INFINITY; |
266 | } |
267 | Category::NaN => { |
268 | IeeeFloat::<Self> { exp: exponent, .. } = Float::NAN; |
269 | } |
270 | } |
271 | |
272 | // Convert the exponent from a signed integer to its bias representation. |
273 | let exponent = (exponent - (Self::MIN_EXP - 1)) as u128; |
274 | |
275 | ((x.is_negative() as u128) << (Self::BITS - 1)) | (exponent << (Self::PRECISION - 1)) | significand |
276 | } |
277 | } |
278 | |
279 | impl<S> Copy for IeeeFloat<S> {} |
280 | impl<S> Clone for IeeeFloat<S> { |
281 | fn clone(&self) -> Self { |
282 | *self |
283 | } |
284 | } |
285 | |
286 | macro_rules! ieee_semantics { |
287 | ($( |
288 | $(#[$meta:meta])* |
289 | $name:ident = $sem:ident($bits:tt : $exp_bits:tt) $({ $($extra:tt)* })? |
290 | ),* $(,)?) => { |
291 | $( |
292 | #[doc = concat!("Floating point semantics for [`", stringify!($name), "`].")] |
293 | /// |
294 | /// See that type for more details. |
295 | pub struct $sem; |
296 | |
297 | $(#[$meta])* |
298 | pub type $name = IeeeFloat<$sem>; |
299 | |
300 | impl Semantics for $sem { |
301 | const BITS: usize = $bits; |
302 | const EXP_BITS: usize = $exp_bits; |
303 | |
304 | $($($extra)*)? |
305 | } |
306 | )* |
307 | } |
308 | } |
309 | |
310 | ieee_semantics! { |
311 | /// IEEE binary16 half-precision (16-bit) floating point number. |
312 | Half = HalfS(16:5), |
313 | |
314 | /// IEEE binary32 single-precision (32-bit) floating point number. |
315 | Single = SingleS(32:8), |
316 | |
317 | /// IEEE binary64 double-precision (64-bit) floating point number. |
318 | Double = DoubleS(64:11), |
319 | |
320 | /// IEEE binary128 quadruple-precision (128-bit) floating point number. |
321 | Quad = QuadS(128:15), |
322 | |
323 | /// 16-bit brain floating point number. |
324 | /// |
325 | /// This is not an IEEE kind but uses the same semantics. |
326 | BFloat = BFloatS(16:8), |
327 | |
328 | /// 8-bit floating point number with S1E5M2 bit layout. |
329 | /// |
330 | /// Follows IEEE-754 conventions with S1E5M2 bit layout as described in |
331 | /// <https://arxiv.org/abs/2209.05433>. |
332 | Float8E5M2 = Float8E5M2S(8:5), |
333 | |
334 | /// 8-bit floating point number with S1E4M3 bit layout. |
335 | /// |
336 | /// This type mostly follows IEEE-754 conventions with a |
337 | /// bit layout S1E4M3 as described in <https://arxiv.org/abs/2209.05433>. |
338 | /// Unlike IEEE-754 types, there are no infinity values, and NaN is |
339 | /// represented with the exponent and mantissa bits set to all 1s. |
340 | Float8E4M3FN = Float8E4M3FNS(8:4) { |
341 | const NONFINITE_BEHAVIOR: NonfiniteBehavior = NonfiniteBehavior::NanOnly; |
342 | }, |
343 | } |
344 | |
345 | // FIXME(eddyb) consider moving X87-specific logic to a "has explicit integer bit" |
346 | // associated `const` on `Semantics` itself. |
347 | /// Floating point semantics for [`X87DoubleExtended`]. |
348 | /// |
349 | /// See that type for more details. |
350 | pub struct X87DoubleExtendedS; |
351 | |
352 | /// 80-bit floating point number that uses IEEE extended precision semantics, as used |
353 | /// by x87 `long double`. |
354 | pub type X87DoubleExtended = IeeeFloat<X87DoubleExtendedS>; |
355 | |
356 | impl Semantics for X87DoubleExtendedS { |
357 | const BITS: usize = 80; |
358 | const EXP_BITS: usize = 15; |
359 | |
360 | // HACK(eddyb) overwriting `EXP_BITS` because its default is incorrect. |
361 | // FIMXE(eddyb) get rid of this by allowing `Semantics` to generically have |
362 | // the concept of "explicitly included integer bit", which is the main way |
363 | // in which the 80-bit X87 format differs from standard IEEE encodings. |
364 | const PRECISION: usize = 64; |
365 | |
366 | /// For x87 extended precision, we want to make a NaN, not a |
367 | /// pseudo-NaN. Maybe we should expose the ability to make |
368 | /// pseudo-NaNs? |
369 | const QNAN_SIGNIFICAND: Limb = 0b11 << (Self::PRECISION - 2); |
370 | |
371 | /// Integer bit is explicit in this format. Intel hardware (387 and later) |
372 | /// does not support these bit patterns: |
373 | /// exponent = all 1's, integer bit 0, significand 0 ("pseudoinfinity") |
374 | /// exponent = all 1's, integer bit 0, significand nonzero ("pseudoNaN") |
375 | /// exponent!=0 nor all 1's, integer bit 0 ("unnormal") |
376 | /// exponent = 0, integer bit 1 ("pseudodenormal") |
377 | /// At the moment, the first three are treated as NaNs, the last one as Normal. |
378 | #[inline] |
379 | fn from_bits(bits: u128) -> IeeeFloat<Self> { |
380 | let sign = bits & (1 << (Self::BITS - 1)); |
381 | let exponent = ((bits & !sign) >> (Self::BITS - 1 - Self::EXP_BITS)) & ((1 << Self::EXP_BITS) - 1); |
382 | let mut r = IeeeFloat { |
383 | sig: [bits & ((1 << Self::PRECISION) - 1)], |
384 | // Convert the exponent from its bias representation to a signed integer. |
385 | exp: (exponent as ExpInt) + (Self::MIN_EXP - 1), |
386 | read_only_category_do_not_mutate: Category::Zero, |
387 | read_only_sign_do_not_mutate: sign != 0, |
388 | marker: PhantomData, |
389 | }; |
390 | |
391 | let integer_bit = r.sig[0] >> (Self::PRECISION - 1); |
392 | |
393 | let category = if r.exp == Self::MIN_EXP - 1 && r.sig == [0] { |
394 | Category::Zero |
395 | } else if r.exp == Self::MAX_EXP + 1 && r.sig == [1 << (Self::PRECISION - 1)] { |
396 | Category::Infinity |
397 | } else if r.exp == Self::MAX_EXP + 1 && r.sig != [1 << (Self::PRECISION - 1)] |
398 | || r.exp != Self::MAX_EXP + 1 && r.exp != Self::MIN_EXP - 1 && integer_bit == 0 |
399 | { |
400 | r.exp = Self::MAX_EXP + 1; |
401 | Category::NaN |
402 | } else { |
403 | if r.exp == Self::MIN_EXP - 1 { |
404 | // Denormal. |
405 | r.exp = Self::MIN_EXP; |
406 | } |
407 | Category::Normal |
408 | }; |
409 | |
410 | r.read_only_category_do_not_mutate = category; |
411 | |
412 | r |
413 | } |
414 | |
415 | #[inline] |
416 | fn to_bits(x: IeeeFloat<Self>) -> u128 { |
417 | // Get integer bit from significand. |
418 | let integer_bit = sig::get_bit(&x.sig, Self::PRECISION - 1); |
419 | let mut significand = x.sig[0] & ((1 << Self::PRECISION) - 1); |
420 | let exponent = match x.category() { |
421 | Category::Normal => { |
422 | if x.exp == Self::MIN_EXP && !integer_bit { |
423 | // Denormal. |
424 | Self::MIN_EXP - 1 |
425 | } else { |
426 | x.exp |
427 | } |
428 | } |
429 | Category::Zero => { |
430 | // FIXME(eddyb) Maybe we should guarantee an invariant instead? |
431 | significand = 0; |
432 | Self::MIN_EXP - 1 |
433 | } |
434 | Category::Infinity => { |
435 | // FIXME(eddyb) Maybe we should guarantee an invariant instead? |
436 | significand = 1 << (Self::PRECISION - 1); |
437 | Self::MAX_EXP + 1 |
438 | } |
439 | Category::NaN => Self::MAX_EXP + 1, |
440 | }; |
441 | |
442 | // Convert the exponent from a signed integer to its bias representation. |
443 | let exponent = (exponent - (Self::MIN_EXP - 1)) as u128; |
444 | |
445 | ((x.is_negative() as u128) << (Self::BITS - 1)) | (exponent << Self::PRECISION) | significand |
446 | } |
447 | } |
448 | |
449 | float_common_impls!(IeeeFloat<S>); |
450 | |
451 | impl<S: Semantics> PartialEq for IeeeFloat<S> { |
452 | fn eq(&self, rhs: &Self) -> bool { |
453 | self.partial_cmp(rhs) == Some(Ordering::Equal) |
454 | } |
455 | } |
456 | |
457 | impl<S: Semantics> PartialOrd for IeeeFloat<S> { |
458 | fn partial_cmp(&self, rhs: &Self) -> Option<Ordering> { |
459 | match (self.category(), rhs.category()) { |
460 | (Category::NaN, _) | (_, Category::NaN) => None, |
461 | |
462 | (Category::Infinity, Category::Infinity) => Some((!self.is_negative()).cmp(&(!rhs.is_negative()))), |
463 | |
464 | (Category::Zero, Category::Zero) => Some(Ordering::Equal), |
465 | |
466 | (Category::Infinity, _) | (Category::Normal, Category::Zero) => { |
467 | Some((!self.is_negative()).cmp(&self.is_negative())) |
468 | } |
469 | |
470 | (_, Category::Infinity) | (Category::Zero, Category::Normal) => { |
471 | Some(rhs.is_negative().cmp(&(!rhs.is_negative()))) |
472 | } |
473 | |
474 | (Category::Normal, Category::Normal) => { |
475 | // Two normal numbers. Do they have the same sign? |
476 | Some((!self.is_negative()).cmp(&(!rhs.is_negative())).then_with(|| { |
477 | // Compare absolute values; invert result if negative. |
478 | let result = self.cmp_abs_normal(*rhs); |
479 | |
480 | if self.is_negative() { |
481 | result.reverse() |
482 | } else { |
483 | result |
484 | } |
485 | })) |
486 | } |
487 | } |
488 | } |
489 | } |
490 | |
491 | impl<S: Semantics> Neg for IeeeFloat<S> { |
492 | type Output = Self; |
493 | fn neg(mut self) -> Self { |
494 | self.read_only_sign_do_not_mutate = !self.is_negative(); |
495 | self |
496 | } |
497 | } |
498 | |
499 | /// Prints this value as a decimal string. |
500 | /// |
501 | /// \param precision The maximum number of digits of |
502 | /// precision to output. If there are fewer digits available, |
503 | /// zero padding will not be used unless the value is |
504 | /// integral and small enough to be expressed in |
505 | /// precision digits. 0 means to use the natural |
506 | /// precision of the number. |
507 | /// \param width The maximum number of zeros to |
508 | /// consider inserting before falling back to scientific |
509 | /// notation. 0 means to always use scientific notation. |
510 | /// |
511 | /// \param alternate Indicate whether to remove the trailing zero in |
512 | /// fraction part or not. Also setting this parameter to true forces |
513 | /// producing of output more similar to default printf behavior. |
514 | /// Specifically the lower e is used as exponent delimiter and exponent |
515 | /// always contains no less than two digits. |
516 | /// |
517 | /// Number precision width Result |
518 | /// ------ --------- ----- ------ |
519 | /// 1.01E+4 5 2 10100 |
520 | /// 1.01E+4 4 2 1.01E+4 |
521 | /// 1.01E+4 5 1 1.01E+4 |
522 | /// 1.01E-2 5 2 0.0101 |
523 | /// 1.01E-2 4 2 0.0101 |
524 | /// 1.01E-2 4 1 1.01E-2 |
525 | impl<S: Semantics> fmt::Display for IeeeFloat<S> { |
526 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
527 | let width = f.width().unwrap_or(3); |
528 | let alternate = f.alternate(); |
529 | |
530 | match self.category() { |
531 | Category::Infinity => { |
532 | if self.is_negative() { |
533 | return f.write_str("-Inf"); |
534 | } else { |
535 | return f.write_str("+Inf"); |
536 | } |
537 | } |
538 | |
539 | Category::NaN => return f.write_str("NaN"), |
540 | |
541 | Category::Zero => { |
542 | if self.is_negative() { |
543 | f.write_char('-')?; |
544 | } |
545 | |
546 | if width == 0 { |
547 | if alternate { |
548 | f.write_str("0.0")?; |
549 | if let Some(n) = f.precision() { |
550 | for _ in 1..n { |
551 | f.write_char('0')?; |
552 | } |
553 | } |
554 | f.write_str("e+00")?; |
555 | } else { |
556 | f.write_str("0.0E+0")?; |
557 | } |
558 | } else { |
559 | f.write_char('0')?; |
560 | } |
561 | return Ok(()); |
562 | } |
563 | |
564 | Category::Normal => {} |
565 | } |
566 | |
567 | if self.is_negative() { |
568 | f.write_char('-')?; |
569 | } |
570 | |
571 | // We use enough digits so the number can be round-tripped back to an |
572 | // APFloat. The formula comes from "How to Print Floating-Point Numbers |
573 | // Accurately" by Steele and White. |
574 | // FIXME: Using a formula based purely on the precision is conservative; |
575 | // we can print fewer digits depending on the actual value being printed. |
576 | |
577 | // precision = 2 + floor(S::PRECISION / lg_2(10)) |
578 | let precision = f.precision().unwrap_or(2 + S::PRECISION * 59 / 196); |
579 | |
580 | // Decompose the number into an APInt and an exponent. |
581 | let mut exp = self.exp - (S::PRECISION as ExpInt - 1); |
582 | let mut sig: DynPrecisionLimbVec = [self.sig[0]].into_iter().collect(); |
583 | |
584 | // Ignore trailing binary zeros. |
585 | let trailing_zeros = sig[0].trailing_zeros(); |
586 | let _: Loss = sig::shift_right(&mut sig, &mut exp, trailing_zeros as usize); |
587 | |
588 | // Change the exponent from 2^e to 10^e. |
589 | if exp == 0 { |
590 | // Nothing to do. |
591 | } else if exp > 0 { |
592 | // Just shift left. |
593 | let shift = exp as usize; |
594 | sig.resize(limbs_for_bits(S::PRECISION + shift), 0); |
595 | sig::shift_left(&mut sig, &mut exp, shift); |
596 | } else { |
597 | // exp < 0 |
598 | let mut texp = -exp as usize; |
599 | |
600 | // We transform this using the identity: |
601 | // (N)(2^-e) == (N)(5^e)(10^-e) |
602 | |
603 | // Multiply significand by 5^e. |
604 | // N * 5^0101 == N * 5^(1*1) * 5^(0*2) * 5^(1*4) * 5^(0*8) |
605 | let mut sig_scratch = DynPrecisionLimbVec::new(); |
606 | let mut p5 = DynPrecisionLimbVec::new(); |
607 | let mut p5_scratch = DynPrecisionLimbVec::new(); |
608 | while texp != 0 { |
609 | if p5.is_empty() { |
610 | p5.push(5); |
611 | } else { |
612 | p5_scratch.resize(p5.len() * 2, 0); |
613 | let _: Loss = sig::mul(&mut p5_scratch, &mut 0, &p5, &p5, p5.len() * 2 * LIMB_BITS); |
614 | while p5_scratch.last() == Some(&0) { |
615 | p5_scratch.pop(); |
616 | } |
617 | mem::swap(&mut p5, &mut p5_scratch); |
618 | } |
619 | if texp & 1 != 0 { |
620 | sig_scratch.resize(sig.len() + p5.len(), 0); |
621 | let _: Loss = sig::mul(&mut sig_scratch, &mut 0, &sig, &p5, (sig.len() + p5.len()) * LIMB_BITS); |
622 | while sig_scratch.last() == Some(&0) { |
623 | sig_scratch.pop(); |
624 | } |
625 | mem::swap(&mut sig, &mut sig_scratch); |
626 | } |
627 | texp >>= 1; |
628 | } |
629 | } |
630 | |
631 | // Fill the buffer. |
632 | let mut buffer = smallvec::SmallVec::<[u8; 64]>::new(); |
633 | |
634 | // Ignore digits from the significand until it is no more |
635 | // precise than is required for the desired precision. |
636 | // 196/59 is a very slight overestimate of lg_2(10). |
637 | let required = (precision * 196 + 58) / 59; |
638 | let mut discard_digits = sig::omsb(&sig).saturating_sub(required) * 59 / 196; |
639 | let mut in_trail = true; |
640 | while !sig.is_empty() { |
641 | // Perform short division by 10 to extract the rightmost digit. |
642 | // rem <- sig % 10 |
643 | // sig <- sig / 10 |
644 | let mut rem = 0; |
645 | |
646 | // Use 64-bit division and remainder, with 32-bit chunks from sig. |
647 | sig::each_chunk(&mut sig, 32, |chunk| { |
648 | let chunk = chunk as u32; |
649 | let combined = ((rem as u64) << 32) | (chunk as u64); |
650 | rem = (combined % 10) as u8; |
651 | (combined / 10) as u32 as Limb |
652 | }); |
653 | |
654 | // Reduce the sigificand to avoid wasting time dividing 0's. |
655 | while sig.last() == Some(&0) { |
656 | sig.pop(); |
657 | } |
658 | |
659 | let digit = rem; |
660 | |
661 | // Ignore digits we don't need. |
662 | if discard_digits > 0 { |
663 | discard_digits -= 1; |
664 | exp += 1; |
665 | continue; |
666 | } |
667 | |
668 | // Drop trailing zeros. |
669 | if in_trail && digit == 0 { |
670 | exp += 1; |
671 | } else { |
672 | in_trail = false; |
673 | buffer.push(b'0'+ digit); |
674 | } |
675 | } |
676 | |
677 | assert!(!buffer.is_empty(), "no characters in buffer!"); |
678 | |
679 | // Drop down to precision. |
680 | // FIXME: don't do more precise calculations above than are required. |
681 | if buffer.len() > precision { |
682 | // The most significant figures are the last ones in the buffer. |
683 | let mut first_sig = buffer.len() - precision; |
684 | |
685 | // Round. |
686 | // FIXME: this probably shouldn't use 'round half up'. |
687 | |
688 | // Rounding down is just a truncation, except we also want to drop |
689 | // trailing zeros from the new result. |
690 | if buffer[first_sig - 1] < b'5'{ |
691 | while first_sig < buffer.len() && buffer[first_sig] == b'0'{ |
692 | first_sig += 1; |
693 | } |
694 | } else { |
695 | // Rounding up requires a decimal add-with-carry. If we continue |
696 | // the carry, the newly-introduced zeros will just be truncated. |
697 | for x in &mut buffer[first_sig..] { |
698 | if *x == b'9'{ |
699 | first_sig += 1; |
700 | } else { |
701 | *x += 1; |
702 | break; |
703 | } |
704 | } |
705 | } |
706 | |
707 | exp += first_sig as ExpInt; |
708 | buffer.drain(..first_sig); |
709 | |
710 | // If we carried through, we have exactly one digit of precision. |
711 | if buffer.is_empty() { |
712 | buffer.push(b'1'); |
713 | } |
714 | } |
715 | |
716 | let digits = buffer.len(); |
717 | |
718 | // Check whether we should use scientific notation. |
719 | let scientific = if width == 0 { |
720 | true |
721 | } else { |
722 | if exp >= 0 { |
723 | // 765e3 --> 765000 |
724 | // ^^^ |
725 | // But we shouldn't make the number look more precise than it is. |
726 | exp as usize > width || digits + exp as usize > precision |
727 | } else { |
728 | // Power of the most significant digit. |
729 | let msd = exp + (digits - 1) as ExpInt; |
730 | if msd >= 0 { |
731 | // 765e-2 == 7.65 |
732 | false |
733 | } else { |
734 | // 765e-5 == 0.00765 |
735 | // ^ ^^ |
736 | -msd as usize > width |
737 | } |
738 | } |
739 | }; |
740 | |
741 | // Scientific formatting is pretty straightforward. |
742 | if scientific { |
743 | exp += digits as ExpInt - 1; |
744 | |
745 | f.write_char(buffer[digits - 1] as char)?; |
746 | f.write_char('.')?; |
747 | let truncate_zero = !alternate; |
748 | if digits == 1 && truncate_zero { |
749 | f.write_char('0')?; |
750 | } else { |
751 | for &d in buffer[..digits - 1].iter().rev() { |
752 | f.write_char(d as char)?; |
753 | } |
754 | } |
755 | // Fill with zeros up to precision. |
756 | if !truncate_zero && precision > digits - 1 { |
757 | for _ in 0..precision - digits + 1 { |
758 | f.write_char('0')?; |
759 | } |
760 | } |
761 | // For alternate we use lower 'e'. |
762 | f.write_char(if alternate { 'e'} else { 'E'})?; |
763 | |
764 | // Exponent always at least two digits if we do not truncate zeros. |
765 | if truncate_zero { |
766 | write!(f, "{:+} ", exp)?; |
767 | } else { |
768 | write!(f, "{:+03} ", exp)?; |
769 | } |
770 | |
771 | return Ok(()); |
772 | } |
773 | |
774 | // Non-scientific, positive exponents. |
775 | if exp >= 0 { |
776 | for &d in buffer.iter().rev() { |
777 | f.write_char(d as char)?; |
778 | } |
779 | for _ in 0..exp { |
780 | f.write_char('0')?; |
781 | } |
782 | return Ok(()); |
783 | } |
784 | |
785 | // Non-scientific, negative exponents. |
786 | let unit_place = -exp as usize; |
787 | if unit_place < digits { |
788 | for &d in buffer[unit_place..].iter().rev() { |
789 | f.write_char(d as char)?; |
790 | } |
791 | f.write_char('.')?; |
792 | for &d in buffer[..unit_place].iter().rev() { |
793 | f.write_char(d as char)?; |
794 | } |
795 | } else { |
796 | f.write_str("0.")?; |
797 | for _ in digits..unit_place { |
798 | f.write_char('0')?; |
799 | } |
800 | for &d in buffer.iter().rev() { |
801 | f.write_char(d as char)?; |
802 | } |
803 | } |
804 | |
805 | Ok(()) |
806 | } |
807 | } |
808 | |
809 | impl<S: Semantics> fmt::Debug for IeeeFloat<S> { |
810 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
811 | write!( |
812 | f, |
813 | "{} ({:?} |{}{:?} * 2^{} )", |
814 | self, |
815 | self.category(), |
816 | if self.is_negative() { "-"} else { "+"}, |
817 | self.sig, |
818 | self.exp |
819 | ) |
820 | } |
821 | } |
822 | |
823 | // HACK(eddyb) this logic is duplicated throughout the original C++ code, |
824 | // but it's a bit too long to keep repeating in the Rust port for all ops. |
825 | // FIXME(eddyb) find a better name/organization for all of this functionality |
826 | // (`IeeeDefaultExceptionHandling` doesn't have a counterpart in the C++ code). |
827 | struct IeeeDefaultExceptionHandling; |
828 | impl IeeeDefaultExceptionHandling { |
829 | fn result_from_nan<S: Semantics>(mut r: IeeeFloat<S>) -> StatusAnd<IeeeFloat<S>> { |
830 | assert!(r.is_nan()); |
831 | |
832 | let status = if r.is_signaling() { |
833 | // [IEEE Std 754-2008 6.2]: |
834 | // Under default exception handling, any operation signaling an invalid |
835 | // operation exception and for which a floating-point result is to be |
836 | // delivered shall deliver a quiet NaN. |
837 | let [sig] = &mut r.sig; |
838 | *sig |= if S::QNAN_SIGNIFICAND == X87DoubleExtendedS::QNAN_SIGNIFICAND { |
839 | // HACK(eddyb) remain bug-compatible with the original C++ code |
840 | // which doesn't appear to attempt avoiding creating pseudo-NaNs |
841 | // (see https://github.com/llvm/llvm-project/issues/63938). |
842 | S::QNAN_SIGNIFICAND & S::NAN_PAYLOAD_MASK |
843 | } else { |
844 | S::QNAN_SIGNIFICAND |
845 | }; |
846 | |
847 | // [IEEE Std 754-2008 6.2]: |
848 | // Signaling NaNs shall be reserved operands that, under default exception |
849 | // handling, signal the invalid operation exception(see 7.2) for every |
850 | // general-computational and signaling-computational operation except for |
851 | // the conversions described in 5.12. |
852 | Status::INVALID_OP |
853 | } else { |
854 | // [IEEE Std 754-2008 6.2]: |
855 | // For an operation with quiet NaN inputs, other than maximum and minimum |
856 | // operations, if a floating-point result is to be delivered the result |
857 | // shall be a quiet NaN which should be one of the input NaNs. |
858 | // ... |
859 | // Every general-computational and quiet-computational operation involving |
860 | // one or more input NaNs, none of them signaling, shall signal no |
861 | // exception, except fusedMultiplyAdd might signal the invalid operation |
862 | // exception(see 7.2). |
863 | Status::OK |
864 | }; |
865 | status.and(r) |
866 | } |
867 | |
868 | fn binop_result_from_either_nan<S: Semantics>(a: IeeeFloat<S>, b: IeeeFloat<S>) -> StatusAnd<IeeeFloat<S>> { |
869 | let r = match (a.category(), b.category()) { |
870 | (Category::NaN, _) => a, |
871 | (_, Category::NaN) => b, |
872 | _ => unreachable!(), |
873 | }; |
874 | let mut status_and_r = Self::result_from_nan(r); |
875 | if b.is_signaling() { |
876 | status_and_r.status |= Status::INVALID_OP; |
877 | } |
878 | status_and_r |
879 | } |
880 | } |
881 | |
882 | impl<S: Semantics> IeeeFloat<S> { |
883 | // HACK(eddyb) allow `Self::qnan` to be used from `IeeeFloat::NAN`. |
884 | // FIXME(eddyb) move back to the trait impl when that can be `const fn`. |
885 | const fn qnan(payload: Option<u128>) -> Self { |
886 | let sig = [S::NAN_SIGNIFICAND_BASE |
887 | | S::QNAN_SIGNIFICAND |
888 | | match payload { |
889 | // Zero out the excess bits of the significand. |
890 | Some(payload) => payload & S::NAN_PAYLOAD_MASK, |
891 | None => 0, |
892 | }]; |
893 | |
894 | let exp = match S::NONFINITE_BEHAVIOR { |
895 | NonfiniteBehavior::IEEE754 => S::MAX_EXP + 1, |
896 | NonfiniteBehavior::NanOnly => S::MAX_EXP, |
897 | }; |
898 | |
899 | IeeeFloat { |
900 | sig, |
901 | exp, |
902 | read_only_category_do_not_mutate: Category::NaN, |
903 | read_only_sign_do_not_mutate: false, |
904 | marker: PhantomData, |
905 | } |
906 | } |
907 | } |
908 | |
909 | impl<S: Semantics> Float for IeeeFloat<S> { |
910 | const BITS: usize = S::BITS; |
911 | const PRECISION: usize = S::PRECISION; |
912 | const MAX_EXP: ExpInt = S::MAX_EXP; |
913 | const MIN_EXP: ExpInt = S::MIN_EXP; |
914 | |
915 | const ZERO: Self = IeeeFloat { |
916 | sig: [0], |
917 | exp: S::MIN_EXP - 1, |
918 | read_only_category_do_not_mutate: Category::Zero, |
919 | read_only_sign_do_not_mutate: false, |
920 | marker: PhantomData, |
921 | }; |
922 | |
923 | const INFINITY: Self = match S::NONFINITE_BEHAVIOR { |
924 | NonfiniteBehavior::IEEE754 => IeeeFloat { |
925 | sig: [0], |
926 | exp: S::MAX_EXP + 1, |
927 | read_only_category_do_not_mutate: Category::Infinity, |
928 | read_only_sign_do_not_mutate: false, |
929 | marker: PhantomData, |
930 | }, |
931 | |
932 | // There is no Inf, so make NaN instead. |
933 | NonfiniteBehavior::NanOnly => Self::NAN, |
934 | }; |
935 | |
936 | const NAN: Self = Self::qnan(None); |
937 | |
938 | fn qnan(payload: Option<u128>) -> Self { |
939 | // NOTE(eddyb) this is not a recursive self-call, but rather it calls |
940 | // the `const fn` inherent mehtod (see above). |
941 | Self::qnan(payload) |
942 | } |
943 | |
944 | fn snan(payload: Option<u128>) -> Self { |
945 | let mut snan = Self::qnan(payload); |
946 | |
947 | let [sig] = &mut snan.sig; |
948 | |
949 | // We always have to clear the QNaN bit to make it an SNaN. |
950 | *sig &= !(S::QNAN_SIGNIFICAND & S::NAN_PAYLOAD_MASK); |
951 | |
952 | // If there are no bits set in the payload, we have to set |
953 | // *something* to make it a NaN instead of an infinity; |
954 | // conventionally, this is the next bit down from the QNaN bit. |
955 | if *sig & S::NAN_PAYLOAD_MASK == 0 { |
956 | *sig |= (S::QNAN_SIGNIFICAND & S::NAN_PAYLOAD_MASK) >> 1; |
957 | } |
958 | |
959 | snan |
960 | } |
961 | |
962 | fn largest() -> Self { |
963 | // We want (in interchange format): |
964 | // exponent = 1..10 |
965 | // significand = 1..1 |
966 | IeeeFloat { |
967 | sig: [((1 << S::PRECISION) - 1) |
968 | & match S::NONFINITE_BEHAVIOR { |
969 | // The largest number by magnitude in our format will be the floating point |
970 | // number with maximum exponent and with significand that is all ones. |
971 | NonfiniteBehavior::IEEE754 => !0, |
972 | |
973 | // The largest number by magnitude in our format will be the floating point |
974 | // number with maximum exponent and with significand that is all ones except |
975 | // the LSB. |
976 | NonfiniteBehavior::NanOnly => !1, |
977 | }], |
978 | exp: S::MAX_EXP, |
979 | read_only_category_do_not_mutate: Category::Normal, |
980 | read_only_sign_do_not_mutate: false, |
981 | marker: PhantomData, |
982 | } |
983 | } |
984 | |
985 | // We want (in interchange format): |
986 | // exponent = 0..0 |
987 | // significand = 0..01 |
988 | const SMALLEST: Self = IeeeFloat { |
989 | sig: [1], |
990 | exp: S::MIN_EXP, |
991 | read_only_category_do_not_mutate: Category::Normal, |
992 | read_only_sign_do_not_mutate: false, |
993 | marker: PhantomData, |
994 | }; |
995 | |
996 | fn smallest_normalized() -> Self { |
997 | // We want (in interchange format): |
998 | // exponent = 0..0 |
999 | // significand = 10..0 |
1000 | IeeeFloat { |
1001 | sig: [1 << (S::PRECISION - 1)], |
1002 | exp: S::MIN_EXP, |
1003 | read_only_category_do_not_mutate: Category::Normal, |
1004 | read_only_sign_do_not_mutate: false, |
1005 | marker: PhantomData, |
1006 | } |
1007 | } |
1008 | |
1009 | fn add_r(mut self, rhs: Self, round: Round) -> StatusAnd<Self> { |
1010 | let status = match (self.category(), rhs.category()) { |
1011 | (Category::NaN, _) | (_, Category::NaN) => { |
1012 | return IeeeDefaultExceptionHandling::binop_result_from_either_nan(self, rhs); |
1013 | } |
1014 | |
1015 | (Category::Infinity, Category::Infinity) => { |
1016 | // Differently signed infinities can only be validly |
1017 | // subtracted. |
1018 | if self.is_negative() != rhs.is_negative() { |
1019 | self = Self::NAN; |
1020 | Status::INVALID_OP |
1021 | } else { |
1022 | Status::OK |
1023 | } |
1024 | } |
1025 | |
1026 | // Sign may depend on rounding mode; handled below. |
1027 | (_, Category::Zero) | (Category::Infinity, Category::Normal) => Status::OK, |
1028 | |
1029 | (Category::Zero, _) | (_, Category::Infinity) => { |
1030 | self = rhs; |
1031 | Status::OK |
1032 | } |
1033 | |
1034 | (Category::Normal, Category::Normal) => { |
1035 | let mut sign = self.is_negative(); |
1036 | let loss = sig::add_or_sub( |
1037 | &mut self.sig, |
1038 | &mut self.exp, |
1039 | &mut sign, |
1040 | &mut [rhs.sig[0]], |
1041 | rhs.exp, |
1042 | rhs.is_negative(), |
1043 | ); |
1044 | self = self.with_sign(sign); |
1045 | |
1046 | let status; |
1047 | self = unpack!(status=, self.normalize(round, loss)); |
1048 | |
1049 | // Can only be zero if we lost no fraction. |
1050 | assert!(self.category() != Category::Zero || loss == Loss::ExactlyZero); |
1051 | |
1052 | status |
1053 | } |
1054 | }; |
1055 | |
1056 | // If two numbers add (exactly) to zero, IEEE 754 decrees it is a |
1057 | // positive zero unless rounding to minus infinity, except that |
1058 | // adding two like-signed zeroes gives that zero. |
1059 | if self.category() == Category::Zero |
1060 | && (rhs.category() != Category::Zero || self.is_negative() != rhs.is_negative()) |
1061 | { |
1062 | self = self.with_sign(round == Round::TowardNegative); |
1063 | } |
1064 | |
1065 | status.and(self) |
1066 | } |
1067 | |
1068 | // NOTE(eddyb) we can't rely on the `sub_r` method default implementation |
1069 | // because NaN handling needs the original `rhs` (i.e. without negation). |
1070 | fn sub_r(self, rhs: Self, round: Round) -> StatusAnd<Self> { |
1071 | match (self.category(), rhs.category()) { |
1072 | (Category::NaN, _) | (_, Category::NaN) => { |
1073 | IeeeDefaultExceptionHandling::binop_result_from_either_nan(self, rhs) |
1074 | } |
1075 | |
1076 | _ => self.add_r(-rhs, round), |
1077 | } |
1078 | } |
1079 | |
1080 | fn mul_r(mut self, rhs: Self, round: Round) -> StatusAnd<Self> { |
1081 | self = self.negate_if(rhs.is_negative()); |
1082 | |
1083 | match (self.category(), rhs.category()) { |
1084 | (Category::NaN, _) | (_, Category::NaN) => { |
1085 | self = self.negate_if(rhs.is_negative()); // restore the original sign |
1086 | |
1087 | IeeeDefaultExceptionHandling::binop_result_from_either_nan(self, rhs) |
1088 | } |
1089 | |
1090 | (Category::Zero, Category::Infinity) | (Category::Infinity, Category::Zero) => { |
1091 | Status::INVALID_OP.and(Self::NAN) |
1092 | } |
1093 | |
1094 | (Category::Infinity, _) | (_, Category::Infinity) => Status::OK.and(Self::INFINITY.copy_sign(self)), |
1095 | |
1096 | (Category::Zero, _) | (_, Category::Zero) => Status::OK.and(Self::ZERO.copy_sign(self)), |
1097 | |
1098 | (Category::Normal, Category::Normal) => { |
1099 | self.exp += rhs.exp; |
1100 | let mut wide_sig = [0; 2]; |
1101 | let loss = sig::mul(&mut wide_sig, &mut self.exp, &self.sig, &rhs.sig, S::PRECISION); |
1102 | self.sig = [wide_sig[0]]; |
1103 | let mut status; |
1104 | self = unpack!(status=, self.normalize(round, loss)); |
1105 | if loss != Loss::ExactlyZero { |
1106 | status |= Status::INEXACT; |
1107 | } |
1108 | status.and(self) |
1109 | } |
1110 | } |
1111 | } |
1112 | |
1113 | fn mul_add_r(mut self, multiplicand: Self, addend: Self, round: Round) -> StatusAnd<Self> { |
1114 | // If and only if all arguments are normal do we need to do an |
1115 | // extended-precision calculation. |
1116 | if !self.is_finite_non_zero() || !multiplicand.is_finite_non_zero() || !addend.is_finite() { |
1117 | let mut status; |
1118 | if self.is_finite_non_zero() && multiplicand.is_finite_non_zero() { |
1119 | // HACK(eddyb) this corresponds to the case where the C++ code |
1120 | // (`multiplySpecials`) is effectively a noop: no multiplication |
1121 | // is actually performed, because we're really only here to handle |
1122 | // the "infinite/NaN `addend`" special-case, which needs to ignore |
1123 | // the "finite * finite" multiplication entirely, instead of letting |
1124 | // it e.g. overflow into infinity (and trample over `addend`). |
1125 | assert!(!addend.is_finite()); |
1126 | status = Status::OK; |
1127 | } else { |
1128 | self = unpack!(status=, self.mul_r(multiplicand, round)); |
1129 | } |
1130 | |
1131 | // FS can only be Status::OK or Status::INVALID_OP. There is no more work |
1132 | // to do in the latter case. The IEEE-754R standard says it is |
1133 | // implementation-defined in this case whether, if ADDEND is a |
1134 | // quiet NaN, we raise invalid op; this implementation does so. |
1135 | // |
1136 | // If we need to do the addition we can do so with normal |
1137 | // precision. |
1138 | if status == Status::OK { |
1139 | self = unpack!(status=, self.add_r(addend, round)); |
1140 | } |
1141 | return status.and(self); |
1142 | } |
1143 | |
1144 | // Post-multiplication sign, before addition. |
1145 | self = self.negate_if(multiplicand.is_negative()); |
1146 | |
1147 | // Allocate space for twice as many bits as the original significand, plus one |
1148 | // extra bit for the addition to overflow into. |
1149 | assert!(limbs_for_bits(S::PRECISION * 2 + 1) <= 2); |
1150 | let mut wide_sig = sig::widening_mul(self.sig[0], multiplicand.sig[0]); |
1151 | |
1152 | let mut loss = Loss::ExactlyZero; |
1153 | let mut omsb = sig::omsb(&wide_sig); |
1154 | self.exp += multiplicand.exp; |
1155 | |
1156 | // Assume the operands involved in the multiplication are single-precision |
1157 | // FP, and the two multiplicants are: |
1158 | // lhs = a23 . a22 ... a0 * 2^e1 |
1159 | // rhs = b23 . b22 ... b0 * 2^e2 |
1160 | // the result of multiplication is: |
1161 | // lhs = c48 c47 c46 . c45 ... c0 * 2^(e1+e2) |
1162 | // Note that there are three significant bits at the left-hand side of the |
1163 | // radix point: two for the multiplication, and an overflow bit for the |
1164 | // addition (that will always be zero at this point). Move the radix point |
1165 | // toward left by two bits, and adjust exponent accordingly. |
1166 | self.exp += 2; |
1167 | |
1168 | if addend.is_non_zero() { |
1169 | // Normalize our MSB to one below the top bit to allow for overflow. |
1170 | let ext_precision = 2 * S::PRECISION + 1; |
1171 | if omsb != ext_precision - 1 { |
1172 | assert!(ext_precision > omsb); |
1173 | sig::shift_left(&mut wide_sig, &mut self.exp, (ext_precision - 1) - omsb); |
1174 | } |
1175 | |
1176 | // The intermediate result of the multiplication has "2 * S::PRECISION" |
1177 | // signicant bit; adjust the addend to be consistent with mul result. |
1178 | let mut ext_addend_sig = [addend.sig[0], 0]; |
1179 | |
1180 | // Extend the addend significand to ext_precision - 1. This guarantees |
1181 | // that the high bit of the significand is zero (same as wide_sig), |
1182 | // so the addition will overflow (if it does overflow at all) into the top bit. |
1183 | sig::shift_left(&mut ext_addend_sig, &mut 0, ext_precision - 1 - S::PRECISION); |
1184 | |
1185 | let mut sign = self.is_negative(); |
1186 | loss = sig::add_or_sub( |
1187 | &mut wide_sig, |
1188 | &mut self.exp, |
1189 | &mut sign, |
1190 | &mut ext_addend_sig, |
1191 | addend.exp + 1, |
1192 | addend.is_negative(), |
1193 | ); |
1194 | self = self.with_sign(sign); |
1195 | |
1196 | omsb = sig::omsb(&wide_sig); |
1197 | } |
1198 | |
1199 | // Convert the result having "2 * S::PRECISION" significant-bits back to the one |
1200 | // having "S::PRECISION" significant-bits. First, move the radix point from |
1201 | // poision "2*S::PRECISION - 1" to "S::PRECISION - 1". The exponent need to be |
1202 | // adjusted by "2*S::PRECISION - 1" - "S::PRECISION - 1" = "S::PRECISION". |
1203 | self.exp -= S::PRECISION as ExpInt + 1; |
1204 | |
1205 | // In case MSB resides at the left-hand side of radix point, shift the |
1206 | // mantissa right by some amount to make sure the MSB reside right before |
1207 | // the radix point (i.e. "MSB . rest-significant-bits"). |
1208 | if omsb > S::PRECISION { |
1209 | let bits = omsb - S::PRECISION; |
1210 | loss = sig::shift_right(&mut wide_sig, &mut self.exp, bits).combine(loss); |
1211 | } |
1212 | |
1213 | self.sig[0] = wide_sig[0]; |
1214 | |
1215 | let mut status; |
1216 | self = unpack!(status=, self.normalize(round, loss)); |
1217 | if loss != Loss::ExactlyZero { |
1218 | status |= Status::INEXACT; |
1219 | } |
1220 | |
1221 | // If two numbers add (exactly) to zero, IEEE 754 decrees it is a |
1222 | // positive zero unless rounding to minus infinity, except that |
1223 | // adding two like-signed zeroes gives that zero. |
1224 | if self.category() == Category::Zero |
1225 | && !status.intersects(Status::UNDERFLOW) |
1226 | && self.is_negative() != addend.is_negative() |
1227 | { |
1228 | self = self.with_sign(round == Round::TowardNegative); |
1229 | } |
1230 | |
1231 | status.and(self) |
1232 | } |
1233 | |
1234 | fn div_r(mut self, rhs: Self, round: Round) -> StatusAnd<Self> { |
1235 | self = self.negate_if(rhs.is_negative()); |
1236 | |
1237 | match (self.category(), rhs.category()) { |
1238 | (Category::NaN, _) | (_, Category::NaN) => { |
1239 | self = self.negate_if(rhs.is_negative()); // restore the original sign |
1240 | |
1241 | IeeeDefaultExceptionHandling::binop_result_from_either_nan(self, rhs) |
1242 | } |
1243 | |
1244 | (Category::Infinity, Category::Infinity) | (Category::Zero, Category::Zero) => { |
1245 | Status::INVALID_OP.and(Self::NAN) |
1246 | } |
1247 | |
1248 | (Category::Infinity | Category::Zero, _) => Status::OK.and(self), |
1249 | |
1250 | (_, Category::Infinity) => Status::OK.and(Self::ZERO.copy_sign(self)), |
1251 | |
1252 | (_, Category::Zero) => Status::DIV_BY_ZERO.and(Self::INFINITY.copy_sign(self)), |
1253 | |
1254 | (Category::Normal, Category::Normal) => { |
1255 | self.exp -= rhs.exp; |
1256 | let dividend = self.sig[0]; |
1257 | let loss = sig::div(&mut self.sig, &mut self.exp, &mut [dividend], &mut [rhs.sig[0]], S::PRECISION); |
1258 | let mut status; |
1259 | self = unpack!(status=, self.normalize(round, loss)); |
1260 | if loss != Loss::ExactlyZero { |
1261 | status |= Status::INEXACT; |
1262 | } |
1263 | status.and(self) |
1264 | } |
1265 | } |
1266 | } |
1267 | |
1268 | fn ieee_rem(self, rhs: Self) -> StatusAnd<Self> { |
1269 | match (self.category(), rhs.category()) { |
1270 | (Category::NaN, _) | (_, Category::NaN) => { |
1271 | IeeeDefaultExceptionHandling::binop_result_from_either_nan(self, rhs) |
1272 | } |
1273 | |
1274 | (Category::Infinity, _) | (_, Category::Zero) => Status::INVALID_OP.and(Self::NAN), |
1275 | |
1276 | (Category::Zero, _) | (_, Category::Infinity) => Status::OK.and(self), |
1277 | |
1278 | (Category::Normal, Category::Normal) => { |
1279 | let mut status; |
1280 | |
1281 | let mut x = self; |
1282 | let mut p = rhs; |
1283 | |
1284 | // Make sure the current value is less than twice the denom. If the addition |
1285 | // did not succeed (an overflow has happened), which means that the finite |
1286 | // value we currently posses must be less than twice the denom (as we are |
1287 | // using the same semantics). |
1288 | let p2 = unpack!(status=, p + p); |
1289 | if status == Status::OK { |
1290 | x = unpack!(status=, x.c_fmod(p2)); |
1291 | assert_eq!(status, Status::OK); |
1292 | } |
1293 | |
1294 | // Lets work with absolute numbers. |
1295 | p = p.abs(); |
1296 | x = x.abs(); |
1297 | |
1298 | // |
1299 | // To calculate the remainder we use the following scheme. |
1300 | // |
1301 | // The remainder is defained as follows: |
1302 | // |
1303 | // remainder = numer - rquot * denom = x - r * p |
1304 | // |
1305 | // Where r is the result of: x/p, rounded toward the nearest integral value |
1306 | // (with halfway cases rounded toward the even number). |
1307 | // |
1308 | // Currently, (after x mod 2p): |
1309 | // r is the number of 2p's present inside x, which is inherently, an even |
1310 | // number of p's. |
1311 | // |
1312 | // We may split the remaining calculation into 4 options: |
1313 | // - if x < 0.5p then we round to the nearest number with is 0, and are done. |
1314 | // - if x == 0.5p then we round to the nearest even number which is 0, and we |
1315 | // are done as well. |
1316 | // - if 0.5p < x < p then we round to nearest number which is 1, and we have |
1317 | // to subtract 1p at least once. |
1318 | // - if x >= p then we must subtract p at least once, as x must be a |
1319 | // remainder. |
1320 | // |
1321 | // By now, we were done, or we added 1 to r, which in turn, now an odd number. |
1322 | // |
1323 | // We can now split the remaining calculation to the following 3 options: |
1324 | // - if x < 0.5p then we round to the nearest number with is 0, and are done. |
1325 | // - if x == 0.5p then we round to the nearest even number. As r is odd, we |
1326 | // must round up to the next even number. so we must subtract p once more. |
1327 | // - if x > 0.5p (and inherently x < p) then we must round r up to the next |
1328 | // integral, and subtract p once more. |
1329 | // |
1330 | |
1331 | // Return `x * 2` at no loss of precision (i.e. no overflow). |
1332 | // |
1333 | // HACK(eddyb) this may seem a bit sketchy because it can return |
1334 | // values that `normalize` would've replaced with `overflow_result` |
1335 | // (e.g. overflowing to infinity), but the result is only used for |
1336 | // comparisons, where both sides of such comparison can be seen |
1337 | // as transiently having a larger *effective* exponent range. |
1338 | let lossless_2x = |mut x: Self| { |
1339 | x.exp += 1; |
1340 | |
1341 | if x.exp >= Self::MAX_EXP { |
1342 | // HACK(eddyb) skip lossy `normalize` (see above). |
1343 | } else { |
1344 | let status; |
1345 | x = unpack!(status=, x.normalize(Round::NearestTiesToEven, Loss::ExactlyZero)); |
1346 | assert_eq!(status, Status::OK); |
1347 | } |
1348 | |
1349 | x |
1350 | }; |
1351 | |
1352 | if lossless_2x(x) > p { |
1353 | x = unpack!(status=, x - p); |
1354 | assert_eq!(status, Status::OK); |
1355 | |
1356 | if lossless_2x(x) >= p { |
1357 | x = unpack!(status=, x - p); |
1358 | assert_eq!(status, Status::OK); |
1359 | } |
1360 | } |
1361 | |
1362 | if x.is_zero() { |
1363 | Status::OK.and(x.copy_sign(self)) // IEEE754 requires this |
1364 | } else { |
1365 | Status::OK.and(x.negate_if(self.is_negative())) |
1366 | } |
1367 | } |
1368 | } |
1369 | } |
1370 | |
1371 | fn c_fmod(mut self, rhs: Self) -> StatusAnd<Self> { |
1372 | match (self.category(), rhs.category()) { |
1373 | (Category::NaN, _) | (_, Category::NaN) => { |
1374 | IeeeDefaultExceptionHandling::binop_result_from_either_nan(self, rhs) |
1375 | } |
1376 | |
1377 | (Category::Infinity, _) | (_, Category::Zero) => Status::INVALID_OP.and(Self::NAN), |
1378 | |
1379 | (Category::Zero, _) | (_, Category::Infinity) => Status::OK.and(self), |
1380 | |
1381 | (Category::Normal, Category::Normal) => { |
1382 | let orig = self; |
1383 | |
1384 | while self.is_finite_non_zero() |
1385 | && rhs.is_finite_non_zero() |
1386 | && self.cmp_abs_normal(rhs) != Ordering::Less |
1387 | { |
1388 | let exp = self.ilogb() - rhs.ilogb(); |
1389 | let mut v = rhs.scalbn(exp); |
1390 | // `v` can overflow to NaN with `NonfiniteBehavior::NanOnly`, so explicitly |
1391 | // check for it. |
1392 | if v.is_nan() || self.cmp_abs_normal(v) == Ordering::Less { |
1393 | v = rhs.scalbn(exp - 1); |
1394 | } |
1395 | v = v.copy_sign(self); |
1396 | |
1397 | let status; |
1398 | self = unpack!(status=, self - v); |
1399 | assert_eq!(status, Status::OK); |
1400 | } |
1401 | if self.is_zero() { |
1402 | self = self.copy_sign(orig); |
1403 | } |
1404 | Status::OK.and(self) |
1405 | } |
1406 | } |
1407 | } |
1408 | |
1409 | fn round_to_integral(self, round: Round) -> StatusAnd<Self> { |
1410 | match self.category() { |
1411 | Category::NaN => IeeeDefaultExceptionHandling::result_from_nan(self), |
1412 | |
1413 | // [IEEE Std 754-2008 6.1]: |
1414 | // The behavior of infinity in floating-point arithmetic is derived from the |
1415 | // limiting cases of real arithmetic with operands of arbitrarily |
1416 | // large magnitude, when such a limit exists. |
1417 | // ... |
1418 | // Operations on infinite operands are usually exact and therefore signal no |
1419 | // exceptions ... |
1420 | Category::Infinity => Status::OK.and(self), |
1421 | |
1422 | // [IEEE Std 754-2008 6.3]: |
1423 | // ... the sign of the result of conversions, the quantize operation, the |
1424 | // roundToIntegral operations, and the roundToIntegralExact(see 5.3.1) is |
1425 | // the sign of the first or only operand. |
1426 | Category::Zero => Status::OK.and(self), |
1427 | |
1428 | Category::Normal => { |
1429 | // If the exponent is large enough, we know that this value is already |
1430 | // integral, and the arithmetic below would potentially cause it to saturate |
1431 | // to +/-Inf. Bail out early instead. |
1432 | if self.exp + 1 >= S::PRECISION as ExpInt { |
1433 | return Status::OK.and(self); |
1434 | } |
1435 | |
1436 | // The algorithm here is quite simple: we add 2^(p-1), where p is the |
1437 | // precision of our format, and then subtract it back off again. The choice |
1438 | // of rounding modes for the addition/subtraction determines the rounding mode |
1439 | // for our integral rounding as well. |
1440 | // NOTE: When the input value is negative, we do subtraction followed by |
1441 | // addition instead. |
1442 | assert!(S::PRECISION <= 128); |
1443 | let mut status; |
1444 | let magic_const = unpack!(status=, Self::from_u128(1 << (S::PRECISION - 1))); |
1445 | assert_eq!(status, Status::OK); |
1446 | let magic_const = magic_const.copy_sign(self); |
1447 | |
1448 | let mut r = self; |
1449 | r = unpack!(status=, r.add_r(magic_const, round)); |
1450 | |
1451 | // Current value and 'MagicConstant' are both integers, so the result of the |
1452 | // subtraction is always exact according to Sterbenz' lemma. |
1453 | r = r.sub_r(magic_const, round).value; |
1454 | |
1455 | // Restore the input sign to handle the case of zero result |
1456 | // correctly. |
1457 | status.and(r.copy_sign(self)) |
1458 | } |
1459 | } |
1460 | } |
1461 | |
1462 | fn next_up(mut self) -> StatusAnd<Self> { |
1463 | // Compute nextUp(x), handling each float category separately. |
1464 | match self.category() { |
1465 | Category::Infinity => { |
1466 | if self.is_negative() { |
1467 | // nextUp(-inf) = -largest |
1468 | Status::OK.and(-Self::largest()) |
1469 | } else { |
1470 | // nextUp(+inf) = +inf |
1471 | Status::OK.and(self) |
1472 | } |
1473 | } |
1474 | Category::NaN => { |
1475 | // IEEE-754R 2008 6.2 Par 2: nextUp(sNaN) = qNaN. Set Invalid flag. |
1476 | // IEEE-754R 2008 6.2: nextUp(qNaN) = qNaN. Must be identity so we do not |
1477 | // change the payload. |
1478 | if self.is_signaling() { |
1479 | // For consistency, propagate the sign of the sNaN to the qNaN. |
1480 | Status::INVALID_OP.and(Self::NAN.copy_sign(self)) |
1481 | } else { |
1482 | Status::OK.and(self) |
1483 | } |
1484 | } |
1485 | Category::Zero => { |
1486 | // nextUp(pm 0) = +smallest |
1487 | Status::OK.and(Self::SMALLEST) |
1488 | } |
1489 | Category::Normal => { |
1490 | // nextUp(-smallest) = -0 |
1491 | if self.is_smallest() && self.is_negative() { |
1492 | return Status::OK.and(-Self::ZERO); |
1493 | } |
1494 | |
1495 | // nextUp(largest) == INFINITY |
1496 | if self.is_largest() && !self.is_negative() { |
1497 | return Status::OK.and(Self::INFINITY); |
1498 | } |
1499 | |
1500 | // Excluding the integral bit. This allows us to test for binade boundaries. |
1501 | let sig_mask = (1 << (S::PRECISION - 1)) - 1; |
1502 | |
1503 | // nextUp(normal) == normal + inc. |
1504 | if self.is_negative() { |
1505 | // If we are negative, we need to decrement the significand. |
1506 | |
1507 | // We only cross a binade boundary that requires adjusting the exponent |
1508 | // if: |
1509 | // 1. exponent != S::MIN_EXP. This implies we are not in the |
1510 | // smallest binade or are dealing with denormals. |
1511 | // 2. Our significand excluding the integral bit is all zeros. |
1512 | let crossing_binade_boundary = self.exp != S::MIN_EXP && self.sig[0] & sig_mask == 0; |
1513 | |
1514 | // Decrement the significand. |
1515 | // |
1516 | // We always do this since: |
1517 | // 1. If we are dealing with a non-binade decrement, by definition we |
1518 | // just decrement the significand. |
1519 | // 2. If we are dealing with a normal -> normal binade decrement, since |
1520 | // we have an explicit integral bit the fact that all bits but the |
1521 | // integral bit are zero implies that subtracting one will yield a |
1522 | // significand with 0 integral bit and 1 in all other spots. Thus we |
1523 | // must just adjust the exponent and set the integral bit to 1. |
1524 | // 3. If we are dealing with a normal -> denormal binade decrement, |
1525 | // since we set the integral bit to 0 when we represent denormals, we |
1526 | // just decrement the significand. |
1527 | sig::decrement(&mut self.sig); |
1528 | |
1529 | if crossing_binade_boundary { |
1530 | // Our result is a normal number. Do the following: |
1531 | // 1. Set the integral bit to 1. |
1532 | // 2. Decrement the exponent. |
1533 | sig::set_bit(&mut self.sig, S::PRECISION - 1); |
1534 | self.exp -= 1; |
1535 | } |
1536 | } else { |
1537 | // If we are positive, we need to increment the significand. |
1538 | |
1539 | // We only cross a binade boundary that requires adjusting the exponent if |
1540 | // the input is not a denormal and all of said input's significand bits |
1541 | // are set. If all of said conditions are true: clear the significand, set |
1542 | // the integral bit to 1, and increment the exponent. If we have a |
1543 | // denormal always increment since moving denormals and the numbers in the |
1544 | // smallest normal binade have the same exponent in our representation. |
1545 | let crossing_binade_boundary = !self.is_denormal() && self.sig[0] & sig_mask == sig_mask; |
1546 | |
1547 | if crossing_binade_boundary { |
1548 | self.sig = [0]; |
1549 | sig::set_bit(&mut self.sig, S::PRECISION - 1); |
1550 | assert_ne!( |
1551 | self.exp, |
1552 | S::MAX_EXP, |
1553 | "We can not increment an exponent beyond the MAX_EXP \ |
1554 | allowed by the given floating point semantics." |
1555 | ); |
1556 | self.exp += 1; |
1557 | } else { |
1558 | sig::increment(&mut self.sig); |
1559 | } |
1560 | } |
1561 | Status::OK.and(self) |
1562 | } |
1563 | } |
1564 | } |
1565 | |
1566 | fn from_bits(input: u128) -> Self { |
1567 | // Dispatch to semantics. |
1568 | S::from_bits(input) |
1569 | } |
1570 | |
1571 | fn from_u128_r(input: u128, round: Round) -> StatusAnd<Self> { |
1572 | IeeeFloat { |
1573 | sig: [input], |
1574 | exp: S::PRECISION as ExpInt - 1, |
1575 | read_only_category_do_not_mutate: Category::Normal, |
1576 | read_only_sign_do_not_mutate: false, |
1577 | marker: PhantomData, |
1578 | } |
1579 | .normalize(round, Loss::ExactlyZero) |
1580 | } |
1581 | |
1582 | fn from_str_r(s: &str, mut round: Round) -> Result<StatusAnd<Self>, ParseError> { |
1583 | if s.is_empty() { |
1584 | return Err(ParseError("Invalid string length")); |
1585 | } |
1586 | |
1587 | // Handle a leading minus sign. |
1588 | let (minus, s) = s.strip_prefix("-").map(|s| (true, s)).unwrap_or((false, s)); |
1589 | let from_abs = |r: Self| r.negate_if(minus); |
1590 | |
1591 | // Handle a leading plus sign (mutually exclusive with minus). |
1592 | let (explicit_plus, s) = s |
1593 | .strip_prefix("+") |
1594 | .filter(|_| !minus) |
1595 | .map(|s| (true, s)) |
1596 | .unwrap_or((false, s)); |
1597 | |
1598 | // Handle special cases. |
1599 | let special = match s { |
1600 | "Inf"if minus || explicit_plus => Some(Self::INFINITY), |
1601 | |
1602 | "inf"| "INFINITY"if !explicit_plus => Some(Self::INFINITY), |
1603 | |
1604 | _ if !explicit_plus => { |
1605 | // If we have a 's' (or 'S') prefix, then this is a Signaling NaN. |
1606 | let (is_signaling, s) = s.strip_prefix(['s', 'S']).map_or((false, s), |s| (true, s)); |
1607 | |
1608 | s.strip_prefix("nan").or_else(|| s.strip_prefix( "NaN")).and_then(|s| { |
1609 | // Allow the payload to be inside parentheses. |
1610 | let s = s |
1611 | .strip_prefix("(") |
1612 | .and_then(|s| { |
1613 | // Parentheses should be balanced (and not empty). |
1614 | s.strip_suffix(")").filter(|s| !s.is_empty()) |
1615 | }) |
1616 | .unwrap_or(s); |
1617 | |
1618 | let payload = if s.is_empty() { |
1619 | // A NaN without payload. |
1620 | None |
1621 | } else { |
1622 | // Determine the payload number's radix. |
1623 | let (radix, s) = s |
1624 | .strip_prefix("0") |
1625 | .filter(|s| !s.is_empty()) |
1626 | .map(|s| s.strip_prefix(['x', 'X']).map(|s| (16, s)).unwrap_or((8, s))) |
1627 | .unwrap_or((10, s)); |
1628 | |
1629 | // Parse the payload and make the NaN. |
1630 | Some(u128::from_str_radix(s, radix).ok()?) |
1631 | }; |
1632 | |
1633 | Some(if is_signaling { |
1634 | Self::snan(payload) |
1635 | } else { |
1636 | Self::qnan(payload) |
1637 | }) |
1638 | }) |
1639 | } |
1640 | |
1641 | _ => None, |
1642 | }; |
1643 | if let Some(r) = special { |
1644 | return Ok(Status::OK.and(from_abs(r))); |
1645 | } |
1646 | |
1647 | if s.is_empty() { |
1648 | return Err(ParseError("String has no digits")); |
1649 | } |
1650 | |
1651 | // Adjust the rounding mode for the absolute value below. |
1652 | round = round.negate_if(minus); |
1653 | |
1654 | let (is_hex, s) = s |
1655 | .strip_prefix("0") |
1656 | .and_then(|s| s.strip_prefix(['x', 'X'])) |
1657 | .map(|s| (true, s)) |
1658 | .unwrap_or((false, s)); |
1659 | |
1660 | let r = if is_hex { |
1661 | if s.is_empty() { |
1662 | return Err(ParseError("Invalid string")); |
1663 | } |
1664 | Self::from_hexadecimal_string(s, round)? |
1665 | } else { |
1666 | Self::from_decimal_string(s, round)? |
1667 | }; |
1668 | |
1669 | Ok(r.map(from_abs)) |
1670 | } |
1671 | |
1672 | fn to_bits(self) -> u128 { |
1673 | // Dispatch to semantics. |
1674 | S::to_bits(self) |
1675 | } |
1676 | |
1677 | fn to_u128_r(self, width: usize, round: Round, is_exact: &mut bool) -> StatusAnd<u128> { |
1678 | // The result of trying to convert a number too large. |
1679 | let overflow = if self.is_negative() { |
1680 | // Negative numbers cannot be represented as unsigned. |
1681 | 0 |
1682 | } else { |
1683 | // Largest unsigned integer of the given width. |
1684 | !0 >> (128 - width) |
1685 | }; |
1686 | |
1687 | *is_exact = false; |
1688 | |
1689 | match self.category() { |
1690 | Category::NaN => Status::INVALID_OP.and(0), |
1691 | |
1692 | Category::Infinity => Status::INVALID_OP.and(overflow), |
1693 | |
1694 | Category::Zero => { |
1695 | // Negative zero can't be represented as an int. |
1696 | *is_exact = !self.is_negative(); |
1697 | Status::OK.and(0) |
1698 | } |
1699 | |
1700 | Category::Normal => { |
1701 | let mut r = 0; |
1702 | |
1703 | // Step 1: place our absolute value, with any fraction truncated, in |
1704 | // the destination. |
1705 | let truncated_bits = if self.exp < 0 { |
1706 | // Our absolute value is less than one; truncate everything. |
1707 | // For exponent -1 the integer bit represents .5, look at that. |
1708 | // For smaller exponents leftmost truncated bit is 0. |
1709 | S::PRECISION - 1 + (-self.exp) as usize |
1710 | } else { |
1711 | // We want the most significant (exponent + 1) bits; the rest are |
1712 | // truncated. |
1713 | let bits = self.exp as usize + 1; |
1714 | |
1715 | // Hopelessly large in magnitude? |
1716 | if bits > width { |
1717 | return Status::INVALID_OP.and(overflow); |
1718 | } |
1719 | |
1720 | if bits < S::PRECISION { |
1721 | // We truncate (S::PRECISION - bits) bits. |
1722 | r = self.sig[0] >> (S::PRECISION - bits); |
1723 | S::PRECISION - bits |
1724 | } else { |
1725 | // We want at least as many bits as are available. |
1726 | r = self.sig[0] << (bits - S::PRECISION); |
1727 | 0 |
1728 | } |
1729 | }; |
1730 | |
1731 | // Step 2: work out any lost fraction, and increment the absolute |
1732 | // value if we would round away from zero. |
1733 | let mut loss = Loss::ExactlyZero; |
1734 | if truncated_bits > 0 { |
1735 | loss = Loss::through_truncation(&self.sig, truncated_bits); |
1736 | if loss != Loss::ExactlyZero && self.round_away_from_zero(round, loss, truncated_bits) { |
1737 | r = r.wrapping_add(1); |
1738 | if r == 0 { |
1739 | return Status::INVALID_OP.and(overflow); // Overflow. |
1740 | } |
1741 | } |
1742 | } |
1743 | |
1744 | // Step 3: check if we fit in the destination. |
1745 | if r > overflow { |
1746 | return Status::INVALID_OP.and(overflow); |
1747 | } |
1748 | |
1749 | if loss == Loss::ExactlyZero { |
1750 | *is_exact = true; |
1751 | Status::OK.and(r) |
1752 | } else { |
1753 | Status::INEXACT.and(r) |
1754 | } |
1755 | } |
1756 | } |
1757 | } |
1758 | |
1759 | fn cmp_abs_normal(self, rhs: Self) -> Ordering { |
1760 | assert!(self.is_finite_non_zero()); |
1761 | assert!(rhs.is_finite_non_zero()); |
1762 | |
1763 | // If exponents are equal, do an unsigned comparison of the significands. |
1764 | self.exp.cmp(&rhs.exp).then_with(|| sig::cmp(&self.sig, &rhs.sig)) |
1765 | } |
1766 | |
1767 | fn bitwise_eq(self, rhs: Self) -> bool { |
1768 | if self.category() != rhs.category() || self.is_negative() != rhs.is_negative() { |
1769 | return false; |
1770 | } |
1771 | |
1772 | if self.category() == Category::Zero || self.category() == Category::Infinity { |
1773 | return true; |
1774 | } |
1775 | |
1776 | if self.is_finite_non_zero() && self.exp != rhs.exp { |
1777 | return false; |
1778 | } |
1779 | |
1780 | self.sig == rhs.sig |
1781 | } |
1782 | |
1783 | fn is_negative(self) -> bool { |
1784 | self.read_only_sign_do_not_mutate |
1785 | } |
1786 | |
1787 | fn is_denormal(self) -> bool { |
1788 | self.is_finite_non_zero() && self.exp == S::MIN_EXP && !sig::get_bit(&self.sig, S::PRECISION - 1) |
1789 | } |
1790 | |
1791 | fn is_signaling(self) -> bool { |
1792 | // IEEE-754R 2008 6.2.1: A signaling NaN bit string should be encoded with the |
1793 | // first bit of the trailing significand being 0. |
1794 | self.is_nan() && self.sig[0] & S::QNAN_SIGNIFICAND != S::QNAN_SIGNIFICAND |
1795 | } |
1796 | |
1797 | fn category(self) -> Category { |
1798 | self.read_only_category_do_not_mutate |
1799 | } |
1800 | |
1801 | fn get_exact_inverse(self) -> Option<Self> { |
1802 | // Special floats and denormals have no exact inverse. |
1803 | if !self.is_finite_non_zero() { |
1804 | return None; |
1805 | } |
1806 | |
1807 | // Check that the number is a power of two by making sure that only the |
1808 | // integer bit is set in the significand. |
1809 | if self.sig != [1 << (S::PRECISION - 1)] { |
1810 | return None; |
1811 | } |
1812 | |
1813 | // Get the inverse. |
1814 | let mut reciprocal = Self::from_u128(1).value; |
1815 | let status; |
1816 | reciprocal = unpack!(status=, reciprocal / self); |
1817 | if status != Status::OK { |
1818 | return None; |
1819 | } |
1820 | |
1821 | // Avoid multiplication with a denormal, it is not safe on all platforms and |
1822 | // may be slower than a normal division. |
1823 | if reciprocal.is_denormal() { |
1824 | return None; |
1825 | } |
1826 | |
1827 | assert!(reciprocal.is_finite_non_zero()); |
1828 | assert_eq!(reciprocal.sig, [1 << (S::PRECISION - 1)]); |
1829 | |
1830 | Some(reciprocal) |
1831 | } |
1832 | |
1833 | fn ilogb(mut self) -> ExpInt { |
1834 | if self.is_nan() { |
1835 | return IEK_NAN; |
1836 | } |
1837 | if self.is_zero() { |
1838 | return IEK_ZERO; |
1839 | } |
1840 | if self.is_infinite() { |
1841 | return IEK_INF; |
1842 | } |
1843 | if !self.is_denormal() { |
1844 | return self.exp; |
1845 | } |
1846 | |
1847 | let sig_bits = (S::PRECISION - 1) as ExpInt; |
1848 | self.exp += sig_bits; |
1849 | self = self.normalize(Round::NearestTiesToEven, Loss::ExactlyZero).value; |
1850 | self.exp - sig_bits |
1851 | } |
1852 | |
1853 | fn scalbn_r(mut self, exp: ExpInt, round: Round) -> Self { |
1854 | // If exp is wildly out-of-scale, simply adding it to self.exp will |
1855 | // overflow; clamp it to a safe range before adding, but ensure that the range |
1856 | // is large enough that the clamp does not change the result. The range we |
1857 | // need to support is the difference between the largest possible exponent and |
1858 | // the normalized exponent of half the smallest denormal. |
1859 | |
1860 | let sig_bits = (S::PRECISION - 1) as i32; |
1861 | let max_change = S::MAX_EXP as i32 - (S::MIN_EXP as i32 - sig_bits) + 1; |
1862 | |
1863 | // Clamp to one past the range ends to let normalize handle overflow. |
1864 | let exp_change = cmp::min(cmp::max(exp as i32, -max_change - 1), max_change); |
1865 | self.exp = self.exp.saturating_add(exp_change as ExpInt); |
1866 | self = self.normalize(round, Loss::ExactlyZero).value; |
1867 | if self.is_nan() { |
1868 | self = IeeeDefaultExceptionHandling::result_from_nan(self).value; |
1869 | } |
1870 | self |
1871 | } |
1872 | |
1873 | fn frexp_r(mut self, exp: &mut ExpInt, round: Round) -> Self { |
1874 | *exp = self.ilogb(); |
1875 | |
1876 | // Quiet signalling nans. |
1877 | if *exp == IEK_NAN { |
1878 | self = IeeeDefaultExceptionHandling::result_from_nan(self).value; |
1879 | return self; |
1880 | } |
1881 | |
1882 | if *exp == IEK_INF { |
1883 | return self; |
1884 | } |
1885 | |
1886 | // 1 is added because frexp is defined to return a normalized fraction in |
1887 | // +/-[0.5, 1.0), rather than the usual +/-[1.0, 2.0). |
1888 | if *exp == IEK_ZERO { |
1889 | *exp = 0; |
1890 | } else { |
1891 | *exp += 1; |
1892 | } |
1893 | self.scalbn_r(-*exp, round) |
1894 | } |
1895 | } |
1896 | |
1897 | impl<S: Semantics, T: Semantics> FloatConvert<IeeeFloat<T>> for IeeeFloat<S> { |
1898 | fn convert_r(mut self, round: Round, loses_info: &mut bool) -> StatusAnd<IeeeFloat<T>> { |
1899 | // FIXME(eddyb) move this into the return result. |
1900 | *loses_info = false; |
1901 | |
1902 | // x86 has some unusual NaNs which cannot be represented in any other |
1903 | // format; note them here. |
1904 | fn is_x87_double_extended<S: Semantics>() -> bool { |
1905 | S::QNAN_SIGNIFICAND == X87DoubleExtendedS::QNAN_SIGNIFICAND |
1906 | } |
1907 | let loses_x87_pseudo_nan = is_x87_double_extended::<S>() |
1908 | && !is_x87_double_extended::<T>() |
1909 | && self.category() == Category::NaN |
1910 | && (self.sig[0] & S::QNAN_SIGNIFICAND) != S::QNAN_SIGNIFICAND; |
1911 | |
1912 | // NOTE(eddyb) this is done early because the target semantics may not |
1913 | // actually support encoding the distinction between SNaN and QNaN. |
1914 | // |
1915 | // Convert of sNaN creates qNaN and raises an exception (invalid op). |
1916 | // This also guarantees that a sNaN does not become Inf on a truncation |
1917 | // that loses all payload bits. |
1918 | let mut status = Status::OK; |
1919 | if self.is_nan() { |
1920 | self = unpack!(status|=, IeeeDefaultExceptionHandling::result_from_nan(self)); |
1921 | } |
1922 | |
1923 | let Self { mut sig, mut exp, .. } = self; |
1924 | |
1925 | // If this is a truncation of a denormal number, and the target semantics |
1926 | // has larger exponent range than the source semantics (this can happen |
1927 | // when truncating from PowerPC double-double to double format), the |
1928 | // right shift could lose result mantissa bits. Adjust exponent instead |
1929 | // of performing excessive shift. |
1930 | // Also do a similar trick in case shifting denormal would produce zero |
1931 | // significand as this case isn't handled correctly by normalize. |
1932 | let mut shift = T::PRECISION as ExpInt - S::PRECISION as ExpInt; |
1933 | if shift < 0 && self.is_finite_non_zero() { |
1934 | let omsb = sig::omsb(&sig) as ExpInt; |
1935 | let mut exp_change = omsb - S::PRECISION as ExpInt; |
1936 | if exp + exp_change < T::MIN_EXP { |
1937 | exp_change = T::MIN_EXP - exp; |
1938 | } |
1939 | if exp_change < shift { |
1940 | exp_change = shift; |
1941 | } |
1942 | if exp_change < 0 { |
1943 | shift -= exp_change; |
1944 | exp += exp_change; |
1945 | } else if omsb <= -shift { |
1946 | exp_change = omsb + shift - 1; // leave at least one bit set |
1947 | shift -= exp_change; |
1948 | exp += exp_change; |
1949 | } |
1950 | } |
1951 | |
1952 | // If this is a truncation, perform the shift. |
1953 | let mut loss = Loss::ExactlyZero; |
1954 | if shift < 0 && (self.is_finite_non_zero() || self.category() == Category::NaN && S::NAN_PAYLOAD_MASK != 0) { |
1955 | loss = sig::shift_right(&mut sig, &mut 0, -shift as usize); |
1956 | } |
1957 | |
1958 | // If this is an extension, perform the shift. |
1959 | if shift > 0 && (self.is_finite_non_zero() || self.category() == Category::NaN) { |
1960 | sig::shift_left(&mut sig, &mut 0, shift as usize); |
1961 | } |
1962 | |
1963 | let r = match self.category() { |
1964 | Category::Normal => { |
1965 | let r = IeeeFloat::<T> { |
1966 | sig, |
1967 | exp, |
1968 | read_only_category_do_not_mutate: self.category(), |
1969 | read_only_sign_do_not_mutate: self.is_negative(), |
1970 | marker: PhantomData, |
1971 | }; |
1972 | unpack!(status|=, r.normalize(round, loss)) |
1973 | } |
1974 | Category::NaN => { |
1975 | *loses_info = loss != Loss::ExactlyZero |
1976 | || loses_x87_pseudo_nan |
1977 | || S::NAN_PAYLOAD_MASK != 0 && T::NAN_PAYLOAD_MASK == 0; |
1978 | |
1979 | IeeeFloat::<T>::qnan(Some(sig[0])).with_sign(self.is_negative()) |
1980 | } |
1981 | Category::Infinity => IeeeFloat::<T>::INFINITY.with_sign(self.is_negative()), |
1982 | Category::Zero => IeeeFloat::<T>::ZERO.with_sign(self.is_negative()), |
1983 | }; |
1984 | |
1985 | // NOTE(eddyb) this catches all cases of e.g. ±Inf turning into NaN, |
1986 | // because of `T::NONFINITE_BEHAVIOR` not being `IEEE754`. |
1987 | if matches!(self.category(), Category::Infinity | Category::Zero) |
1988 | && (r.category() != self.category() || r.is_negative() != self.is_negative()) |
1989 | { |
1990 | status |= Status::INEXACT; |
1991 | } |
1992 | |
1993 | *loses_info |= (status - Status::INVALID_OP) != Status::OK; |
1994 | |
1995 | status.and(r) |
1996 | } |
1997 | } |
1998 | |
1999 | impl<S: Semantics> IeeeFloat<S> { |
2000 | /// Handle positive overflow. We either return infinity or |
2001 | /// the largest finite number. For negative overflow, |
2002 | /// negate the `round` argument before calling. |
2003 | fn overflow_result(round: Round) -> StatusAnd<Self> { |
2004 | match round { |
2005 | // Infinity? |
2006 | Round::NearestTiesToEven | Round::NearestTiesToAway | Round::TowardPositive => { |
2007 | (Status::OVERFLOW | Status::INEXACT).and(Self::INFINITY) |
2008 | } |
2009 | // Otherwise we become the largest finite number. |
2010 | Round::TowardNegative | Round::TowardZero => Status::INEXACT.and(Self::largest()), |
2011 | } |
2012 | } |
2013 | |
2014 | /// Returns TRUE if, when truncating the current number, with BIT the |
2015 | /// new LSB, with the given lost fraction and rounding mode, the result |
2016 | /// would need to be rounded away from zero (i.e., by increasing the |
2017 | /// signficand). This routine must work for Category::Zero of both signs, and |
2018 | /// Category::Normal numbers. |
2019 | fn round_away_from_zero(&self, round: Round, loss: Loss, bit: usize) -> bool { |
2020 | // NaNs and infinities should not have lost fractions. |
2021 | assert!(self.is_finite_non_zero() || self.is_zero()); |
2022 | |
2023 | // Current callers never pass this so we don't handle it. |
2024 | assert_ne!(loss, Loss::ExactlyZero); |
2025 | |
2026 | match round { |
2027 | Round::NearestTiesToAway => loss == Loss::ExactlyHalf || loss == Loss::MoreThanHalf, |
2028 | Round::NearestTiesToEven => { |
2029 | if loss == Loss::MoreThanHalf { |
2030 | return true; |
2031 | } |
2032 | |
2033 | // Our zeros don't have a significand to test. |
2034 | if loss == Loss::ExactlyHalf && self.category() != Category::Zero { |
2035 | return sig::get_bit(&self.sig, bit); |
2036 | } |
2037 | |
2038 | false |
2039 | } |
2040 | Round::TowardZero => false, |
2041 | Round::TowardPositive => !self.is_negative(), |
2042 | Round::TowardNegative => self.is_negative(), |
2043 | } |
2044 | } |
2045 | |
2046 | fn normalize(mut self, round: Round, mut loss: Loss) -> StatusAnd<Self> { |
2047 | if !self.is_finite_non_zero() { |
2048 | return Status::OK.and(self); |
2049 | } |
2050 | |
2051 | // Before rounding normalize the exponent of Category::Normal numbers. |
2052 | let mut omsb = sig::omsb(&self.sig); |
2053 | |
2054 | if omsb > 0 { |
2055 | // OMSB is numbered from 1. We want to place it in the integer |
2056 | // bit numbered PRECISION if possible, with a compensating change in |
2057 | // the exponent. |
2058 | let mut final_exp = self.exp.saturating_add(omsb as ExpInt - S::PRECISION as ExpInt); |
2059 | |
2060 | // If the resulting exponent is too high, overflow according to |
2061 | // the rounding mode. |
2062 | if final_exp > S::MAX_EXP { |
2063 | let round = round.negate_if(self.is_negative()); |
2064 | return Self::overflow_result(round).map(|r| r.copy_sign(self)); |
2065 | } |
2066 | |
2067 | // Subnormal numbers have exponent MIN_EXP, and their MSB |
2068 | // is forced based on that. |
2069 | if final_exp < S::MIN_EXP { |
2070 | final_exp = S::MIN_EXP; |
2071 | } |
2072 | |
2073 | // Shifting left is easy as we don't lose precision. |
2074 | if final_exp < self.exp { |
2075 | assert_eq!(loss, Loss::ExactlyZero); |
2076 | |
2077 | let exp_change = (self.exp - final_exp) as usize; |
2078 | sig::shift_left(&mut self.sig, &mut self.exp, exp_change); |
2079 | |
2080 | return Status::OK.and(self); |
2081 | } |
2082 | |
2083 | // Shift right and capture any new lost fraction. |
2084 | if final_exp > self.exp { |
2085 | let exp_change = (final_exp - self.exp) as usize; |
2086 | loss = sig::shift_right(&mut self.sig, &mut self.exp, exp_change).combine(loss); |
2087 | |
2088 | // Keep OMSB up-to-date. |
2089 | omsb = omsb.saturating_sub(exp_change); |
2090 | } |
2091 | } |
2092 | |
2093 | // NOTE(eddyb) for `NonfiniteBehavior::NanOnly`, the unique `NAN` takes up |
2094 | // the largest significand of `MAX_EXP` (which also has normals), though |
2095 | // comparing significands needs to ignore the integer bit `NAN` lacks. |
2096 | if S::NONFINITE_BEHAVIOR == NonfiniteBehavior::NanOnly |
2097 | && self.exp == Self::NAN.exp |
2098 | && [self.sig[0] & S::NAN_SIGNIFICAND_BASE] == Self::NAN.sig |
2099 | { |
2100 | return Self::overflow_result(round).map(|r| r.copy_sign(self)); |
2101 | } |
2102 | |
2103 | // Now round the number according to round given the lost |
2104 | // fraction. |
2105 | |
2106 | // As specified in IEEE 754, since we do not trap we do not report |
2107 | // underflow for exact results. |
2108 | if loss == Loss::ExactlyZero { |
2109 | // Canonicalize zeros. |
2110 | if omsb == 0 { |
2111 | self = Self::ZERO.copy_sign(self); |
2112 | } |
2113 | |
2114 | return Status::OK.and(self); |
2115 | } |
2116 | |
2117 | // Increment the significand if we're rounding away from zero. |
2118 | if self.round_away_from_zero(round, loss, 0) { |
2119 | if omsb == 0 { |
2120 | self.exp = S::MIN_EXP; |
2121 | } |
2122 | |
2123 | // We should never overflow. |
2124 | assert_eq!(sig::increment(&mut self.sig), 0); |
2125 | omsb = sig::omsb(&self.sig); |
2126 | |
2127 | // Did the significand increment overflow? |
2128 | if omsb == S::PRECISION + 1 { |
2129 | // Renormalize by incrementing the exponent and shifting our |
2130 | // significand right one. However if we already have the |
2131 | // maximum exponent we overflow to infinity. |
2132 | if self.exp == S::MAX_EXP { |
2133 | return (Status::OVERFLOW | Status::INEXACT).and(Self::INFINITY.copy_sign(self)); |
2134 | } |
2135 | |
2136 | let _: Loss = sig::shift_right(&mut self.sig, &mut self.exp, 1); |
2137 | |
2138 | return Status::INEXACT.and(self); |
2139 | } |
2140 | |
2141 | // NOTE(eddyb) for `NonfiniteBehavior::NanOnly`, the unique `NAN` takes up |
2142 | // the largest significand of `MAX_EXP` (which also has normals), though |
2143 | // comparing significands needs to ignore the integer bit `NAN` lacks. |
2144 | if S::NONFINITE_BEHAVIOR == NonfiniteBehavior::NanOnly |
2145 | && self.exp == Self::NAN.exp |
2146 | && [self.sig[0] & S::NAN_SIGNIFICAND_BASE] == Self::NAN.sig |
2147 | { |
2148 | return Self::overflow_result(round).map(|r| r.copy_sign(self)); |
2149 | } |
2150 | } |
2151 | |
2152 | // The normal case - we were and are not denormal, and any |
2153 | // significand increment above didn't overflow. |
2154 | if omsb == S::PRECISION { |
2155 | return Status::INEXACT.and(self); |
2156 | } |
2157 | |
2158 | // We have a non-zero denormal. |
2159 | assert!(omsb < S::PRECISION); |
2160 | |
2161 | // Canonicalize zeros. |
2162 | if omsb == 0 { |
2163 | self = Self::ZERO.copy_sign(self); |
2164 | } |
2165 | |
2166 | // The Category::Zero case is a denormal that underflowed to zero. |
2167 | (Status::UNDERFLOW | Status::INEXACT).and(self) |
2168 | } |
2169 | |
2170 | fn from_hexadecimal_string(s: &str, round: Round) -> Result<StatusAnd<Self>, ParseError> { |
2171 | let mut r = IeeeFloat { |
2172 | sig: [0], |
2173 | exp: 0, |
2174 | read_only_category_do_not_mutate: Category::Normal, |
2175 | read_only_sign_do_not_mutate: false, |
2176 | marker: PhantomData, |
2177 | }; |
2178 | |
2179 | let mut any_digits = false; |
2180 | let mut has_exp = false; |
2181 | let mut bit_pos = LIMB_BITS as isize; |
2182 | let mut loss = None; |
2183 | |
2184 | // Without leading or trailing zeros, irrespective of the dot. |
2185 | let mut first_sig_digit = None; |
2186 | let mut dot = s.len(); |
2187 | |
2188 | for (p, c) in s.char_indices() { |
2189 | // Skip leading zeros and any (hexa)decimal point. |
2190 | if c == '.'{ |
2191 | if dot != s.len() { |
2192 | return Err(ParseError("String contains multiple dots")); |
2193 | } |
2194 | dot = p; |
2195 | } else if let Some(hex_value) = c.to_digit(16) { |
2196 | any_digits = true; |
2197 | |
2198 | if first_sig_digit.is_none() { |
2199 | if hex_value == 0 { |
2200 | continue; |
2201 | } |
2202 | first_sig_digit = Some(p); |
2203 | } |
2204 | |
2205 | // Store the number while we have space. |
2206 | bit_pos -= 4; |
2207 | if bit_pos >= 0 { |
2208 | r.sig[0] |= (hex_value as Limb) << bit_pos; |
2209 | } else { |
2210 | // If zero or one-half (the hexadecimal digit 8) are followed |
2211 | // by non-zero, they're a little more than zero or one-half. |
2212 | if let Some(ref mut loss) = loss { |
2213 | if hex_value != 0 { |
2214 | if *loss == Loss::ExactlyZero { |
2215 | *loss = Loss::LessThanHalf; |
2216 | } |
2217 | if *loss == Loss::ExactlyHalf { |
2218 | *loss = Loss::MoreThanHalf; |
2219 | } |
2220 | } |
2221 | } else { |
2222 | loss = Some(match hex_value { |
2223 | 0 => Loss::ExactlyZero, |
2224 | 1..=7 => Loss::LessThanHalf, |
2225 | 8 => Loss::ExactlyHalf, |
2226 | 9..=15 => Loss::MoreThanHalf, |
2227 | _ => unreachable!(), |
2228 | }); |
2229 | } |
2230 | } |
2231 | } else if c == 'p'|| c == 'P'{ |
2232 | if !any_digits { |
2233 | return Err(ParseError("Significand has no digits")); |
2234 | } |
2235 | |
2236 | if dot == s.len() { |
2237 | dot = p; |
2238 | } |
2239 | |
2240 | let mut chars = s[p + 1..].chars().peekable(); |
2241 | |
2242 | // Adjust for the given exponent. |
2243 | let exp_minus = chars.peek() == Some(&'-'); |
2244 | if exp_minus || chars.peek() == Some(&'+') { |
2245 | chars.next(); |
2246 | } |
2247 | |
2248 | for c in chars { |
2249 | if let Some(value) = c.to_digit(10) { |
2250 | has_exp = true; |
2251 | r.exp = r.exp.saturating_mul(10).saturating_add(value as ExpInt); |
2252 | } else { |
2253 | return Err(ParseError("Invalid character in exponent")); |
2254 | } |
2255 | } |
2256 | if !has_exp { |
2257 | return Err(ParseError("Exponent has no digits")); |
2258 | } |
2259 | |
2260 | r.exp = r.exp.negate_if(exp_minus); |
2261 | |
2262 | break; |
2263 | } else { |
2264 | return Err(ParseError("Invalid character in significand")); |
2265 | } |
2266 | } |
2267 | if !any_digits { |
2268 | return Err(ParseError("Significand has no digits")); |
2269 | } |
2270 | |
2271 | // Hex floats require an exponent but not a hexadecimal point. |
2272 | if !has_exp { |
2273 | return Err(ParseError("Hex strings require an exponent")); |
2274 | } |
2275 | |
2276 | // Ignore the exponent if we are zero. |
2277 | let first_sig_digit = match first_sig_digit { |
2278 | Some(p) => p, |
2279 | None => return Ok(Status::OK.and(Self::ZERO)), |
2280 | }; |
2281 | |
2282 | // Calculate the exponent adjustment implicit in the number of |
2283 | // significant digits and adjust for writing the significand starting |
2284 | // at the most significant nibble. |
2285 | let exp_adjustment = if dot > first_sig_digit { |
2286 | ExpInt::try_from(dot - first_sig_digit).unwrap() |
2287 | } else { |
2288 | -ExpInt::try_from(first_sig_digit - dot - 1).unwrap() |
2289 | }; |
2290 | let exp_adjustment = exp_adjustment |
2291 | .saturating_mul(4) |
2292 | .saturating_sub(1) |
2293 | .saturating_add(S::PRECISION as ExpInt) |
2294 | .saturating_sub(LIMB_BITS as ExpInt); |
2295 | r.exp = r.exp.saturating_add(exp_adjustment); |
2296 | |
2297 | Ok(r.normalize(round, loss.unwrap_or(Loss::ExactlyZero))) |
2298 | } |
2299 | |
2300 | fn from_decimal_string(s: &str, round: Round) -> Result<StatusAnd<Self>, ParseError> { |
2301 | // Given a normal decimal floating point number of the form |
2302 | // |
2303 | // dddd.dddd[eE][+-]ddd |
2304 | // |
2305 | // where the decimal point and exponent are optional, fill out the |
2306 | // variables below. Exponent is appropriate if the significand is |
2307 | // treated as an integer, and normalized_exp if the significand |
2308 | // is taken to have the decimal point after a single leading |
2309 | // non-zero digit. |
2310 | // |
2311 | // If the value is zero, first_sig_digit is None. |
2312 | |
2313 | let mut any_digits = false; |
2314 | let mut dec_exp = 0i32; |
2315 | |
2316 | // Without leading or trailing zeros, irrespective of the dot. |
2317 | let mut first_sig_digit = None; |
2318 | let mut last_sig_digit = 0; |
2319 | let mut dot = s.len(); |
2320 | |
2321 | for (p, c) in s.char_indices() { |
2322 | if c == '.'{ |
2323 | if dot != s.len() { |
2324 | return Err(ParseError("String contains multiple dots")); |
2325 | } |
2326 | dot = p; |
2327 | } else if let Some(dec_value) = c.to_digit(10) { |
2328 | any_digits = true; |
2329 | |
2330 | if dec_value != 0 { |
2331 | if first_sig_digit.is_none() { |
2332 | first_sig_digit = Some(p); |
2333 | } |
2334 | last_sig_digit = p; |
2335 | } |
2336 | } else if c == 'e'|| c == 'E'{ |
2337 | if !any_digits { |
2338 | return Err(ParseError("Significand has no digits")); |
2339 | } |
2340 | |
2341 | if dot == s.len() { |
2342 | dot = p; |
2343 | } |
2344 | |
2345 | let mut chars = s[p + 1..].chars().peekable(); |
2346 | |
2347 | // Adjust for the given exponent. |
2348 | let exp_minus = chars.peek() == Some(&'-'); |
2349 | if exp_minus || chars.peek() == Some(&'+') { |
2350 | chars.next(); |
2351 | } |
2352 | |
2353 | let mut any_exp_digits = false; |
2354 | for c in chars { |
2355 | if let Some(value) = c.to_digit(10) { |
2356 | any_exp_digits = true; |
2357 | dec_exp = dec_exp.saturating_mul(10).saturating_add(value as i32); |
2358 | } else { |
2359 | return Err(ParseError("Invalid character in exponent")); |
2360 | } |
2361 | } |
2362 | // Treat no exponent as 0 to match binutils |
2363 | if !any_exp_digits { |
2364 | assert_eq!(dec_exp, 0); |
2365 | } |
2366 | |
2367 | dec_exp = dec_exp.negate_if(exp_minus); |
2368 | |
2369 | break; |
2370 | } else { |
2371 | return Err(ParseError("Invalid character in significand")); |
2372 | } |
2373 | } |
2374 | if !any_digits { |
2375 | return Err(ParseError("Significand has no digits")); |
2376 | } |
2377 | |
2378 | // Test if we have a zero number allowing for non-zero exponents. |
2379 | let first_sig_digit = match first_sig_digit { |
2380 | Some(p) => p, |
2381 | None => return Ok(Status::OK.and(Self::ZERO)), |
2382 | }; |
2383 | |
2384 | // Adjust the exponents for any decimal point. |
2385 | if dot > last_sig_digit { |
2386 | dec_exp = dec_exp.saturating_add((dot - last_sig_digit - 1) as i32); |
2387 | } else { |
2388 | dec_exp = dec_exp.saturating_sub((last_sig_digit - dot) as i32); |
2389 | } |
2390 | let significand_digits = |
2391 | last_sig_digit - first_sig_digit + 1 - (dot > first_sig_digit && dot < last_sig_digit) as usize; |
2392 | let normalized_exp = dec_exp.saturating_add(significand_digits as i32 - 1); |
2393 | |
2394 | // Handle the cases where exponents are obviously too large or too |
2395 | // small. Writing L for log 10 / log 2, a number d.ddddd*10^dec_exp |
2396 | // definitely overflows if |
2397 | // |
2398 | // (dec_exp - 1) * L >= MAX_EXP |
2399 | // |
2400 | // and definitely underflows to zero where |
2401 | // |
2402 | // (dec_exp + 1) * L <= MIN_EXP - PRECISION |
2403 | // |
2404 | // With integer arithmetic the tightest bounds for L are |
2405 | // |
2406 | // 93/28 < L < 196/59 [ numerator <= 256 ] |
2407 | // 42039/12655 < L < 28738/8651 [ numerator <= 65536 ] |
2408 | |
2409 | // Check for MAX_EXP. |
2410 | if normalized_exp.saturating_sub(1).saturating_mul(42039) >= 12655 * S::MAX_EXP as i32 { |
2411 | // Overflow and round. |
2412 | return Ok(Self::overflow_result(round)); |
2413 | } |
2414 | |
2415 | // Check for MIN_EXP. |
2416 | if normalized_exp.saturating_add(1).saturating_mul(28738) <= 8651 * (S::MIN_EXP as i32 - S::PRECISION as i32) { |
2417 | // Underflow to zero and round. |
2418 | let r = if round == Round::TowardPositive { |
2419 | IeeeFloat::SMALLEST |
2420 | } else { |
2421 | IeeeFloat::ZERO |
2422 | }; |
2423 | return Ok((Status::UNDERFLOW | Status::INEXACT).and(r)); |
2424 | } |
2425 | |
2426 | // A tight upper bound on number of bits required to hold an |
2427 | // N-digit decimal integer is N * 196 / 59. Allocate enough space |
2428 | // to hold the full significand, and an extra limb required by |
2429 | // tcMultiplyPart. |
2430 | let max_limbs = limbs_for_bits(1 + 196 * significand_digits / 59); |
2431 | let mut dec_sig = DynPrecisionLimbVec::with_capacity(max_limbs); |
2432 | |
2433 | // Convert to binary efficiently - we do almost all multiplication |
2434 | // in a Limb. When this would overflow do we do a single |
2435 | // bignum multiplication, and then revert again to multiplication |
2436 | // in a Limb. |
2437 | let mut chars = s[first_sig_digit..last_sig_digit + 1].chars(); |
2438 | loop { |
2439 | let mut val = 0; |
2440 | let mut multiplier = 1; |
2441 | |
2442 | loop { |
2443 | let dec_value = match chars.next() { |
2444 | Some('.') => continue, |
2445 | Some(c) => c.to_digit(10).unwrap(), |
2446 | None => break, |
2447 | }; |
2448 | |
2449 | multiplier *= 10; |
2450 | val = val * 10 + dec_value as Limb; |
2451 | |
2452 | // The maximum number that can be multiplied by ten with any |
2453 | // digit added without overflowing a Limb. |
2454 | if multiplier > (!0 - 9) / 10 { |
2455 | break; |
2456 | } |
2457 | } |
2458 | |
2459 | // If we've consumed no digits, we're done. |
2460 | if multiplier == 1 { |
2461 | break; |
2462 | } |
2463 | |
2464 | // Multiply out the current limb. |
2465 | let mut carry = val; |
2466 | for x in &mut dec_sig { |
2467 | let [low, mut high] = sig::widening_mul(*x, multiplier); |
2468 | |
2469 | // Now add carry. |
2470 | let (low, overflow) = low.overflowing_add(carry); |
2471 | high += overflow as Limb; |
2472 | |
2473 | *x = low; |
2474 | carry = high; |
2475 | } |
2476 | |
2477 | // If we had carry, we need another limb (likely but not guaranteed). |
2478 | if carry > 0 { |
2479 | dec_sig.push(carry); |
2480 | } |
2481 | } |
2482 | |
2483 | // Calculate pow(5, abs(dec_exp)) into `pow5_full`. |
2484 | // The *_calc Vec's are reused scratch space, as an optimization. |
2485 | let (pow5_full, mut pow5_calc, mut sig_calc, mut sig_scratch_calc) = { |
2486 | let mut power = dec_exp.abs() as usize; |
2487 | |
2488 | const FIRST_EIGHT_POWERS: [Limb; 8] = [1, 5, 25, 125, 625, 3125, 15625, 78125]; |
2489 | |
2490 | let mut p5_scratch = DynPrecisionLimbVec::new(); |
2491 | let mut p5: DynPrecisionLimbVec = [FIRST_EIGHT_POWERS[4]].into_iter().collect(); |
2492 | |
2493 | let mut r_scratch = DynPrecisionLimbVec::new(); |
2494 | let mut r: DynPrecisionLimbVec = [FIRST_EIGHT_POWERS[power & 7]].into_iter().collect(); |
2495 | power >>= 3; |
2496 | |
2497 | while power > 0 { |
2498 | // Calculate pow(5,pow(2,n+3)). |
2499 | p5_scratch.resize(p5.len() * 2, 0); |
2500 | let _: Loss = sig::mul(&mut p5_scratch, &mut 0, &p5, &p5, p5.len() * 2 * LIMB_BITS); |
2501 | while p5_scratch.last() == Some(&0) { |
2502 | p5_scratch.pop(); |
2503 | } |
2504 | mem::swap(&mut p5, &mut p5_scratch); |
2505 | |
2506 | if power & 1 != 0 { |
2507 | r_scratch.resize(r.len() + p5.len(), 0); |
2508 | let _: Loss = sig::mul(&mut r_scratch, &mut 0, &r, &p5, (r.len() + p5.len()) * LIMB_BITS); |
2509 | while r_scratch.last() == Some(&0) { |
2510 | r_scratch.pop(); |
2511 | } |
2512 | mem::swap(&mut r, &mut r_scratch); |
2513 | } |
2514 | |
2515 | power >>= 1; |
2516 | } |
2517 | |
2518 | (r, r_scratch, p5, p5_scratch) |
2519 | }; |
2520 | |
2521 | // Attempt dec_sig * 10^dec_exp with increasing precision. |
2522 | let mut attempt = 0; |
2523 | loop { |
2524 | let calc_precision = (LIMB_BITS << attempt) - 1; |
2525 | attempt += 1; |
2526 | |
2527 | let calc_normal_from_limbs = |sig: &mut DynPrecisionLimbVec, limbs: &[Limb]| -> StatusAnd<ExpInt> { |
2528 | sig.resize(limbs_for_bits(calc_precision), 0); |
2529 | let (mut loss, mut exp) = sig::from_limbs(sig, limbs, calc_precision); |
2530 | |
2531 | // Before rounding normalize the exponent of Category::Normal numbers. |
2532 | let mut omsb = sig::omsb(sig); |
2533 | |
2534 | assert_ne!(omsb, 0); |
2535 | |
2536 | // OMSB is numbered from 1. We want to place it in the integer |
2537 | // bit numbered PRECISION if possible, with a compensating change in |
2538 | // the exponent. |
2539 | let final_exp = exp.saturating_add(omsb as ExpInt - calc_precision as ExpInt); |
2540 | |
2541 | // Shifting left is easy as we don't lose precision. |
2542 | if final_exp < exp { |
2543 | assert_eq!(loss, Loss::ExactlyZero); |
2544 | |
2545 | let exp_change = (exp - final_exp) as usize; |
2546 | sig::shift_left(sig, &mut exp, exp_change); |
2547 | |
2548 | return Status::OK.and(exp); |
2549 | } |
2550 | |
2551 | // Shift right and capture any new lost fraction. |
2552 | if final_exp > exp { |
2553 | let exp_change = (final_exp - exp) as usize; |
2554 | loss = sig::shift_right(sig, &mut exp, exp_change).combine(loss); |
2555 | |
2556 | // Keep OMSB up-to-date. |
2557 | omsb = omsb.saturating_sub(exp_change); |
2558 | } |
2559 | |
2560 | assert_eq!(omsb, calc_precision); |
2561 | |
2562 | // Now round the number according to round given the lost |
2563 | // fraction. |
2564 | |
2565 | // As specified in IEEE 754, since we do not trap we do not report |
2566 | // underflow for exact results. |
2567 | if loss == Loss::ExactlyZero { |
2568 | return Status::OK.and(exp); |
2569 | } |
2570 | |
2571 | // Increment the significand if we're rounding away from zero. |
2572 | if loss == Loss::MoreThanHalf || loss == Loss::ExactlyHalf && sig::get_bit(sig, 0) { |
2573 | // We should never overflow. |
2574 | assert_eq!(sig::increment(sig), 0); |
2575 | omsb = sig::omsb(sig); |
2576 | |
2577 | // Did the significand increment overflow? |
2578 | if omsb == calc_precision + 1 { |
2579 | let _: Loss = sig::shift_right(sig, &mut exp, 1); |
2580 | |
2581 | return Status::INEXACT.and(exp); |
2582 | } |
2583 | } |
2584 | |
2585 | // The normal case - we were and are not denormal, and any |
2586 | // significand increment above didn't overflow. |
2587 | Status::INEXACT.and(exp) |
2588 | }; |
2589 | |
2590 | let status; |
2591 | let mut exp = unpack!(status=, |
2592 | calc_normal_from_limbs(&mut sig_calc, &dec_sig)); |
2593 | let pow5_status; |
2594 | let pow5_exp = unpack!(pow5_status=, |
2595 | calc_normal_from_limbs(&mut pow5_calc, &pow5_full)); |
2596 | |
2597 | // Add dec_exp, as 10^n = 5^n * 2^n. |
2598 | exp += dec_exp as ExpInt; |
2599 | |
2600 | let mut used_bits = S::PRECISION; |
2601 | let mut truncated_bits = calc_precision - used_bits; |
2602 | |
2603 | let half_ulp_err1 = (status != Status::OK) as Limb; |
2604 | let (calc_loss, half_ulp_err2); |
2605 | if dec_exp >= 0 { |
2606 | exp += pow5_exp; |
2607 | |
2608 | sig_scratch_calc.resize(sig_calc.len() + pow5_calc.len(), 0); |
2609 | calc_loss = sig::mul(&mut sig_scratch_calc, &mut exp, &sig_calc, &pow5_calc, calc_precision); |
2610 | mem::swap(&mut sig_calc, &mut sig_scratch_calc); |
2611 | |
2612 | half_ulp_err2 = (pow5_status != Status::OK) as Limb; |
2613 | } else { |
2614 | exp -= pow5_exp; |
2615 | |
2616 | sig_scratch_calc.resize(sig_calc.len(), 0); |
2617 | calc_loss = sig::div(&mut sig_scratch_calc, &mut exp, &mut sig_calc, &mut pow5_calc, calc_precision); |
2618 | mem::swap(&mut sig_calc, &mut sig_scratch_calc); |
2619 | |
2620 | // Denormal numbers have less precision. |
2621 | if exp < S::MIN_EXP { |
2622 | truncated_bits += (S::MIN_EXP - exp) as usize; |
2623 | used_bits = calc_precision.saturating_sub(truncated_bits); |
2624 | } |
2625 | // Extra half-ulp lost in reciprocal of exponent. |
2626 | half_ulp_err2 = 2 * (pow5_status != Status::OK || calc_loss != Loss::ExactlyZero) as Limb; |
2627 | } |
2628 | |
2629 | // Both sig::mul and sig::div return the |
2630 | // result with the integer bit set. |
2631 | assert!(sig::get_bit(&sig_calc, calc_precision - 1)); |
2632 | |
2633 | // The error from the true value, in half-ulps, on multiplying two |
2634 | // floating point numbers, which differ from the value they |
2635 | // approximate by at most half_ulp_err1 and half_ulp_err2 half-ulps, is strictly less |
2636 | // than the returned value. |
2637 | // |
2638 | // See "How to Read Floating Point Numbers Accurately" by William D Clinger. |
2639 | assert!(half_ulp_err1 < 2 || half_ulp_err2 < 2 || (half_ulp_err1 + half_ulp_err2 < 8)); |
2640 | |
2641 | let inexact = (calc_loss != Loss::ExactlyZero) as Limb; |
2642 | let half_ulp_err = if half_ulp_err1 + half_ulp_err2 == 0 { |
2643 | inexact * 2 // <= inexact half-ulps. |
2644 | } else { |
2645 | inexact + 2 * (half_ulp_err1 + half_ulp_err2) |
2646 | }; |
2647 | |
2648 | let ulps_from_boundary = { |
2649 | let bits = calc_precision - used_bits - 1; |
2650 | |
2651 | let i = bits / LIMB_BITS; |
2652 | let limb = sig_calc[i] & (!0 >> (LIMB_BITS - 1 - bits % LIMB_BITS)); |
2653 | let boundary = match round { |
2654 | Round::NearestTiesToEven | Round::NearestTiesToAway => 1 << (bits % LIMB_BITS), |
2655 | _ => 0, |
2656 | }; |
2657 | if i == 0 { |
2658 | let delta = limb.wrapping_sub(boundary); |
2659 | cmp::min(delta, delta.wrapping_neg()) |
2660 | } else if limb == boundary { |
2661 | if !sig::is_all_zeros(&sig_calc[1..i]) { |
2662 | !0 // A lot. |
2663 | } else { |
2664 | sig_calc[0] |
2665 | } |
2666 | } else if limb == boundary.wrapping_sub(1) { |
2667 | if sig_calc[1..i].iter().any(|&x| x.wrapping_neg() != 1) { |
2668 | !0 // A lot. |
2669 | } else { |
2670 | sig_calc[0].wrapping_neg() |
2671 | } |
2672 | } else { |
2673 | !0 // A lot. |
2674 | } |
2675 | }; |
2676 | |
2677 | // Are we guaranteed to round correctly if we truncate? |
2678 | if ulps_from_boundary.saturating_mul(2) >= half_ulp_err { |
2679 | let mut r = IeeeFloat { |
2680 | sig: [0], |
2681 | exp, |
2682 | read_only_category_do_not_mutate: Category::Normal, |
2683 | read_only_sign_do_not_mutate: false, |
2684 | marker: PhantomData, |
2685 | }; |
2686 | sig::extract(&mut r.sig, &sig_calc, used_bits, calc_precision - used_bits); |
2687 | // If we extracted less bits above we must adjust our exponent |
2688 | // to compensate for the implicit right shift. |
2689 | r.exp += (S::PRECISION - used_bits) as ExpInt; |
2690 | let loss = Loss::through_truncation(&sig_calc, truncated_bits); |
2691 | return Ok(r.normalize(round, loss)); |
2692 | } |
2693 | } |
2694 | } |
2695 | } |
2696 | |
2697 | impl Loss { |
2698 | /// Combine the effect of two lost fractions. |
2699 | #[inline] |
2700 | fn combine(self, less_significant: Loss) -> Loss { |
2701 | let mut more_significant = self; |
2702 | if less_significant != Loss::ExactlyZero { |
2703 | if more_significant == Loss::ExactlyZero { |
2704 | more_significant = Loss::LessThanHalf; |
2705 | } else if more_significant == Loss::ExactlyHalf { |
2706 | more_significant = Loss::MoreThanHalf; |
2707 | } |
2708 | } |
2709 | |
2710 | more_significant |
2711 | } |
2712 | |
2713 | /// Return the fraction lost were a bignum truncated losing the least |
2714 | /// significant `bits` bits. |
2715 | #[inline] |
2716 | fn through_truncation(limbs: &[Limb], bits: usize) -> Loss { |
2717 | if bits == 0 { |
2718 | return Loss::ExactlyZero; |
2719 | } |
2720 | |
2721 | let half_bit = bits - 1; |
2722 | let half_limb = half_bit / LIMB_BITS; |
2723 | let (half_limb, rest) = if half_limb < limbs.len() { |
2724 | (limbs[half_limb], &limbs[..half_limb]) |
2725 | } else { |
2726 | (0, limbs) |
2727 | }; |
2728 | let half = 1 << (half_bit % LIMB_BITS); |
2729 | let has_half = half_limb & half != 0; |
2730 | let has_rest = half_limb & (half - 1) != 0 || !sig::is_all_zeros(rest); |
2731 | |
2732 | match (has_half, has_rest) { |
2733 | (false, false) => Loss::ExactlyZero, |
2734 | (false, true) => Loss::LessThanHalf, |
2735 | (true, false) => Loss::ExactlyHalf, |
2736 | (true, true) => Loss::MoreThanHalf, |
2737 | } |
2738 | } |
2739 | } |
2740 | |
2741 | /// Implementation details of IeeeFloat significands, such as big integer arithmetic. |
2742 | /// As a rule of thumb, no functions in this module should dynamically allocate. |
2743 | mod sig { |
2744 | use super::{limbs_for_bits, ExpInt, Limb, Loss, LIMB_BITS}; |
2745 | use core::cmp::Ordering; |
2746 | use core::mem; |
2747 | |
2748 | #[inline] |
2749 | pub(super) fn is_all_zeros(limbs: &[Limb]) -> bool { |
2750 | limbs.iter().all(|&l| l == 0) |
2751 | } |
2752 | |
2753 | /// One, not zero, based LSB. That is, returns 0 for a zeroed significand. |
2754 | #[inline] |
2755 | pub(super) fn olsb(limbs: &[Limb]) -> usize { |
2756 | for i in 0..limbs.len() { |
2757 | if limbs[i] != 0 { |
2758 | return i * LIMB_BITS + limbs[i].trailing_zeros() as usize + 1; |
2759 | } |
2760 | } |
2761 | |
2762 | 0 |
2763 | } |
2764 | |
2765 | /// One, not zero, based MSB. That is, returns 0 for a zeroed significand. |
2766 | #[inline] |
2767 | pub(super) fn omsb(limbs: &[Limb]) -> usize { |
2768 | for i in (0..limbs.len()).rev() { |
2769 | if limbs[i] != 0 { |
2770 | return (i + 1) * LIMB_BITS - limbs[i].leading_zeros() as usize; |
2771 | } |
2772 | } |
2773 | |
2774 | 0 |
2775 | } |
2776 | |
2777 | /// Comparison (unsigned) of two significands. |
2778 | #[inline] |
2779 | pub(super) fn cmp(a: &[Limb], b: &[Limb]) -> Ordering { |
2780 | assert_eq!(a.len(), b.len()); |
2781 | for (a, b) in a.iter().zip(b).rev() { |
2782 | match a.cmp(b) { |
2783 | Ordering::Equal => {} |
2784 | o => return o, |
2785 | } |
2786 | } |
2787 | |
2788 | Ordering::Equal |
2789 | } |
2790 | |
2791 | /// Extract the given bit. |
2792 | #[inline] |
2793 | pub(super) fn get_bit(limbs: &[Limb], bit: usize) -> bool { |
2794 | limbs[bit / LIMB_BITS] & (1 << (bit % LIMB_BITS)) != 0 |
2795 | } |
2796 | |
2797 | /// Set the given bit. |
2798 | #[inline] |
2799 | pub(super) fn set_bit(limbs: &mut [Limb], bit: usize) { |
2800 | limbs[bit / LIMB_BITS] |= 1 << (bit % LIMB_BITS); |
2801 | } |
2802 | |
2803 | /// Shift `dst` left `bits` bits, subtract `bits` from its exponent. |
2804 | #[inline] |
2805 | pub(super) fn shift_left(dst: &mut [Limb], exp: &mut ExpInt, bits: usize) { |
2806 | if bits > 0 { |
2807 | // Our exponent should not underflow. |
2808 | *exp = exp.checked_sub(bits as ExpInt).unwrap(); |
2809 | |
2810 | // Jump is the inter-limb jump; shift is is intra-limb shift. |
2811 | let jump = bits / LIMB_BITS; |
2812 | let shift = bits % LIMB_BITS; |
2813 | |
2814 | for i in (0..dst.len()).rev() { |
2815 | let mut limb; |
2816 | |
2817 | if i < jump { |
2818 | limb = 0; |
2819 | } else { |
2820 | // dst[i] comes from the two limbs src[i - jump] and, if we have |
2821 | // an intra-limb shift, src[i - jump - 1]. |
2822 | limb = dst[i - jump]; |
2823 | if shift > 0 { |
2824 | limb <<= shift; |
2825 | if i >= jump + 1 { |
2826 | limb |= dst[i - jump - 1] >> (LIMB_BITS - shift); |
2827 | } |
2828 | } |
2829 | } |
2830 | |
2831 | dst[i] = limb; |
2832 | } |
2833 | } |
2834 | } |
2835 | |
2836 | /// Shift `dst` right `bits` bits noting lost fraction. |
2837 | #[inline] |
2838 | pub(super) fn shift_right(dst: &mut [Limb], exp: &mut ExpInt, bits: usize) -> Loss { |
2839 | let loss = Loss::through_truncation(dst, bits); |
2840 | |
2841 | if bits > 0 { |
2842 | // Our exponent should not overflow. |
2843 | *exp = exp.checked_add(bits as ExpInt).unwrap(); |
2844 | |
2845 | // Jump is the inter-limb jump; shift is is intra-limb shift. |
2846 | let jump = bits / LIMB_BITS; |
2847 | let shift = bits % LIMB_BITS; |
2848 | |
2849 | // Perform the shift. This leaves the most significant `bits` bits |
2850 | // of the result at zero. |
2851 | for i in 0..dst.len() { |
2852 | let mut limb; |
2853 | |
2854 | if i + jump >= dst.len() { |
2855 | limb = 0; |
2856 | } else { |
2857 | limb = dst[i + jump]; |
2858 | if shift > 0 { |
2859 | limb >>= shift; |
2860 | if i + jump + 1 < dst.len() { |
2861 | limb |= dst[i + jump + 1] << (LIMB_BITS - shift); |
2862 | } |
2863 | } |
2864 | } |
2865 | |
2866 | dst[i] = limb; |
2867 | } |
2868 | } |
2869 | |
2870 | loss |
2871 | } |
2872 | |
2873 | /// Copy the bit vector of width `src_bits` from `src`, starting at bit SRC_LSB, |
2874 | /// to `dst`, such that the bit SRC_LSB becomes the least significant bit of `dst`. |
2875 | /// All high bits above `src_bits` in `dst` are zero-filled. |
2876 | #[inline] |
2877 | pub(super) fn extract(dst: &mut [Limb], src: &[Limb], src_bits: usize, src_lsb: usize) { |
2878 | if src_bits == 0 { |
2879 | return; |
2880 | } |
2881 | |
2882 | let dst_limbs = limbs_for_bits(src_bits); |
2883 | assert!(dst_limbs <= dst.len()); |
2884 | |
2885 | let src = &src[src_lsb / LIMB_BITS..]; |
2886 | dst[..dst_limbs].copy_from_slice(&src[..dst_limbs]); |
2887 | |
2888 | let shift = src_lsb % LIMB_BITS; |
2889 | let _: Loss = shift_right(&mut dst[..dst_limbs], &mut 0, shift); |
2890 | |
2891 | // We now have (dst_limbs * LIMB_BITS - shift) bits from `src` |
2892 | // in `dst`. If this is less that src_bits, append the rest, else |
2893 | // clear the high bits. |
2894 | let n = dst_limbs * LIMB_BITS - shift; |
2895 | if n < src_bits { |
2896 | let mask = (1 << (src_bits - n)) - 1; |
2897 | dst[dst_limbs - 1] |= (src[dst_limbs] & mask) << n % LIMB_BITS; |
2898 | } else if n > src_bits && src_bits % LIMB_BITS > 0 { |
2899 | dst[dst_limbs - 1] &= (1 << (src_bits % LIMB_BITS)) - 1; |
2900 | } |
2901 | |
2902 | // Clear high limbs. |
2903 | for x in &mut dst[dst_limbs..] { |
2904 | *x = 0; |
2905 | } |
2906 | } |
2907 | |
2908 | /// We want the most significant PRECISION bits of `src`. There may not |
2909 | /// be that many; extract what we can. |
2910 | #[inline] |
2911 | pub(super) fn from_limbs(dst: &mut [Limb], src: &[Limb], precision: usize) -> (Loss, ExpInt) { |
2912 | let omsb = omsb(src); |
2913 | |
2914 | if precision <= omsb { |
2915 | extract(dst, src, precision, omsb - precision); |
2916 | (Loss::through_truncation(src, omsb - precision), omsb as ExpInt - 1) |
2917 | } else { |
2918 | extract(dst, src, omsb, 0); |
2919 | (Loss::ExactlyZero, precision as ExpInt - 1) |
2920 | } |
2921 | } |
2922 | |
2923 | /// For every consecutive chunk of `bits` bits from `limbs`, |
2924 | /// going from most significant to the least significant bits, |
2925 | /// call `f` to transform those bits and store the result back. |
2926 | #[inline] |
2927 | pub(super) fn each_chunk<F: FnMut(Limb) -> Limb>(limbs: &mut [Limb], bits: usize, mut f: F) { |
2928 | assert_eq!(LIMB_BITS % bits, 0); |
2929 | for limb in limbs.iter_mut().rev() { |
2930 | let mut r = 0; |
2931 | for i in (0..LIMB_BITS / bits).rev() { |
2932 | r |= f((*limb >> (i * bits)) & ((1 << bits) - 1)) << (i * bits); |
2933 | } |
2934 | *limb = r; |
2935 | } |
2936 | } |
2937 | |
2938 | /// Increment in-place, return the carry flag. |
2939 | #[inline] |
2940 | pub(super) fn increment(dst: &mut [Limb]) -> Limb { |
2941 | for x in dst { |
2942 | *x = x.wrapping_add(1); |
2943 | if *x != 0 { |
2944 | return 0; |
2945 | } |
2946 | } |
2947 | |
2948 | 1 |
2949 | } |
2950 | |
2951 | /// Decrement in-place, return the borrow flag. |
2952 | #[inline] |
2953 | pub(super) fn decrement(dst: &mut [Limb]) -> Limb { |
2954 | for x in dst { |
2955 | *x = x.wrapping_sub(1); |
2956 | if *x != !0 { |
2957 | return 0; |
2958 | } |
2959 | } |
2960 | |
2961 | 1 |
2962 | } |
2963 | |
2964 | /// `a += b + c` where `c` is zero or one. Returns the carry flag. |
2965 | #[inline] |
2966 | pub(super) fn add(a: &mut [Limb], b: &[Limb], mut c: Limb) -> Limb { |
2967 | assert!(c <= 1); |
2968 | |
2969 | for (a, &b) in a.iter_mut().zip(b) { |
2970 | let (r, overflow) = a.overflowing_add(b); |
2971 | let (r, overflow2) = r.overflowing_add(c); |
2972 | *a = r; |
2973 | c = (overflow | overflow2) as Limb; |
2974 | } |
2975 | |
2976 | c |
2977 | } |
2978 | |
2979 | /// `a -= b + c` where `c` is zero or one. Returns the borrow flag. |
2980 | #[inline] |
2981 | pub(super) fn sub(a: &mut [Limb], b: &[Limb], mut c: Limb) -> Limb { |
2982 | assert!(c <= 1); |
2983 | |
2984 | for (a, &b) in a.iter_mut().zip(b) { |
2985 | let (r, overflow) = a.overflowing_sub(b); |
2986 | let (r, overflow2) = r.overflowing_sub(c); |
2987 | *a = r; |
2988 | c = (overflow | overflow2) as Limb; |
2989 | } |
2990 | |
2991 | c |
2992 | } |
2993 | |
2994 | /// `a += b` or `a -= b`. Does not preserve `b`. |
2995 | #[inline] |
2996 | pub(super) fn add_or_sub( |
2997 | a_sig: &mut [Limb], |
2998 | a_exp: &mut ExpInt, |
2999 | a_sign: &mut bool, |
3000 | b_sig: &mut [Limb], |
3001 | b_exp: ExpInt, |
3002 | b_sign: bool, |
3003 | ) -> Loss { |
3004 | // Are we bigger exponent-wise than the RHS? |
3005 | let bits = *a_exp - b_exp; |
3006 | |
3007 | // Determine if the operation on the absolute values is effectively |
3008 | // an addition or subtraction. |
3009 | // Subtraction is more subtle than one might naively expect. |
3010 | if *a_sign ^ b_sign { |
3011 | let loss; |
3012 | |
3013 | if bits == 0 { |
3014 | loss = Loss::ExactlyZero; |
3015 | } else if bits > 0 { |
3016 | loss = shift_right(b_sig, &mut 0, (bits - 1) as usize); |
3017 | shift_left(a_sig, a_exp, 1); |
3018 | } else { |
3019 | loss = shift_right(a_sig, a_exp, (-bits - 1) as usize); |
3020 | shift_left(b_sig, &mut 0, 1); |
3021 | } |
3022 | |
3023 | let borrow = (loss != Loss::ExactlyZero) as Limb; |
3024 | |
3025 | // Should we reverse the subtraction. |
3026 | if cmp(a_sig, b_sig) == Ordering::Less { |
3027 | // The code above is intended to ensure that no borrow is necessary. |
3028 | assert_eq!(sub(b_sig, a_sig, borrow), 0); |
3029 | a_sig.copy_from_slice(b_sig); |
3030 | *a_sign = !*a_sign; |
3031 | } else { |
3032 | // The code above is intended to ensure that no borrow is necessary. |
3033 | assert_eq!(sub(a_sig, b_sig, borrow), 0); |
3034 | } |
3035 | |
3036 | // Invert the lost fraction - it was on the RHS and subtracted. |
3037 | match loss { |
3038 | Loss::LessThanHalf => Loss::MoreThanHalf, |
3039 | Loss::MoreThanHalf => Loss::LessThanHalf, |
3040 | _ => loss, |
3041 | } |
3042 | } else { |
3043 | let loss = if bits > 0 { |
3044 | shift_right(b_sig, &mut 0, bits as usize) |
3045 | } else { |
3046 | shift_right(a_sig, a_exp, -bits as usize) |
3047 | }; |
3048 | // We have a guard bit; generating a carry cannot happen. |
3049 | assert_eq!(add(a_sig, b_sig, 0), 0); |
3050 | loss |
3051 | } |
3052 | } |
3053 | |
3054 | /// `[low, high] = a * b`. |
3055 | /// |
3056 | /// This cannot overflow, because |
3057 | /// |
3058 | /// `(n - 1) * (n - 1) + 2 * (n - 1) == (n - 1) * (n + 1)` |
3059 | /// |
3060 | /// which is less than n^2. |
3061 | #[inline] |
3062 | pub(super) fn widening_mul(a: Limb, b: Limb) -> [Limb; 2] { |
3063 | let mut wide = [0, 0]; |
3064 | |
3065 | if a == 0 || b == 0 { |
3066 | return wide; |
3067 | } |
3068 | |
3069 | const HALF_BITS: usize = LIMB_BITS / 2; |
3070 | |
3071 | let select = |limb, i| (limb >> (i * HALF_BITS)) & ((1 << HALF_BITS) - 1); |
3072 | for i in 0..2 { |
3073 | for j in 0..2 { |
3074 | let mut x = [select(a, i) * select(b, j), 0]; |
3075 | shift_left(&mut x, &mut 0, (i + j) * HALF_BITS); |
3076 | assert_eq!(add(&mut wide, &x, 0), 0); |
3077 | } |
3078 | } |
3079 | |
3080 | wide |
3081 | } |
3082 | |
3083 | /// `dst = a * b` (for normal `a` and `b`). Returns the lost fraction. |
3084 | #[inline] |
3085 | pub(super) fn mul<'a>( |
3086 | dst: &mut [Limb], |
3087 | exp: &mut ExpInt, |
3088 | mut a: &'a [Limb], |
3089 | mut b: &'a [Limb], |
3090 | precision: usize, |
3091 | ) -> Loss { |
3092 | // Put the narrower number on the `a` for less loops below. |
3093 | if a.len() > b.len() { |
3094 | mem::swap(&mut a, &mut b); |
3095 | } |
3096 | |
3097 | for x in &mut dst[..b.len()] { |
3098 | *x = 0; |
3099 | } |
3100 | |
3101 | for i in 0..a.len() { |
3102 | let mut carry = 0; |
3103 | for j in 0..b.len() { |
3104 | let [low, mut high] = widening_mul(a[i], b[j]); |
3105 | |
3106 | // Now add carry. |
3107 | let (low, overflow) = low.overflowing_add(carry); |
3108 | high += overflow as Limb; |
3109 | |
3110 | // And now `dst[i + j]`, and store the new low part there. |
3111 | let (low, overflow) = low.overflowing_add(dst[i + j]); |
3112 | high += overflow as Limb; |
3113 | |
3114 | dst[i + j] = low; |
3115 | carry = high; |
3116 | } |
3117 | dst[i + b.len()] = carry; |
3118 | } |
3119 | |
3120 | // Assume the operands involved in the multiplication are single-precision |
3121 | // FP, and the two multiplicants are: |
3122 | // a = a23 . a22 ... a0 * 2^e1 |
3123 | // b = b23 . b22 ... b0 * 2^e2 |
3124 | // the result of multiplication is: |
3125 | // dst = c48 c47 c46 . c45 ... c0 * 2^(e1+e2) |
3126 | // Note that there are three significant bits at the left-hand side of the |
3127 | // radix point: two for the multiplication, and an overflow bit for the |
3128 | // addition (that will always be zero at this point). Move the radix point |
3129 | // toward left by two bits, and adjust exponent accordingly. |
3130 | *exp += 2; |
3131 | |
3132 | // Convert the result having "2 * precision" significant-bits back to the one |
3133 | // having "precision" significant-bits. First, move the radix point from |
3134 | // poision "2*precision - 1" to "precision - 1". The exponent need to be |
3135 | // adjusted by "2*precision - 1" - "precision - 1" = "precision". |
3136 | *exp -= precision as ExpInt + 1; |
3137 | |
3138 | // In case MSB resides at the left-hand side of radix point, shift the |
3139 | // mantissa right by some amount to make sure the MSB reside right before |
3140 | // the radix point (i.e. "MSB . rest-significant-bits"). |
3141 | // |
3142 | // Note that the result is not normalized when "omsb < precision". So, the |
3143 | // caller needs to call IeeeFloat::normalize() if normalized value is |
3144 | // expected. |
3145 | let omsb = omsb(dst); |
3146 | if omsb <= precision { |
3147 | Loss::ExactlyZero |
3148 | } else { |
3149 | shift_right(dst, exp, omsb - precision) |
3150 | } |
3151 | } |
3152 | |
3153 | /// `quotient = dividend / divisor`. Returns the lost fraction. |
3154 | /// Does not preserve `dividend` or `divisor`. |
3155 | #[inline] |
3156 | pub(super) fn div( |
3157 | quotient: &mut [Limb], |
3158 | exp: &mut ExpInt, |
3159 | dividend: &mut [Limb], |
3160 | divisor: &mut [Limb], |
3161 | precision: usize, |
3162 | ) -> Loss { |
3163 | // Normalize the divisor. |
3164 | let bits = precision - omsb(divisor); |
3165 | shift_left(divisor, &mut 0, bits); |
3166 | *exp += bits as ExpInt; |
3167 | |
3168 | // Normalize the dividend. |
3169 | let bits = precision - omsb(dividend); |
3170 | shift_left(dividend, exp, bits); |
3171 | |
3172 | // Division by 1. |
3173 | let olsb_divisor = olsb(divisor); |
3174 | if olsb_divisor == precision { |
3175 | quotient.copy_from_slice(dividend); |
3176 | return Loss::ExactlyZero; |
3177 | } |
3178 | |
3179 | // Ensure the dividend >= divisor initially for the loop below. |
3180 | // Incidentally, this means that the division loop below is |
3181 | // guaranteed to set the integer bit to one. |
3182 | if cmp(dividend, divisor) == Ordering::Less { |
3183 | shift_left(dividend, exp, 1); |
3184 | assert_ne!(cmp(dividend, divisor), Ordering::Less) |
3185 | } |
3186 | |
3187 | // Helper for figuring out the lost fraction. |
3188 | let lost_fraction = |dividend: &[Limb], divisor: &[Limb]| match cmp(dividend, divisor) { |
3189 | Ordering::Greater => Loss::MoreThanHalf, |
3190 | Ordering::Equal => Loss::ExactlyHalf, |
3191 | Ordering::Less => { |
3192 | if is_all_zeros(dividend) { |
3193 | Loss::ExactlyZero |
3194 | } else { |
3195 | Loss::LessThanHalf |
3196 | } |
3197 | } |
3198 | }; |
3199 | |
3200 | // Try to perform a (much faster) short division for small divisors. |
3201 | let divisor_bits = precision - (olsb_divisor - 1); |
3202 | macro_rules! try_short_div { |
3203 | ($W:ty, $H:ty, $half:expr) => { |
3204 | if divisor_bits * 2 <= $half { |
3205 | // Extract the small divisor. |
3206 | let _: Loss = shift_right(divisor, &mut 0, olsb_divisor - 1); |
3207 | let divisor = divisor[0] as $H as $W; |
3208 | |
3209 | // Shift the dividend to produce a quotient with the unit bit set. |
3210 | let top_limb = *dividend.last().unwrap(); |
3211 | let mut rem = (top_limb >> (LIMB_BITS - (divisor_bits - 1))) as $H; |
3212 | shift_left(dividend, &mut 0, divisor_bits - 1); |
3213 | |
3214 | // Apply short division in place on $H (of $half bits) chunks. |
3215 | each_chunk(dividend, $half, |chunk| { |
3216 | let chunk = chunk as $H; |
3217 | let combined = ((rem as $W) << $half) | (chunk as $W); |
3218 | rem = (combined % divisor) as $H; |
3219 | (combined / divisor) as $H as Limb |
3220 | }); |
3221 | quotient.copy_from_slice(dividend); |
3222 | |
3223 | return lost_fraction(&[(rem as Limb) << 1], &[divisor as Limb]); |
3224 | } |
3225 | }; |
3226 | } |
3227 | |
3228 | try_short_div!(u32, u16, 16); |
3229 | try_short_div!(u64, u32, 32); |
3230 | try_short_div!(u128, u64, 64); |
3231 | |
3232 | // Zero the quotient before setting bits in it. |
3233 | for x in &mut quotient[..limbs_for_bits(precision)] { |
3234 | *x = 0; |
3235 | } |
3236 | |
3237 | // Long division. |
3238 | for bit in (0..precision).rev() { |
3239 | if cmp(dividend, divisor) != Ordering::Less { |
3240 | sub(dividend, divisor, 0); |
3241 | set_bit(quotient, bit); |
3242 | } |
3243 | shift_left(dividend, &mut 0, 1); |
3244 | } |
3245 | |
3246 | lost_fraction(dividend, divisor) |
3247 | } |
3248 | } |
3249 |
Definitions
- IeeeFloat
- sig
- exp
- read_only_category_do_not_mutate
- read_only_sign_do_not_mutate
- marker
- Limb
- limbs_for_bits
- DynPrecisionLimbVec
- Loss
- ExactlyZero
- LessThanHalf
- ExactlyHalf
- MoreThanHalf
- NonfiniteBehavior
- IEEE754
- NanOnly
- NegExt
- negate_if
- with_sign
- Semantics
- from_bits
- to_bits
- clone
- ieee_semantics
- Half
- HalfS
- Single
- SingleS
- Double
- DoubleS
- Quad
- QuadS
- BFloat
- BFloatS
- Float8E5M2
- Float8E5M2S
- Float8E4M3FN
- Float8E4M3FNS
- X87DoubleExtendedS
- X87DoubleExtended
- from_bits
- to_bits
- eq
- partial_cmp
- Output
- neg
- fmt
- fmt
- IeeeDefaultExceptionHandling
- result_from_nan
- binop_result_from_either_nan
- qnan
- qnan
- snan
- largest
- smallest_normalized
- add_r
- sub_r
- mul_r
- mul_add_r
- div_r
- ieee_rem
- c_fmod
- round_to_integral
- next_up
- from_bits
- from_u128_r
- from_str_r
- to_bits
- to_u128_r
- cmp_abs_normal
- bitwise_eq
- is_negative
- is_denormal
- is_signaling
- category
- get_exact_inverse
- ilogb
- scalbn_r
- frexp_r
- convert_r
- is_x87_double_extended
- sig
- exp
- overflow_result
- round_away_from_zero
- normalize
- from_hexadecimal_string
- from_decimal_string
- combine
- through_truncation
- is_all_zeros
- olsb
- omsb
- cmp
- get_bit
- set_bit
- shift_left
- shift_right
- extract
- from_limbs
- each_chunk
- increment
- decrement
- add
- sub
- add_or_sub
- widening_mul
- mul
- div
Learn Rust with the experts
Find out more