1 | /////////////////////////////////////////////////////////////////////////// |
2 | // |
3 | // Copyright (c) 2002, Industrial Light & Magic, a division of Lucas |
4 | // Digital Ltd. LLC |
5 | // |
6 | // All rights reserved. |
7 | // |
8 | // Redistribution and use in source and binary forms, with or without |
9 | // modification, are permitted provided that the following conditions are |
10 | // met: |
11 | // * Redistributions of source code must retain the above copyright |
12 | // notice, this list of conditions and the following disclaimer. |
13 | // * Redistributions in binary form must reproduce the above |
14 | // copyright notice, this list of conditions and the following disclaimer |
15 | // in the documentation and/or other materials provided with the |
16 | // distribution. |
17 | // * Neither the name of Industrial Light & Magic nor the names of |
18 | // its contributors may be used to endorse or promote products derived |
19 | // from this software without specific prior written permission. |
20 | // |
21 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
22 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
23 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
24 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
25 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
26 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
27 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
28 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
29 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
30 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
31 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
32 | // |
33 | /////////////////////////////////////////////////////////////////////////// |
34 | |
35 | // Primary authors: |
36 | // Florian Kainz <kainz@ilm.com> |
37 | // Rod Bogart <rgb@ilm.com> |
38 | |
39 | //--------------------------------------------------------------------------- |
40 | // |
41 | // half -- a 16-bit floating point number class: |
42 | // |
43 | // Type half can represent positive and negative numbers whose |
44 | // magnitude is between roughly 6.1e-5 and 6.5e+4 with a relative |
45 | // error of 9.8e-4; numbers smaller than 6.1e-5 can be represented |
46 | // with an absolute error of 6.0e-8. All integers from -2048 to |
47 | // +2048 can be represented exactly. |
48 | // |
49 | // Type half behaves (almost) like the built-in C++ floating point |
50 | // types. In arithmetic expressions, half, float and double can be |
51 | // mixed freely. Here are a few examples: |
52 | // |
53 | // half a (3.5); |
54 | // float b (a + sqrt (a)); |
55 | // a += b; |
56 | // b += a; |
57 | // b = a + 7; |
58 | // |
59 | // Conversions from half to float are lossless; all half numbers |
60 | // are exactly representable as floats. |
61 | // |
62 | // Conversions from float to half may not preserve a float's value |
63 | // exactly. If a float is not representable as a half, then the |
64 | // float value is rounded to the nearest representable half. If a |
65 | // float value is exactly in the middle between the two closest |
66 | // representable half values, then the float value is rounded to |
67 | // the closest half whose least significant bit is zero. |
68 | // |
69 | // Overflows during float-to-half conversions cause arithmetic |
70 | // exceptions. An overflow occurs when the float value to be |
71 | // converted is too large to be represented as a half, or if the |
72 | // float value is an infinity or a NAN. |
73 | // |
74 | // The implementation of type half makes the following assumptions |
75 | // about the implementation of the built-in C++ types: |
76 | // |
77 | // float is an IEEE 754 single-precision number |
78 | // sizeof (float) == 4 |
79 | // sizeof (unsigned int) == sizeof (float) |
80 | // alignof (unsigned int) == alignof (float) |
81 | // sizeof (unsigned short) == 2 |
82 | // |
83 | //--------------------------------------------------------------------------- |
84 | |
85 | #ifndef _HALF_H_ |
86 | #define _HALF_H_ |
87 | |
88 | #include "halfExport.h" // for definition of HALF_EXPORT |
89 | #include <iostream> |
90 | |
91 | class half |
92 | { |
93 | public: |
94 | |
95 | //------------- |
96 | // Constructors |
97 | //------------- |
98 | |
99 | half () = default; // no initialization |
100 | half (float f); |
101 | // rule of 5 |
102 | ~half () = default; |
103 | half (const half &) = default; |
104 | half (half &&) noexcept = default; |
105 | |
106 | //-------------------- |
107 | // Conversion to float |
108 | //-------------------- |
109 | |
110 | operator float () const; |
111 | |
112 | |
113 | //------------ |
114 | // Unary minus |
115 | //------------ |
116 | |
117 | half operator - () const; |
118 | |
119 | |
120 | //----------- |
121 | // Assignment |
122 | //----------- |
123 | |
124 | half & operator = (const half &h) = default; |
125 | half & operator = (half &&h) noexcept = default; |
126 | half & operator = (float f); |
127 | |
128 | half & operator += (half h); |
129 | half & operator += (float f); |
130 | |
131 | half & operator -= (half h); |
132 | half & operator -= (float f); |
133 | |
134 | half & operator *= (half h); |
135 | half & operator *= (float f); |
136 | |
137 | half & operator /= (half h); |
138 | half & operator /= (float f); |
139 | |
140 | |
141 | //--------------------------------------------------------- |
142 | // Round to n-bit precision (n should be between 0 and 10). |
143 | // After rounding, the significand's 10-n least significant |
144 | // bits will be zero. |
145 | //--------------------------------------------------------- |
146 | |
147 | half round (unsigned int n) const; |
148 | |
149 | |
150 | //-------------------------------------------------------------------- |
151 | // Classification: |
152 | // |
153 | // h.isFinite() returns true if h is a normalized number, |
154 | // a denormalized number or zero |
155 | // |
156 | // h.isNormalized() returns true if h is a normalized number |
157 | // |
158 | // h.isDenormalized() returns true if h is a denormalized number |
159 | // |
160 | // h.isZero() returns true if h is zero |
161 | // |
162 | // h.isNan() returns true if h is a NAN |
163 | // |
164 | // h.isInfinity() returns true if h is a positive |
165 | // or a negative infinity |
166 | // |
167 | // h.isNegative() returns true if the sign bit of h |
168 | // is set (negative) |
169 | //-------------------------------------------------------------------- |
170 | |
171 | bool isFinite () const; |
172 | bool isNormalized () const; |
173 | bool isDenormalized () const; |
174 | bool isZero () const; |
175 | bool isNan () const; |
176 | bool isInfinity () const; |
177 | bool isNegative () const; |
178 | |
179 | |
180 | //-------------------------------------------- |
181 | // Special values |
182 | // |
183 | // posInf() returns +infinity |
184 | // |
185 | // negInf() returns -infinity |
186 | // |
187 | // qNan() returns a NAN with the bit |
188 | // pattern 0111111111111111 |
189 | // |
190 | // sNan() returns a NAN with the bit |
191 | // pattern 0111110111111111 |
192 | //-------------------------------------------- |
193 | |
194 | static half posInf (); |
195 | static half negInf (); |
196 | static half qNan (); |
197 | static half sNan (); |
198 | |
199 | |
200 | //-------------------------------------- |
201 | // Access to the internal representation |
202 | //-------------------------------------- |
203 | |
204 | HALF_EXPORT unsigned short bits () const; |
205 | HALF_EXPORT void setBits (unsigned short bits); |
206 | |
207 | |
208 | public: |
209 | |
210 | union uif |
211 | { |
212 | unsigned int i; |
213 | float f; |
214 | }; |
215 | |
216 | private: |
217 | |
218 | HALF_EXPORT static short convert (int i); |
219 | HALF_EXPORT static float overflow (); |
220 | |
221 | unsigned short _h; |
222 | |
223 | HALF_EXPORT static const uif _toFloat[1 << 16]; |
224 | HALF_EXPORT static const unsigned short _eLut[1 << 9]; |
225 | }; |
226 | |
227 | |
228 | |
229 | //----------- |
230 | // Stream I/O |
231 | //----------- |
232 | |
233 | HALF_EXPORT std::ostream & operator << (std::ostream &os, half h); |
234 | HALF_EXPORT std::istream & operator >> (std::istream &is, half &h); |
235 | |
236 | |
237 | //---------- |
238 | // Debugging |
239 | //---------- |
240 | |
241 | HALF_EXPORT void printBits (std::ostream &os, half h); |
242 | HALF_EXPORT void printBits (std::ostream &os, float f); |
243 | HALF_EXPORT void printBits (char c[19], half h); |
244 | HALF_EXPORT void printBits (char c[35], float f); |
245 | |
246 | |
247 | //------------------------------------------------------------------------- |
248 | // Limits |
249 | // |
250 | // Visual C++ will complain if HALF_MIN, HALF_NRM_MIN etc. are not float |
251 | // constants, but at least one other compiler (gcc 2.96) produces incorrect |
252 | // results if they are. |
253 | //------------------------------------------------------------------------- |
254 | |
255 | #if (defined _WIN32 || defined _WIN64) && defined _MSC_VER |
256 | |
257 | #define HALF_MIN 5.96046448e-08f // Smallest positive half |
258 | |
259 | #define HALF_NRM_MIN 6.10351562e-05f // Smallest positive normalized half |
260 | |
261 | #define HALF_MAX 65504.0f // Largest positive half |
262 | |
263 | #define HALF_EPSILON 0.00097656f // Smallest positive e for which |
264 | // half (1.0 + e) != half (1.0) |
265 | #else |
266 | |
267 | #define HALF_MIN 5.96046448e-08 // Smallest positive half |
268 | |
269 | #define HALF_NRM_MIN 6.10351562e-05 // Smallest positive normalized half |
270 | |
271 | #define HALF_MAX 65504.0 // Largest positive half |
272 | |
273 | #define HALF_EPSILON 0.00097656 // Smallest positive e for which |
274 | // half (1.0 + e) != half (1.0) |
275 | #endif |
276 | |
277 | |
278 | #define HALF_MANT_DIG 11 // Number of digits in mantissa |
279 | // (significand + hidden leading 1) |
280 | |
281 | // |
282 | // floor( (HALF_MANT_DIG - 1) * log10(2) ) => 3.01... -> 3 |
283 | #define HALF_DIG 3 // Number of base 10 digits that |
284 | // can be represented without change |
285 | |
286 | // ceil(HALF_MANT_DIG * log10(2) + 1) => 4.31... -> 5 |
287 | #define HALF_DECIMAL_DIG 5 // Number of base-10 digits that are |
288 | // necessary to uniquely represent all |
289 | // distinct values |
290 | |
291 | #define HALF_RADIX 2 // Base of the exponent |
292 | |
293 | #define HALF_MIN_EXP -13 // Minimum negative integer such that |
294 | // HALF_RADIX raised to the power of |
295 | // one less than that integer is a |
296 | // normalized half |
297 | |
298 | #define HALF_MAX_EXP 16 // Maximum positive integer such that |
299 | // HALF_RADIX raised to the power of |
300 | // one less than that integer is a |
301 | // normalized half |
302 | |
303 | #define HALF_MIN_10_EXP -4 // Minimum positive integer such |
304 | // that 10 raised to that power is |
305 | // a normalized half |
306 | |
307 | #define HALF_MAX_10_EXP 4 // Maximum positive integer such |
308 | // that 10 raised to that power is |
309 | // a normalized half |
310 | |
311 | |
312 | //--------------------------------------------------------------------------- |
313 | // |
314 | // Implementation -- |
315 | // |
316 | // Representation of a float: |
317 | // |
318 | // We assume that a float, f, is an IEEE 754 single-precision |
319 | // floating point number, whose bits are arranged as follows: |
320 | // |
321 | // 31 (msb) |
322 | // | |
323 | // | 30 23 |
324 | // | | | |
325 | // | | | 22 0 (lsb) |
326 | // | | | | | |
327 | // X XXXXXXXX XXXXXXXXXXXXXXXXXXXXXXX |
328 | // |
329 | // s e m |
330 | // |
331 | // S is the sign-bit, e is the exponent and m is the significand. |
332 | // |
333 | // If e is between 1 and 254, f is a normalized number: |
334 | // |
335 | // s e-127 |
336 | // f = (-1) * 2 * 1.m |
337 | // |
338 | // If e is 0, and m is not zero, f is a denormalized number: |
339 | // |
340 | // s -126 |
341 | // f = (-1) * 2 * 0.m |
342 | // |
343 | // If e and m are both zero, f is zero: |
344 | // |
345 | // f = 0.0 |
346 | // |
347 | // If e is 255, f is an "infinity" or "not a number" (NAN), |
348 | // depending on whether m is zero or not. |
349 | // |
350 | // Examples: |
351 | // |
352 | // 0 00000000 00000000000000000000000 = 0.0 |
353 | // 0 01111110 00000000000000000000000 = 0.5 |
354 | // 0 01111111 00000000000000000000000 = 1.0 |
355 | // 0 10000000 00000000000000000000000 = 2.0 |
356 | // 0 10000000 10000000000000000000000 = 3.0 |
357 | // 1 10000101 11110000010000000000000 = -124.0625 |
358 | // 0 11111111 00000000000000000000000 = +infinity |
359 | // 1 11111111 00000000000000000000000 = -infinity |
360 | // 0 11111111 10000000000000000000000 = NAN |
361 | // 1 11111111 11111111111111111111111 = NAN |
362 | // |
363 | // Representation of a half: |
364 | // |
365 | // Here is the bit-layout for a half number, h: |
366 | // |
367 | // 15 (msb) |
368 | // | |
369 | // | 14 10 |
370 | // | | | |
371 | // | | | 9 0 (lsb) |
372 | // | | | | | |
373 | // X XXXXX XXXXXXXXXX |
374 | // |
375 | // s e m |
376 | // |
377 | // S is the sign-bit, e is the exponent and m is the significand. |
378 | // |
379 | // If e is between 1 and 30, h is a normalized number: |
380 | // |
381 | // s e-15 |
382 | // h = (-1) * 2 * 1.m |
383 | // |
384 | // If e is 0, and m is not zero, h is a denormalized number: |
385 | // |
386 | // S -14 |
387 | // h = (-1) * 2 * 0.m |
388 | // |
389 | // If e and m are both zero, h is zero: |
390 | // |
391 | // h = 0.0 |
392 | // |
393 | // If e is 31, h is an "infinity" or "not a number" (NAN), |
394 | // depending on whether m is zero or not. |
395 | // |
396 | // Examples: |
397 | // |
398 | // 0 00000 0000000000 = 0.0 |
399 | // 0 01110 0000000000 = 0.5 |
400 | // 0 01111 0000000000 = 1.0 |
401 | // 0 10000 0000000000 = 2.0 |
402 | // 0 10000 1000000000 = 3.0 |
403 | // 1 10101 1111000001 = -124.0625 |
404 | // 0 11111 0000000000 = +infinity |
405 | // 1 11111 0000000000 = -infinity |
406 | // 0 11111 1000000000 = NAN |
407 | // 1 11111 1111111111 = NAN |
408 | // |
409 | // Conversion: |
410 | // |
411 | // Converting from a float to a half requires some non-trivial bit |
412 | // manipulations. In some cases, this makes conversion relatively |
413 | // slow, but the most common case is accelerated via table lookups. |
414 | // |
415 | // Converting back from a half to a float is easier because we don't |
416 | // have to do any rounding. In addition, there are only 65536 |
417 | // different half numbers; we can convert each of those numbers once |
418 | // and store the results in a table. Later, all conversions can be |
419 | // done using only simple table lookups. |
420 | // |
421 | //--------------------------------------------------------------------------- |
422 | |
423 | |
424 | //---------------------------- |
425 | // Half-from-float constructor |
426 | //---------------------------- |
427 | |
428 | inline |
429 | half::half (float f) |
430 | { |
431 | uif x; |
432 | |
433 | x.f = f; |
434 | |
435 | if (f == 0) |
436 | { |
437 | // |
438 | // Common special case - zero. |
439 | // Preserve the zero's sign bit. |
440 | // |
441 | |
442 | _h = (x.i >> 16); |
443 | } |
444 | else |
445 | { |
446 | // |
447 | // We extract the combined sign and exponent, e, from our |
448 | // floating-point number, f. Then we convert e to the sign |
449 | // and exponent of the half number via a table lookup. |
450 | // |
451 | // For the most common case, where a normalized half is produced, |
452 | // the table lookup returns a non-zero value; in this case, all |
453 | // we have to do is round f's significand to 10 bits and combine |
454 | // the result with e. |
455 | // |
456 | // For all other cases (overflow, zeroes, denormalized numbers |
457 | // resulting from underflow, infinities and NANs), the table |
458 | // lookup returns zero, and we call a longer, non-inline function |
459 | // to do the float-to-half conversion. |
460 | // |
461 | |
462 | int e = (x.i >> 23) & 0x000001ff; |
463 | |
464 | e = _eLut[e]; |
465 | |
466 | if (e) |
467 | { |
468 | // |
469 | // Simple case - round the significand, m, to 10 |
470 | // bits and combine it with the sign and exponent. |
471 | // |
472 | |
473 | int m = x.i & 0x007fffff; |
474 | _h = e + ((m + 0x00000fff + ((m >> 13) & 1)) >> 13); |
475 | } |
476 | else |
477 | { |
478 | // |
479 | // Difficult case - call a function. |
480 | // |
481 | |
482 | _h = convert (i: x.i); |
483 | } |
484 | } |
485 | } |
486 | |
487 | |
488 | //------------------------------------------ |
489 | // Half-to-float conversion via table lookup |
490 | //------------------------------------------ |
491 | |
492 | inline |
493 | half::operator float () const |
494 | { |
495 | return _toFloat[_h].f; |
496 | } |
497 | |
498 | |
499 | //------------------------- |
500 | // Round to n-bit precision |
501 | //------------------------- |
502 | |
503 | inline half |
504 | half::round (unsigned int n) const |
505 | { |
506 | // |
507 | // Parameter check. |
508 | // |
509 | |
510 | if (n >= 10) |
511 | return *this; |
512 | |
513 | // |
514 | // Disassemble h into the sign, s, |
515 | // and the combined exponent and significand, e. |
516 | // |
517 | |
518 | unsigned short s = _h & 0x8000; |
519 | unsigned short e = _h & 0x7fff; |
520 | |
521 | // |
522 | // Round the exponent and significand to the nearest value |
523 | // where ones occur only in the (10-n) most significant bits. |
524 | // Note that the exponent adjusts automatically if rounding |
525 | // up causes the significand to overflow. |
526 | // |
527 | |
528 | e >>= 9 - n; |
529 | e += e & 1; |
530 | e <<= 9 - n; |
531 | |
532 | // |
533 | // Check for exponent overflow. |
534 | // |
535 | |
536 | if (e >= 0x7c00) |
537 | { |
538 | // |
539 | // Overflow occurred -- truncate instead of rounding. |
540 | // |
541 | |
542 | e = _h; |
543 | e >>= 10 - n; |
544 | e <<= 10 - n; |
545 | } |
546 | |
547 | // |
548 | // Put the original sign bit back. |
549 | // |
550 | |
551 | half h; |
552 | h._h = s | e; |
553 | |
554 | return h; |
555 | } |
556 | |
557 | |
558 | //----------------------- |
559 | // Other inline functions |
560 | //----------------------- |
561 | |
562 | inline half |
563 | half::operator - () const |
564 | { |
565 | half h; |
566 | h._h = _h ^ 0x8000; |
567 | return h; |
568 | } |
569 | |
570 | |
571 | inline half & |
572 | half::operator = (float f) |
573 | { |
574 | *this = half (f); |
575 | return *this; |
576 | } |
577 | |
578 | |
579 | inline half & |
580 | half::operator += (half h) |
581 | { |
582 | *this = half (float (*this) + float (h)); |
583 | return *this; |
584 | } |
585 | |
586 | |
587 | inline half & |
588 | half::operator += (float f) |
589 | { |
590 | *this = half (float (*this) + f); |
591 | return *this; |
592 | } |
593 | |
594 | |
595 | inline half & |
596 | half::operator -= (half h) |
597 | { |
598 | *this = half (float (*this) - float (h)); |
599 | return *this; |
600 | } |
601 | |
602 | |
603 | inline half & |
604 | half::operator -= (float f) |
605 | { |
606 | *this = half (float (*this) - f); |
607 | return *this; |
608 | } |
609 | |
610 | |
611 | inline half & |
612 | half::operator *= (half h) |
613 | { |
614 | *this = half (float (*this) * float (h)); |
615 | return *this; |
616 | } |
617 | |
618 | |
619 | inline half & |
620 | half::operator *= (float f) |
621 | { |
622 | *this = half (float (*this) * f); |
623 | return *this; |
624 | } |
625 | |
626 | |
627 | inline half & |
628 | half::operator /= (half h) |
629 | { |
630 | *this = half (float (*this) / float (h)); |
631 | return *this; |
632 | } |
633 | |
634 | |
635 | inline half & |
636 | half::operator /= (float f) |
637 | { |
638 | *this = half (float (*this) / f); |
639 | return *this; |
640 | } |
641 | |
642 | |
643 | inline bool |
644 | half::isFinite () const |
645 | { |
646 | unsigned short e = (_h >> 10) & 0x001f; |
647 | return e < 31; |
648 | } |
649 | |
650 | |
651 | inline bool |
652 | half::isNormalized () const |
653 | { |
654 | unsigned short e = (_h >> 10) & 0x001f; |
655 | return e > 0 && e < 31; |
656 | } |
657 | |
658 | |
659 | inline bool |
660 | half::isDenormalized () const |
661 | { |
662 | unsigned short e = (_h >> 10) & 0x001f; |
663 | unsigned short m = _h & 0x3ff; |
664 | return e == 0 && m != 0; |
665 | } |
666 | |
667 | |
668 | inline bool |
669 | half::isZero () const |
670 | { |
671 | return (_h & 0x7fff) == 0; |
672 | } |
673 | |
674 | |
675 | inline bool |
676 | half::isNan () const |
677 | { |
678 | unsigned short e = (_h >> 10) & 0x001f; |
679 | unsigned short m = _h & 0x3ff; |
680 | return e == 31 && m != 0; |
681 | } |
682 | |
683 | |
684 | inline bool |
685 | half::isInfinity () const |
686 | { |
687 | unsigned short e = (_h >> 10) & 0x001f; |
688 | unsigned short m = _h & 0x3ff; |
689 | return e == 31 && m == 0; |
690 | } |
691 | |
692 | |
693 | inline bool |
694 | half::isNegative () const |
695 | { |
696 | return (_h & 0x8000) != 0; |
697 | } |
698 | |
699 | |
700 | inline half |
701 | half::posInf () |
702 | { |
703 | half h; |
704 | h._h = 0x7c00; |
705 | return h; |
706 | } |
707 | |
708 | |
709 | inline half |
710 | half::negInf () |
711 | { |
712 | half h; |
713 | h._h = 0xfc00; |
714 | return h; |
715 | } |
716 | |
717 | |
718 | inline half |
719 | half::qNan () |
720 | { |
721 | half h; |
722 | h._h = 0x7fff; |
723 | return h; |
724 | } |
725 | |
726 | |
727 | inline half |
728 | half::sNan () |
729 | { |
730 | half h; |
731 | h._h = 0x7dff; |
732 | return h; |
733 | } |
734 | |
735 | |
736 | inline unsigned short |
737 | half::bits () const |
738 | { |
739 | return _h; |
740 | } |
741 | |
742 | |
743 | inline void |
744 | half::setBits (unsigned short bits) |
745 | { |
746 | _h = bits; |
747 | } |
748 | |
749 | #endif |
750 | |