1 | // Copyright 2016 The SwiftShader Authors. All Rights Reserved. |
2 | // |
3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | // you may not use this file except in compliance with the License. |
5 | // You may obtain a copy of the License at |
6 | // |
7 | // http://www.apache.org/licenses/LICENSE-2.0 |
8 | // |
9 | // Unless required by applicable law or agreed to in writing, software |
10 | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | // See the License for the specific language governing permissions and |
13 | // limitations under the License. |
14 | |
15 | #ifndef sw_Half_hpp |
16 | #define sw_Half_hpp |
17 | |
18 | #include "Math.hpp" |
19 | |
20 | #include <algorithm> |
21 | #include <cmath> |
22 | |
23 | namespace sw { |
24 | |
25 | class half |
26 | { |
27 | public: |
28 | half() = default; |
29 | explicit half(float f); |
30 | |
31 | operator float() const; |
32 | |
33 | half &operator=(float f); |
34 | |
35 | private: |
36 | unsigned short fp16i; |
37 | }; |
38 | |
39 | inline half shortAsHalf(short s) |
40 | { |
41 | union |
42 | { |
43 | half h; |
44 | short s; |
45 | } hs; |
46 | |
47 | hs.s = s; |
48 | |
49 | return hs.h; |
50 | } |
51 | |
52 | class RGB9E5 |
53 | { |
54 | union |
55 | { |
56 | struct |
57 | { |
58 | unsigned int R : 9; |
59 | unsigned int G : 9; |
60 | unsigned int B : 9; |
61 | unsigned int E : 5; |
62 | }; |
63 | uint32_t packed; |
64 | }; |
65 | |
66 | public: |
67 | RGB9E5(const float rgb[3]) |
68 | : RGB9E5(rgb[0], rgb[1], rgb[2]) |
69 | { |
70 | } |
71 | |
72 | RGB9E5(float r, float g, float b) |
73 | { |
74 | // Vulkan 1.1.117 section 15.2.1 RGB to Shared Exponent Conversion |
75 | |
76 | // B is the exponent bias (15) |
77 | constexpr int g_sharedexp_bias = 15; |
78 | |
79 | // N is the number of mantissa bits per component (9) |
80 | constexpr int g_sharedexp_mantissabits = 9; |
81 | |
82 | // Emax is the maximum allowed biased exponent value (31) |
83 | constexpr int g_sharedexp_maxexponent = 31; |
84 | |
85 | constexpr float g_sharedexp_max = |
86 | ((static_cast<float>(1 << g_sharedexp_mantissabits) - 1) / |
87 | static_cast<float>(1 << g_sharedexp_mantissabits)) * |
88 | static_cast<float>(1 << (g_sharedexp_maxexponent - g_sharedexp_bias)); |
89 | |
90 | // Clamp components to valid range. NaN becomes 0. |
91 | const float red_c = std::min(a: !(r > 0) ? 0 : r, b: g_sharedexp_max); |
92 | const float green_c = std::min(a: !(g > 0) ? 0 : g, b: g_sharedexp_max); |
93 | const float blue_c = std::min(a: !(b > 0) ? 0 : b, b: g_sharedexp_max); |
94 | |
95 | // We're reducing the mantissa to 9 bits, so we must round up if the next |
96 | // bit is 1. In other words add 0.5 to the new mantissa's position and |
97 | // allow overflow into the exponent so we can scale correctly. |
98 | constexpr int half = 1 << (23 - g_sharedexp_mantissabits); |
99 | const float red_r = bit_cast<float>(source: bit_cast<int>(source: red_c) + half); |
100 | const float green_r = bit_cast<float>(source: bit_cast<int>(source: green_c) + half); |
101 | const float blue_r = bit_cast<float>(source: bit_cast<int>(source: blue_c) + half); |
102 | |
103 | // The largest component determines the shared exponent. It can't be lower |
104 | // than 0 (after bias subtraction) so also limit to the mimimum representable. |
105 | constexpr float min_s = 0.5f / (1 << g_sharedexp_bias); |
106 | float max_s = std::max(a: std::max(a: red_r, b: green_r), b: std::max(a: blue_r, b: min_s)); |
107 | |
108 | // Obtain the reciprocal of the shared exponent by inverting the bits, |
109 | // and scale by the new mantissa's size. Note that the IEEE-754 single-precision |
110 | // format has an implicit leading 1, but this shared component format does not. |
111 | float scale = bit_cast<float>(source: (bit_cast<int>(source: max_s) & 0x7F800000) ^ 0x7F800000) * (1 << (g_sharedexp_mantissabits - 2)); |
112 | |
113 | R = static_cast<unsigned int>(round(lcpp_x: red_c * scale)); |
114 | G = static_cast<unsigned int>(round(lcpp_x: green_c * scale)); |
115 | B = static_cast<unsigned int>(round(lcpp_x: blue_c * scale)); |
116 | E = (bit_cast<unsigned int>(source: max_s) >> 23) - 127 + 15 + 1; |
117 | } |
118 | |
119 | operator unsigned int() const |
120 | { |
121 | return packed; |
122 | } |
123 | |
124 | void toRGB16F(half rgb[3]) const |
125 | { |
126 | constexpr int offset = 24; // Exponent bias (15) + number of mantissa bits per component (9) = 24 |
127 | |
128 | const float factor = (1u << E) * (1.0f / (1 << offset)); |
129 | rgb[0] = half(R * factor); |
130 | rgb[1] = half(G * factor); |
131 | rgb[2] = half(B * factor); |
132 | } |
133 | }; |
134 | |
135 | class R11G11B10F |
136 | { |
137 | union |
138 | { |
139 | struct |
140 | { |
141 | unsigned int R : 11; |
142 | unsigned int G : 11; |
143 | unsigned int B : 10; |
144 | }; |
145 | uint32_t packed; |
146 | }; |
147 | |
148 | public: |
149 | R11G11B10F(const float rgb[3]) |
150 | { |
151 | R = float32ToFloat11(fp32: rgb[0]); |
152 | G = float32ToFloat11(fp32: rgb[1]); |
153 | B = float32ToFloat10(fp32: rgb[2]); |
154 | } |
155 | |
156 | operator unsigned int() const |
157 | { |
158 | return packed; |
159 | } |
160 | |
161 | void toRGB16F(half rgb[3]) const |
162 | { |
163 | rgb[0] = float11ToFloat16(fp11: R); |
164 | rgb[1] = float11ToFloat16(fp11: G); |
165 | rgb[2] = float10ToFloat16(fp10: B); |
166 | } |
167 | |
168 | static inline half float11ToFloat16(unsigned short fp11) |
169 | { |
170 | return shortAsHalf(s: fp11 << 4); // Sign bit 0 |
171 | } |
172 | |
173 | static inline half float10ToFloat16(unsigned short fp10) |
174 | { |
175 | return shortAsHalf(s: fp10 << 5); // Sign bit 0 |
176 | } |
177 | |
178 | static inline unsigned short float32ToFloat11(float fp32) |
179 | { |
180 | const unsigned int float32MantissaMask = 0x7FFFFF; |
181 | const unsigned int float32ExponentMask = 0x7F800000; |
182 | const unsigned int float32SignMask = 0x80000000; |
183 | const unsigned int float32ValueMask = ~float32SignMask; |
184 | const unsigned int float32ExponentFirstBit = 23; |
185 | const unsigned int float32ExponentBias = 127; |
186 | |
187 | const unsigned short float11Max = 0x7BF; |
188 | const unsigned short float11MantissaMask = 0x3F; |
189 | const unsigned short float11ExponentMask = 0x7C0; |
190 | const unsigned short float11BitMask = 0x7FF; |
191 | const unsigned int float11ExponentBias = 14; |
192 | |
193 | const unsigned int float32Maxfloat11 = 0x477E0000; |
194 | const unsigned int float32MinNormfloat11 = 0x38800000; |
195 | const unsigned int float32MinDenormfloat11 = 0x35000080; |
196 | |
197 | const unsigned int float32Bits = bit_cast<unsigned int>(source: fp32); |
198 | const bool float32Sign = (float32Bits & float32SignMask) == float32SignMask; |
199 | |
200 | unsigned int float32Val = float32Bits & float32ValueMask; |
201 | |
202 | if((float32Val & float32ExponentMask) == float32ExponentMask) |
203 | { |
204 | // INF or NAN |
205 | if((float32Val & float32MantissaMask) != 0) |
206 | { |
207 | return float11ExponentMask | |
208 | (((float32Val >> 17) | (float32Val >> 11) | (float32Val >> 6) | (float32Val)) & |
209 | float11MantissaMask); |
210 | } |
211 | else if(float32Sign) |
212 | { |
213 | // -INF is clamped to 0 since float11 is positive only |
214 | return 0; |
215 | } |
216 | else |
217 | { |
218 | return float11ExponentMask; |
219 | } |
220 | } |
221 | else if(float32Sign) |
222 | { |
223 | // float11 is positive only, so clamp to zero |
224 | return 0; |
225 | } |
226 | else if(float32Val > float32Maxfloat11) |
227 | { |
228 | // The number is too large to be represented as a float11, set to max |
229 | return float11Max; |
230 | } |
231 | else if(float32Val < float32MinDenormfloat11) |
232 | { |
233 | // The number is too small to be represented as a denormalized float11, set to 0 |
234 | return 0; |
235 | } |
236 | else |
237 | { |
238 | if(float32Val < float32MinNormfloat11) |
239 | { |
240 | // The number is too small to be represented as a normalized float11 |
241 | // Convert it to a denormalized value. |
242 | const unsigned int shift = (float32ExponentBias - float11ExponentBias) - |
243 | (float32Val >> float32ExponentFirstBit); |
244 | float32Val = |
245 | ((1 << float32ExponentFirstBit) | (float32Val & float32MantissaMask)) >> shift; |
246 | } |
247 | else |
248 | { |
249 | // Rebias the exponent to represent the value as a normalized float11 |
250 | float32Val += 0xC8000000; |
251 | } |
252 | |
253 | return ((float32Val + 0xFFFF + ((float32Val >> 17) & 1)) >> 17) & float11BitMask; |
254 | } |
255 | } |
256 | |
257 | static inline unsigned short float32ToFloat10(float fp32) |
258 | { |
259 | const unsigned int float32MantissaMask = 0x7FFFFF; |
260 | const unsigned int float32ExponentMask = 0x7F800000; |
261 | const unsigned int float32SignMask = 0x80000000; |
262 | const unsigned int float32ValueMask = ~float32SignMask; |
263 | const unsigned int float32ExponentFirstBit = 23; |
264 | const unsigned int float32ExponentBias = 127; |
265 | |
266 | const unsigned short float10Max = 0x3DF; |
267 | const unsigned short float10MantissaMask = 0x1F; |
268 | const unsigned short float10ExponentMask = 0x3E0; |
269 | const unsigned short float10BitMask = 0x3FF; |
270 | const unsigned int float10ExponentBias = 14; |
271 | |
272 | const unsigned int float32Maxfloat10 = 0x477C0000; |
273 | const unsigned int float32MinNormfloat10 = 0x38800000; |
274 | const unsigned int float32MinDenormfloat10 = 0x35800040; |
275 | |
276 | const unsigned int float32Bits = bit_cast<unsigned int>(source: fp32); |
277 | const bool float32Sign = (float32Bits & float32SignMask) == float32SignMask; |
278 | |
279 | unsigned int float32Val = float32Bits & float32ValueMask; |
280 | |
281 | if((float32Val & float32ExponentMask) == float32ExponentMask) |
282 | { |
283 | // INF or NAN |
284 | if((float32Val & float32MantissaMask) != 0) |
285 | { |
286 | return float10ExponentMask | |
287 | (((float32Val >> 18) | (float32Val >> 13) | (float32Val >> 3) | (float32Val)) & |
288 | float10MantissaMask); |
289 | } |
290 | else if(float32Sign) |
291 | { |
292 | // -INF is clamped to 0 since float10 is positive only |
293 | return 0; |
294 | } |
295 | else |
296 | { |
297 | return float10ExponentMask; |
298 | } |
299 | } |
300 | else if(float32Sign) |
301 | { |
302 | // float10 is positive only, so clamp to zero |
303 | return 0; |
304 | } |
305 | else if(float32Val > float32Maxfloat10) |
306 | { |
307 | // The number is too large to be represented as a float10, set to max |
308 | return float10Max; |
309 | } |
310 | else if(float32Val < float32MinDenormfloat10) |
311 | { |
312 | // The number is too small to be represented as a denormalized float10, set to 0 |
313 | return 0; |
314 | } |
315 | else |
316 | { |
317 | if(float32Val < float32MinNormfloat10) |
318 | { |
319 | // The number is too small to be represented as a normalized float10 |
320 | // Convert it to a denormalized value. |
321 | const unsigned int shift = (float32ExponentBias - float10ExponentBias) - |
322 | (float32Val >> float32ExponentFirstBit); |
323 | float32Val = |
324 | ((1 << float32ExponentFirstBit) | (float32Val & float32MantissaMask)) >> shift; |
325 | } |
326 | else |
327 | { |
328 | // Rebias the exponent to represent the value as a normalized float10 |
329 | float32Val += 0xC8000000; |
330 | } |
331 | |
332 | return ((float32Val + 0x1FFFF + ((float32Val >> 18) & 1)) >> 18) & float10BitMask; |
333 | } |
334 | } |
335 | }; |
336 | |
337 | } // namespace sw |
338 | |
339 | #endif // sw_Half_hpp |
340 | |