1 | // Copyright 2009-2021 Intel Corporation |
2 | // SPDX-License-Identifier: Apache-2.0 |
3 | |
4 | #pragma once |
5 | |
6 | // Transcendental functions from "ispc": https://github.com/ispc/ispc/ |
7 | // Most of the transcendental implementations in ispc code come from |
8 | // Solomon Boulos's "syrah": https://github.com/boulos/syrah/ |
9 | |
10 | #include "../simd/simd.h" |
11 | |
12 | namespace embree |
13 | { |
14 | |
15 | namespace fastapprox |
16 | { |
17 | |
18 | template <typename T> |
19 | __forceinline T sin(const T &v) |
20 | { |
21 | static const float piOverTwoVec = 1.57079637050628662109375; |
22 | static const float twoOverPiVec = 0.636619746685028076171875; |
23 | auto scaled = v * twoOverPiVec; |
24 | auto kReal = floor(scaled); |
25 | auto k = toInt(kReal); |
26 | |
27 | // Reduced range version of x |
28 | auto x = v - kReal * piOverTwoVec; |
29 | auto kMod4 = k & 3; |
30 | auto sinUseCos = (kMod4 == 1) | (kMod4 == 3); |
31 | auto flipSign = (kMod4 > 1); |
32 | |
33 | // These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2, |
34 | // 4, 6, 8, 10|], [|single...|], [0;Pi/2]); |
35 | static const float sinC2 = -0.16666667163372039794921875; |
36 | static const float sinC4 = +8.333347737789154052734375e-3; |
37 | static const float sinC6 = -1.9842604524455964565277099609375e-4; |
38 | static const float sinC8 = +2.760012648650445044040679931640625e-6; |
39 | static const float sinC10 = -2.50293279435709337121807038784027099609375e-8; |
40 | |
41 | static const float cosC2 = -0.5; |
42 | static const float cosC4 = +4.166664183139801025390625e-2; |
43 | static const float cosC6 = -1.388833043165504932403564453125e-3; |
44 | static const float cosC8 = +2.47562347794882953166961669921875e-5; |
45 | static const float cosC10 = -2.59630184018533327616751194000244140625e-7; |
46 | |
47 | auto outside = select(sinUseCos, 1., x); |
48 | auto c2 = select(sinUseCos, T(cosC2), T(sinC2)); |
49 | auto c4 = select(sinUseCos, T(cosC4), T(sinC4)); |
50 | auto c6 = select(sinUseCos, T(cosC6), T(sinC6)); |
51 | auto c8 = select(sinUseCos, T(cosC8), T(sinC8)); |
52 | auto c10 = select(sinUseCos, T(cosC10), T(sinC10)); |
53 | |
54 | auto x2 = x * x; |
55 | auto formula = x2 * c10 + c8; |
56 | formula = x2 * formula + c6; |
57 | formula = x2 * formula + c4; |
58 | formula = x2 * formula + c2; |
59 | formula = x2 * formula + 1.; |
60 | formula *= outside; |
61 | |
62 | formula = select(flipSign, -formula, formula); |
63 | return formula; |
64 | } |
65 | |
66 | template <typename T> |
67 | __forceinline T cos(const T &v) |
68 | { |
69 | static const float piOverTwoVec = 1.57079637050628662109375; |
70 | static const float twoOverPiVec = 0.636619746685028076171875; |
71 | auto scaled = v * twoOverPiVec; |
72 | auto kReal = floor(scaled); |
73 | auto k = toInt(kReal); |
74 | |
75 | // Reduced range version of x |
76 | auto x = v - kReal * piOverTwoVec; |
77 | |
78 | auto kMod4 = k & 3; |
79 | auto cosUseCos = (kMod4 == 0) | (kMod4 == 2); |
80 | auto flipSign = (kMod4 == 1) | (kMod4 == 2); |
81 | |
82 | const float sinC2 = -0.16666667163372039794921875; |
83 | const float sinC4 = +8.333347737789154052734375e-3; |
84 | const float sinC6 = -1.9842604524455964565277099609375e-4; |
85 | const float sinC8 = +2.760012648650445044040679931640625e-6; |
86 | const float sinC10 = -2.50293279435709337121807038784027099609375e-8; |
87 | |
88 | const float cosC2 = -0.5; |
89 | const float cosC4 = +4.166664183139801025390625e-2; |
90 | const float cosC6 = -1.388833043165504932403564453125e-3; |
91 | const float cosC8 = +2.47562347794882953166961669921875e-5; |
92 | const float cosC10 = -2.59630184018533327616751194000244140625e-7; |
93 | |
94 | auto outside = select(cosUseCos, 1., x); |
95 | auto c2 = select(cosUseCos, T(cosC2), T(sinC2)); |
96 | auto c4 = select(cosUseCos, T(cosC4), T(sinC4)); |
97 | auto c6 = select(cosUseCos, T(cosC6), T(sinC6)); |
98 | auto c8 = select(cosUseCos, T(cosC8), T(sinC8)); |
99 | auto c10 = select(cosUseCos, T(cosC10), T(sinC10)); |
100 | |
101 | auto x2 = x * x; |
102 | auto formula = x2 * c10 + c8; |
103 | formula = x2 * formula + c6; |
104 | formula = x2 * formula + c4; |
105 | formula = x2 * formula + c2; |
106 | formula = x2 * formula + 1.; |
107 | formula *= outside; |
108 | |
109 | formula = select(flipSign, -formula, formula); |
110 | return formula; |
111 | } |
112 | |
113 | template <typename T> |
114 | __forceinline void sincos(const T &v, T &sinResult, T &cosResult) |
115 | { |
116 | const float piOverTwoVec = 1.57079637050628662109375; |
117 | const float twoOverPiVec = 0.636619746685028076171875; |
118 | auto scaled = v * twoOverPiVec; |
119 | auto kReal = floor(scaled); |
120 | auto k = toInt(kReal); |
121 | |
122 | // Reduced range version of x |
123 | auto x = v - kReal * piOverTwoVec; |
124 | auto kMod4 = k & 3; |
125 | auto cosUseCos = ((kMod4 == 0) | (kMod4 == 2)); |
126 | auto sinUseCos = ((kMod4 == 1) | (kMod4 == 3)); |
127 | auto sinFlipSign = (kMod4 > 1); |
128 | auto cosFlipSign = ((kMod4 == 1) | (kMod4 == 2)); |
129 | |
130 | const float oneVec = +1.; |
131 | const float sinC2 = -0.16666667163372039794921875; |
132 | const float sinC4 = +8.333347737789154052734375e-3; |
133 | const float sinC6 = -1.9842604524455964565277099609375e-4; |
134 | const float sinC8 = +2.760012648650445044040679931640625e-6; |
135 | const float sinC10 = -2.50293279435709337121807038784027099609375e-8; |
136 | |
137 | const float cosC2 = -0.5; |
138 | const float cosC4 = +4.166664183139801025390625e-2; |
139 | const float cosC6 = -1.388833043165504932403564453125e-3; |
140 | const float cosC8 = +2.47562347794882953166961669921875e-5; |
141 | const float cosC10 = -2.59630184018533327616751194000244140625e-7; |
142 | |
143 | auto x2 = x * x; |
144 | |
145 | auto sinFormula = x2 * sinC10 + sinC8; |
146 | auto cosFormula = x2 * cosC10 + cosC8; |
147 | sinFormula = x2 * sinFormula + sinC6; |
148 | cosFormula = x2 * cosFormula + cosC6; |
149 | |
150 | sinFormula = x2 * sinFormula + sinC4; |
151 | cosFormula = x2 * cosFormula + cosC4; |
152 | |
153 | sinFormula = x2 * sinFormula + sinC2; |
154 | cosFormula = x2 * cosFormula + cosC2; |
155 | |
156 | sinFormula = x2 * sinFormula + oneVec; |
157 | cosFormula = x2 * cosFormula + oneVec; |
158 | |
159 | sinFormula *= x; |
160 | |
161 | sinResult = select(sinUseCos, cosFormula, sinFormula); |
162 | cosResult = select(cosUseCos, cosFormula, sinFormula); |
163 | |
164 | sinResult = select(sinFlipSign, -sinResult, sinResult); |
165 | cosResult = select(cosFlipSign, -cosResult, cosResult); |
166 | } |
167 | |
168 | template <typename T> |
169 | __forceinline T tan(const T &v) |
170 | { |
171 | const float piOverFourVec = 0.785398185253143310546875; |
172 | const float fourOverPiVec = 1.27323949337005615234375; |
173 | |
174 | auto xLt0 = v < 0.; |
175 | auto y = select(xLt0, -v, v); |
176 | auto scaled = y * fourOverPiVec; |
177 | |
178 | auto kReal = floor(scaled); |
179 | auto k = toInt(kReal); |
180 | |
181 | auto x = y - kReal * piOverFourVec; |
182 | |
183 | // If k & 1, x -= Pi/4 |
184 | auto needOffset = (k & 1) != 0; |
185 | x = select(needOffset, x - piOverFourVec, x); |
186 | |
187 | // If k & 3 == (0 or 3) let z = tan_In...(y) otherwise z = -cot_In0To... |
188 | auto kMod4 = k & 3; |
189 | auto useCotan = (kMod4 == 1) | (kMod4 == 2); |
190 | |
191 | const float oneVec = 1.0; |
192 | |
193 | const float tanC2 = +0.33333075046539306640625; |
194 | const float tanC4 = +0.13339905440807342529296875; |
195 | const float tanC6 = +5.3348250687122344970703125e-2; |
196 | const float tanC8 = +2.46033705770969390869140625e-2; |
197 | const float tanC10 = +2.892402000725269317626953125e-3; |
198 | const float tanC12 = +9.500005282461643218994140625e-3; |
199 | |
200 | const float cotC2 = -0.3333333432674407958984375; |
201 | const float cotC4 = -2.222204394638538360595703125e-2; |
202 | const float cotC6 = -2.11752182804048061370849609375e-3; |
203 | const float cotC8 = -2.0846328698098659515380859375e-4; |
204 | const float cotC10 = -2.548247357481159269809722900390625e-5; |
205 | const float cotC12 = -3.5257363606433500535786151885986328125e-7; |
206 | |
207 | auto x2 = x * x; |
208 | T z; |
209 | if (any(useCotan)) |
210 | { |
211 | auto cotVal = x2 * cotC12 + cotC10; |
212 | cotVal = x2 * cotVal + cotC8; |
213 | cotVal = x2 * cotVal + cotC6; |
214 | cotVal = x2 * cotVal + cotC4; |
215 | cotVal = x2 * cotVal + cotC2; |
216 | cotVal = x2 * cotVal + oneVec; |
217 | // The equation is for x * cot(x) but we need -x * cot(x) for the tan part. |
218 | cotVal /= -x; |
219 | z = cotVal; |
220 | } |
221 | auto useTan = !useCotan; |
222 | if (any(useTan)) |
223 | { |
224 | auto tanVal = x2 * tanC12 + tanC10; |
225 | tanVal = x2 * tanVal + tanC8; |
226 | tanVal = x2 * tanVal + tanC6; |
227 | tanVal = x2 * tanVal + tanC4; |
228 | tanVal = x2 * tanVal + tanC2; |
229 | tanVal = x2 * tanVal + oneVec; |
230 | // Equation was for tan(x)/x |
231 | tanVal *= x; |
232 | z = select(useTan, tanVal, z); |
233 | } |
234 | return select(xLt0, -z, z); |
235 | } |
236 | |
237 | template <typename T> |
238 | __forceinline T asin(const T &x0) |
239 | { |
240 | auto isneg = (x0 < 0.f); |
241 | auto x = abs(x0); |
242 | auto isnan = (x > 1.f); |
243 | |
244 | // sollya |
245 | // fpminimax(((asin(x)-pi/2)/-sqrt(1-x)), [|0,1,2,3,4,5|],[|single...|], |
246 | // [1e-20;.9999999999999999]); |
247 | // avg error: 1.1105439e-06, max error 1.3187528e-06 |
248 | auto v = 1.57079517841339111328125f + |
249 | x * (-0.21450997889041900634765625f + |
250 | x * (8.78556668758392333984375e-2f + |
251 | x * (-4.489909112453460693359375e-2f + |
252 | x * (1.928029954433441162109375e-2f + |
253 | x * (-4.3095736764371395111083984375e-3f))))); |
254 | |
255 | v *= -sqrt(1.f - x); |
256 | v = v + 1.57079637050628662109375f; |
257 | |
258 | v = select(v < 0.f, T(0.f), v); |
259 | v = select(isneg, -v, v); |
260 | v = select(isnan, T(cast_i2f(i: 0x7fc00000)), v); |
261 | |
262 | return v; |
263 | } |
264 | |
265 | template <typename T> |
266 | __forceinline T acos(const T &v) |
267 | { |
268 | return 1.57079637050628662109375f - asin(v); |
269 | } |
270 | |
271 | template <typename T> |
272 | __forceinline T atan(const T &v) |
273 | { |
274 | const float piOverTwoVec = 1.57079637050628662109375; |
275 | // atan(-x) = -atan(x) (so flip from negative to positive first) |
276 | // If x > 1 -> atan(x) = Pi/2 - atan(1/x) |
277 | auto xNeg = v < 0.f; |
278 | auto xFlipped = select(xNeg, -v, v); |
279 | |
280 | auto xGt1 = xFlipped > 1.; |
281 | auto x = select(xGt1, rcpSafe(xFlipped), xFlipped); |
282 | |
283 | // These coefficients approximate atan(x)/x |
284 | const float atanC0 = +0.99999988079071044921875; |
285 | const float atanC2 = -0.3333191573619842529296875; |
286 | const float atanC4 = +0.199689209461212158203125; |
287 | const float atanC6 = -0.14015688002109527587890625; |
288 | const float atanC8 = +9.905083477497100830078125e-2; |
289 | const float atanC10 = -5.93664981424808502197265625e-2; |
290 | const float atanC12 = +2.417283318936824798583984375e-2; |
291 | const float atanC14 = -4.6721356920897960662841796875e-3; |
292 | |
293 | auto x2 = x * x; |
294 | auto result = x2 * atanC14 + atanC12; |
295 | result = x2 * result + atanC10; |
296 | result = x2 * result + atanC8; |
297 | result = x2 * result + atanC6; |
298 | result = x2 * result + atanC4; |
299 | result = x2 * result + atanC2; |
300 | result = x2 * result + atanC0; |
301 | result *= x; |
302 | |
303 | result = select(xGt1, piOverTwoVec - result, result); |
304 | result = select(xNeg, -result, result); |
305 | return result; |
306 | } |
307 | |
308 | template <typename T> |
309 | __forceinline T atan2(const T &y, const T &x) |
310 | { |
311 | const float piVec = 3.1415926536; |
312 | // atan2(y, x) = |
313 | // |
314 | // atan2(y > 0, x = +-0) -> Pi/2 |
315 | // atan2(y < 0, x = +-0) -> -Pi/2 |
316 | // atan2(y = +-0, x < +0) -> +-Pi |
317 | // atan2(y = +-0, x >= +0) -> +-0 |
318 | // |
319 | // atan2(y >= 0, x < 0) -> Pi + atan(y/x) |
320 | // atan2(y < 0, x < 0) -> -Pi + atan(y/x) |
321 | // atan2(y, x > 0) -> atan(y/x) |
322 | // |
323 | // and then a bunch of code for dealing with infinities. |
324 | auto yOverX = y * rcpSafe(x); |
325 | auto atanArg = atan(yOverX); |
326 | auto xLt0 = x < 0.f; |
327 | auto yLt0 = y < 0.f; |
328 | auto offset = select(xLt0, |
329 | select(yLt0, T(-piVec), T(piVec)), 0.f); |
330 | return offset + atanArg; |
331 | } |
332 | |
333 | template <typename T> |
334 | __forceinline T exp(const T &v) |
335 | { |
336 | const float ln2Part1 = 0.6931457519; |
337 | const float ln2Part2 = 1.4286067653e-6; |
338 | const float oneOverLn2 = 1.44269502162933349609375; |
339 | |
340 | auto scaled = v * oneOverLn2; |
341 | auto kReal = floor(scaled); |
342 | auto k = toInt(kReal); |
343 | |
344 | // Reduced range version of x |
345 | auto x = v - kReal * ln2Part1; |
346 | x -= kReal * ln2Part2; |
347 | |
348 | // These coefficients are for e^x in [0, ln(2)] |
349 | const float one = 1.; |
350 | const float c2 = 0.4999999105930328369140625; |
351 | const float c3 = 0.166668415069580078125; |
352 | const float c4 = 4.16539050638675689697265625e-2; |
353 | const float c5 = 8.378830738365650177001953125e-3; |
354 | const float c6 = 1.304379315115511417388916015625e-3; |
355 | const float c7 = 2.7555381529964506626129150390625e-4; |
356 | |
357 | auto result = x * c7 + c6; |
358 | result = x * result + c5; |
359 | result = x * result + c4; |
360 | result = x * result + c3; |
361 | result = x * result + c2; |
362 | result = x * result + one; |
363 | result = x * result + one; |
364 | |
365 | // Compute 2^k (should differ for float and double, but I'll avoid |
366 | // it for now and just do floats) |
367 | const int fpbias = 127; |
368 | auto biasedN = k + fpbias; |
369 | auto overflow = kReal > fpbias; |
370 | // Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0) |
371 | // we've got underflow. -127 * ln(2) -> -88.02. So the most |
372 | // negative float input that doesn't result in zero is like -88. |
373 | auto underflow = kReal <= -fpbias; |
374 | const int infBits = 0x7f800000; |
375 | biasedN <<= 23; |
376 | // Reinterpret this thing as float |
377 | auto twoToTheN = asFloat(biasedN); |
378 | // Handle both doubles and floats (hopefully eliding the copy for float) |
379 | auto elemtype2n = twoToTheN; |
380 | result *= elemtype2n; |
381 | result = select(overflow, cast_i2f(i: infBits), result); |
382 | result = select(underflow, 0., result); |
383 | return result; |
384 | } |
385 | |
386 | // Range reduction for logarithms takes log(x) -> log(2^n * y) -> n |
387 | // * log(2) + log(y) where y is the reduced range (usually in [1/2, 1)). |
388 | template <typename T, typename R> |
389 | __forceinline void __rangeReduceLog(const T &input, |
390 | T &reduced, |
391 | R &exponent) |
392 | { |
393 | auto intVersion = asInt(input); |
394 | // single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM |
395 | // exponent mask = 0111 1111 1000 0000 0000 0000 0000 0000 |
396 | // 0x7 0xF 0x8 0x0 0x0 0x0 0x0 0x0 |
397 | // non-exponent = 1000 0000 0111 1111 1111 1111 1111 1111 |
398 | // = 0x8 0x0 0x7 0xF 0xF 0xF 0xF 0xF |
399 | |
400 | //const int exponentMask(0x7F800000) |
401 | static const int nonexponentMask = 0x807FFFFF; |
402 | |
403 | // We want the reduced version to have an exponent of -1 which is |
404 | // -1 + 127 after biasing or 126 |
405 | static const int exponentNeg1 = (126l << 23); |
406 | // NOTE(boulos): We don't need to mask anything out since we know |
407 | // the sign bit has to be 0. If it's 1, we need to return infinity/nan |
408 | // anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN). |
409 | auto biasedExponent = intVersion >> 23; // This number is [0, 255] but it means [-127, 128] |
410 | |
411 | auto offsetExponent = biasedExponent + 1; // Treat the number as if it were 2^{e+1} * (1.m)/2 |
412 | exponent = offsetExponent - 127; // get the real value |
413 | |
414 | // Blend the offset_exponent with the original input (do this in |
415 | // int for now, until I decide if float can have & and ¬) |
416 | auto blended = (intVersion & nonexponentMask) | (exponentNeg1); |
417 | reduced = asFloat(blended); |
418 | } |
419 | |
420 | template <typename T> struct ExponentType { }; |
421 | template <int N> struct ExponentType<vfloat_impl<N>> { typedef vint<N> Ty; }; |
422 | template <> struct ExponentType<float> { typedef int Ty; }; |
423 | |
424 | template <typename T> |
425 | __forceinline T log(const T &v) |
426 | { |
427 | T reduced; |
428 | typename ExponentType<T>::Ty exponent; |
429 | |
430 | const int nanBits = 0x7fc00000; |
431 | const int negInfBits = 0xFF800000; |
432 | const float nan = cast_i2f(i: nanBits); |
433 | const float negInf = cast_i2f(i: negInfBits); |
434 | auto useNan = v < 0.; |
435 | auto useInf = v == 0.; |
436 | auto exceptional = useNan | useInf; |
437 | const float one = 1.0; |
438 | |
439 | auto patched = select(exceptional, one, v); |
440 | __rangeReduceLog(patched, reduced, exponent); |
441 | |
442 | const float ln2 = 0.693147182464599609375; |
443 | |
444 | auto x1 = one - reduced; |
445 | const float c1 = +0.50000095367431640625; |
446 | const float c2 = +0.33326041698455810546875; |
447 | const float c3 = +0.2519190013408660888671875; |
448 | const float c4 = +0.17541764676570892333984375; |
449 | const float c5 = +0.3424419462680816650390625; |
450 | const float c6 = -0.599632322788238525390625; |
451 | const float c7 = +1.98442304134368896484375; |
452 | const float c8 = -2.4899270534515380859375; |
453 | const float c9 = +1.7491014003753662109375; |
454 | |
455 | auto result = x1 * c9 + c8; |
456 | result = x1 * result + c7; |
457 | result = x1 * result + c6; |
458 | result = x1 * result + c5; |
459 | result = x1 * result + c4; |
460 | result = x1 * result + c3; |
461 | result = x1 * result + c2; |
462 | result = x1 * result + c1; |
463 | result = x1 * result + one; |
464 | |
465 | // Equation was for -(ln(red)/(1-red)) |
466 | result *= -x1; |
467 | result += toFloat(exponent) * ln2; |
468 | |
469 | return select(exceptional, |
470 | select(useNan, T(nan), T(negInf)), |
471 | result); |
472 | } |
473 | |
474 | template <typename T> |
475 | __forceinline T pow(const T &x, const T &y) |
476 | { |
477 | auto x1 = abs(x); |
478 | auto z = exp(y * log(x1)); |
479 | |
480 | // Handle special cases |
481 | const float twoOver23 = 8388608.0f; |
482 | auto yInt = y == round(y); |
483 | auto yOddInt = select(yInt, asInt(abs(y) + twoOver23) << 31, 0); // set sign bit |
484 | |
485 | // x == 0 |
486 | z = select(x == 0.0f, |
487 | select(y < 0.0f, T(inf) | signmsk(x), |
488 | select(y == 0.0f, T(1.0f), asFloat(yOddInt) & x)), z); |
489 | |
490 | // x < 0 |
491 | auto xNegative = x < 0.0f; |
492 | if (any(xNegative)) |
493 | { |
494 | auto z1 = z | asFloat(yOddInt); |
495 | z1 = select(yInt, z1, std::numeric_limits<float>::quiet_NaN()); |
496 | z = select(xNegative, z1, z); |
497 | } |
498 | |
499 | auto xFinite = isfinite(x); |
500 | auto yFinite = isfinite(y); |
501 | if (all(xFinite & yFinite)) |
502 | return z; |
503 | |
504 | // x finite and y infinite |
505 | z = select(andn(xFinite, yFinite), |
506 | select(x1 == 1.0f, 1.0f, |
507 | select((x1 > 1.0f) ^ (y < 0.0f), inf, T(0.0f))), z); |
508 | |
509 | // x infinite |
510 | z = select(xFinite, z, |
511 | select(y == 0.0f, 1.0f, |
512 | select(y < 0.0f, T(0.0f), inf) | (asFloat(yOddInt) & x))); |
513 | |
514 | return z; |
515 | } |
516 | |
517 | template <typename T> |
518 | __forceinline T pow(const T &x, float y) |
519 | { |
520 | return pow(x, T(y)); |
521 | } |
522 | |
523 | } // namespace fastapprox |
524 | |
525 | } // namespace embree |
526 | |