| 1 | // Copyright 2009-2021 Intel Corporation | 
| 2 | // SPDX-License-Identifier: Apache-2.0 | 
| 3 |  | 
| 4 | #pragma once | 
| 5 |  | 
| 6 | // Transcendental functions from "ispc": https://github.com/ispc/ispc/ | 
| 7 | // Most of the transcendental implementations in ispc code come from | 
| 8 | // Solomon Boulos's "syrah": https://github.com/boulos/syrah/ | 
| 9 |  | 
| 10 | #include "../simd/simd.h" | 
| 11 |  | 
| 12 | namespace embree | 
| 13 | { | 
| 14 |  | 
| 15 | namespace fastapprox | 
| 16 | { | 
| 17 |  | 
| 18 | template <typename T> | 
| 19 | __forceinline T sin(const T &v) | 
| 20 | { | 
| 21 |   static const float piOverTwoVec = 1.57079637050628662109375; | 
| 22 |   static const float twoOverPiVec = 0.636619746685028076171875; | 
| 23 |   auto scaled = v * twoOverPiVec; | 
| 24 |   auto kReal = floor(scaled); | 
| 25 |   auto k = toInt(kReal); | 
| 26 |  | 
| 27 |   // Reduced range version of x | 
| 28 |   auto x = v - kReal * piOverTwoVec; | 
| 29 |   auto kMod4 = k & 3; | 
| 30 |   auto sinUseCos = (kMod4 == 1) | (kMod4 == 3); | 
| 31 |   auto flipSign = (kMod4 > 1); | 
| 32 |  | 
| 33 |   // These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2, | 
| 34 |   // 4, 6, 8, 10|], [|single...|], [0;Pi/2]); | 
| 35 |   static const float sinC2  = -0.16666667163372039794921875; | 
| 36 |   static const float sinC4  = +8.333347737789154052734375e-3; | 
| 37 |   static const float sinC6  = -1.9842604524455964565277099609375e-4; | 
| 38 |   static const float sinC8  = +2.760012648650445044040679931640625e-6; | 
| 39 |   static const float sinC10 = -2.50293279435709337121807038784027099609375e-8; | 
| 40 |  | 
| 41 |   static const float cosC2  = -0.5; | 
| 42 |   static const float cosC4  = +4.166664183139801025390625e-2; | 
| 43 |   static const float cosC6  = -1.388833043165504932403564453125e-3; | 
| 44 |   static const float cosC8  = +2.47562347794882953166961669921875e-5; | 
| 45 |   static const float cosC10 = -2.59630184018533327616751194000244140625e-7; | 
| 46 |  | 
| 47 |   auto outside = select(sinUseCos, 1., x); | 
| 48 |   auto c2  = select(sinUseCos, T(cosC2),  T(sinC2)); | 
| 49 |   auto c4  = select(sinUseCos, T(cosC4),  T(sinC4)); | 
| 50 |   auto c6  = select(sinUseCos, T(cosC6),  T(sinC6)); | 
| 51 |   auto c8  = select(sinUseCos, T(cosC8),  T(sinC8)); | 
| 52 |   auto c10 = select(sinUseCos, T(cosC10), T(sinC10)); | 
| 53 |  | 
| 54 |   auto x2 = x * x; | 
| 55 |   auto formula = x2 * c10 + c8; | 
| 56 |   formula = x2 * formula + c6; | 
| 57 |   formula = x2 * formula + c4; | 
| 58 |   formula = x2 * formula + c2; | 
| 59 |   formula = x2 * formula + 1.; | 
| 60 |   formula *= outside; | 
| 61 |  | 
| 62 |   formula = select(flipSign, -formula, formula); | 
| 63 |   return formula; | 
| 64 | } | 
| 65 |  | 
| 66 | template <typename T> | 
| 67 | __forceinline T cos(const T &v) | 
| 68 | { | 
| 69 |   static const float piOverTwoVec = 1.57079637050628662109375; | 
| 70 |   static const float twoOverPiVec = 0.636619746685028076171875; | 
| 71 |   auto scaled = v * twoOverPiVec; | 
| 72 |   auto kReal = floor(scaled); | 
| 73 |   auto k = toInt(kReal); | 
| 74 |  | 
| 75 |   // Reduced range version of x | 
| 76 |   auto x = v - kReal * piOverTwoVec; | 
| 77 |  | 
| 78 |   auto kMod4 = k & 3; | 
| 79 |   auto cosUseCos = (kMod4 == 0) | (kMod4 == 2); | 
| 80 |   auto flipSign = (kMod4 == 1) | (kMod4 == 2); | 
| 81 |  | 
| 82 |   const float sinC2  = -0.16666667163372039794921875; | 
| 83 |   const float sinC4  = +8.333347737789154052734375e-3; | 
| 84 |   const float sinC6  = -1.9842604524455964565277099609375e-4; | 
| 85 |   const float sinC8  = +2.760012648650445044040679931640625e-6; | 
| 86 |   const float sinC10 = -2.50293279435709337121807038784027099609375e-8; | 
| 87 |  | 
| 88 |   const float cosC2  = -0.5; | 
| 89 |   const float cosC4  = +4.166664183139801025390625e-2; | 
| 90 |   const float cosC6  = -1.388833043165504932403564453125e-3; | 
| 91 |   const float cosC8  = +2.47562347794882953166961669921875e-5; | 
| 92 |   const float cosC10 = -2.59630184018533327616751194000244140625e-7; | 
| 93 |  | 
| 94 |   auto outside = select(cosUseCos, 1., x); | 
| 95 |   auto c2  = select(cosUseCos, T(cosC2),  T(sinC2)); | 
| 96 |   auto c4  = select(cosUseCos, T(cosC4),  T(sinC4)); | 
| 97 |   auto c6  = select(cosUseCos, T(cosC6),  T(sinC6)); | 
| 98 |   auto c8  = select(cosUseCos, T(cosC8),  T(sinC8)); | 
| 99 |   auto c10 = select(cosUseCos, T(cosC10), T(sinC10)); | 
| 100 |  | 
| 101 |   auto x2 = x * x; | 
| 102 |   auto formula = x2 * c10 + c8; | 
| 103 |   formula = x2 * formula + c6; | 
| 104 |   formula = x2 * formula + c4; | 
| 105 |   formula = x2 * formula + c2; | 
| 106 |   formula = x2 * formula + 1.; | 
| 107 |   formula *= outside; | 
| 108 |  | 
| 109 |   formula = select(flipSign, -formula, formula); | 
| 110 |   return formula; | 
| 111 | } | 
| 112 |  | 
| 113 | template <typename T> | 
| 114 | __forceinline void sincos(const T &v, T &sinResult, T &cosResult) | 
| 115 | { | 
| 116 |   const float piOverTwoVec = 1.57079637050628662109375; | 
| 117 |   const float twoOverPiVec = 0.636619746685028076171875; | 
| 118 |   auto scaled = v * twoOverPiVec; | 
| 119 |   auto kReal = floor(scaled); | 
| 120 |   auto k = toInt(kReal); | 
| 121 |  | 
| 122 |   // Reduced range version of x | 
| 123 |   auto x = v - kReal * piOverTwoVec; | 
| 124 |   auto kMod4 = k & 3; | 
| 125 |   auto cosUseCos = ((kMod4 == 0) | (kMod4 == 2)); | 
| 126 |   auto sinUseCos = ((kMod4 == 1) | (kMod4 == 3)); | 
| 127 |   auto sinFlipSign = (kMod4 > 1); | 
| 128 |   auto cosFlipSign = ((kMod4 == 1) | (kMod4 == 2)); | 
| 129 |  | 
| 130 |   const float oneVec = +1.; | 
| 131 |   const float sinC2  = -0.16666667163372039794921875; | 
| 132 |   const float sinC4  = +8.333347737789154052734375e-3; | 
| 133 |   const float sinC6  = -1.9842604524455964565277099609375e-4; | 
| 134 |   const float sinC8  = +2.760012648650445044040679931640625e-6; | 
| 135 |   const float sinC10 = -2.50293279435709337121807038784027099609375e-8; | 
| 136 |  | 
| 137 |   const float cosC2  = -0.5; | 
| 138 |   const float cosC4  = +4.166664183139801025390625e-2; | 
| 139 |   const float cosC6  = -1.388833043165504932403564453125e-3; | 
| 140 |   const float cosC8  = +2.47562347794882953166961669921875e-5; | 
| 141 |   const float cosC10 = -2.59630184018533327616751194000244140625e-7; | 
| 142 |  | 
| 143 |   auto x2 = x * x; | 
| 144 |  | 
| 145 |   auto sinFormula = x2 * sinC10 + sinC8; | 
| 146 |   auto cosFormula = x2 * cosC10 + cosC8; | 
| 147 |   sinFormula = x2 * sinFormula + sinC6; | 
| 148 |   cosFormula = x2 * cosFormula + cosC6; | 
| 149 |  | 
| 150 |   sinFormula = x2 * sinFormula + sinC4; | 
| 151 |   cosFormula = x2 * cosFormula + cosC4; | 
| 152 |  | 
| 153 |   sinFormula = x2 * sinFormula + sinC2; | 
| 154 |   cosFormula = x2 * cosFormula + cosC2; | 
| 155 |  | 
| 156 |   sinFormula = x2 * sinFormula + oneVec; | 
| 157 |   cosFormula = x2 * cosFormula + oneVec; | 
| 158 |  | 
| 159 |   sinFormula *= x; | 
| 160 |  | 
| 161 |   sinResult = select(sinUseCos, cosFormula, sinFormula); | 
| 162 |   cosResult = select(cosUseCos, cosFormula, sinFormula); | 
| 163 |  | 
| 164 |   sinResult = select(sinFlipSign, -sinResult, sinResult); | 
| 165 |   cosResult = select(cosFlipSign, -cosResult, cosResult); | 
| 166 | } | 
| 167 |  | 
| 168 | template <typename T> | 
| 169 | __forceinline T tan(const T &v) | 
| 170 | { | 
| 171 |   const float piOverFourVec = 0.785398185253143310546875; | 
| 172 |   const float fourOverPiVec = 1.27323949337005615234375; | 
| 173 |  | 
| 174 |   auto xLt0 = v < 0.; | 
| 175 |   auto y = select(xLt0, -v, v); | 
| 176 |   auto scaled = y * fourOverPiVec; | 
| 177 |  | 
| 178 |   auto kReal = floor(scaled); | 
| 179 |   auto k = toInt(kReal); | 
| 180 |  | 
| 181 |   auto x = y - kReal * piOverFourVec; | 
| 182 |  | 
| 183 |   // If k & 1, x -= Pi/4 | 
| 184 |   auto needOffset = (k & 1) != 0; | 
| 185 |   x = select(needOffset, x - piOverFourVec, x); | 
| 186 |  | 
| 187 |   // If k & 3 == (0 or 3) let z = tan_In...(y) otherwise z = -cot_In0To... | 
| 188 |   auto kMod4 = k & 3; | 
| 189 |   auto useCotan = (kMod4 == 1) | (kMod4 == 2); | 
| 190 |  | 
| 191 |   const float oneVec = 1.0; | 
| 192 |  | 
| 193 |   const float tanC2  = +0.33333075046539306640625; | 
| 194 |   const float tanC4  = +0.13339905440807342529296875; | 
| 195 |   const float tanC6  = +5.3348250687122344970703125e-2; | 
| 196 |   const float tanC8  = +2.46033705770969390869140625e-2; | 
| 197 |   const float tanC10 = +2.892402000725269317626953125e-3; | 
| 198 |   const float tanC12 = +9.500005282461643218994140625e-3; | 
| 199 |  | 
| 200 |   const float cotC2  = -0.3333333432674407958984375; | 
| 201 |   const float cotC4  = -2.222204394638538360595703125e-2; | 
| 202 |   const float cotC6  = -2.11752182804048061370849609375e-3; | 
| 203 |   const float cotC8  = -2.0846328698098659515380859375e-4; | 
| 204 |   const float cotC10 = -2.548247357481159269809722900390625e-5; | 
| 205 |   const float cotC12 = -3.5257363606433500535786151885986328125e-7; | 
| 206 |  | 
| 207 |   auto x2 = x * x; | 
| 208 |   T z; | 
| 209 |   if (any(useCotan)) | 
| 210 |   { | 
| 211 |     auto cotVal = x2 * cotC12 + cotC10; | 
| 212 |     cotVal = x2 * cotVal + cotC8; | 
| 213 |     cotVal = x2 * cotVal + cotC6; | 
| 214 |     cotVal = x2 * cotVal + cotC4; | 
| 215 |     cotVal = x2 * cotVal + cotC2; | 
| 216 |     cotVal = x2 * cotVal + oneVec; | 
| 217 |     // The equation is for x * cot(x) but we need -x * cot(x) for the tan part. | 
| 218 |     cotVal /= -x; | 
| 219 |     z = cotVal; | 
| 220 |   } | 
| 221 |   auto useTan = !useCotan; | 
| 222 |   if (any(useTan)) | 
| 223 |   { | 
| 224 |     auto tanVal = x2 * tanC12 + tanC10; | 
| 225 |     tanVal = x2 * tanVal + tanC8; | 
| 226 |     tanVal = x2 * tanVal + tanC6; | 
| 227 |     tanVal = x2 * tanVal + tanC4; | 
| 228 |     tanVal = x2 * tanVal + tanC2; | 
| 229 |     tanVal = x2 * tanVal + oneVec; | 
| 230 |     // Equation was for tan(x)/x | 
| 231 |     tanVal *= x; | 
| 232 |     z = select(useTan, tanVal, z); | 
| 233 |   } | 
| 234 |   return select(xLt0, -z, z); | 
| 235 | } | 
| 236 |  | 
| 237 | template <typename T> | 
| 238 | __forceinline T asin(const T &x0) | 
| 239 | { | 
| 240 |   auto isneg = (x0 < 0.f); | 
| 241 |   auto x = abs(x0); | 
| 242 |   auto isnan = (x > 1.f); | 
| 243 |  | 
| 244 |   // sollya | 
| 245 |   // fpminimax(((asin(x)-pi/2)/-sqrt(1-x)), [|0,1,2,3,4,5|],[|single...|], | 
| 246 |   //           [1e-20;.9999999999999999]); | 
| 247 |   // avg error: 1.1105439e-06, max error 1.3187528e-06 | 
| 248 |   auto v = 1.57079517841339111328125f + | 
| 249 |            x * (-0.21450997889041900634765625f + | 
| 250 |                 x * (8.78556668758392333984375e-2f + | 
| 251 |                      x * (-4.489909112453460693359375e-2f + | 
| 252 |                           x * (1.928029954433441162109375e-2f + | 
| 253 |                                x * (-4.3095736764371395111083984375e-3f))))); | 
| 254 |  | 
| 255 |   v *= -sqrt(1.f - x); | 
| 256 |   v = v + 1.57079637050628662109375f; | 
| 257 |  | 
| 258 |   v = select(v < 0.f, T(0.f), v); | 
| 259 |   v = select(isneg, -v, v); | 
| 260 |   v = select(isnan, T(cast_i2f(i: 0x7fc00000)), v); | 
| 261 |  | 
| 262 |   return v; | 
| 263 | } | 
| 264 |  | 
| 265 | template <typename T> | 
| 266 | __forceinline T acos(const T &v) | 
| 267 | { | 
| 268 |   return 1.57079637050628662109375f - asin(v); | 
| 269 | } | 
| 270 |  | 
| 271 | template <typename T> | 
| 272 | __forceinline T atan(const T &v) | 
| 273 | { | 
| 274 |   const float piOverTwoVec = 1.57079637050628662109375; | 
| 275 |   // atan(-x) = -atan(x) (so flip from negative to positive first) | 
| 276 |   // If x > 1 -> atan(x) = Pi/2 - atan(1/x) | 
| 277 |   auto xNeg = v < 0.f; | 
| 278 |   auto xFlipped = select(xNeg, -v, v); | 
| 279 |  | 
| 280 |   auto xGt1 = xFlipped > 1.; | 
| 281 |   auto x = select(xGt1, rcpSafe(xFlipped), xFlipped); | 
| 282 |  | 
| 283 |   // These coefficients approximate atan(x)/x | 
| 284 |   const float atanC0  = +0.99999988079071044921875; | 
| 285 |   const float atanC2  = -0.3333191573619842529296875; | 
| 286 |   const float atanC4  = +0.199689209461212158203125; | 
| 287 |   const float atanC6  = -0.14015688002109527587890625; | 
| 288 |   const float atanC8  = +9.905083477497100830078125e-2; | 
| 289 |   const float atanC10 = -5.93664981424808502197265625e-2; | 
| 290 |   const float atanC12 = +2.417283318936824798583984375e-2; | 
| 291 |   const float atanC14 = -4.6721356920897960662841796875e-3; | 
| 292 |  | 
| 293 |   auto x2 = x * x; | 
| 294 |   auto result = x2 * atanC14 + atanC12; | 
| 295 |   result = x2 * result + atanC10; | 
| 296 |   result = x2 * result + atanC8; | 
| 297 |   result = x2 * result + atanC6; | 
| 298 |   result = x2 * result + atanC4; | 
| 299 |   result = x2 * result + atanC2; | 
| 300 |   result = x2 * result + atanC0; | 
| 301 |   result *= x; | 
| 302 |  | 
| 303 |   result = select(xGt1, piOverTwoVec - result, result); | 
| 304 |   result = select(xNeg, -result, result); | 
| 305 |   return result; | 
| 306 | } | 
| 307 |  | 
| 308 | template <typename T> | 
| 309 | __forceinline T atan2(const T &y, const T &x) | 
| 310 | { | 
| 311 |   const float piVec = 3.1415926536; | 
| 312 |   // atan2(y, x) = | 
| 313 |   // | 
| 314 |   // atan2(y > 0, x = +-0) ->  Pi/2 | 
| 315 |   // atan2(y < 0, x = +-0) -> -Pi/2 | 
| 316 |   // atan2(y = +-0, x < +0) -> +-Pi | 
| 317 |   // atan2(y = +-0, x >= +0) -> +-0 | 
| 318 |   // | 
| 319 |   // atan2(y >= 0, x < 0) ->  Pi + atan(y/x) | 
| 320 |   // atan2(y <  0, x < 0) -> -Pi + atan(y/x) | 
| 321 |   // atan2(y, x > 0) -> atan(y/x) | 
| 322 |   // | 
| 323 |   // and then a bunch of code for dealing with infinities. | 
| 324 |   auto yOverX = y * rcpSafe(x); | 
| 325 |   auto atanArg = atan(yOverX); | 
| 326 |   auto xLt0 = x < 0.f; | 
| 327 |   auto yLt0 = y < 0.f; | 
| 328 |   auto offset = select(xLt0, | 
| 329 |                 select(yLt0, T(-piVec), T(piVec)), 0.f); | 
| 330 |   return offset + atanArg; | 
| 331 | } | 
| 332 |  | 
| 333 | template <typename T> | 
| 334 | __forceinline T exp(const T &v) | 
| 335 | { | 
| 336 |   const float ln2Part1 = 0.6931457519; | 
| 337 |   const float ln2Part2 = 1.4286067653e-6; | 
| 338 |   const float oneOverLn2 = 1.44269502162933349609375; | 
| 339 |  | 
| 340 |   auto scaled = v * oneOverLn2; | 
| 341 |   auto kReal = floor(scaled); | 
| 342 |   auto k = toInt(kReal); | 
| 343 |  | 
| 344 |   // Reduced range version of x | 
| 345 |   auto x = v - kReal * ln2Part1; | 
| 346 |   x -= kReal * ln2Part2; | 
| 347 |  | 
| 348 |   // These coefficients are for e^x in [0, ln(2)] | 
| 349 |   const float one = 1.; | 
| 350 |   const float c2 = 0.4999999105930328369140625; | 
| 351 |   const float c3 = 0.166668415069580078125; | 
| 352 |   const float c4 = 4.16539050638675689697265625e-2; | 
| 353 |   const float c5 = 8.378830738365650177001953125e-3; | 
| 354 |   const float c6 = 1.304379315115511417388916015625e-3; | 
| 355 |   const float c7 = 2.7555381529964506626129150390625e-4; | 
| 356 |  | 
| 357 |   auto result = x * c7 + c6; | 
| 358 |   result = x * result + c5; | 
| 359 |   result = x * result + c4; | 
| 360 |   result = x * result + c3; | 
| 361 |   result = x * result + c2; | 
| 362 |   result = x * result + one; | 
| 363 |   result = x * result + one; | 
| 364 |  | 
| 365 |   // Compute 2^k (should differ for float and double, but I'll avoid | 
| 366 |   // it for now and just do floats) | 
| 367 |   const int fpbias = 127; | 
| 368 |   auto biasedN = k + fpbias; | 
| 369 |   auto overflow = kReal > fpbias; | 
| 370 |   // Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0) | 
| 371 |   // we've got underflow. -127 * ln(2) -> -88.02. So the most | 
| 372 |   // negative float input that doesn't result in zero is like -88. | 
| 373 |   auto underflow = kReal <= -fpbias; | 
| 374 |   const int infBits = 0x7f800000; | 
| 375 |   biasedN <<= 23; | 
| 376 |   // Reinterpret this thing as float | 
| 377 |   auto twoToTheN = asFloat(biasedN); | 
| 378 |   // Handle both doubles and floats (hopefully eliding the copy for float) | 
| 379 |   auto elemtype2n = twoToTheN; | 
| 380 |   result *= elemtype2n; | 
| 381 |   result = select(overflow, cast_i2f(i: infBits), result); | 
| 382 |   result = select(underflow, 0., result); | 
| 383 |   return result; | 
| 384 | } | 
| 385 |  | 
| 386 | // Range reduction for logarithms takes log(x) -> log(2^n * y) -> n | 
| 387 | // * log(2) + log(y) where y is the reduced range (usually in [1/2, 1)). | 
| 388 | template <typename T, typename R> | 
| 389 | __forceinline void __rangeReduceLog(const T &input, | 
| 390 |                                     T &reduced, | 
| 391 |                                     R &exponent) | 
| 392 | { | 
| 393 |   auto intVersion = asInt(input); | 
| 394 |   // single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM | 
| 395 |   // exponent mask    = 0111 1111 1000 0000 0000 0000 0000 0000 | 
| 396 |   //                    0x7  0xF  0x8  0x0  0x0  0x0  0x0  0x0 | 
| 397 |   // non-exponent     = 1000 0000 0111 1111 1111 1111 1111 1111 | 
| 398 |   //                  = 0x8  0x0  0x7  0xF  0xF  0xF  0xF  0xF | 
| 399 |  | 
| 400 |   //const int exponentMask(0x7F800000) | 
| 401 |   static const int nonexponentMask = 0x807FFFFF; | 
| 402 |  | 
| 403 |   // We want the reduced version to have an exponent of -1 which is | 
| 404 |   // -1 + 127 after biasing or 126 | 
| 405 |   static const int exponentNeg1 = (126l << 23); | 
| 406 |   // NOTE(boulos): We don't need to mask anything out since we know | 
| 407 |   // the sign bit has to be 0. If it's 1, we need to return infinity/nan | 
| 408 |   // anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN). | 
| 409 |   auto biasedExponent = intVersion >> 23; // This number is [0, 255] but it means [-127, 128] | 
| 410 |  | 
| 411 |   auto offsetExponent = biasedExponent + 1; // Treat the number as if it were 2^{e+1} * (1.m)/2 | 
| 412 |   exponent = offsetExponent - 127;          // get the real value | 
| 413 |  | 
| 414 |   // Blend the offset_exponent with the original input (do this in | 
| 415 |   // int for now, until I decide if float can have & and ¬) | 
| 416 |   auto blended = (intVersion & nonexponentMask) | (exponentNeg1); | 
| 417 |   reduced = asFloat(blended); | 
| 418 | } | 
| 419 |  | 
| 420 | template <typename T> struct ExponentType            { }; | 
| 421 | template <int N>      struct ExponentType<vfloat_impl<N>> { typedef vint<N> Ty; }; | 
| 422 | template <>           struct ExponentType<float>     { typedef int     Ty; }; | 
| 423 |  | 
| 424 | template <typename T> | 
| 425 | __forceinline T log(const T &v) | 
| 426 | { | 
| 427 |   T reduced; | 
| 428 |   typename ExponentType<T>::Ty exponent; | 
| 429 |  | 
| 430 |   const int nanBits = 0x7fc00000; | 
| 431 |   const int negInfBits = 0xFF800000; | 
| 432 |   const float nan = cast_i2f(i: nanBits); | 
| 433 |   const float negInf = cast_i2f(i: negInfBits); | 
| 434 |   auto useNan = v < 0.; | 
| 435 |   auto useInf = v == 0.; | 
| 436 |   auto exceptional = useNan | useInf; | 
| 437 |   const float one = 1.0; | 
| 438 |  | 
| 439 |   auto patched = select(exceptional, one, v); | 
| 440 |   __rangeReduceLog(patched, reduced, exponent); | 
| 441 |  | 
| 442 |   const float ln2 = 0.693147182464599609375; | 
| 443 |  | 
| 444 |   auto x1 = one - reduced; | 
| 445 |   const float c1 = +0.50000095367431640625; | 
| 446 |   const float c2 = +0.33326041698455810546875; | 
| 447 |   const float c3 = +0.2519190013408660888671875; | 
| 448 |   const float c4 = +0.17541764676570892333984375; | 
| 449 |   const float c5 = +0.3424419462680816650390625; | 
| 450 |   const float c6 = -0.599632322788238525390625; | 
| 451 |   const float c7 = +1.98442304134368896484375; | 
| 452 |   const float c8 = -2.4899270534515380859375; | 
| 453 |   const float c9 = +1.7491014003753662109375; | 
| 454 |  | 
| 455 |   auto result = x1 * c9 + c8; | 
| 456 |   result = x1 * result + c7; | 
| 457 |   result = x1 * result + c6; | 
| 458 |   result = x1 * result + c5; | 
| 459 |   result = x1 * result + c4; | 
| 460 |   result = x1 * result + c3; | 
| 461 |   result = x1 * result + c2; | 
| 462 |   result = x1 * result + c1; | 
| 463 |   result = x1 * result + one; | 
| 464 |  | 
| 465 |   // Equation was for -(ln(red)/(1-red)) | 
| 466 |   result *= -x1; | 
| 467 |   result += toFloat(exponent) * ln2; | 
| 468 |  | 
| 469 |   return select(exceptional, | 
| 470 |                 select(useNan, T(nan), T(negInf)), | 
| 471 |                 result); | 
| 472 | } | 
| 473 |  | 
| 474 | template <typename T> | 
| 475 | __forceinline T pow(const T &x, const T &y) | 
| 476 | { | 
| 477 |   auto x1 = abs(x); | 
| 478 |   auto z = exp(y * log(x1)); | 
| 479 |  | 
| 480 |   // Handle special cases | 
| 481 |   const float twoOver23 = 8388608.0f; | 
| 482 |   auto yInt = y == round(y); | 
| 483 |   auto yOddInt = select(yInt, asInt(abs(y) + twoOver23) << 31, 0); // set sign bit | 
| 484 |  | 
| 485 |   // x == 0 | 
| 486 |   z = select(x == 0.0f, | 
| 487 |       select(y < 0.0f, T(inf) | signmsk(x), | 
| 488 |       select(y == 0.0f, T(1.0f), asFloat(yOddInt) & x)), z); | 
| 489 |  | 
| 490 |   // x < 0 | 
| 491 |   auto xNegative = x < 0.0f; | 
| 492 |   if (any(xNegative)) | 
| 493 |   { | 
| 494 |     auto z1 = z | asFloat(yOddInt); | 
| 495 |     z1 = select(yInt, z1, std::numeric_limits<float>::quiet_NaN()); | 
| 496 |     z = select(xNegative, z1, z); | 
| 497 |   } | 
| 498 |  | 
| 499 |   auto xFinite = isfinite(x); | 
| 500 |   auto yFinite = isfinite(y); | 
| 501 |   if (all(xFinite & yFinite)) | 
| 502 |     return z; | 
| 503 |  | 
| 504 |   // x finite and y infinite | 
| 505 |   z = select(andn(xFinite, yFinite), | 
| 506 |       select(x1 == 1.0f, 1.0f, | 
| 507 |       select((x1 > 1.0f) ^ (y < 0.0f), inf, T(0.0f))), z); | 
| 508 |  | 
| 509 |   // x infinite | 
| 510 |   z = select(xFinite, z, | 
| 511 |       select(y == 0.0f, 1.0f, | 
| 512 |       select(y < 0.0f, T(0.0f), inf) | (asFloat(yOddInt) & x))); | 
| 513 |  | 
| 514 |   return z; | 
| 515 | } | 
| 516 |  | 
| 517 | template <typename T> | 
| 518 | __forceinline T pow(const T &x, float y) | 
| 519 | { | 
| 520 |   return pow(x, T(y)); | 
| 521 | } | 
| 522 |  | 
| 523 | } // namespace fastapprox | 
| 524 |  | 
| 525 | } // namespace embree | 
| 526 |  |