| 1 | // Copyright 2009-2021 Intel Corporation | 
| 2 | // SPDX-License-Identifier: Apache-2.0 | 
| 3 |  | 
| 4 | #pragma once | 
| 5 |  | 
| 6 | #include "../sys/alloc.h" | 
| 7 | #include "math.h" | 
| 8 | #include "../simd/sse.h" | 
| 9 |  | 
| 10 | namespace embree | 
| 11 | { | 
| 12 |   //////////////////////////////////////////////////////////////////////////////// | 
| 13 |   /// SSE Vec2fa Type | 
| 14 |   //////////////////////////////////////////////////////////////////////////////// | 
| 15 |  | 
| 16 |   struct __aligned(16) Vec2fa | 
| 17 |   { | 
| 18 |     ALIGNED_STRUCT_(16); | 
| 19 |  | 
| 20 |     typedef float Scalar; | 
| 21 |     enum { N = 2 }; | 
| 22 |     union { | 
| 23 |       __m128 m128; | 
| 24 |       struct { float x,y,az,aw; }; | 
| 25 |     }; | 
| 26 |  | 
| 27 |     //////////////////////////////////////////////////////////////////////////////// | 
| 28 |     /// Constructors, Assignment & Cast Operators | 
| 29 |     //////////////////////////////////////////////////////////////////////////////// | 
| 30 |  | 
| 31 |     __forceinline Vec2fa( ) {} | 
| 32 |     __forceinline Vec2fa( const __m128 a ) : m128(a) {} | 
| 33 |  | 
| 34 |     __forceinline Vec2fa            ( const Vec2<float>& other  ) { x = other.x; y = other.y; } | 
| 35 |     __forceinline Vec2fa& operator =( const Vec2<float>& other ) { x = other.x; y = other.y; return *this; } | 
| 36 |  | 
| 37 |     __forceinline Vec2fa            ( const Vec2fa& other ) { m128 = other.m128; } | 
| 38 |     __forceinline Vec2fa& operator =( const Vec2fa& other ) { m128 = other.m128; return *this; } | 
| 39 |  | 
| 40 |     __forceinline explicit Vec2fa( const float a ) : m128(_mm_set1_ps(w: a)) {} | 
| 41 |     __forceinline          Vec2fa( const float x, const float y) : m128(_mm_set_ps(z: y, y: y, x: y, w: x)) {} | 
| 42 |  | 
| 43 |     __forceinline explicit Vec2fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a: a)) {} | 
| 44 |  | 
| 45 |     __forceinline operator const __m128&() const { return m128; } | 
| 46 |     __forceinline operator       __m128&()       { return m128; } | 
| 47 |  | 
| 48 |     //////////////////////////////////////////////////////////////////////////////// | 
| 49 |     /// Loads and Stores | 
| 50 |     //////////////////////////////////////////////////////////////////////////////// | 
| 51 |  | 
| 52 |     static __forceinline Vec2fa load( const void* const a ) { | 
| 53 |       return Vec2fa(_mm_and_ps(a: _mm_load_ps(p: (float*)a),b: _mm_castsi128_ps(a: _mm_set_epi32(i3: 0, i2: 0, i1: -1, i0: -1)))); | 
| 54 |     } | 
| 55 |  | 
| 56 |     static __forceinline Vec2fa loadu( const void* const a ) { | 
| 57 |       return Vec2fa(_mm_and_ps(a: _mm_loadu_ps(p: (float*)a),b: _mm_castsi128_ps(a: _mm_set_epi32(i3: 0, i2: 0, i1: -1, i0: -1)))); | 
| 58 |     } | 
| 59 |  | 
| 60 |     static __forceinline void storeu ( void* ptr, const Vec2fa& v ) { | 
| 61 |       _mm_storeu_ps(p: (float*)ptr,a: v); | 
| 62 |     } | 
| 63 |  | 
| 64 |     //////////////////////////////////////////////////////////////////////////////// | 
| 65 |     /// Constants | 
| 66 |     //////////////////////////////////////////////////////////////////////////////// | 
| 67 |  | 
| 68 |     __forceinline Vec2fa( ZeroTy   ) : m128(_mm_setzero_ps()) {} | 
| 69 |     __forceinline Vec2fa( OneTy    ) : m128(_mm_set1_ps(w: 1.0f)) {} | 
| 70 |     __forceinline Vec2fa( PosInfTy ) : m128(_mm_set1_ps(w: pos_inf)) {} | 
| 71 |     __forceinline Vec2fa( NegInfTy ) : m128(_mm_set1_ps(w: neg_inf)) {} | 
| 72 |  | 
| 73 |     //////////////////////////////////////////////////////////////////////////////// | 
| 74 |     /// Array Access | 
| 75 |     //////////////////////////////////////////////////////////////////////////////// | 
| 76 |  | 
| 77 |     __forceinline const float& operator []( const size_t index ) const { assert(index < 2); return (&x)[index]; } | 
| 78 |     __forceinline       float& operator []( const size_t index )       { assert(index < 2); return (&x)[index]; } | 
| 79 |   }; | 
| 80 |  | 
| 81 |   //////////////////////////////////////////////////////////////////////////////// | 
| 82 |   /// Unary Operators | 
| 83 |   //////////////////////////////////////////////////////////////////////////////// | 
| 84 |  | 
| 85 |   __forceinline Vec2fa operator +( const Vec2fa& a ) { return a; } | 
| 86 |   __forceinline Vec2fa operator -( const Vec2fa& a ) { | 
| 87 |     const __m128 mask = _mm_castsi128_ps(a: _mm_set1_epi32(i: 0x80000000)); | 
| 88 |     return _mm_xor_ps(a: a.m128, b: mask); | 
| 89 |   } | 
| 90 |   __forceinline Vec2fa abs  ( const Vec2fa& a ) { | 
| 91 |     const __m128 mask = _mm_castsi128_ps(a: _mm_set1_epi32(i: 0x7fffffff)); | 
| 92 |     return _mm_and_ps(a: a.m128, b: mask); | 
| 93 |   } | 
| 94 |   __forceinline Vec2fa sign ( const Vec2fa& a ) { | 
| 95 |     return blendv_ps(f: Vec2fa(one), t: -Vec2fa(one), mask: _mm_cmplt_ps (a: a,b: Vec2fa(zero))); | 
| 96 |   } | 
| 97 |  | 
| 98 |   __forceinline Vec2fa rcp  ( const Vec2fa& a ) | 
| 99 |   { | 
| 100 | #if defined(__AVX512VL__) | 
| 101 |     const Vec2fa r = _mm_rcp14_ps(a.m128); | 
| 102 | #else | 
| 103 |     const Vec2fa r = _mm_rcp_ps(a: a.m128); | 
| 104 | #endif | 
| 105 |  | 
| 106 | #if defined(__AVX2__) | 
| 107 |     const Vec2fa h_n = _mm_fnmadd_ps(a, r, vfloat4(1.0));  // First, compute 1 - a * r (which will be very close to 0) | 
| 108 |     const Vec2fa res = _mm_fmadd_ps(r, h_n, r);            // Then compute r + r * h_n | 
| 109 | #else | 
| 110 |     const Vec2fa h_n = _mm_sub_ps(a: vfloat4(1.0f), b: _mm_mul_ps(a: a, b: r));  // First, compute 1 - a * r (which will be very close to 0) | 
| 111 |     const Vec2fa res = _mm_add_ps(a: r,b: _mm_mul_ps(a: r, b: h_n));             // Then compute r + r * h_n   | 
| 112 | #endif | 
| 113 |  | 
| 114 |     return res; | 
| 115 |   } | 
| 116 |  | 
| 117 |   __forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a: a.m128); } | 
| 118 |   __forceinline Vec2fa sqr  ( const Vec2fa& a ) { return _mm_mul_ps(a: a,b: a); } | 
| 119 |  | 
| 120 |   __forceinline Vec2fa rsqrt( const Vec2fa& a ) | 
| 121 |   { | 
| 122 | #if defined(__AVX512VL__) | 
| 123 |     __m128 r = _mm_rsqrt14_ps(a.m128); | 
| 124 | #else | 
| 125 |     __m128 r = _mm_rsqrt_ps(a: a.m128); | 
| 126 | #endif | 
| 127 |     return _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: 1.5f),b: r), b: _mm_mul_ps(a: _mm_mul_ps(a: _mm_mul_ps(a: a, b: _mm_set1_ps(w: -0.5f)), b: r), b: _mm_mul_ps(a: r, b: r))); | 
| 128 |   } | 
| 129 |  | 
| 130 |   __forceinline Vec2fa zero_fix(const Vec2fa& a) { | 
| 131 |     return blendv_ps(f: a, t: _mm_set1_ps(w: min_rcp_input), mask: _mm_cmplt_ps (a: abs(a).m128, b: _mm_set1_ps(w: min_rcp_input))); | 
| 132 |   } | 
| 133 |   __forceinline Vec2fa rcp_safe(const Vec2fa& a) { | 
| 134 |     return rcp(a: zero_fix(a)); | 
| 135 |   } | 
| 136 |   __forceinline Vec2fa log ( const Vec2fa& a ) { | 
| 137 |     return Vec2fa(logf(x: a.x),logf(x: a.y)); | 
| 138 |   } | 
| 139 |  | 
| 140 |   __forceinline Vec2fa exp ( const Vec2fa& a ) { | 
| 141 |     return Vec2fa(expf(x: a.x),expf(x: a.y)); | 
| 142 |   } | 
| 143 |  | 
| 144 |   //////////////////////////////////////////////////////////////////////////////// | 
| 145 |   /// Binary Operators | 
| 146 |   //////////////////////////////////////////////////////////////////////////////// | 
| 147 |  | 
| 148 |   __forceinline Vec2fa operator +( const Vec2fa& a, const Vec2fa& b ) { return _mm_add_ps(a: a.m128, b: b.m128); } | 
| 149 |   __forceinline Vec2fa operator -( const Vec2fa& a, const Vec2fa& b ) { return _mm_sub_ps(a: a.m128, b: b.m128); } | 
| 150 |   __forceinline Vec2fa operator *( const Vec2fa& a, const Vec2fa& b ) { return _mm_mul_ps(a: a.m128, b: b.m128); } | 
| 151 |   __forceinline Vec2fa operator *( const Vec2fa& a, const float b ) { return a * Vec2fa(b); } | 
| 152 |   __forceinline Vec2fa operator *( const float a, const Vec2fa& b ) { return Vec2fa(a) * b; } | 
| 153 |   __forceinline Vec2fa operator /( const Vec2fa& a, const Vec2fa& b ) { return _mm_div_ps(a: a.m128,b: b.m128); } | 
| 154 |   __forceinline Vec2fa operator /( const Vec2fa& a, const float b        ) { return _mm_div_ps(a: a.m128,b: _mm_set1_ps(w: b)); } | 
| 155 |   __forceinline Vec2fa operator /( const        float a, const Vec2fa& b ) { return _mm_div_ps(a: _mm_set1_ps(w: a),b: b.m128); } | 
| 156 |  | 
| 157 |   __forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a: a.m128,b: b.m128); } | 
| 158 |   __forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a: a.m128,b: b.m128); } | 
| 159 |  | 
| 160 | #if defined(__SSE4_1__) | 
| 161 |     __forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) { | 
| 162 |       const vint4 ai = _mm_castps_si128(a); | 
| 163 |       const vint4 bi = _mm_castps_si128(b); | 
| 164 |       const vint4 ci = _mm_min_epi32(ai,bi); | 
| 165 |       return _mm_castsi128_ps(ci); | 
| 166 |     } | 
| 167 | #endif | 
| 168 |  | 
| 169 | #if defined(__SSE4_1__) | 
| 170 |     __forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) { | 
| 171 |       const vint4 ai = _mm_castps_si128(a); | 
| 172 |       const vint4 bi = _mm_castps_si128(b); | 
| 173 |       const vint4 ci = _mm_max_epi32(ai,bi); | 
| 174 |       return _mm_castsi128_ps(ci); | 
| 175 |     } | 
| 176 | #endif | 
| 177 |  | 
| 178 |     __forceinline Vec2fa pow ( const Vec2fa& a, const float& b ) { | 
| 179 |       return Vec2fa(powf(x: a.x,y: b),powf(x: a.y,y: b)); | 
| 180 |     } | 
| 181 |  | 
| 182 |   //////////////////////////////////////////////////////////////////////////////// | 
| 183 |   /// Ternary Operators | 
| 184 |   //////////////////////////////////////////////////////////////////////////////// | 
| 185 |  | 
| 186 | #if defined(__AVX2__) | 
| 187 |   __forceinline Vec2fa madd  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmadd_ps(a,b,c); } | 
| 188 |   __forceinline Vec2fa msub  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmsub_ps(a,b,c); } | 
| 189 |   __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmadd_ps(a,b,c); } | 
| 190 |   __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmsub_ps(a,b,c); } | 
| 191 | #else | 
| 192 |   __forceinline Vec2fa madd  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b+c; } | 
| 193 |   __forceinline Vec2fa msub  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b-c; } | 
| 194 |   __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b+c;} | 
| 195 |   __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b-c; } | 
| 196 | #endif | 
| 197 |  | 
| 198 |   __forceinline Vec2fa madd  ( const float a, const Vec2fa& b, const Vec2fa& c) { return madd(a: Vec2fa(a),b,c); } | 
| 199 |   __forceinline Vec2fa msub  ( const float a, const Vec2fa& b, const Vec2fa& c) { return msub(a: Vec2fa(a),b,c); } | 
| 200 |   __forceinline Vec2fa nmadd ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmadd(a: Vec2fa(a),b,c); } | 
| 201 |   __forceinline Vec2fa nmsub ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmsub(a: Vec2fa(a),b,c); } | 
| 202 |  | 
| 203 |   //////////////////////////////////////////////////////////////////////////////// | 
| 204 |   /// Assignment Operators | 
| 205 |   //////////////////////////////////////////////////////////////////////////////// | 
| 206 |  | 
| 207 |   __forceinline Vec2fa& operator +=( Vec2fa& a, const Vec2fa& b ) { return a = a + b; } | 
| 208 |   __forceinline Vec2fa& operator -=( Vec2fa& a, const Vec2fa& b ) { return a = a - b; } | 
| 209 |   __forceinline Vec2fa& operator *=( Vec2fa& a, const Vec2fa& b ) { return a = a * b; } | 
| 210 |   __forceinline Vec2fa& operator *=( Vec2fa& a, const float   b ) { return a = a * b; } | 
| 211 |   __forceinline Vec2fa& operator /=( Vec2fa& a, const Vec2fa& b ) { return a = a / b; } | 
| 212 |   __forceinline Vec2fa& operator /=( Vec2fa& a, const float   b ) { return a = a / b; } | 
| 213 |  | 
| 214 |   //////////////////////////////////////////////////////////////////////////////// | 
| 215 |   /// Reductions | 
| 216 |   //////////////////////////////////////////////////////////////////////////////// | 
| 217 |  | 
| 218 |   __forceinline float reduce_add(const Vec2fa& v) { return v.x+v.y; } | 
| 219 |   __forceinline float reduce_mul(const Vec2fa& v) { return v.x*v.y; } | 
| 220 |   __forceinline float reduce_min(const Vec2fa& v) { return min(a: v.x,b: v.y); } | 
| 221 |   __forceinline float reduce_max(const Vec2fa& v) { return max(a: v.x,b: v.y); } | 
| 222 |  | 
| 223 |   //////////////////////////////////////////////////////////////////////////////// | 
| 224 |   /// Comparison Operators | 
| 225 |   //////////////////////////////////////////////////////////////////////////////// | 
| 226 |  | 
| 227 |   __forceinline bool operator ==( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(a: _mm_cmpeq_ps (a: a.m128, b: b.m128)) & 3) == 3; } | 
| 228 |   __forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(a: _mm_cmpneq_ps(a: a.m128, b: b.m128)) & 3) != 0; } | 
| 229 |  | 
| 230 |   //////////////////////////////////////////////////////////////////////////////// | 
| 231 |   /// Euclidian Space Operators | 
| 232 |   //////////////////////////////////////////////////////////////////////////////// | 
| 233 |  | 
| 234 | #if defined(__SSE4_1__) | 
| 235 |   __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) { | 
| 236 |     return _mm_cvtss_f32(_mm_dp_ps(a,b,0x3F)); | 
| 237 |   } | 
| 238 | #else | 
| 239 |   __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) { | 
| 240 |     return reduce_add(v: a*b); | 
| 241 |   } | 
| 242 | #endif | 
| 243 |  | 
| 244 |   __forceinline Vec2fa cross ( const Vec2fa& a ) { | 
| 245 |     return Vec2fa(-a.y,a.x); | 
| 246 |   } | 
| 247 |  | 
| 248 |   __forceinline float  sqr_length ( const Vec2fa& a )                { return dot(a,b: a); } | 
| 249 |   __forceinline float  rcp_length ( const Vec2fa& a )                { return rsqrt(x: dot(a,b: a)); } | 
| 250 |   __forceinline float  rcp_length2( const Vec2fa& a )                { return rcp(x: dot(a,b: a)); } | 
| 251 |   __forceinline float  length   ( const Vec2fa& a )                  { return sqrt(x: dot(a,b: a)); } | 
| 252 |   __forceinline Vec2fa normalize( const Vec2fa& a )                  { return a*rsqrt(x: dot(a,b: a)); } | 
| 253 |   __forceinline float  distance ( const Vec2fa& a, const Vec2fa& b ) { return length(a: a-b); } | 
| 254 |  | 
| 255 |   //////////////////////////////////////////////////////////////////////////////// | 
| 256 |   /// Select | 
| 257 |   //////////////////////////////////////////////////////////////////////////////// | 
| 258 |  | 
| 259 |   __forceinline Vec2fa select( bool s, const Vec2fa& t, const Vec2fa& f ) { | 
| 260 |     __m128 mask = s ? _mm_castsi128_ps(a: _mm_cmpeq_epi32(a: _mm_setzero_si128(), b: _mm_setzero_si128())) : _mm_setzero_ps(); | 
| 261 |     return blendv_ps(f, t, mask); | 
| 262 |   } | 
| 263 |  | 
| 264 |   __forceinline Vec2fa lerp(const Vec2fa& v0, const Vec2fa& v1, const float t) { | 
| 265 |     return madd(a: 1.0f-t,b: v0,c: t*v1); | 
| 266 |   } | 
| 267 |  | 
| 268 |   __forceinline int maxDim ( const Vec2fa& a ) | 
| 269 |   { | 
| 270 |     const Vec2fa b = abs(a); | 
| 271 |     if (b.x > b.y) return 0; | 
| 272 |     else return 1; | 
| 273 |   } | 
| 274 |  | 
| 275 |   //////////////////////////////////////////////////////////////////////////////// | 
| 276 |   /// Rounding Functions | 
| 277 |   //////////////////////////////////////////////////////////////////////////////// | 
| 278 |  | 
| 279 | #if defined(__aarch64__) | 
| 280 |   //__forceinline Vec2fa trunc(const Vec2fa& a) { return vrndq_f32(a); } | 
| 281 |   __forceinline Vec2fa floor(const Vec2fa& a) { return vrndmq_f32(a); } | 
| 282 |   __forceinline Vec2fa ceil (const Vec2fa& a) { return vrndpq_f32(a); } | 
| 283 | #elif defined (__SSE4_1__) | 
| 284 |   //__forceinline Vec2fa trunc( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } | 
| 285 |   __forceinline Vec2fa floor( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF    ); } | 
| 286 |   __forceinline Vec2fa ceil ( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF    ); } | 
| 287 | #else | 
| 288 |   //__forceinline Vec2fa trunc( const Vec2fa& a ) { return Vec2fa(truncf(a.x),truncf(a.y),truncf(a.z)); } | 
| 289 |   __forceinline Vec2fa floor( const Vec2fa& a ) { return Vec2fa(floorf(x: a.x),floorf(x: a.y)); } | 
| 290 |   __forceinline Vec2fa ceil ( const Vec2fa& a ) { return Vec2fa(ceilf (x: a.x),ceilf (x: a.y)); } | 
| 291 | #endif | 
| 292 |  | 
| 293 |   //////////////////////////////////////////////////////////////////////////////// | 
| 294 |   /// Output Operators | 
| 295 |   //////////////////////////////////////////////////////////////////////////////// | 
| 296 |  | 
| 297 |   __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2fa& a) { | 
| 298 |     return cout << "("  << a.x << ", "  << a.y << ")" ; | 
| 299 |   } | 
| 300 |  | 
| 301 |   typedef Vec2fa Vec2fa_t; | 
| 302 | } | 
| 303 |  |