| 1 | // Copyright 2009-2021 Intel Corporation |
| 2 | // SPDX-License-Identifier: Apache-2.0 |
| 3 | |
| 4 | #pragma once |
| 5 | |
| 6 | #include "../sys/alloc.h" |
| 7 | #include "math.h" |
| 8 | #include "../simd/sse.h" |
| 9 | |
| 10 | namespace embree |
| 11 | { |
| 12 | //////////////////////////////////////////////////////////////////////////////// |
| 13 | /// SSE Vec2fa Type |
| 14 | //////////////////////////////////////////////////////////////////////////////// |
| 15 | |
| 16 | struct __aligned(16) Vec2fa |
| 17 | { |
| 18 | ALIGNED_STRUCT_(16); |
| 19 | |
| 20 | typedef float Scalar; |
| 21 | enum { N = 2 }; |
| 22 | union { |
| 23 | __m128 m128; |
| 24 | struct { float x,y,az,aw; }; |
| 25 | }; |
| 26 | |
| 27 | //////////////////////////////////////////////////////////////////////////////// |
| 28 | /// Constructors, Assignment & Cast Operators |
| 29 | //////////////////////////////////////////////////////////////////////////////// |
| 30 | |
| 31 | __forceinline Vec2fa( ) {} |
| 32 | __forceinline Vec2fa( const __m128 a ) : m128(a) {} |
| 33 | |
| 34 | __forceinline Vec2fa ( const Vec2<float>& other ) { x = other.x; y = other.y; } |
| 35 | __forceinline Vec2fa& operator =( const Vec2<float>& other ) { x = other.x; y = other.y; return *this; } |
| 36 | |
| 37 | __forceinline Vec2fa ( const Vec2fa& other ) { m128 = other.m128; } |
| 38 | __forceinline Vec2fa& operator =( const Vec2fa& other ) { m128 = other.m128; return *this; } |
| 39 | |
| 40 | __forceinline explicit Vec2fa( const float a ) : m128(_mm_set1_ps(w: a)) {} |
| 41 | __forceinline Vec2fa( const float x, const float y) : m128(_mm_set_ps(z: y, y: y, x: y, w: x)) {} |
| 42 | |
| 43 | __forceinline explicit Vec2fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a: a)) {} |
| 44 | |
| 45 | __forceinline operator const __m128&() const { return m128; } |
| 46 | __forceinline operator __m128&() { return m128; } |
| 47 | |
| 48 | //////////////////////////////////////////////////////////////////////////////// |
| 49 | /// Loads and Stores |
| 50 | //////////////////////////////////////////////////////////////////////////////// |
| 51 | |
| 52 | static __forceinline Vec2fa load( const void* const a ) { |
| 53 | return Vec2fa(_mm_and_ps(a: _mm_load_ps(p: (float*)a),b: _mm_castsi128_ps(a: _mm_set_epi32(i3: 0, i2: 0, i1: -1, i0: -1)))); |
| 54 | } |
| 55 | |
| 56 | static __forceinline Vec2fa loadu( const void* const a ) { |
| 57 | return Vec2fa(_mm_and_ps(a: _mm_loadu_ps(p: (float*)a),b: _mm_castsi128_ps(a: _mm_set_epi32(i3: 0, i2: 0, i1: -1, i0: -1)))); |
| 58 | } |
| 59 | |
| 60 | static __forceinline void storeu ( void* ptr, const Vec2fa& v ) { |
| 61 | _mm_storeu_ps(p: (float*)ptr,a: v); |
| 62 | } |
| 63 | |
| 64 | //////////////////////////////////////////////////////////////////////////////// |
| 65 | /// Constants |
| 66 | //////////////////////////////////////////////////////////////////////////////// |
| 67 | |
| 68 | __forceinline Vec2fa( ZeroTy ) : m128(_mm_setzero_ps()) {} |
| 69 | __forceinline Vec2fa( OneTy ) : m128(_mm_set1_ps(w: 1.0f)) {} |
| 70 | __forceinline Vec2fa( PosInfTy ) : m128(_mm_set1_ps(w: pos_inf)) {} |
| 71 | __forceinline Vec2fa( NegInfTy ) : m128(_mm_set1_ps(w: neg_inf)) {} |
| 72 | |
| 73 | //////////////////////////////////////////////////////////////////////////////// |
| 74 | /// Array Access |
| 75 | //////////////////////////////////////////////////////////////////////////////// |
| 76 | |
| 77 | __forceinline const float& operator []( const size_t index ) const { assert(index < 2); return (&x)[index]; } |
| 78 | __forceinline float& operator []( const size_t index ) { assert(index < 2); return (&x)[index]; } |
| 79 | }; |
| 80 | |
| 81 | //////////////////////////////////////////////////////////////////////////////// |
| 82 | /// Unary Operators |
| 83 | //////////////////////////////////////////////////////////////////////////////// |
| 84 | |
| 85 | __forceinline Vec2fa operator +( const Vec2fa& a ) { return a; } |
| 86 | __forceinline Vec2fa operator -( const Vec2fa& a ) { |
| 87 | const __m128 mask = _mm_castsi128_ps(a: _mm_set1_epi32(i: 0x80000000)); |
| 88 | return _mm_xor_ps(a: a.m128, b: mask); |
| 89 | } |
| 90 | __forceinline Vec2fa abs ( const Vec2fa& a ) { |
| 91 | const __m128 mask = _mm_castsi128_ps(a: _mm_set1_epi32(i: 0x7fffffff)); |
| 92 | return _mm_and_ps(a: a.m128, b: mask); |
| 93 | } |
| 94 | __forceinline Vec2fa sign ( const Vec2fa& a ) { |
| 95 | return blendv_ps(f: Vec2fa(one), t: -Vec2fa(one), mask: _mm_cmplt_ps (a: a,b: Vec2fa(zero))); |
| 96 | } |
| 97 | |
| 98 | __forceinline Vec2fa rcp ( const Vec2fa& a ) |
| 99 | { |
| 100 | #if defined(__AVX512VL__) |
| 101 | const Vec2fa r = _mm_rcp14_ps(a.m128); |
| 102 | #else |
| 103 | const Vec2fa r = _mm_rcp_ps(a: a.m128); |
| 104 | #endif |
| 105 | |
| 106 | #if defined(__AVX2__) |
| 107 | const Vec2fa h_n = _mm_fnmadd_ps(a, r, vfloat4(1.0)); // First, compute 1 - a * r (which will be very close to 0) |
| 108 | const Vec2fa res = _mm_fmadd_ps(r, h_n, r); // Then compute r + r * h_n |
| 109 | #else |
| 110 | const Vec2fa h_n = _mm_sub_ps(a: vfloat4(1.0f), b: _mm_mul_ps(a: a, b: r)); // First, compute 1 - a * r (which will be very close to 0) |
| 111 | const Vec2fa res = _mm_add_ps(a: r,b: _mm_mul_ps(a: r, b: h_n)); // Then compute r + r * h_n |
| 112 | #endif |
| 113 | |
| 114 | return res; |
| 115 | } |
| 116 | |
| 117 | __forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a: a.m128); } |
| 118 | __forceinline Vec2fa sqr ( const Vec2fa& a ) { return _mm_mul_ps(a: a,b: a); } |
| 119 | |
| 120 | __forceinline Vec2fa rsqrt( const Vec2fa& a ) |
| 121 | { |
| 122 | #if defined(__AVX512VL__) |
| 123 | __m128 r = _mm_rsqrt14_ps(a.m128); |
| 124 | #else |
| 125 | __m128 r = _mm_rsqrt_ps(a: a.m128); |
| 126 | #endif |
| 127 | return _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: 1.5f),b: r), b: _mm_mul_ps(a: _mm_mul_ps(a: _mm_mul_ps(a: a, b: _mm_set1_ps(w: -0.5f)), b: r), b: _mm_mul_ps(a: r, b: r))); |
| 128 | } |
| 129 | |
| 130 | __forceinline Vec2fa zero_fix(const Vec2fa& a) { |
| 131 | return blendv_ps(f: a, t: _mm_set1_ps(w: min_rcp_input), mask: _mm_cmplt_ps (a: abs(a).m128, b: _mm_set1_ps(w: min_rcp_input))); |
| 132 | } |
| 133 | __forceinline Vec2fa rcp_safe(const Vec2fa& a) { |
| 134 | return rcp(a: zero_fix(a)); |
| 135 | } |
| 136 | __forceinline Vec2fa log ( const Vec2fa& a ) { |
| 137 | return Vec2fa(logf(x: a.x),logf(x: a.y)); |
| 138 | } |
| 139 | |
| 140 | __forceinline Vec2fa exp ( const Vec2fa& a ) { |
| 141 | return Vec2fa(expf(x: a.x),expf(x: a.y)); |
| 142 | } |
| 143 | |
| 144 | //////////////////////////////////////////////////////////////////////////////// |
| 145 | /// Binary Operators |
| 146 | //////////////////////////////////////////////////////////////////////////////// |
| 147 | |
| 148 | __forceinline Vec2fa operator +( const Vec2fa& a, const Vec2fa& b ) { return _mm_add_ps(a: a.m128, b: b.m128); } |
| 149 | __forceinline Vec2fa operator -( const Vec2fa& a, const Vec2fa& b ) { return _mm_sub_ps(a: a.m128, b: b.m128); } |
| 150 | __forceinline Vec2fa operator *( const Vec2fa& a, const Vec2fa& b ) { return _mm_mul_ps(a: a.m128, b: b.m128); } |
| 151 | __forceinline Vec2fa operator *( const Vec2fa& a, const float b ) { return a * Vec2fa(b); } |
| 152 | __forceinline Vec2fa operator *( const float a, const Vec2fa& b ) { return Vec2fa(a) * b; } |
| 153 | __forceinline Vec2fa operator /( const Vec2fa& a, const Vec2fa& b ) { return _mm_div_ps(a: a.m128,b: b.m128); } |
| 154 | __forceinline Vec2fa operator /( const Vec2fa& a, const float b ) { return _mm_div_ps(a: a.m128,b: _mm_set1_ps(w: b)); } |
| 155 | __forceinline Vec2fa operator /( const float a, const Vec2fa& b ) { return _mm_div_ps(a: _mm_set1_ps(w: a),b: b.m128); } |
| 156 | |
| 157 | __forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a: a.m128,b: b.m128); } |
| 158 | __forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a: a.m128,b: b.m128); } |
| 159 | |
| 160 | #if defined(__SSE4_1__) |
| 161 | __forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) { |
| 162 | const vint4 ai = _mm_castps_si128(a); |
| 163 | const vint4 bi = _mm_castps_si128(b); |
| 164 | const vint4 ci = _mm_min_epi32(ai,bi); |
| 165 | return _mm_castsi128_ps(ci); |
| 166 | } |
| 167 | #endif |
| 168 | |
| 169 | #if defined(__SSE4_1__) |
| 170 | __forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) { |
| 171 | const vint4 ai = _mm_castps_si128(a); |
| 172 | const vint4 bi = _mm_castps_si128(b); |
| 173 | const vint4 ci = _mm_max_epi32(ai,bi); |
| 174 | return _mm_castsi128_ps(ci); |
| 175 | } |
| 176 | #endif |
| 177 | |
| 178 | __forceinline Vec2fa pow ( const Vec2fa& a, const float& b ) { |
| 179 | return Vec2fa(powf(x: a.x,y: b),powf(x: a.y,y: b)); |
| 180 | } |
| 181 | |
| 182 | //////////////////////////////////////////////////////////////////////////////// |
| 183 | /// Ternary Operators |
| 184 | //////////////////////////////////////////////////////////////////////////////// |
| 185 | |
| 186 | #if defined(__AVX2__) |
| 187 | __forceinline Vec2fa madd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmadd_ps(a,b,c); } |
| 188 | __forceinline Vec2fa msub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmsub_ps(a,b,c); } |
| 189 | __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmadd_ps(a,b,c); } |
| 190 | __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmsub_ps(a,b,c); } |
| 191 | #else |
| 192 | __forceinline Vec2fa madd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b+c; } |
| 193 | __forceinline Vec2fa msub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b-c; } |
| 194 | __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b+c;} |
| 195 | __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b-c; } |
| 196 | #endif |
| 197 | |
| 198 | __forceinline Vec2fa madd ( const float a, const Vec2fa& b, const Vec2fa& c) { return madd(a: Vec2fa(a),b,c); } |
| 199 | __forceinline Vec2fa msub ( const float a, const Vec2fa& b, const Vec2fa& c) { return msub(a: Vec2fa(a),b,c); } |
| 200 | __forceinline Vec2fa nmadd ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmadd(a: Vec2fa(a),b,c); } |
| 201 | __forceinline Vec2fa nmsub ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmsub(a: Vec2fa(a),b,c); } |
| 202 | |
| 203 | //////////////////////////////////////////////////////////////////////////////// |
| 204 | /// Assignment Operators |
| 205 | //////////////////////////////////////////////////////////////////////////////// |
| 206 | |
| 207 | __forceinline Vec2fa& operator +=( Vec2fa& a, const Vec2fa& b ) { return a = a + b; } |
| 208 | __forceinline Vec2fa& operator -=( Vec2fa& a, const Vec2fa& b ) { return a = a - b; } |
| 209 | __forceinline Vec2fa& operator *=( Vec2fa& a, const Vec2fa& b ) { return a = a * b; } |
| 210 | __forceinline Vec2fa& operator *=( Vec2fa& a, const float b ) { return a = a * b; } |
| 211 | __forceinline Vec2fa& operator /=( Vec2fa& a, const Vec2fa& b ) { return a = a / b; } |
| 212 | __forceinline Vec2fa& operator /=( Vec2fa& a, const float b ) { return a = a / b; } |
| 213 | |
| 214 | //////////////////////////////////////////////////////////////////////////////// |
| 215 | /// Reductions |
| 216 | //////////////////////////////////////////////////////////////////////////////// |
| 217 | |
| 218 | __forceinline float reduce_add(const Vec2fa& v) { return v.x+v.y; } |
| 219 | __forceinline float reduce_mul(const Vec2fa& v) { return v.x*v.y; } |
| 220 | __forceinline float reduce_min(const Vec2fa& v) { return min(a: v.x,b: v.y); } |
| 221 | __forceinline float reduce_max(const Vec2fa& v) { return max(a: v.x,b: v.y); } |
| 222 | |
| 223 | //////////////////////////////////////////////////////////////////////////////// |
| 224 | /// Comparison Operators |
| 225 | //////////////////////////////////////////////////////////////////////////////// |
| 226 | |
| 227 | __forceinline bool operator ==( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(a: _mm_cmpeq_ps (a: a.m128, b: b.m128)) & 3) == 3; } |
| 228 | __forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(a: _mm_cmpneq_ps(a: a.m128, b: b.m128)) & 3) != 0; } |
| 229 | |
| 230 | //////////////////////////////////////////////////////////////////////////////// |
| 231 | /// Euclidian Space Operators |
| 232 | //////////////////////////////////////////////////////////////////////////////// |
| 233 | |
| 234 | #if defined(__SSE4_1__) |
| 235 | __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) { |
| 236 | return _mm_cvtss_f32(_mm_dp_ps(a,b,0x3F)); |
| 237 | } |
| 238 | #else |
| 239 | __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) { |
| 240 | return reduce_add(v: a*b); |
| 241 | } |
| 242 | #endif |
| 243 | |
| 244 | __forceinline Vec2fa cross ( const Vec2fa& a ) { |
| 245 | return Vec2fa(-a.y,a.x); |
| 246 | } |
| 247 | |
| 248 | __forceinline float sqr_length ( const Vec2fa& a ) { return dot(a,b: a); } |
| 249 | __forceinline float rcp_length ( const Vec2fa& a ) { return rsqrt(x: dot(a,b: a)); } |
| 250 | __forceinline float rcp_length2( const Vec2fa& a ) { return rcp(x: dot(a,b: a)); } |
| 251 | __forceinline float length ( const Vec2fa& a ) { return sqrt(x: dot(a,b: a)); } |
| 252 | __forceinline Vec2fa normalize( const Vec2fa& a ) { return a*rsqrt(x: dot(a,b: a)); } |
| 253 | __forceinline float distance ( const Vec2fa& a, const Vec2fa& b ) { return length(a: a-b); } |
| 254 | |
| 255 | //////////////////////////////////////////////////////////////////////////////// |
| 256 | /// Select |
| 257 | //////////////////////////////////////////////////////////////////////////////// |
| 258 | |
| 259 | __forceinline Vec2fa select( bool s, const Vec2fa& t, const Vec2fa& f ) { |
| 260 | __m128 mask = s ? _mm_castsi128_ps(a: _mm_cmpeq_epi32(a: _mm_setzero_si128(), b: _mm_setzero_si128())) : _mm_setzero_ps(); |
| 261 | return blendv_ps(f, t, mask); |
| 262 | } |
| 263 | |
| 264 | __forceinline Vec2fa lerp(const Vec2fa& v0, const Vec2fa& v1, const float t) { |
| 265 | return madd(a: 1.0f-t,b: v0,c: t*v1); |
| 266 | } |
| 267 | |
| 268 | __forceinline int maxDim ( const Vec2fa& a ) |
| 269 | { |
| 270 | const Vec2fa b = abs(a); |
| 271 | if (b.x > b.y) return 0; |
| 272 | else return 1; |
| 273 | } |
| 274 | |
| 275 | //////////////////////////////////////////////////////////////////////////////// |
| 276 | /// Rounding Functions |
| 277 | //////////////////////////////////////////////////////////////////////////////// |
| 278 | |
| 279 | #if defined(__aarch64__) |
| 280 | //__forceinline Vec2fa trunc(const Vec2fa& a) { return vrndq_f32(a); } |
| 281 | __forceinline Vec2fa floor(const Vec2fa& a) { return vrndmq_f32(a); } |
| 282 | __forceinline Vec2fa ceil (const Vec2fa& a) { return vrndpq_f32(a); } |
| 283 | #elif defined (__SSE4_1__) |
| 284 | //__forceinline Vec2fa trunc( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } |
| 285 | __forceinline Vec2fa floor( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); } |
| 286 | __forceinline Vec2fa ceil ( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); } |
| 287 | #else |
| 288 | //__forceinline Vec2fa trunc( const Vec2fa& a ) { return Vec2fa(truncf(a.x),truncf(a.y),truncf(a.z)); } |
| 289 | __forceinline Vec2fa floor( const Vec2fa& a ) { return Vec2fa(floorf(x: a.x),floorf(x: a.y)); } |
| 290 | __forceinline Vec2fa ceil ( const Vec2fa& a ) { return Vec2fa(ceilf (x: a.x),ceilf (x: a.y)); } |
| 291 | #endif |
| 292 | |
| 293 | //////////////////////////////////////////////////////////////////////////////// |
| 294 | /// Output Operators |
| 295 | //////////////////////////////////////////////////////////////////////////////// |
| 296 | |
| 297 | __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2fa& a) { |
| 298 | return cout << "(" << a.x << ", " << a.y << ")" ; |
| 299 | } |
| 300 | |
| 301 | typedef Vec2fa Vec2fa_t; |
| 302 | } |
| 303 | |