1// Copyright 2009-2021 Intel Corporation
2// SPDX-License-Identifier: Apache-2.0
3
4#pragma once
5
6#include "../sys/alloc.h"
7#include "math.h"
8#include "../simd/sse.h"
9
10namespace embree
11{
12 ////////////////////////////////////////////////////////////////////////////////
13 /// SSE Vec3fa Type
14 ////////////////////////////////////////////////////////////////////////////////
15
16 struct __aligned(16) Vec3fa
17 {
18 ALIGNED_STRUCT_(16);
19
20 typedef float Scalar;
21 enum { N = 3 };
22 union {
23 __m128 m128;
24 struct { float x,y,z; };
25 };
26
27 ////////////////////////////////////////////////////////////////////////////////
28 /// Constructors, Assignment & Cast Operators
29 ////////////////////////////////////////////////////////////////////////////////
30
31 __forceinline Vec3fa( ) {}
32 __forceinline Vec3fa( const __m128 a ) : m128(a) {}
33
34 __forceinline Vec3fa ( const Vec3<float>& other ) { m128 = _mm_set_ps(z: 0, y: other.z, x: other.y, w: other.x); }
35 //__forceinline Vec3fa& operator =( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); return *this; }
36
37 __forceinline Vec3fa ( const Vec3fa& other ) { m128 = other.m128; }
38 __forceinline Vec3fa& operator =( const Vec3fa& other ) { m128 = other.m128; return *this; }
39
40 __forceinline explicit Vec3fa( const float a ) : m128(_mm_set1_ps(w: a)) {}
41 __forceinline Vec3fa( const float x, const float y, const float z) : m128(_mm_set_ps(z: 0, y: z, x: y, w: x)) {}
42
43 __forceinline explicit Vec3fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a: a)) {}
44
45 __forceinline explicit operator const vfloat4() const { return vfloat4(m128); }
46 __forceinline explicit operator const vint4() const { return vint4(_mm_cvtps_epi32(a: m128)); }
47 __forceinline explicit operator const Vec2fa() const { return Vec2fa(m128); }
48 __forceinline explicit operator const Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(a: m128)); }
49
50 //__forceinline operator const __m128&() const { return m128; }
51 //__forceinline operator __m128&() { return m128; }
52
53 ////////////////////////////////////////////////////////////////////////////////
54 /// Loads and Stores
55 ////////////////////////////////////////////////////////////////////////////////
56
57 static __forceinline Vec3fa load( const void* const a ) {
58 return Vec3fa(_mm_and_ps(a: _mm_load_ps(p: (float*)a),b: _mm_castsi128_ps(a: _mm_set_epi32(i3: 0, i2: -1, i1: -1, i0: -1))));
59 }
60
61 static __forceinline Vec3fa loadu( const void* const a ) {
62 return Vec3fa(_mm_loadu_ps(p: (float*)a));
63 }
64
65 static __forceinline void storeu ( void* ptr, const Vec3fa& v ) {
66 _mm_storeu_ps(p: (float*)ptr,a: v.m128);
67 }
68
69 ////////////////////////////////////////////////////////////////////////////////
70 /// Constants
71 ////////////////////////////////////////////////////////////////////////////////
72
73 __forceinline Vec3fa( ZeroTy ) : m128(_mm_setzero_ps()) {}
74 __forceinline Vec3fa( OneTy ) : m128(_mm_set1_ps(w: 1.0f)) {}
75 __forceinline Vec3fa( PosInfTy ) : m128(_mm_set1_ps(w: pos_inf)) {}
76 __forceinline Vec3fa( NegInfTy ) : m128(_mm_set1_ps(w: neg_inf)) {}
77
78 ////////////////////////////////////////////////////////////////////////////////
79 /// Array Access
80 ////////////////////////////////////////////////////////////////////////////////
81
82 __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
83 __forceinline float& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; }
84 };
85
86 ////////////////////////////////////////////////////////////////////////////////
87 /// Unary Operators
88 ////////////////////////////////////////////////////////////////////////////////
89
90 __forceinline Vec3fa operator +( const Vec3fa& a ) { return a; }
91 __forceinline Vec3fa operator -( const Vec3fa& a ) {
92 const __m128 mask = _mm_castsi128_ps(a: _mm_set1_epi32(i: 0x80000000));
93 return _mm_xor_ps(a: a.m128, b: mask);
94 }
95 __forceinline Vec3fa abs ( const Vec3fa& a ) {
96 const __m128 mask = _mm_castsi128_ps(a: _mm_set1_epi32(i: 0x7fffffff));
97 return _mm_and_ps(a: a.m128, b: mask);
98 }
99 __forceinline Vec3fa sign ( const Vec3fa& a ) {
100 return blendv_ps(f: Vec3fa(one).m128, t: (-Vec3fa(one)).m128, mask: _mm_cmplt_ps (a: a.m128,b: Vec3fa(zero).m128));
101 }
102
103 __forceinline Vec3fa rcp ( const Vec3fa& a )
104 {
105#if defined(__AVX512VL__)
106 const Vec3fa r = _mm_rcp14_ps(a.m128);
107#else
108 const Vec3fa r = _mm_rcp_ps(a: a.m128);
109#endif
110
111#if defined(__AVX2__)
112 const Vec3fa h_n = _mm_fnmadd_ps(a.m128, r.m128, vfloat4(1.0)); // First, compute 1 - a * r (which will be very close to 0)
113 const Vec3fa res = _mm_fmadd_ps(r.m128, h_n.m128, r.m128); // Then compute r + r * h_n
114#else
115 const Vec3fa h_n = _mm_sub_ps(a: vfloat4(1.0f), b: _mm_mul_ps(a: a.m128, b: r.m128)); // First, compute 1 - a * r (which will be very close to 0)
116 const Vec3fa res = _mm_add_ps(a: r.m128,b: _mm_mul_ps(a: r.m128, b: h_n.m128)); // Then compute r + r * h_n
117#endif
118
119 return res;
120 }
121
122 __forceinline Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a: a.m128); }
123 __forceinline Vec3fa sqr ( const Vec3fa& a ) { return _mm_mul_ps(a: a.m128,b: a.m128); }
124
125 __forceinline Vec3fa rsqrt( const Vec3fa& a )
126 {
127#if defined(__AVX512VL__)
128 __m128 r = _mm_rsqrt14_ps(a.m128);
129#else
130 __m128 r = _mm_rsqrt_ps(a: a.m128);
131#endif
132 return _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: 1.5f),b: r), b: _mm_mul_ps(a: _mm_mul_ps(a: _mm_mul_ps(a: a.m128, b: _mm_set1_ps(w: -0.5f)), b: r), b: _mm_mul_ps(a: r, b: r)));
133 }
134
135 __forceinline Vec3fa zero_fix(const Vec3fa& a) {
136 return blendv_ps(f: a.m128, t: _mm_set1_ps(w: min_rcp_input), mask: _mm_cmplt_ps (a: abs(a).m128, b: _mm_set1_ps(w: min_rcp_input)));
137 }
138 __forceinline Vec3fa rcp_safe(const Vec3fa& a) {
139 return rcp(a: zero_fix(a));
140 }
141 __forceinline Vec3fa log ( const Vec3fa& a ) {
142 return Vec3fa(logf(x: a.x),logf(x: a.y),logf(x: a.z));
143 }
144
145 __forceinline Vec3fa exp ( const Vec3fa& a ) {
146 return Vec3fa(expf(x: a.x),expf(x: a.y),expf(x: a.z));
147 }
148
149 ////////////////////////////////////////////////////////////////////////////////
150 /// Binary Operators
151 ////////////////////////////////////////////////////////////////////////////////
152
153 __forceinline Vec3fa operator +( const Vec3fa& a, const Vec3fa& b ) { return _mm_add_ps(a: a.m128, b: b.m128); }
154 __forceinline Vec3fa operator -( const Vec3fa& a, const Vec3fa& b ) { return _mm_sub_ps(a: a.m128, b: b.m128); }
155 __forceinline Vec3fa operator *( const Vec3fa& a, const Vec3fa& b ) { return _mm_mul_ps(a: a.m128, b: b.m128); }
156 __forceinline Vec3fa operator *( const Vec3fa& a, const float b ) { return a * Vec3fa(b); }
157 __forceinline Vec3fa operator *( const float a, const Vec3fa& b ) { return Vec3fa(a) * b; }
158 __forceinline Vec3fa operator /( const Vec3fa& a, const Vec3fa& b ) { return _mm_div_ps(a: a.m128,b: b.m128); }
159 __forceinline Vec3fa operator /( const Vec3fa& a, const float b ) { return _mm_div_ps(a: a.m128,b: _mm_set1_ps(w: b)); }
160 __forceinline Vec3fa operator /( const float a, const Vec3fa& b ) { return _mm_div_ps(a: _mm_set1_ps(w: a),b: b.m128); }
161
162 __forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a: a.m128,b: b.m128); }
163 __forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a: a.m128,b: b.m128); }
164
165#if defined(__SSE4_1__)
166 __forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) {
167 const vint4 ai = _mm_castps_si128(a.m128);
168 const vint4 bi = _mm_castps_si128(b.m128);
169 const vint4 ci = _mm_min_epi32(ai,bi);
170 return _mm_castsi128_ps(ci);
171 }
172#endif
173
174#if defined(__SSE4_1__)
175 __forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) {
176 const vint4 ai = _mm_castps_si128(a.m128);
177 const vint4 bi = _mm_castps_si128(b.m128);
178 const vint4 ci = _mm_max_epi32(ai,bi);
179 return _mm_castsi128_ps(ci);
180 }
181#endif
182
183 __forceinline Vec3fa pow ( const Vec3fa& a, const float& b ) {
184 return Vec3fa(powf(x: a.x,y: b),powf(x: a.y,y: b),powf(x: a.z,y: b));
185 }
186
187 ////////////////////////////////////////////////////////////////////////////////
188 /// Ternary Operators
189 ////////////////////////////////////////////////////////////////////////////////
190
191#if defined(__AVX2__)
192 __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); }
193 __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); }
194 __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); }
195 __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); }
196#else
197 __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b+c; }
198 __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; }
199 __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b+c;}
200 __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b-c; }
201#endif
202
203 __forceinline Vec3fa madd ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(a: Vec3fa(a),b,c); }
204 __forceinline Vec3fa msub ( const float a, const Vec3fa& b, const Vec3fa& c) { return msub(a: Vec3fa(a),b,c); }
205 __forceinline Vec3fa nmadd ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmadd(a: Vec3fa(a),b,c); }
206 __forceinline Vec3fa nmsub ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmsub(a: Vec3fa(a),b,c); }
207
208 ////////////////////////////////////////////////////////////////////////////////
209 /// Assignment Operators
210 ////////////////////////////////////////////////////////////////////////////////
211
212 __forceinline Vec3fa& operator +=( Vec3fa& a, const Vec3fa& b ) { return a = a + b; }
213 __forceinline Vec3fa& operator -=( Vec3fa& a, const Vec3fa& b ) { return a = a - b; }
214 __forceinline Vec3fa& operator *=( Vec3fa& a, const Vec3fa& b ) { return a = a * b; }
215 __forceinline Vec3fa& operator *=( Vec3fa& a, const float b ) { return a = a * b; }
216 __forceinline Vec3fa& operator /=( Vec3fa& a, const Vec3fa& b ) { return a = a / b; }
217 __forceinline Vec3fa& operator /=( Vec3fa& a, const float b ) { return a = a / b; }
218
219 ////////////////////////////////////////////////////////////////////////////////
220 /// Reductions
221 ////////////////////////////////////////////////////////////////////////////////
222
223 __forceinline float reduce_add(const Vec3fa& v) {
224 const vfloat4 a(v.m128);
225 const vfloat4 b = shuffle<1>(v: a);
226 const vfloat4 c = shuffle<2>(v: a);
227 return _mm_cvtss_f32(a: a+b+c);
228 }
229
230 __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
231 __forceinline float reduce_min(const Vec3fa& v) { return min(a: v.x,b: v.y,c: v.z); }
232 __forceinline float reduce_max(const Vec3fa& v) { return max(a: v.x,b: v.y,c: v.z); }
233
234 ////////////////////////////////////////////////////////////////////////////////
235 /// Comparison Operators
236 ////////////////////////////////////////////////////////////////////////////////
237
238 __forceinline bool operator ==( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(a: _mm_cmpeq_ps (a: a.m128, b: b.m128)) & 7) == 7; }
239 __forceinline bool operator !=( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(a: _mm_cmpneq_ps(a: a.m128, b: b.m128)) & 7) != 0; }
240
241 __forceinline Vec3ba eq_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpeq_ps (a: a.m128, b: b.m128); }
242 __forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a: a.m128, b: b.m128); }
243 __forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a: a.m128, b: b.m128); }
244 __forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a: a.m128, b: b.m128); }
245 __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnle_ps(a: a.m128, b: b.m128); }
246 __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnlt_ps(a: a.m128, b: b.m128); }
247
248 __forceinline bool isvalid ( const Vec3fa& v ) {
249 return all(b: gt_mask(a: v,b: Vec3fa(-FLT_LARGE)) & lt_mask(a: v,b: Vec3fa(+FLT_LARGE)));
250 }
251
252 __forceinline bool is_finite ( const Vec3fa& a ) {
253 return all(b: ge_mask(a,b: Vec3fa(-FLT_MAX)) & le_mask(a,b: Vec3fa(+FLT_MAX)));
254 }
255
256 __forceinline bool isvalid4 ( const Vec3fa& v ) {
257 return all(b: (vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE)));
258 }
259
260 __forceinline bool is_finite4 ( const Vec3fa& a ) {
261 return all(b: (vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX)));
262 }
263
264 ////////////////////////////////////////////////////////////////////////////////
265 /// Euclidian Space Operators
266 ////////////////////////////////////////////////////////////////////////////////
267
268#if defined(__SSE4_1__)
269 __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) {
270 return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F));
271 }
272#else
273 __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) {
274 return reduce_add(v: a*b);
275 }
276#endif
277
278 __forceinline Vec3fa cross ( const Vec3fa& a, const Vec3fa& b )
279 {
280 vfloat4 a0 = vfloat4(a.m128);
281 vfloat4 b0 = shuffle<1,2,0,3>(v: vfloat4(b.m128));
282 vfloat4 a1 = shuffle<1,2,0,3>(v: vfloat4(a.m128));
283 vfloat4 b1 = vfloat4(b.m128);
284 return Vec3fa(shuffle<1,2,0,3>(v: msub(a: a0,b: b0,c: a1*b1)));
285 }
286
287 __forceinline float sqr_length ( const Vec3fa& a ) { return dot(a,b: a); }
288 __forceinline float rcp_length ( const Vec3fa& a ) { return rsqrt(x: dot(a,b: a)); }
289 __forceinline float rcp_length2( const Vec3fa& a ) { return rcp(x: dot(a,b: a)); }
290 __forceinline float length ( const Vec3fa& a ) { return sqrt(x: dot(a,b: a)); }
291 __forceinline Vec3fa normalize( const Vec3fa& a ) { return a*rsqrt(x: dot(a,b: a)); }
292 __forceinline float distance ( const Vec3fa& a, const Vec3fa& b ) { return length(a: a-b); }
293 __forceinline float halfArea ( const Vec3fa& d ) { return madd(a: d.x,b: (d.y+d.z),c: d.y*d.z); }
294 __forceinline float area ( const Vec3fa& d ) { return 2.0f*halfArea(d); }
295
296 __forceinline Vec3fa normalize_safe( const Vec3fa& a ) {
297 const float d = dot(a,b: a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(x: d);
298 }
299
300 /*! differentiated normalization */
301 __forceinline Vec3fa dnormalize(const Vec3fa& p, const Vec3fa& dp)
302 {
303 const float pp = dot(a: p,b: p);
304 const float pdp = dot(a: p,b: dp);
305 return (pp*dp-pdp*p)*rcp(x: pp)*rsqrt(x: pp);
306 }
307
308 ////////////////////////////////////////////////////////////////////////////////
309 /// Select
310 ////////////////////////////////////////////////////////////////////////////////
311
312 __forceinline Vec3fa select( bool s, const Vec3fa& t, const Vec3fa& f ) {
313 __m128 mask = s ? _mm_castsi128_ps(a: _mm_cmpeq_epi32(a: _mm_setzero_si128(), b: _mm_setzero_si128())) : _mm_setzero_ps();
314 return blendv_ps(f: f.m128, t: t.m128, mask);
315 }
316
317 __forceinline Vec3fa select( const Vec3ba& s, const Vec3fa& t, const Vec3fa& f ) {
318 return blendv_ps(f: f.m128, t: t.m128, mask: s);
319 }
320
321 __forceinline Vec3fa lerp(const Vec3fa& v0, const Vec3fa& v1, const float t) {
322 return madd(a: 1.0f-t,b: v0,c: t*v1);
323 }
324
325 __forceinline int maxDim ( const Vec3fa& a )
326 {
327 const Vec3fa b = abs(a);
328 if (b.x > b.y) {
329 if (b.x > b.z) return 0; else return 2;
330 } else {
331 if (b.y > b.z) return 1; else return 2;
332 }
333 }
334
335 ////////////////////////////////////////////////////////////////////////////////
336 /// Rounding Functions
337 ////////////////////////////////////////////////////////////////////////////////
338
339#if defined (__SSE4_1__)
340 __forceinline Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
341 __forceinline Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); }
342 __forceinline Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); }
343#else
344 __forceinline Vec3fa trunc( const Vec3fa& a ) { return Vec3fa(truncf(x: a.x),truncf(x: a.y),truncf(x: a.z)); }
345 __forceinline Vec3fa floor( const Vec3fa& a ) { return Vec3fa(floorf(x: a.x),floorf(x: a.y),floorf(x: a.z)); }
346 __forceinline Vec3fa ceil ( const Vec3fa& a ) { return Vec3fa(ceilf (x: a.x),ceilf (x: a.y),ceilf (x: a.z)); }
347#endif
348
349 ////////////////////////////////////////////////////////////////////////////////
350 /// Output Operators
351 ////////////////////////////////////////////////////////////////////////////////
352
353 __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fa& a) {
354 return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
355 }
356
357 typedef Vec3fa Vec3fa_t;
358
359
360 ////////////////////////////////////////////////////////////////////////////////
361 /// SSE Vec3fx Type
362 ////////////////////////////////////////////////////////////////////////////////
363
364 struct __aligned(16) Vec3fx
365 {
366 ALIGNED_STRUCT_(16);
367
368 typedef float Scalar;
369 enum { N = 3 };
370 union {
371 __m128 m128;
372 struct { float x,y,z; union { int a; unsigned u; float w; }; };
373 };
374
375 ////////////////////////////////////////////////////////////////////////////////
376 /// Constructors, Assignment & Cast Operators
377 ////////////////////////////////////////////////////////////////////////////////
378
379 __forceinline Vec3fx( ) {}
380 __forceinline Vec3fx( const __m128 a ) : m128(a) {}
381
382 __forceinline explicit Vec3fx(const Vec3fa& v) : m128(v.m128) {}
383 __forceinline operator Vec3fa () const { return Vec3fa(m128); }
384
385 __forceinline explicit Vec3fx ( const Vec3<float>& other ) { m128 = _mm_set_ps(z: 0, y: other.z, x: other.y, w: other.x); }
386 //__forceinline Vec3fx& operator =( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); return *this; }
387
388 __forceinline Vec3fx ( const Vec3fx& other ) { m128 = other.m128; }
389
390 __forceinline Vec3fx& operator =( const Vec3fx& other ) { m128 = other.m128; return *this; }
391
392 __forceinline explicit Vec3fx( const float a ) : m128(_mm_set1_ps(w: a)) {}
393 __forceinline Vec3fx( const float x, const float y, const float z) : m128(_mm_set_ps(z: 0, y: z, x: y, w: x)) {}
394
395 __forceinline Vec3fx( const Vec3fa& other, const int a1) { m128 = other.m128; a = a1; }
396 __forceinline Vec3fx( const Vec3fa& other, const unsigned a1) { m128 = other.m128; u = a1; }
397 __forceinline Vec3fx( const Vec3fa& other, const float w1) {
398#if defined (__SSE4_1__)
399 m128 = _mm_insert_ps(other.m128, _mm_set_ss(w1),3 << 4);
400#else
401 const vint4 mask(-1,-1,-1,0);
402 m128 = select(m: vboolf4(_mm_castsi128_ps(a: mask)),t: vfloat4(other.m128),f: vfloat4(w1));
403#endif
404 }
405 //__forceinline Vec3fx( const float x, const float y, const float z, const int a) : x(x), y(y), z(z), a(a) {} // not working properly!
406 //__forceinline Vec3fx( const float x, const float y, const float z, const unsigned a) : x(x), y(y), z(z), u(a) {} // not working properly!
407 __forceinline Vec3fx( const float x, const float y, const float z, const float w) : m128(_mm_set_ps(z: w, y: z, x: y, w: x)) {}
408
409 //__forceinline explicit Vec3fx( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
410
411 __forceinline explicit operator const vfloat4() const { return vfloat4(m128); }
412 __forceinline explicit operator const vint4() const { return vint4(_mm_cvtps_epi32(a: m128)); }
413 __forceinline explicit operator const Vec2fa() const { return Vec2fa(m128); }
414 __forceinline explicit operator const Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(a: m128)); }
415
416 //__forceinline operator const __m128&() const { return m128; }
417 //__forceinline operator __m128&() { return m128; }
418
419 ////////////////////////////////////////////////////////////////////////////////
420 /// Loads and Stores
421 ////////////////////////////////////////////////////////////////////////////////
422
423 static __forceinline Vec3fx load( const void* const a ) {
424 return Vec3fx(_mm_and_ps(a: _mm_load_ps(p: (float*)a),b: _mm_castsi128_ps(a: _mm_set_epi32(i3: 0, i2: -1, i1: -1, i0: -1))));
425 }
426
427 static __forceinline Vec3fx loadu( const void* const a ) {
428 return Vec3fx(_mm_loadu_ps(p: (float*)a));
429 }
430
431 static __forceinline void storeu ( void* ptr, const Vec3fx& v ) {
432 _mm_storeu_ps(p: (float*)ptr,a: v.m128);
433 }
434
435 ////////////////////////////////////////////////////////////////////////////////
436 /// Constants
437 ////////////////////////////////////////////////////////////////////////////////
438
439 __forceinline Vec3fx( ZeroTy ) : m128(_mm_setzero_ps()) {}
440 __forceinline Vec3fx( OneTy ) : m128(_mm_set1_ps(w: 1.0f)) {}
441 __forceinline Vec3fx( PosInfTy ) : m128(_mm_set1_ps(w: pos_inf)) {}
442 __forceinline Vec3fx( NegInfTy ) : m128(_mm_set1_ps(w: neg_inf)) {}
443
444 ////////////////////////////////////////////////////////////////////////////////
445 /// Array Access
446 ////////////////////////////////////////////////////////////////////////////////
447
448 __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
449 __forceinline float& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; }
450 };
451
452 ////////////////////////////////////////////////////////////////////////////////
453 /// Unary Operators
454 ////////////////////////////////////////////////////////////////////////////////
455
456 __forceinline Vec3fx operator +( const Vec3fx& a ) { return a; }
457 __forceinline Vec3fx operator -( const Vec3fx& a ) {
458 const __m128 mask = _mm_castsi128_ps(a: _mm_set1_epi32(i: 0x80000000));
459 return _mm_xor_ps(a: a.m128, b: mask);
460 }
461 __forceinline Vec3fx abs ( const Vec3fx& a ) {
462 const __m128 mask = _mm_castsi128_ps(a: _mm_set1_epi32(i: 0x7fffffff));
463 return _mm_and_ps(a: a.m128, b: mask);
464 }
465 __forceinline Vec3fx sign ( const Vec3fx& a ) {
466 return blendv_ps(f: Vec3fx(one).m128, t: (-Vec3fx(one)).m128, mask: _mm_cmplt_ps (a: a.m128,b: Vec3fx(zero).m128));
467 }
468
469 __forceinline Vec3fx rcp ( const Vec3fx& a )
470 {
471#if defined(__AVX512VL__)
472 const Vec3fx r = _mm_rcp14_ps(a.m128);
473#else
474 const Vec3fx r = _mm_rcp_ps(a: a.m128);
475#endif
476
477#if defined(__AVX2__)
478 const Vec3fx res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f)));
479#else
480 const Vec3fx res = _mm_mul_ps(a: r.m128,b: _mm_sub_ps(a: vfloat4(2.0f), b: _mm_mul_ps(a: r.m128, b: a.m128)));
481 //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
482#endif
483
484 return res;
485 }
486
487 __forceinline Vec3fx sqrt ( const Vec3fx& a ) { return _mm_sqrt_ps(a: a.m128); }
488 __forceinline Vec3fx sqr ( const Vec3fx& a ) { return _mm_mul_ps(a: a.m128,b: a.m128); }
489
490 __forceinline Vec3fx rsqrt( const Vec3fx& a )
491 {
492#if defined(__AVX512VL__)
493 __m128 r = _mm_rsqrt14_ps(a.m128);
494#else
495 __m128 r = _mm_rsqrt_ps(a: a.m128);
496#endif
497 return _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: 1.5f),b: r), b: _mm_mul_ps(a: _mm_mul_ps(a: _mm_mul_ps(a: a.m128, b: _mm_set1_ps(w: -0.5f)), b: r), b: _mm_mul_ps(a: r, b: r)));
498 }
499
500 __forceinline Vec3fx zero_fix(const Vec3fx& a) {
501 return blendv_ps(f: a.m128, t: _mm_set1_ps(w: min_rcp_input), mask: _mm_cmplt_ps (a: abs(a).m128, b: _mm_set1_ps(w: min_rcp_input)));
502 }
503 __forceinline Vec3fx rcp_safe(const Vec3fx& a) {
504 return rcp(a: zero_fix(a));
505 }
506 __forceinline Vec3fx log ( const Vec3fx& a ) {
507 return Vec3fx(logf(x: a.x),logf(x: a.y),logf(x: a.z));
508 }
509
510 __forceinline Vec3fx exp ( const Vec3fx& a ) {
511 return Vec3fx(expf(x: a.x),expf(x: a.y),expf(x: a.z));
512 }
513
514 ////////////////////////////////////////////////////////////////////////////////
515 /// Binary Operators
516 ////////////////////////////////////////////////////////////////////////////////
517
518 __forceinline Vec3fx operator +( const Vec3fx& a, const Vec3fx& b ) { return _mm_add_ps(a: a.m128, b: b.m128); }
519 __forceinline Vec3fx operator -( const Vec3fx& a, const Vec3fx& b ) { return _mm_sub_ps(a: a.m128, b: b.m128); }
520 __forceinline Vec3fx operator *( const Vec3fx& a, const Vec3fx& b ) { return _mm_mul_ps(a: a.m128, b: b.m128); }
521 __forceinline Vec3fx operator *( const Vec3fx& a, const float b ) { return a * Vec3fx(b); }
522 __forceinline Vec3fx operator *( const float a, const Vec3fx& b ) { return Vec3fx(a) * b; }
523 __forceinline Vec3fx operator /( const Vec3fx& a, const Vec3fx& b ) { return _mm_div_ps(a: a.m128,b: b.m128); }
524 __forceinline Vec3fx operator /( const Vec3fx& a, const float b ) { return _mm_div_ps(a: a.m128,b: _mm_set1_ps(w: b)); }
525 __forceinline Vec3fx operator /( const float a, const Vec3fx& b ) { return _mm_div_ps(a: _mm_set1_ps(w: a),b: b.m128); }
526
527 __forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) { return _mm_min_ps(a: a.m128,b: b.m128); }
528 __forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) { return _mm_max_ps(a: a.m128,b: b.m128); }
529
530#if defined(__SSE4_1__)
531 __forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) {
532 const vint4 ai = _mm_castps_si128(a.m128);
533 const vint4 bi = _mm_castps_si128(b.m128);
534 const vint4 ci = _mm_min_epi32(ai,bi);
535 return _mm_castsi128_ps(ci);
536 }
537#endif
538
539#if defined(__SSE4_1__)
540 __forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) {
541 const vint4 ai = _mm_castps_si128(a.m128);
542 const vint4 bi = _mm_castps_si128(b.m128);
543 const vint4 ci = _mm_max_epi32(ai,bi);
544 return _mm_castsi128_ps(ci);
545 }
546#endif
547
548 __forceinline Vec3fx pow ( const Vec3fx& a, const float& b ) {
549 return Vec3fx(powf(x: a.x,y: b),powf(x: a.y,y: b),powf(x: a.z,y: b));
550 }
551
552 ////////////////////////////////////////////////////////////////////////////////
553 /// Ternary Operators
554 ////////////////////////////////////////////////////////////////////////////////
555
556#if defined(__AVX2__)
557 __forceinline Vec3fx madd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); }
558 __forceinline Vec3fx msub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); }
559 __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); }
560 __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); }
561#else
562 __forceinline Vec3fx madd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b+c; }
563 __forceinline Vec3fx msub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b-c; }
564 __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b+c;}
565 __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b-c; }
566#endif
567
568 __forceinline Vec3fx madd ( const float a, const Vec3fx& b, const Vec3fx& c) { return madd(a: Vec3fx(a),b,c); }
569 __forceinline Vec3fx msub ( const float a, const Vec3fx& b, const Vec3fx& c) { return msub(a: Vec3fx(a),b,c); }
570 __forceinline Vec3fx nmadd ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmadd(a: Vec3fx(a),b,c); }
571 __forceinline Vec3fx nmsub ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmsub(a: Vec3fx(a),b,c); }
572
573 ////////////////////////////////////////////////////////////////////////////////
574 /// Assignment Operators
575 ////////////////////////////////////////////////////////////////////////////////
576
577 __forceinline Vec3fx& operator +=( Vec3fx& a, const Vec3fx& b ) { return a = a + b; }
578 __forceinline Vec3fx& operator -=( Vec3fx& a, const Vec3fx& b ) { return a = a - b; }
579 __forceinline Vec3fx& operator *=( Vec3fx& a, const Vec3fx& b ) { return a = a * b; }
580 __forceinline Vec3fx& operator *=( Vec3fx& a, const float b ) { return a = a * b; }
581 __forceinline Vec3fx& operator /=( Vec3fx& a, const Vec3fx& b ) { return a = a / b; }
582 __forceinline Vec3fx& operator /=( Vec3fx& a, const float b ) { return a = a / b; }
583
584 ////////////////////////////////////////////////////////////////////////////////
585 /// Reductions
586 ////////////////////////////////////////////////////////////////////////////////
587
588 __forceinline float reduce_add(const Vec3fx& v) {
589 const vfloat4 a(v.m128);
590 const vfloat4 b = shuffle<1>(v: a);
591 const vfloat4 c = shuffle<2>(v: a);
592 return _mm_cvtss_f32(a: a+b+c);
593 }
594
595 __forceinline float reduce_mul(const Vec3fx& v) { return v.x*v.y*v.z; }
596 __forceinline float reduce_min(const Vec3fx& v) { return min(a: v.x,b: v.y,c: v.z); }
597 __forceinline float reduce_max(const Vec3fx& v) { return max(a: v.x,b: v.y,c: v.z); }
598
599 ////////////////////////////////////////////////////////////////////////////////
600 /// Comparison Operators
601 ////////////////////////////////////////////////////////////////////////////////
602
603 __forceinline bool operator ==( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(a: _mm_cmpeq_ps (a: a.m128, b: b.m128)) & 7) == 7; }
604 __forceinline bool operator !=( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(a: _mm_cmpneq_ps(a: a.m128, b: b.m128)) & 7) != 0; }
605
606 __forceinline Vec3ba eq_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpeq_ps (a: a.m128, b: b.m128); }
607 __forceinline Vec3ba neq_mask(const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpneq_ps(a: a.m128, b: b.m128); }
608 __forceinline Vec3ba lt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmplt_ps (a: a.m128, b: b.m128); }
609 __forceinline Vec3ba le_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmple_ps (a: a.m128, b: b.m128); }
610 __forceinline Vec3ba gt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnle_ps(a: a.m128, b: b.m128); }
611 __forceinline Vec3ba ge_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnlt_ps(a: a.m128, b: b.m128); }
612
613 __forceinline bool isvalid ( const Vec3fx& v ) {
614 return all(b: gt_mask(a: v,b: Vec3fx(-FLT_LARGE)) & lt_mask(a: v,b: Vec3fx(+FLT_LARGE)));
615 }
616
617 __forceinline bool is_finite ( const Vec3fx& a ) {
618 return all(b: ge_mask(a,b: Vec3fx(-FLT_MAX)) & le_mask(a,b: Vec3fx(+FLT_MAX)));
619 }
620
621 __forceinline bool isvalid4 ( const Vec3fx& v ) {
622 return all(b: (vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE)));
623 }
624
625 __forceinline bool is_finite4 ( const Vec3fx& a ) {
626 return all(b: (vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX)));
627 }
628
629 ////////////////////////////////////////////////////////////////////////////////
630 /// Euclidian Space Operators
631 ////////////////////////////////////////////////////////////////////////////////
632
633#if defined(__SSE4_1__)
634 __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) {
635 return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F));
636 }
637#else
638 __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) {
639 return reduce_add(v: a*b);
640 }
641#endif
642
643 __forceinline Vec3fx cross ( const Vec3fx& a, const Vec3fx& b )
644 {
645 vfloat4 a0 = vfloat4(a.m128);
646 vfloat4 b0 = shuffle<1,2,0,3>(v: vfloat4(b.m128));
647 vfloat4 a1 = shuffle<1,2,0,3>(v: vfloat4(a.m128));
648 vfloat4 b1 = vfloat4(b.m128);
649 return Vec3fx(shuffle<1,2,0,3>(v: msub(a: a0,b: b0,c: a1*b1)));
650 }
651
652 __forceinline float sqr_length ( const Vec3fx& a ) { return dot(a,b: a); }
653 __forceinline float rcp_length ( const Vec3fx& a ) { return rsqrt(x: dot(a,b: a)); }
654 __forceinline float rcp_length2( const Vec3fx& a ) { return rcp(x: dot(a,b: a)); }
655 __forceinline float length ( const Vec3fx& a ) { return sqrt(x: dot(a,b: a)); }
656 __forceinline Vec3fx normalize( const Vec3fx& a ) { return a*rsqrt(x: dot(a,b: a)); }
657 __forceinline float distance ( const Vec3fx& a, const Vec3fx& b ) { return length(a: a-b); }
658 __forceinline float halfArea ( const Vec3fx& d ) { return madd(a: d.x,b: (d.y+d.z),c: d.y*d.z); }
659 __forceinline float area ( const Vec3fx& d ) { return 2.0f*halfArea(d); }
660
661 __forceinline Vec3fx normalize_safe( const Vec3fx& a ) {
662 const float d = dot(a,b: a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(x: d);
663 }
664
665 /*! differentiated normalization */
666 __forceinline Vec3fx dnormalize(const Vec3fx& p, const Vec3fx& dp)
667 {
668 const float pp = dot(a: p,b: p);
669 const float pdp = dot(a: p,b: dp);
670 return (pp*dp-pdp*p)*rcp(x: pp)*rsqrt(x: pp);
671 }
672
673 ////////////////////////////////////////////////////////////////////////////////
674 /// Select
675 ////////////////////////////////////////////////////////////////////////////////
676
677 __forceinline Vec3fx select( bool s, const Vec3fx& t, const Vec3fx& f ) {
678 __m128 mask = s ? _mm_castsi128_ps(a: _mm_cmpeq_epi32(a: _mm_setzero_si128(), b: _mm_setzero_si128())) : _mm_setzero_ps();
679 return blendv_ps(f: f.m128, t: t.m128, mask);
680 }
681
682 __forceinline Vec3fx select( const Vec3ba& s, const Vec3fx& t, const Vec3fx& f ) {
683 return blendv_ps(f: f.m128, t: t.m128, mask: s);
684 }
685
686 __forceinline Vec3fx lerp(const Vec3fx& v0, const Vec3fx& v1, const float t) {
687 return madd(a: 1.0f-t,b: v0,c: t*v1);
688 }
689
690 __forceinline int maxDim ( const Vec3fx& a )
691 {
692 const Vec3fx b = abs(a);
693 if (b.x > b.y) {
694 if (b.x > b.z) return 0; else return 2;
695 } else {
696 if (b.y > b.z) return 1; else return 2;
697 }
698 }
699
700 ////////////////////////////////////////////////////////////////////////////////
701 /// Rounding Functions
702 ////////////////////////////////////////////////////////////////////////////////
703
704#if defined(__aarch64__)
705 __forceinline Vec3fx trunc(const Vec3fx& a) { return vrndq_f32(a.m128); }
706 __forceinline Vec3fx floor(const Vec3fx& a) { return vrndmq_f32(a.m128); }
707 __forceinline Vec3fx ceil (const Vec3fx& a) { return vrndpq_f32(a.m128); }
708#elif defined (__SSE4_1__)
709 __forceinline Vec3fx trunc( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
710 __forceinline Vec3fx floor( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); }
711 __forceinline Vec3fx ceil ( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); }
712#else
713 __forceinline Vec3fx trunc( const Vec3fx& a ) { return Vec3fx(truncf(x: a.x),truncf(x: a.y),truncf(x: a.z)); }
714 __forceinline Vec3fx floor( const Vec3fx& a ) { return Vec3fx(floorf(x: a.x),floorf(x: a.y),floorf(x: a.z)); }
715 __forceinline Vec3fx ceil ( const Vec3fx& a ) { return Vec3fx(ceilf (x: a.x),ceilf (x: a.y),ceilf (x: a.z)); }
716#endif
717
718 ////////////////////////////////////////////////////////////////////////////////
719 /// Output Operators
720 ////////////////////////////////////////////////////////////////////////////////
721
722 __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fx& a) {
723 return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
724 }
725
726
727 typedef Vec3fx Vec3ff;
728}
729

source code of qtquick3d/src/3rdparty/embree/common/math/vec3fa.h