1// Copyright 2009-2021 Intel Corporation
2// SPDX-License-Identifier: Apache-2.0
3
4#pragma once
5
6#include "../sys/alloc.h"
7#include "math.h"
8#include "../simd/sse.h"
9
10namespace embree
11{
12 ////////////////////////////////////////////////////////////////////////////////
13 /// SSE Vec2fa Type
14 ////////////////////////////////////////////////////////////////////////////////
15
16 struct __aligned(16) Vec2fa
17 {
18 ALIGNED_STRUCT_(16);
19
20 typedef float Scalar;
21 enum { N = 2 };
22 union {
23 __m128 m128;
24 struct { float x,y,az,aw; };
25 };
26
27 ////////////////////////////////////////////////////////////////////////////////
28 /// Constructors, Assignment & Cast Operators
29 ////////////////////////////////////////////////////////////////////////////////
30
31 __forceinline Vec2fa( ) {}
32 __forceinline Vec2fa( const __m128 a ) : m128(a) {}
33
34 __forceinline Vec2fa ( const Vec2<float>& other ) { x = other.x; y = other.y; }
35 __forceinline Vec2fa& operator =( const Vec2<float>& other ) { x = other.x; y = other.y; return *this; }
36
37 __forceinline Vec2fa ( const Vec2fa& other ) { m128 = other.m128; }
38 __forceinline Vec2fa& operator =( const Vec2fa& other ) { m128 = other.m128; return *this; }
39
40 __forceinline explicit Vec2fa( const float a ) : m128(_mm_set1_ps(w: a)) {}
41 __forceinline Vec2fa( const float x, const float y) : m128(_mm_set_ps(z: y, y: y, x: y, w: x)) {}
42
43 __forceinline explicit Vec2fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a: a)) {}
44
45 __forceinline operator const __m128&() const { return m128; }
46 __forceinline operator __m128&() { return m128; }
47
48 ////////////////////////////////////////////////////////////////////////////////
49 /// Loads and Stores
50 ////////////////////////////////////////////////////////////////////////////////
51
52 static __forceinline Vec2fa load( const void* const a ) {
53 return Vec2fa(_mm_and_ps(a: _mm_load_ps(p: (float*)a),b: _mm_castsi128_ps(a: _mm_set_epi32(i3: 0, i2: 0, i1: -1, i0: -1))));
54 }
55
56 static __forceinline Vec2fa loadu( const void* const a ) {
57 return Vec2fa(_mm_and_ps(a: _mm_loadu_ps(p: (float*)a),b: _mm_castsi128_ps(a: _mm_set_epi32(i3: 0, i2: 0, i1: -1, i0: -1))));
58 }
59
60 static __forceinline void storeu ( void* ptr, const Vec2fa& v ) {
61 _mm_storeu_ps(p: (float*)ptr,a: v);
62 }
63
64 ////////////////////////////////////////////////////////////////////////////////
65 /// Constants
66 ////////////////////////////////////////////////////////////////////////////////
67
68 __forceinline Vec2fa( ZeroTy ) : m128(_mm_setzero_ps()) {}
69 __forceinline Vec2fa( OneTy ) : m128(_mm_set1_ps(w: 1.0f)) {}
70 __forceinline Vec2fa( PosInfTy ) : m128(_mm_set1_ps(w: pos_inf)) {}
71 __forceinline Vec2fa( NegInfTy ) : m128(_mm_set1_ps(w: neg_inf)) {}
72
73 ////////////////////////////////////////////////////////////////////////////////
74 /// Array Access
75 ////////////////////////////////////////////////////////////////////////////////
76
77 __forceinline const float& operator []( const size_t index ) const { assert(index < 2); return (&x)[index]; }
78 __forceinline float& operator []( const size_t index ) { assert(index < 2); return (&x)[index]; }
79 };
80
81 ////////////////////////////////////////////////////////////////////////////////
82 /// Unary Operators
83 ////////////////////////////////////////////////////////////////////////////////
84
85 __forceinline Vec2fa operator +( const Vec2fa& a ) { return a; }
86 __forceinline Vec2fa operator -( const Vec2fa& a ) {
87 const __m128 mask = _mm_castsi128_ps(a: _mm_set1_epi32(i: 0x80000000));
88 return _mm_xor_ps(a: a.m128, b: mask);
89 }
90 __forceinline Vec2fa abs ( const Vec2fa& a ) {
91 const __m128 mask = _mm_castsi128_ps(a: _mm_set1_epi32(i: 0x7fffffff));
92 return _mm_and_ps(a: a.m128, b: mask);
93 }
94 __forceinline Vec2fa sign ( const Vec2fa& a ) {
95 return blendv_ps(f: Vec2fa(one), t: -Vec2fa(one), mask: _mm_cmplt_ps (a: a,b: Vec2fa(zero)));
96 }
97
98 __forceinline Vec2fa rcp ( const Vec2fa& a )
99 {
100#if defined(__AVX512VL__)
101 const Vec2fa r = _mm_rcp14_ps(a.m128);
102#else
103 const Vec2fa r = _mm_rcp_ps(a: a.m128);
104#endif
105
106#if defined(__AVX2__)
107 const Vec2fa h_n = _mm_fnmadd_ps(a, r, vfloat4(1.0)); // First, compute 1 - a * r (which will be very close to 0)
108 const Vec2fa res = _mm_fmadd_ps(r, h_n, r); // Then compute r + r * h_n
109#else
110 const Vec2fa h_n = _mm_sub_ps(a: vfloat4(1.0f), b: _mm_mul_ps(a: a, b: r)); // First, compute 1 - a * r (which will be very close to 0)
111 const Vec2fa res = _mm_add_ps(a: r,b: _mm_mul_ps(a: r, b: h_n)); // Then compute r + r * h_n
112#endif
113
114 return res;
115 }
116
117 __forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a: a.m128); }
118 __forceinline Vec2fa sqr ( const Vec2fa& a ) { return _mm_mul_ps(a: a,b: a); }
119
120 __forceinline Vec2fa rsqrt( const Vec2fa& a )
121 {
122#if defined(__AVX512VL__)
123 __m128 r = _mm_rsqrt14_ps(a.m128);
124#else
125 __m128 r = _mm_rsqrt_ps(a: a.m128);
126#endif
127 return _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: 1.5f),b: r), b: _mm_mul_ps(a: _mm_mul_ps(a: _mm_mul_ps(a: a, b: _mm_set1_ps(w: -0.5f)), b: r), b: _mm_mul_ps(a: r, b: r)));
128 }
129
130 __forceinline Vec2fa zero_fix(const Vec2fa& a) {
131 return blendv_ps(f: a, t: _mm_set1_ps(w: min_rcp_input), mask: _mm_cmplt_ps (a: abs(a).m128, b: _mm_set1_ps(w: min_rcp_input)));
132 }
133 __forceinline Vec2fa rcp_safe(const Vec2fa& a) {
134 return rcp(a: zero_fix(a));
135 }
136 __forceinline Vec2fa log ( const Vec2fa& a ) {
137 return Vec2fa(logf(x: a.x),logf(x: a.y));
138 }
139
140 __forceinline Vec2fa exp ( const Vec2fa& a ) {
141 return Vec2fa(expf(x: a.x),expf(x: a.y));
142 }
143
144 ////////////////////////////////////////////////////////////////////////////////
145 /// Binary Operators
146 ////////////////////////////////////////////////////////////////////////////////
147
148 __forceinline Vec2fa operator +( const Vec2fa& a, const Vec2fa& b ) { return _mm_add_ps(a: a.m128, b: b.m128); }
149 __forceinline Vec2fa operator -( const Vec2fa& a, const Vec2fa& b ) { return _mm_sub_ps(a: a.m128, b: b.m128); }
150 __forceinline Vec2fa operator *( const Vec2fa& a, const Vec2fa& b ) { return _mm_mul_ps(a: a.m128, b: b.m128); }
151 __forceinline Vec2fa operator *( const Vec2fa& a, const float b ) { return a * Vec2fa(b); }
152 __forceinline Vec2fa operator *( const float a, const Vec2fa& b ) { return Vec2fa(a) * b; }
153 __forceinline Vec2fa operator /( const Vec2fa& a, const Vec2fa& b ) { return _mm_div_ps(a: a.m128,b: b.m128); }
154 __forceinline Vec2fa operator /( const Vec2fa& a, const float b ) { return _mm_div_ps(a: a.m128,b: _mm_set1_ps(w: b)); }
155 __forceinline Vec2fa operator /( const float a, const Vec2fa& b ) { return _mm_div_ps(a: _mm_set1_ps(w: a),b: b.m128); }
156
157 __forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a: a.m128,b: b.m128); }
158 __forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a: a.m128,b: b.m128); }
159
160#if defined(__SSE4_1__)
161 __forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) {
162 const vint4 ai = _mm_castps_si128(a);
163 const vint4 bi = _mm_castps_si128(b);
164 const vint4 ci = _mm_min_epi32(ai,bi);
165 return _mm_castsi128_ps(ci);
166 }
167#endif
168
169#if defined(__SSE4_1__)
170 __forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) {
171 const vint4 ai = _mm_castps_si128(a);
172 const vint4 bi = _mm_castps_si128(b);
173 const vint4 ci = _mm_max_epi32(ai,bi);
174 return _mm_castsi128_ps(ci);
175 }
176#endif
177
178 __forceinline Vec2fa pow ( const Vec2fa& a, const float& b ) {
179 return Vec2fa(powf(x: a.x,y: b),powf(x: a.y,y: b));
180 }
181
182 ////////////////////////////////////////////////////////////////////////////////
183 /// Ternary Operators
184 ////////////////////////////////////////////////////////////////////////////////
185
186#if defined(__AVX2__)
187 __forceinline Vec2fa madd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmadd_ps(a,b,c); }
188 __forceinline Vec2fa msub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmsub_ps(a,b,c); }
189 __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmadd_ps(a,b,c); }
190 __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmsub_ps(a,b,c); }
191#else
192 __forceinline Vec2fa madd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b+c; }
193 __forceinline Vec2fa msub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b-c; }
194 __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b+c;}
195 __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b-c; }
196#endif
197
198 __forceinline Vec2fa madd ( const float a, const Vec2fa& b, const Vec2fa& c) { return madd(a: Vec2fa(a),b,c); }
199 __forceinline Vec2fa msub ( const float a, const Vec2fa& b, const Vec2fa& c) { return msub(a: Vec2fa(a),b,c); }
200 __forceinline Vec2fa nmadd ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmadd(a: Vec2fa(a),b,c); }
201 __forceinline Vec2fa nmsub ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmsub(a: Vec2fa(a),b,c); }
202
203 ////////////////////////////////////////////////////////////////////////////////
204 /// Assignment Operators
205 ////////////////////////////////////////////////////////////////////////////////
206
207 __forceinline Vec2fa& operator +=( Vec2fa& a, const Vec2fa& b ) { return a = a + b; }
208 __forceinline Vec2fa& operator -=( Vec2fa& a, const Vec2fa& b ) { return a = a - b; }
209 __forceinline Vec2fa& operator *=( Vec2fa& a, const Vec2fa& b ) { return a = a * b; }
210 __forceinline Vec2fa& operator *=( Vec2fa& a, const float b ) { return a = a * b; }
211 __forceinline Vec2fa& operator /=( Vec2fa& a, const Vec2fa& b ) { return a = a / b; }
212 __forceinline Vec2fa& operator /=( Vec2fa& a, const float b ) { return a = a / b; }
213
214 ////////////////////////////////////////////////////////////////////////////////
215 /// Reductions
216 ////////////////////////////////////////////////////////////////////////////////
217
218 __forceinline float reduce_add(const Vec2fa& v) { return v.x+v.y; }
219 __forceinline float reduce_mul(const Vec2fa& v) { return v.x*v.y; }
220 __forceinline float reduce_min(const Vec2fa& v) { return min(a: v.x,b: v.y); }
221 __forceinline float reduce_max(const Vec2fa& v) { return max(a: v.x,b: v.y); }
222
223 ////////////////////////////////////////////////////////////////////////////////
224 /// Comparison Operators
225 ////////////////////////////////////////////////////////////////////////////////
226
227 __forceinline bool operator ==( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(a: _mm_cmpeq_ps (a: a.m128, b: b.m128)) & 3) == 3; }
228 __forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(a: _mm_cmpneq_ps(a: a.m128, b: b.m128)) & 3) != 0; }
229
230 ////////////////////////////////////////////////////////////////////////////////
231 /// Euclidian Space Operators
232 ////////////////////////////////////////////////////////////////////////////////
233
234#if defined(__SSE4_1__)
235 __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) {
236 return _mm_cvtss_f32(_mm_dp_ps(a,b,0x3F));
237 }
238#else
239 __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) {
240 return reduce_add(v: a*b);
241 }
242#endif
243
244 __forceinline Vec2fa cross ( const Vec2fa& a ) {
245 return Vec2fa(-a.y,a.x);
246 }
247
248 __forceinline float sqr_length ( const Vec2fa& a ) { return dot(a,b: a); }
249 __forceinline float rcp_length ( const Vec2fa& a ) { return rsqrt(x: dot(a,b: a)); }
250 __forceinline float rcp_length2( const Vec2fa& a ) { return rcp(x: dot(a,b: a)); }
251 __forceinline float length ( const Vec2fa& a ) { return sqrt(x: dot(a,b: a)); }
252 __forceinline Vec2fa normalize( const Vec2fa& a ) { return a*rsqrt(x: dot(a,b: a)); }
253 __forceinline float distance ( const Vec2fa& a, const Vec2fa& b ) { return length(a: a-b); }
254
255 ////////////////////////////////////////////////////////////////////////////////
256 /// Select
257 ////////////////////////////////////////////////////////////////////////////////
258
259 __forceinline Vec2fa select( bool s, const Vec2fa& t, const Vec2fa& f ) {
260 __m128 mask = s ? _mm_castsi128_ps(a: _mm_cmpeq_epi32(a: _mm_setzero_si128(), b: _mm_setzero_si128())) : _mm_setzero_ps();
261 return blendv_ps(f, t, mask);
262 }
263
264 __forceinline Vec2fa lerp(const Vec2fa& v0, const Vec2fa& v1, const float t) {
265 return madd(a: 1.0f-t,b: v0,c: t*v1);
266 }
267
268 __forceinline int maxDim ( const Vec2fa& a )
269 {
270 const Vec2fa b = abs(a);
271 if (b.x > b.y) return 0;
272 else return 1;
273 }
274
275 ////////////////////////////////////////////////////////////////////////////////
276 /// Rounding Functions
277 ////////////////////////////////////////////////////////////////////////////////
278
279#if defined(__aarch64__)
280 //__forceinline Vec2fa trunc(const Vec2fa& a) { return vrndq_f32(a); }
281 __forceinline Vec2fa floor(const Vec2fa& a) { return vrndmq_f32(a); }
282 __forceinline Vec2fa ceil (const Vec2fa& a) { return vrndpq_f32(a); }
283#elif defined (__SSE4_1__)
284 //__forceinline Vec2fa trunc( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
285 __forceinline Vec2fa floor( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); }
286 __forceinline Vec2fa ceil ( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); }
287#else
288 //__forceinline Vec2fa trunc( const Vec2fa& a ) { return Vec2fa(truncf(a.x),truncf(a.y),truncf(a.z)); }
289 __forceinline Vec2fa floor( const Vec2fa& a ) { return Vec2fa(floorf(x: a.x),floorf(x: a.y)); }
290 __forceinline Vec2fa ceil ( const Vec2fa& a ) { return Vec2fa(ceilf (x: a.x),ceilf (x: a.y)); }
291#endif
292
293 ////////////////////////////////////////////////////////////////////////////////
294 /// Output Operators
295 ////////////////////////////////////////////////////////////////////////////////
296
297 __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2fa& a) {
298 return cout << "(" << a.x << ", " << a.y << ")";
299 }
300
301 typedef Vec2fa Vec2fa_t;
302}
303

source code of qtquick3d/src/3rdparty/embree/common/math/vec2fa.h