1 | // Copyright 2009-2021 Intel Corporation |
2 | // SPDX-License-Identifier: Apache-2.0 |
3 | |
4 | #pragma once |
5 | |
6 | #include "../sys/alloc.h" |
7 | #include "math.h" |
8 | #include "../simd/sse.h" |
9 | |
10 | namespace embree |
11 | { |
12 | //////////////////////////////////////////////////////////////////////////////// |
13 | /// SSE Vec2fa Type |
14 | //////////////////////////////////////////////////////////////////////////////// |
15 | |
16 | struct __aligned(16) Vec2fa |
17 | { |
18 | ALIGNED_STRUCT_(16); |
19 | |
20 | typedef float Scalar; |
21 | enum { N = 2 }; |
22 | union { |
23 | __m128 m128; |
24 | struct { float x,y,az,aw; }; |
25 | }; |
26 | |
27 | //////////////////////////////////////////////////////////////////////////////// |
28 | /// Constructors, Assignment & Cast Operators |
29 | //////////////////////////////////////////////////////////////////////////////// |
30 | |
31 | __forceinline Vec2fa( ) {} |
32 | __forceinline Vec2fa( const __m128 a ) : m128(a) {} |
33 | |
34 | __forceinline Vec2fa ( const Vec2<float>& other ) { x = other.x; y = other.y; } |
35 | __forceinline Vec2fa& operator =( const Vec2<float>& other ) { x = other.x; y = other.y; return *this; } |
36 | |
37 | __forceinline Vec2fa ( const Vec2fa& other ) { m128 = other.m128; } |
38 | __forceinline Vec2fa& operator =( const Vec2fa& other ) { m128 = other.m128; return *this; } |
39 | |
40 | __forceinline explicit Vec2fa( const float a ) : m128(_mm_set1_ps(w: a)) {} |
41 | __forceinline Vec2fa( const float x, const float y) : m128(_mm_set_ps(z: y, y: y, x: y, w: x)) {} |
42 | |
43 | __forceinline explicit Vec2fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a: a)) {} |
44 | |
45 | __forceinline operator const __m128&() const { return m128; } |
46 | __forceinline operator __m128&() { return m128; } |
47 | |
48 | //////////////////////////////////////////////////////////////////////////////// |
49 | /// Loads and Stores |
50 | //////////////////////////////////////////////////////////////////////////////// |
51 | |
52 | static __forceinline Vec2fa load( const void* const a ) { |
53 | return Vec2fa(_mm_and_ps(a: _mm_load_ps(p: (float*)a),b: _mm_castsi128_ps(a: _mm_set_epi32(i3: 0, i2: 0, i1: -1, i0: -1)))); |
54 | } |
55 | |
56 | static __forceinline Vec2fa loadu( const void* const a ) { |
57 | return Vec2fa(_mm_and_ps(a: _mm_loadu_ps(p: (float*)a),b: _mm_castsi128_ps(a: _mm_set_epi32(i3: 0, i2: 0, i1: -1, i0: -1)))); |
58 | } |
59 | |
60 | static __forceinline void storeu ( void* ptr, const Vec2fa& v ) { |
61 | _mm_storeu_ps(p: (float*)ptr,a: v); |
62 | } |
63 | |
64 | //////////////////////////////////////////////////////////////////////////////// |
65 | /// Constants |
66 | //////////////////////////////////////////////////////////////////////////////// |
67 | |
68 | __forceinline Vec2fa( ZeroTy ) : m128(_mm_setzero_ps()) {} |
69 | __forceinline Vec2fa( OneTy ) : m128(_mm_set1_ps(w: 1.0f)) {} |
70 | __forceinline Vec2fa( PosInfTy ) : m128(_mm_set1_ps(w: pos_inf)) {} |
71 | __forceinline Vec2fa( NegInfTy ) : m128(_mm_set1_ps(w: neg_inf)) {} |
72 | |
73 | //////////////////////////////////////////////////////////////////////////////// |
74 | /// Array Access |
75 | //////////////////////////////////////////////////////////////////////////////// |
76 | |
77 | __forceinline const float& operator []( const size_t index ) const { assert(index < 2); return (&x)[index]; } |
78 | __forceinline float& operator []( const size_t index ) { assert(index < 2); return (&x)[index]; } |
79 | }; |
80 | |
81 | //////////////////////////////////////////////////////////////////////////////// |
82 | /// Unary Operators |
83 | //////////////////////////////////////////////////////////////////////////////// |
84 | |
85 | __forceinline Vec2fa operator +( const Vec2fa& a ) { return a; } |
86 | __forceinline Vec2fa operator -( const Vec2fa& a ) { |
87 | const __m128 mask = _mm_castsi128_ps(a: _mm_set1_epi32(i: 0x80000000)); |
88 | return _mm_xor_ps(a: a.m128, b: mask); |
89 | } |
90 | __forceinline Vec2fa abs ( const Vec2fa& a ) { |
91 | const __m128 mask = _mm_castsi128_ps(a: _mm_set1_epi32(i: 0x7fffffff)); |
92 | return _mm_and_ps(a: a.m128, b: mask); |
93 | } |
94 | __forceinline Vec2fa sign ( const Vec2fa& a ) { |
95 | return blendv_ps(f: Vec2fa(one), t: -Vec2fa(one), mask: _mm_cmplt_ps (a: a,b: Vec2fa(zero))); |
96 | } |
97 | |
98 | __forceinline Vec2fa rcp ( const Vec2fa& a ) |
99 | { |
100 | #if defined(__AVX512VL__) |
101 | const Vec2fa r = _mm_rcp14_ps(a.m128); |
102 | #else |
103 | const Vec2fa r = _mm_rcp_ps(a: a.m128); |
104 | #endif |
105 | |
106 | #if defined(__AVX2__) |
107 | const Vec2fa h_n = _mm_fnmadd_ps(a, r, vfloat4(1.0)); // First, compute 1 - a * r (which will be very close to 0) |
108 | const Vec2fa res = _mm_fmadd_ps(r, h_n, r); // Then compute r + r * h_n |
109 | #else |
110 | const Vec2fa h_n = _mm_sub_ps(a: vfloat4(1.0f), b: _mm_mul_ps(a: a, b: r)); // First, compute 1 - a * r (which will be very close to 0) |
111 | const Vec2fa res = _mm_add_ps(a: r,b: _mm_mul_ps(a: r, b: h_n)); // Then compute r + r * h_n |
112 | #endif |
113 | |
114 | return res; |
115 | } |
116 | |
117 | __forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a: a.m128); } |
118 | __forceinline Vec2fa sqr ( const Vec2fa& a ) { return _mm_mul_ps(a: a,b: a); } |
119 | |
120 | __forceinline Vec2fa rsqrt( const Vec2fa& a ) |
121 | { |
122 | #if defined(__AVX512VL__) |
123 | __m128 r = _mm_rsqrt14_ps(a.m128); |
124 | #else |
125 | __m128 r = _mm_rsqrt_ps(a: a.m128); |
126 | #endif |
127 | return _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: 1.5f),b: r), b: _mm_mul_ps(a: _mm_mul_ps(a: _mm_mul_ps(a: a, b: _mm_set1_ps(w: -0.5f)), b: r), b: _mm_mul_ps(a: r, b: r))); |
128 | } |
129 | |
130 | __forceinline Vec2fa zero_fix(const Vec2fa& a) { |
131 | return blendv_ps(f: a, t: _mm_set1_ps(w: min_rcp_input), mask: _mm_cmplt_ps (a: abs(a).m128, b: _mm_set1_ps(w: min_rcp_input))); |
132 | } |
133 | __forceinline Vec2fa rcp_safe(const Vec2fa& a) { |
134 | return rcp(a: zero_fix(a)); |
135 | } |
136 | __forceinline Vec2fa log ( const Vec2fa& a ) { |
137 | return Vec2fa(logf(x: a.x),logf(x: a.y)); |
138 | } |
139 | |
140 | __forceinline Vec2fa exp ( const Vec2fa& a ) { |
141 | return Vec2fa(expf(x: a.x),expf(x: a.y)); |
142 | } |
143 | |
144 | //////////////////////////////////////////////////////////////////////////////// |
145 | /// Binary Operators |
146 | //////////////////////////////////////////////////////////////////////////////// |
147 | |
148 | __forceinline Vec2fa operator +( const Vec2fa& a, const Vec2fa& b ) { return _mm_add_ps(a: a.m128, b: b.m128); } |
149 | __forceinline Vec2fa operator -( const Vec2fa& a, const Vec2fa& b ) { return _mm_sub_ps(a: a.m128, b: b.m128); } |
150 | __forceinline Vec2fa operator *( const Vec2fa& a, const Vec2fa& b ) { return _mm_mul_ps(a: a.m128, b: b.m128); } |
151 | __forceinline Vec2fa operator *( const Vec2fa& a, const float b ) { return a * Vec2fa(b); } |
152 | __forceinline Vec2fa operator *( const float a, const Vec2fa& b ) { return Vec2fa(a) * b; } |
153 | __forceinline Vec2fa operator /( const Vec2fa& a, const Vec2fa& b ) { return _mm_div_ps(a: a.m128,b: b.m128); } |
154 | __forceinline Vec2fa operator /( const Vec2fa& a, const float b ) { return _mm_div_ps(a: a.m128,b: _mm_set1_ps(w: b)); } |
155 | __forceinline Vec2fa operator /( const float a, const Vec2fa& b ) { return _mm_div_ps(a: _mm_set1_ps(w: a),b: b.m128); } |
156 | |
157 | __forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a: a.m128,b: b.m128); } |
158 | __forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a: a.m128,b: b.m128); } |
159 | |
160 | #if defined(__SSE4_1__) |
161 | __forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) { |
162 | const vint4 ai = _mm_castps_si128(a); |
163 | const vint4 bi = _mm_castps_si128(b); |
164 | const vint4 ci = _mm_min_epi32(ai,bi); |
165 | return _mm_castsi128_ps(ci); |
166 | } |
167 | #endif |
168 | |
169 | #if defined(__SSE4_1__) |
170 | __forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) { |
171 | const vint4 ai = _mm_castps_si128(a); |
172 | const vint4 bi = _mm_castps_si128(b); |
173 | const vint4 ci = _mm_max_epi32(ai,bi); |
174 | return _mm_castsi128_ps(ci); |
175 | } |
176 | #endif |
177 | |
178 | __forceinline Vec2fa pow ( const Vec2fa& a, const float& b ) { |
179 | return Vec2fa(powf(x: a.x,y: b),powf(x: a.y,y: b)); |
180 | } |
181 | |
182 | //////////////////////////////////////////////////////////////////////////////// |
183 | /// Ternary Operators |
184 | //////////////////////////////////////////////////////////////////////////////// |
185 | |
186 | #if defined(__AVX2__) |
187 | __forceinline Vec2fa madd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmadd_ps(a,b,c); } |
188 | __forceinline Vec2fa msub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmsub_ps(a,b,c); } |
189 | __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmadd_ps(a,b,c); } |
190 | __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmsub_ps(a,b,c); } |
191 | #else |
192 | __forceinline Vec2fa madd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b+c; } |
193 | __forceinline Vec2fa msub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b-c; } |
194 | __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b+c;} |
195 | __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b-c; } |
196 | #endif |
197 | |
198 | __forceinline Vec2fa madd ( const float a, const Vec2fa& b, const Vec2fa& c) { return madd(a: Vec2fa(a),b,c); } |
199 | __forceinline Vec2fa msub ( const float a, const Vec2fa& b, const Vec2fa& c) { return msub(a: Vec2fa(a),b,c); } |
200 | __forceinline Vec2fa nmadd ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmadd(a: Vec2fa(a),b,c); } |
201 | __forceinline Vec2fa nmsub ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmsub(a: Vec2fa(a),b,c); } |
202 | |
203 | //////////////////////////////////////////////////////////////////////////////// |
204 | /// Assignment Operators |
205 | //////////////////////////////////////////////////////////////////////////////// |
206 | |
207 | __forceinline Vec2fa& operator +=( Vec2fa& a, const Vec2fa& b ) { return a = a + b; } |
208 | __forceinline Vec2fa& operator -=( Vec2fa& a, const Vec2fa& b ) { return a = a - b; } |
209 | __forceinline Vec2fa& operator *=( Vec2fa& a, const Vec2fa& b ) { return a = a * b; } |
210 | __forceinline Vec2fa& operator *=( Vec2fa& a, const float b ) { return a = a * b; } |
211 | __forceinline Vec2fa& operator /=( Vec2fa& a, const Vec2fa& b ) { return a = a / b; } |
212 | __forceinline Vec2fa& operator /=( Vec2fa& a, const float b ) { return a = a / b; } |
213 | |
214 | //////////////////////////////////////////////////////////////////////////////// |
215 | /// Reductions |
216 | //////////////////////////////////////////////////////////////////////////////// |
217 | |
218 | __forceinline float reduce_add(const Vec2fa& v) { return v.x+v.y; } |
219 | __forceinline float reduce_mul(const Vec2fa& v) { return v.x*v.y; } |
220 | __forceinline float reduce_min(const Vec2fa& v) { return min(a: v.x,b: v.y); } |
221 | __forceinline float reduce_max(const Vec2fa& v) { return max(a: v.x,b: v.y); } |
222 | |
223 | //////////////////////////////////////////////////////////////////////////////// |
224 | /// Comparison Operators |
225 | //////////////////////////////////////////////////////////////////////////////// |
226 | |
227 | __forceinline bool operator ==( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(a: _mm_cmpeq_ps (a: a.m128, b: b.m128)) & 3) == 3; } |
228 | __forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(a: _mm_cmpneq_ps(a: a.m128, b: b.m128)) & 3) != 0; } |
229 | |
230 | //////////////////////////////////////////////////////////////////////////////// |
231 | /// Euclidian Space Operators |
232 | //////////////////////////////////////////////////////////////////////////////// |
233 | |
234 | #if defined(__SSE4_1__) |
235 | __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) { |
236 | return _mm_cvtss_f32(_mm_dp_ps(a,b,0x3F)); |
237 | } |
238 | #else |
239 | __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) { |
240 | return reduce_add(v: a*b); |
241 | } |
242 | #endif |
243 | |
244 | __forceinline Vec2fa cross ( const Vec2fa& a ) { |
245 | return Vec2fa(-a.y,a.x); |
246 | } |
247 | |
248 | __forceinline float sqr_length ( const Vec2fa& a ) { return dot(a,b: a); } |
249 | __forceinline float rcp_length ( const Vec2fa& a ) { return rsqrt(x: dot(a,b: a)); } |
250 | __forceinline float rcp_length2( const Vec2fa& a ) { return rcp(x: dot(a,b: a)); } |
251 | __forceinline float length ( const Vec2fa& a ) { return sqrt(x: dot(a,b: a)); } |
252 | __forceinline Vec2fa normalize( const Vec2fa& a ) { return a*rsqrt(x: dot(a,b: a)); } |
253 | __forceinline float distance ( const Vec2fa& a, const Vec2fa& b ) { return length(a: a-b); } |
254 | |
255 | //////////////////////////////////////////////////////////////////////////////// |
256 | /// Select |
257 | //////////////////////////////////////////////////////////////////////////////// |
258 | |
259 | __forceinline Vec2fa select( bool s, const Vec2fa& t, const Vec2fa& f ) { |
260 | __m128 mask = s ? _mm_castsi128_ps(a: _mm_cmpeq_epi32(a: _mm_setzero_si128(), b: _mm_setzero_si128())) : _mm_setzero_ps(); |
261 | return blendv_ps(f, t, mask); |
262 | } |
263 | |
264 | __forceinline Vec2fa lerp(const Vec2fa& v0, const Vec2fa& v1, const float t) { |
265 | return madd(a: 1.0f-t,b: v0,c: t*v1); |
266 | } |
267 | |
268 | __forceinline int maxDim ( const Vec2fa& a ) |
269 | { |
270 | const Vec2fa b = abs(a); |
271 | if (b.x > b.y) return 0; |
272 | else return 1; |
273 | } |
274 | |
275 | //////////////////////////////////////////////////////////////////////////////// |
276 | /// Rounding Functions |
277 | //////////////////////////////////////////////////////////////////////////////// |
278 | |
279 | #if defined(__aarch64__) |
280 | //__forceinline Vec2fa trunc(const Vec2fa& a) { return vrndq_f32(a); } |
281 | __forceinline Vec2fa floor(const Vec2fa& a) { return vrndmq_f32(a); } |
282 | __forceinline Vec2fa ceil (const Vec2fa& a) { return vrndpq_f32(a); } |
283 | #elif defined (__SSE4_1__) |
284 | //__forceinline Vec2fa trunc( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } |
285 | __forceinline Vec2fa floor( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); } |
286 | __forceinline Vec2fa ceil ( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); } |
287 | #else |
288 | //__forceinline Vec2fa trunc( const Vec2fa& a ) { return Vec2fa(truncf(a.x),truncf(a.y),truncf(a.z)); } |
289 | __forceinline Vec2fa floor( const Vec2fa& a ) { return Vec2fa(floorf(x: a.x),floorf(x: a.y)); } |
290 | __forceinline Vec2fa ceil ( const Vec2fa& a ) { return Vec2fa(ceilf (x: a.x),ceilf (x: a.y)); } |
291 | #endif |
292 | |
293 | //////////////////////////////////////////////////////////////////////////////// |
294 | /// Output Operators |
295 | //////////////////////////////////////////////////////////////////////////////// |
296 | |
297 | __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2fa& a) { |
298 | return cout << "(" << a.x << ", " << a.y << ")" ; |
299 | } |
300 | |
301 | typedef Vec2fa Vec2fa_t; |
302 | } |
303 | |