1// Copyright 2009-2021 Intel Corporation
2// SPDX-License-Identifier: Apache-2.0
3
4#pragma once
5
6#include "../math/math.h"
7
8#define vboolf vboolf_impl
9#define vboold vboold_impl
10#define vint vint_impl
11#define vuint vuint_impl
12#define vllong vllong_impl
13#define vfloat vfloat_impl
14#define vdouble vdouble_impl
15
16namespace embree
17{
18 /* 4-wide SSE integer type */
19 template<>
20 struct vint<4>
21 {
22 ALIGNED_STRUCT_(16);
23
24 typedef vboolf4 Bool;
25 typedef vint4 Int;
26 typedef vfloat4 Float;
27
28 enum { size = 4 }; // number of SIMD elements
29 union { __m128i v; int i[4]; }; // data
30
31 ////////////////////////////////////////////////////////////////////////////////
32 /// Constructors, Assignment & Cast Operators
33 ////////////////////////////////////////////////////////////////////////////////
34
35 __forceinline vint() {}
36 __forceinline vint(const vint4& a) { v = a.v; }
37 __forceinline vint4& operator =(const vint4& a) { v = a.v; return *this; }
38
39 __forceinline vint(__m128i a) : v(a) {}
40 __forceinline operator const __m128i&() const { return v; }
41 __forceinline operator __m128i&() { return v; }
42
43 __forceinline vint(int a) : v(_mm_set1_epi32(i: a)) {}
44 __forceinline vint(int a, int b, int c, int d) : v(_mm_set_epi32(i3: d, i2: c, i1: b, i0: a)) {}
45
46 __forceinline explicit vint(__m128 a) : v(_mm_cvtps_epi32(a: a)) {}
47#if defined(__AVX512VL__)
48 __forceinline explicit vint(const vboolf4& a) : v(_mm_movm_epi32(a)) {}
49#else
50 __forceinline explicit vint(const vboolf4& a) : v(_mm_castps_si128(a: (__m128)a)) {}
51#endif
52
53 __forceinline vint(long long a, long long b) : v(_mm_set_epi64x(q1: b,q0: a)) {}
54
55 ////////////////////////////////////////////////////////////////////////////////
56 /// Constants
57 ////////////////////////////////////////////////////////////////////////////////
58
59 __forceinline vint(ZeroTy) : v(_mm_setzero_si128()) {}
60 __forceinline vint(OneTy) : v(_mm_set_epi32(i3: 1, i2: 1, i1: 1, i0: 1)) {}
61 __forceinline vint(PosInfTy) : v(_mm_set_epi32(i3: pos_inf, i2: pos_inf, i1: pos_inf, i0: pos_inf)) {}
62 __forceinline vint(NegInfTy) : v(_mm_set_epi32(i3: neg_inf, i2: neg_inf, i1: neg_inf, i0: neg_inf)) {}
63 __forceinline vint(StepTy) : v(_mm_set_epi32(i3: 3, i2: 2, i1: 1, i0: 0)) {}
64 __forceinline vint(ReverseStepTy) : v(_mm_set_epi32(i3: 0, i2: 1, i1: 2, i0: 3)) {}
65
66 __forceinline vint(TrueTy) { v = _mm_cmpeq_epi32(a: v,b: v); }
67 __forceinline vint(UndefinedTy) : v(_mm_castps_si128(a: _mm_undefined_ps())) {}
68
69
70 ////////////////////////////////////////////////////////////////////////////////
71 /// Loads and Stores
72 ////////////////////////////////////////////////////////////////////////////////
73
74 static __forceinline vint4 load (const void* a) { return _mm_load_si128(p: (__m128i*)a); }
75 static __forceinline vint4 loadu(const void* a) { return _mm_loadu_si128(p: (__m128i*)a); }
76
77 static __forceinline void store (void* ptr, const vint4& v) { _mm_store_si128(p: (__m128i*)ptr,b: v); }
78 static __forceinline void storeu(void* ptr, const vint4& v) { _mm_storeu_si128(p: (__m128i*)ptr,b: v); }
79
80#if defined(__AVX512VL__)
81
82 static __forceinline vint4 compact(const vboolf4& mask, vint4 &v) {
83 return _mm_mask_compress_epi32(v, mask, v);
84 }
85 static __forceinline vint4 compact(const vboolf4& mask, vint4 &a, const vint4& b) {
86 return _mm_mask_compress_epi32(a, mask, b);
87 }
88
89 static __forceinline vint4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_epi32 (_mm_setzero_si128(),mask,ptr); }
90 static __forceinline vint4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_epi32(_mm_setzero_si128(),mask,ptr); }
91
92 static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& v) { _mm_mask_store_epi32 (ptr,mask,v); }
93 static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& v) { _mm_mask_storeu_epi32(ptr,mask,v); }
94#elif defined(__AVX__)
95 static __forceinline vint4 load (const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); }
96 static __forceinline vint4 loadu(const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); }
97
98 static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); }
99 static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); }
100#else
101 static __forceinline vint4 load (const vbool4& mask, const void* a) { return _mm_and_si128(a: _mm_load_si128 (p: (__m128i*)a),b: mask); }
102 static __forceinline vint4 loadu(const vbool4& mask, const void* a) { return _mm_and_si128(a: _mm_loadu_si128(p: (__m128i*)a),b: mask); }
103
104 static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& i) { store (ptr,v: select(m: mask,t: i,f: load (a: ptr))); }
105 static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& i) { storeu(ptr,v: select(m: mask,t: i,f: loadu(a: ptr))); }
106#endif
107
108
109#if defined(__SSE4_1__)
110 static __forceinline vint4 load(const unsigned char* ptr) {
111 return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
112 }
113
114 static __forceinline vint4 loadu(const unsigned char* ptr) {
115 return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
116 }
117#else
118
119 static __forceinline vint4 load(const unsigned char* ptr) {
120 return vint4(ptr[0],ptr[1],ptr[2],ptr[3]);
121 }
122
123 static __forceinline vint4 loadu(const unsigned char* ptr) {
124 return vint4(ptr[0],ptr[1],ptr[2],ptr[3]);
125 }
126
127#endif
128
129 static __forceinline vint4 load(const unsigned short* ptr) {
130#if defined (__SSE4_1__)
131 return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr));
132#else
133 return vint4(ptr[0],ptr[1],ptr[2],ptr[3]);
134#endif
135 }
136
137 static __forceinline void store(unsigned char* ptr, const vint4& v) {
138#if defined(__SSE4_1__)
139 __m128i x = v;
140 x = _mm_packus_epi32(x, x);
141 x = _mm_packus_epi16(x, x);
142 *(int*)ptr = _mm_cvtsi128_si32(x);
143#else
144 for (size_t i=0;i<4;i++)
145 ptr[i] = (unsigned char)v[i];
146#endif
147 }
148
149 static __forceinline void store(unsigned short* ptr, const vint4& v) {
150 for (size_t i=0;i<4;i++)
151 ptr[i] = (unsigned short)v[i];
152 }
153
154 static __forceinline vint4 load_nt(void* ptr) {
155#if defined(__SSE4_1__)
156 return _mm_stream_load_si128((__m128i*)ptr);
157#else
158 return _mm_load_si128(p: (__m128i*)ptr);
159#endif
160 }
161
162 static __forceinline void store_nt(void* ptr, const vint4& v) {
163#if defined(__SSE4_1__)
164 _mm_stream_ps((float*)ptr, _mm_castsi128_ps(v));
165#else
166 _mm_store_si128(p: (__m128i*)ptr,b: v);
167#endif
168 }
169
170 template<int scale = 4>
171 static __forceinline vint4 gather(const int* ptr, const vint4& index) {
172#if defined(__AVX2__)
173 return _mm_i32gather_epi32(ptr, index, scale);
174#else
175 return vint4(
176 *(int*)(((char*)ptr)+scale*index[0]),
177 *(int*)(((char*)ptr)+scale*index[1]),
178 *(int*)(((char*)ptr)+scale*index[2]),
179 *(int*)(((char*)ptr)+scale*index[3]));
180#endif
181 }
182
183 template<int scale = 4>
184 static __forceinline vint4 gather(const vboolf4& mask, const int* ptr, const vint4& index) {
185 vint4 r = zero;
186#if defined(__AVX512VL__)
187 return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale);
188#elif defined(__AVX2__)
189 return _mm_mask_i32gather_epi32(r, ptr, index, mask, scale);
190#else
191 if (likely(mask[0])) r[0] = *(int*)(((char*)ptr)+scale*index[0]);
192 if (likely(mask[1])) r[1] = *(int*)(((char*)ptr)+scale*index[1]);
193 if (likely(mask[2])) r[2] = *(int*)(((char*)ptr)+scale*index[2]);
194 if (likely(mask[3])) r[3] = *(int*)(((char*)ptr)+scale*index[3]);
195 return r;
196#endif
197 }
198
199 template<int scale = 4>
200 static __forceinline void scatter(void* ptr, const vint4& index, const vint4& v)
201 {
202#if defined(__AVX512VL__)
203 _mm_i32scatter_epi32((int*)ptr, index, v, scale);
204#else
205 *(int*)(((char*)ptr)+scale*index[0]) = v[0];
206 *(int*)(((char*)ptr)+scale*index[1]) = v[1];
207 *(int*)(((char*)ptr)+scale*index[2]) = v[2];
208 *(int*)(((char*)ptr)+scale*index[3]) = v[3];
209#endif
210 }
211
212 template<int scale = 4>
213 static __forceinline void scatter(const vboolf4& mask, void* ptr, const vint4& index, const vint4& v)
214 {
215#if defined(__AVX512VL__)
216 _mm_mask_i32scatter_epi32((int*)ptr, mask, index, v, scale);
217#else
218 if (likely(mask[0])) *(int*)(((char*)ptr)+scale*index[0]) = v[0];
219 if (likely(mask[1])) *(int*)(((char*)ptr)+scale*index[1]) = v[1];
220 if (likely(mask[2])) *(int*)(((char*)ptr)+scale*index[2]) = v[2];
221 if (likely(mask[3])) *(int*)(((char*)ptr)+scale*index[3]) = v[3];
222#endif
223 }
224
225#if defined(__x86_64__)
226 static __forceinline vint4 broadcast64(long long a) { return _mm_set1_epi64x(q: a); }
227#endif
228
229 ////////////////////////////////////////////////////////////////////////////////
230 /// Array Access
231 ////////////////////////////////////////////////////////////////////////////////
232
233 __forceinline const int& operator [](size_t index) const { assert(index < 4); return i[index]; }
234 __forceinline int& operator [](size_t index) { assert(index < 4); return i[index]; }
235
236 friend __forceinline vint4 select(const vboolf4& m, const vint4& t, const vint4& f) {
237#if defined(__AVX512VL__)
238 return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t);
239#elif defined(__SSE4_1__)
240 return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m));
241#else
242 return _mm_or_si128(a: _mm_and_si128(a: m, b: t), b: _mm_andnot_si128(a: m, b: f));
243#endif
244 }
245 };
246
247 ////////////////////////////////////////////////////////////////////////////////
248 /// Unary Operators
249 ////////////////////////////////////////////////////////////////////////////////
250
251#if defined(__AVX512VL__)
252 __forceinline vboolf4 asBool(const vint4& a) { return _mm_movepi32_mask(a); }
253#else
254 __forceinline vboolf4 asBool(const vint4& a) { return _mm_castsi128_ps(a: a); }
255#endif
256
257 __forceinline vint4 operator +(const vint4& a) { return a; }
258 __forceinline vint4 operator -(const vint4& a) { return _mm_sub_epi32(a: _mm_setzero_si128(), b: a); }
259#if defined(__SSSE3__)
260 __forceinline vint4 abs(const vint4& a) { return _mm_abs_epi32(a); }
261#endif
262
263 ////////////////////////////////////////////////////////////////////////////////
264 /// Binary Operators
265 ////////////////////////////////////////////////////////////////////////////////
266
267 __forceinline vint4 operator +(const vint4& a, const vint4& b) { return _mm_add_epi32(a: a, b: b); }
268 __forceinline vint4 operator +(const vint4& a, int b) { return a + vint4(b); }
269 __forceinline vint4 operator +(int a, const vint4& b) { return vint4(a) + b; }
270
271 __forceinline vint4 operator -(const vint4& a, const vint4& b) { return _mm_sub_epi32(a: a, b: b); }
272 __forceinline vint4 operator -(const vint4& a, int b) { return a - vint4(b); }
273 __forceinline vint4 operator -(int a, const vint4& b) { return vint4(a) - b; }
274
275#if defined(__SSE4_1__)
276 __forceinline vint4 operator *(const vint4& a, const vint4& b) { return _mm_mullo_epi32(a, b); }
277#else
278 __forceinline vint4 operator *(const vint4& a, const vint4& b) { return vint4(a[0]*b[0],a[1]*b[1],a[2]*b[2],a[3]*b[3]); }
279#endif
280 __forceinline vint4 operator *(const vint4& a, int b) { return a * vint4(b); }
281 __forceinline vint4 operator *(int a, const vint4& b) { return vint4(a) * b; }
282
283 __forceinline vint4 operator &(const vint4& a, const vint4& b) { return _mm_and_si128(a: a, b: b); }
284 __forceinline vint4 operator &(const vint4& a, int b) { return a & vint4(b); }
285 __forceinline vint4 operator &(int a, const vint4& b) { return vint4(a) & b; }
286
287 __forceinline vint4 operator |(const vint4& a, const vint4& b) { return _mm_or_si128(a: a, b: b); }
288 __forceinline vint4 operator |(const vint4& a, int b) { return a | vint4(b); }
289 __forceinline vint4 operator |(int a, const vint4& b) { return vint4(a) | b; }
290
291 __forceinline vint4 operator ^(const vint4& a, const vint4& b) { return _mm_xor_si128(a: a, b: b); }
292 __forceinline vint4 operator ^(const vint4& a, int b) { return a ^ vint4(b); }
293 __forceinline vint4 operator ^(int a, const vint4& b) { return vint4(a) ^ b; }
294
295 __forceinline vint4 operator <<(const vint4& a, int n) { return _mm_slli_epi32(a: a, count: n); }
296 __forceinline vint4 operator >>(const vint4& a, int n) { return _mm_srai_epi32(a: a, count: n); }
297
298 __forceinline vint4 sll (const vint4& a, int b) { return _mm_slli_epi32(a: a, count: b); }
299 __forceinline vint4 sra (const vint4& a, int b) { return _mm_srai_epi32(a: a, count: b); }
300 __forceinline vint4 srl (const vint4& a, int b) { return _mm_srli_epi32(a: a, count: b); }
301
302 ////////////////////////////////////////////////////////////////////////////////
303 /// Assignment Operators
304 ////////////////////////////////////////////////////////////////////////////////
305
306 __forceinline vint4& operator +=(vint4& a, const vint4& b) { return a = a + b; }
307 __forceinline vint4& operator +=(vint4& a, int b) { return a = a + b; }
308
309 __forceinline vint4& operator -=(vint4& a, const vint4& b) { return a = a - b; }
310 __forceinline vint4& operator -=(vint4& a, int b) { return a = a - b; }
311
312#if defined(__SSE4_1__)
313 __forceinline vint4& operator *=(vint4& a, const vint4& b) { return a = a * b; }
314 __forceinline vint4& operator *=(vint4& a, int b) { return a = a * b; }
315#endif
316
317 __forceinline vint4& operator &=(vint4& a, const vint4& b) { return a = a & b; }
318 __forceinline vint4& operator &=(vint4& a, int b) { return a = a & b; }
319
320 __forceinline vint4& operator |=(vint4& a, const vint4& b) { return a = a | b; }
321 __forceinline vint4& operator |=(vint4& a, int b) { return a = a | b; }
322
323 __forceinline vint4& operator <<=(vint4& a, int b) { return a = a << b; }
324 __forceinline vint4& operator >>=(vint4& a, int b) { return a = a >> b; }
325
326 ////////////////////////////////////////////////////////////////////////////////
327 /// Comparison Operators + Select
328 ////////////////////////////////////////////////////////////////////////////////
329
330#if defined(__AVX512VL__)
331 __forceinline vboolf4 operator ==(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); }
332 __forceinline vboolf4 operator !=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_NE); }
333 __forceinline vboolf4 operator < (const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_LT); }
334 __forceinline vboolf4 operator >=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_GE); }
335 __forceinline vboolf4 operator > (const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_GT); }
336 __forceinline vboolf4 operator <=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_LE); }
337#else
338 __forceinline vboolf4 operator ==(const vint4& a, const vint4& b) { return _mm_castsi128_ps(a: _mm_cmpeq_epi32(a: a, b: b)); }
339 __forceinline vboolf4 operator !=(const vint4& a, const vint4& b) { return !(a == b); }
340 __forceinline vboolf4 operator < (const vint4& a, const vint4& b) { return _mm_castsi128_ps(a: _mm_cmplt_epi32(a: a, b: b)); }
341 __forceinline vboolf4 operator >=(const vint4& a, const vint4& b) { return !(a < b); }
342 __forceinline vboolf4 operator > (const vint4& a, const vint4& b) { return _mm_castsi128_ps(a: _mm_cmpgt_epi32(a: a, b: b)); }
343 __forceinline vboolf4 operator <=(const vint4& a, const vint4& b) { return !(a > b); }
344#endif
345
346 __forceinline vboolf4 operator ==(const vint4& a, int b) { return a == vint4(b); }
347 __forceinline vboolf4 operator ==(int a, const vint4& b) { return vint4(a) == b; }
348
349 __forceinline vboolf4 operator !=(const vint4& a, int b) { return a != vint4(b); }
350 __forceinline vboolf4 operator !=(int a, const vint4& b) { return vint4(a) != b; }
351
352 __forceinline vboolf4 operator < (const vint4& a, int b) { return a < vint4(b); }
353 __forceinline vboolf4 operator < (int a, const vint4& b) { return vint4(a) < b; }
354
355 __forceinline vboolf4 operator >=(const vint4& a, int b) { return a >= vint4(b); }
356 __forceinline vboolf4 operator >=(int a, const vint4& b) { return vint4(a) >= b; }
357
358 __forceinline vboolf4 operator > (const vint4& a, int b) { return a > vint4(b); }
359 __forceinline vboolf4 operator > (int a, const vint4& b) { return vint4(a) > b; }
360
361 __forceinline vboolf4 operator <=(const vint4& a, int b) { return a <= vint4(b); }
362 __forceinline vboolf4 operator <=(int a, const vint4& b) { return vint4(a) <= b; }
363
364 __forceinline vboolf4 eq(const vint4& a, const vint4& b) { return a == b; }
365 __forceinline vboolf4 ne(const vint4& a, const vint4& b) { return a != b; }
366 __forceinline vboolf4 lt(const vint4& a, const vint4& b) { return a < b; }
367 __forceinline vboolf4 ge(const vint4& a, const vint4& b) { return a >= b; }
368 __forceinline vboolf4 gt(const vint4& a, const vint4& b) { return a > b; }
369 __forceinline vboolf4 le(const vint4& a, const vint4& b) { return a <= b; }
370
371#if defined(__AVX512VL__)
372 __forceinline vboolf4 eq(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_EQ); }
373 __forceinline vboolf4 ne(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_NE); }
374 __forceinline vboolf4 lt(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LT); }
375 __forceinline vboolf4 ge(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GE); }
376 __forceinline vboolf4 gt(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GT); }
377 __forceinline vboolf4 le(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LE); }
378#else
379 __forceinline vboolf4 eq(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a == b); }
380 __forceinline vboolf4 ne(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a != b); }
381 __forceinline vboolf4 lt(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a < b); }
382 __forceinline vboolf4 ge(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a >= b); }
383 __forceinline vboolf4 gt(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a > b); }
384 __forceinline vboolf4 le(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a <= b); }
385#endif
386
387 template<int mask>
388 __forceinline vint4 select(const vint4& t, const vint4& f) {
389#if defined(__SSE4_1__)
390 return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask));
391#else
392 return select(m: vboolf4(mask), t, f);
393#endif
394 }
395
396#if defined(__SSE4_1__)
397 __forceinline vint4 min(const vint4& a, const vint4& b) { return _mm_min_epi32(a, b); }
398 __forceinline vint4 max(const vint4& a, const vint4& b) { return _mm_max_epi32(a, b); }
399
400 __forceinline vint4 umin(const vint4& a, const vint4& b) { return _mm_min_epu32(a, b); }
401 __forceinline vint4 umax(const vint4& a, const vint4& b) { return _mm_max_epu32(a, b); }
402
403#else
404 __forceinline vint4 min(const vint4& a, const vint4& b) { return select(m: a < b,t: a,f: b); }
405 __forceinline vint4 max(const vint4& a, const vint4& b) { return select(m: a < b,t: b,f: a); }
406#endif
407
408 __forceinline vint4 min(const vint4& a, int b) { return min(a,b: vint4(b)); }
409 __forceinline vint4 min(int a, const vint4& b) { return min(a: vint4(a),b); }
410 __forceinline vint4 max(const vint4& a, int b) { return max(a,b: vint4(b)); }
411 __forceinline vint4 max(int a, const vint4& b) { return max(a: vint4(a),b); }
412
413 ////////////////////////////////////////////////////////////////////////////////
414 // Movement/Shifting/Shuffling Functions
415 ////////////////////////////////////////////////////////////////////////////////
416
417 __forceinline vint4 unpacklo(const vint4& a, const vint4& b) { return _mm_castps_si128(a: _mm_unpacklo_ps(a: _mm_castsi128_ps(a: a), b: _mm_castsi128_ps(a: b))); }
418 __forceinline vint4 unpackhi(const vint4& a, const vint4& b) { return _mm_castps_si128(a: _mm_unpackhi_ps(a: _mm_castsi128_ps(a: a), b: _mm_castsi128_ps(a: b))); }
419
420 template<int i0, int i1, int i2, int i3>
421 __forceinline vint4 shuffle(const vint4& v) {
422 return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0));
423 }
424
425 template<int i0, int i1, int i2, int i3>
426 __forceinline vint4 shuffle(const vint4& a, const vint4& b) {
427 return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
428 }
429
430#if defined(__SSE3__)
431 template<> __forceinline vint4 shuffle<0, 0, 2, 2>(const vint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); }
432 template<> __forceinline vint4 shuffle<1, 1, 3, 3>(const vint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); }
433 template<> __forceinline vint4 shuffle<0, 1, 0, 1>(const vint4& v) { return _mm_castpd_si128(_mm_movedup_pd (_mm_castsi128_pd(v))); }
434#endif
435
436 template<int i>
437 __forceinline vint4 shuffle(const vint4& v) {
438 return shuffle<i,i,i,i>(v);
439 }
440
441#if defined(__SSE4_1__)
442 template<int src> __forceinline int extract(const vint4& b) { return _mm_extract_epi32(b, src); }
443 template<int dst> __forceinline vint4 insert(const vint4& a, const int b) { return _mm_insert_epi32(a, b, dst); }
444#else
445 template<int src> __forceinline int extract(const vint4& b) { return b[src&3]; }
446 template<int dst> __forceinline vint4 insert(const vint4& a, int b) { vint4 c = a; c[dst&3] = b; return c; }
447#endif
448
449
450 template<> __forceinline int extract<0>(const vint4& b) { return _mm_cvtsi128_si32(a: b); }
451
452 __forceinline int toScalar(const vint4& v) { return _mm_cvtsi128_si32(a: v); }
453
454 __forceinline size_t toSizeT(const vint4& v) {
455#if defined(__WIN32__) && !defined(__X86_64__) // win32 workaround
456 return toScalar(v);
457#else
458 return _mm_cvtsi128_si64(a: v);
459#endif
460 }
461
462#if defined(__AVX512VL__)
463
464 __forceinline vint4 permute(const vint4 &a, const vint4 &index) {
465 return _mm_castps_si128(_mm_permutevar_ps(_mm_castsi128_ps(a),index));
466 }
467
468 template<int i>
469 __forceinline vint4 align_shift_right(const vint4& a, const vint4& b) {
470 return _mm_alignr_epi32(a, b, i);
471 }
472#endif
473
474 ////////////////////////////////////////////////////////////////////////////////
475 /// Reductions
476 ////////////////////////////////////////////////////////////////////////////////
477
478#if defined(__SSE4_1__)
479 __forceinline vint4 vreduce_min(const vint4& v) { vint4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
480 __forceinline vint4 vreduce_max(const vint4& v) { vint4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
481 __forceinline vint4 vreduce_add(const vint4& v) { vint4 h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; }
482
483 __forceinline int reduce_min(const vint4& v) { return toScalar(vreduce_min(v)); }
484 __forceinline int reduce_max(const vint4& v) { return toScalar(vreduce_max(v)); }
485 __forceinline int reduce_add(const vint4& v) { return toScalar(vreduce_add(v)); }
486
487 __forceinline size_t select_min(const vint4& v) { return bsf(movemask(v == vreduce_min(v))); }
488 __forceinline size_t select_max(const vint4& v) { return bsf(movemask(v == vreduce_max(v))); }
489
490 __forceinline size_t select_min(const vboolf4& valid, const vint4& v) { const vint4 a = select(valid,v,vint4(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
491 __forceinline size_t select_max(const vboolf4& valid, const vint4& v) { const vint4 a = select(valid,v,vint4(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
492
493#else
494
495 __forceinline int reduce_min(const vint4& v) { return min(a: v[0],b: v[1],c: v[2],d: v[3]); }
496 __forceinline int reduce_max(const vint4& v) { return max(a: v[0],b: v[1],c: v[2],d: v[3]); }
497 __forceinline int reduce_add(const vint4& v) { return v[0]+v[1]+v[2]+v[3]; }
498
499#endif
500
501 ////////////////////////////////////////////////////////////////////////////////
502 /// Sorting networks
503 ////////////////////////////////////////////////////////////////////////////////
504
505#if defined(__SSE4_1__)
506
507 __forceinline vint4 usort_ascending(const vint4& v)
508 {
509 const vint4 a0 = v;
510 const vint4 b0 = shuffle<1,0,3,2>(a0);
511 const vint4 c0 = umin(a0,b0);
512 const vint4 d0 = umax(a0,b0);
513 const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
514 const vint4 b1 = shuffle<2,3,0,1>(a1);
515 const vint4 c1 = umin(a1,b1);
516 const vint4 d1 = umax(a1,b1);
517 const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
518 const vint4 b2 = shuffle<0,2,1,3>(a2);
519 const vint4 c2 = umin(a2,b2);
520 const vint4 d2 = umax(a2,b2);
521 const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
522 return a3;
523 }
524
525 __forceinline vint4 usort_descending(const vint4& v)
526 {
527 const vint4 a0 = v;
528 const vint4 b0 = shuffle<1,0,3,2>(a0);
529 const vint4 c0 = umax(a0,b0);
530 const vint4 d0 = umin(a0,b0);
531 const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
532 const vint4 b1 = shuffle<2,3,0,1>(a1);
533 const vint4 c1 = umax(a1,b1);
534 const vint4 d1 = umin(a1,b1);
535 const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
536 const vint4 b2 = shuffle<0,2,1,3>(a2);
537 const vint4 c2 = umax(a2,b2);
538 const vint4 d2 = umin(a2,b2);
539 const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
540 return a3;
541 }
542
543#else
544
545 __forceinline vint4 usort_ascending(const vint4& v)
546 {
547 const vint4 a0 = v-vint4(0x80000000);
548 const vint4 b0 = shuffle<1,0,3,2>(v: a0);
549 const vint4 c0 = min(a: a0,b: b0);
550 const vint4 d0 = max(a: a0,b: b0);
551 const vint4 a1 = select<0x5 /* 0b0101 */>(t: c0,f: d0);
552 const vint4 b1 = shuffle<2,3,0,1>(v: a1);
553 const vint4 c1 = min(a: a1,b: b1);
554 const vint4 d1 = max(a: a1,b: b1);
555 const vint4 a2 = select<0x3 /* 0b0011 */>(t: c1,f: d1);
556 const vint4 b2 = shuffle<0,2,1,3>(v: a2);
557 const vint4 c2 = min(a: a2,b: b2);
558 const vint4 d2 = max(a: a2,b: b2);
559 const vint4 a3 = select<0x2 /* 0b0010 */>(t: c2,f: d2);
560 return a3+vint4(0x80000000);
561 }
562
563 __forceinline vint4 usort_descending(const vint4& v)
564 {
565 const vint4 a0 = v-vint4(0x80000000);
566 const vint4 b0 = shuffle<1,0,3,2>(v: a0);
567 const vint4 c0 = max(a: a0,b: b0);
568 const vint4 d0 = min(a: a0,b: b0);
569 const vint4 a1 = select<0x5 /* 0b0101 */>(t: c0,f: d0);
570 const vint4 b1 = shuffle<2,3,0,1>(v: a1);
571 const vint4 c1 = max(a: a1,b: b1);
572 const vint4 d1 = min(a: a1,b: b1);
573 const vint4 a2 = select<0x3 /* 0b0011 */>(t: c1,f: d1);
574 const vint4 b2 = shuffle<0,2,1,3>(v: a2);
575 const vint4 c2 = max(a: a2,b: b2);
576 const vint4 d2 = min(a: a2,b: b2);
577 const vint4 a3 = select<0x2 /* 0b0010 */>(t: c2,f: d2);
578 return a3+vint4(0x80000000);
579 }
580
581#endif
582
583 ////////////////////////////////////////////////////////////////////////////////
584 /// Output Operators
585 ////////////////////////////////////////////////////////////////////////////////
586
587 __forceinline embree_ostream operator <<(embree_ostream cout, const vint4& a) {
588 return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">";
589 }
590}
591
592#undef vboolf
593#undef vboold
594#undef vint
595#undef vuint
596#undef vllong
597#undef vfloat
598#undef vdouble
599

source code of qtquick3d/src/3rdparty/embree/common/simd/vint4_sse2.h