vint4_sse2.h source code [qtquick3d/src/3rdparty/embree/common/simd/vint4_sse2.h]

1	// Copyright 2009-2021 Intel Corporation
2	// SPDX-License-Identifier: Apache-2.0
3
4	#pragma once
5
6	#include "../math/math.h"
7
8	#define vboolf vboolf_impl
9	#define vboold vboold_impl
10	#define vint vint_impl
11	#define vuint vuint_impl
12	#define vllong vllong_impl
13	#define vfloat vfloat_impl
14	#define vdouble vdouble_impl
15
16	namespace embree
17	{
18	/ 4-wide SSE integer type /
19	template<>
20	struct vint<`4`>
21	{
22	ALIGNED_STRUCT_(`16`);
23
24	typedef vboolf4 Bool;
25	typedef vint4 Int;
26	typedef vfloat4 Float;
27
28	enum { size = `4` }; // number of SIMD elements
29	union { __m128i v; int i[`4`]; }; // data
30
31	////////////////////////////////////////////////////////////////////////////////
32	/// Constructors, Assignment & Cast Operators
33	////////////////////////////////////////////////////////////////////////////////
34
35	__forceinline vint() {}
36	__forceinline vint(const vint4& a) { v = a.v; }
37	__forceinline vint4& operator =(const vint4& a) { v = a.v; return *this; }
38
39	__forceinline vint(__m128i a) : v(a) {}
40	__forceinline operator const __m128i&() const { return v; }
41	__forceinline operator __m128i&() { return v; }
42
43	__forceinline vint(int a) : v(_mm_set1_epi32(i: a)) {}
44	__forceinline vint(int a, int b, int c, int d) : v(_mm_set_epi32(i3: d, i2: c, i1: b, i0: a)) {}
45
46	__forceinline explicit vint(__m128 a) : v(_mm_cvtps_epi32(a: a)) {}
47	#if defined(__AVX512VL__)
48	__forceinline explicit vint(const vboolf4& a) : v(_mm_movm_epi32(a)) {}
49	#else
50	__forceinline explicit vint(const vboolf4& a) : v(_mm_castps_si128(a: (__m128)a)) {}
51	#endif
52
53	__forceinline vint(long long a, long long b) : v(_mm_set_epi64x(q1: b,q0: a)) {}
54
55	////////////////////////////////////////////////////////////////////////////////
56	/// Constants
57	////////////////////////////////////////////////////////////////////////////////
58
59	__forceinline vint(ZeroTy) : v(_mm_setzero_si128()) {}
60	__forceinline vint(OneTy) : v(_mm_set_epi32(i3: `1`, i2: `1`, i1: `1`, i0: `1`)) {}
61	__forceinline vint(PosInfTy) : v(_mm_set_epi32(i3: pos_inf, i2: pos_inf, i1: pos_inf, i0: pos_inf)) {}
62	__forceinline vint(NegInfTy) : v(_mm_set_epi32(i3: neg_inf, i2: neg_inf, i1: neg_inf, i0: neg_inf)) {}
63	__forceinline vint(StepTy) : v(_mm_set_epi32(i3: `3`, i2: `2`, i1: `1`, i0: `0`)) {}
64	__forceinline vint(ReverseStepTy) : v(_mm_set_epi32(i3: `0`, i2: `1`, i1: `2`, i0: `3`)) {}
65
66	__forceinline vint(TrueTy) { v = _mm_cmpeq_epi32(a: v,b: v); }
67	__forceinline vint(UndefinedTy) : v(_mm_castps_si128(a: _mm_undefined_ps())) {}
68
69
70	////////////////////////////////////////////////////////////////////////////////
71	/// Loads and Stores
72	////////////////////////////////////////////////////////////////////////////////
73
74	static __forceinline vint4 load (const void* a) { return _mm_load_si128(p: (__m128i*)a); }
75	static __forceinline vint4 loadu(const void* a) { return _mm_loadu_si128(p: (__m128i*)a); }
76
77	static __forceinline void store (void* ptr, const vint4& v) { _mm_store_si128(p: (__m128i*)ptr,b: v); }
78	static __forceinline void storeu(void* ptr, const vint4& v) { _mm_storeu_si128(p: (__m128i*)ptr,b: v); }
79
80	#if defined(__AVX512VL__)
81
82	static __forceinline vint4 compact(const vboolf4& mask, vint4 &v) {
83	return _mm_mask_compress_epi32(v, mask, v);
84	}
85	static __forceinline vint4 compact(const vboolf4& mask, vint4 &a, const vint4& b) {
86	return _mm_mask_compress_epi32(a, mask, b);
87	}
88
89	static __forceinline vint4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_epi32 (_mm_setzero_si128(),mask,ptr); }
90	static __forceinline vint4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_epi32(_mm_setzero_si128(),mask,ptr); }
91
92	static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& v) { _mm_mask_store_epi32 (ptr,mask,v); }
93	static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& v) { _mm_mask_storeu_epi32(ptr,mask,v); }
94	#elif defined(__AVX__)
95	static __forceinline vint4 load (const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); }
96	static __forceinline vint4 loadu(const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); }
97
98	static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); }
99	static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); }
100	#else
101	static __forceinline vint4 load (const vbool4& mask, const void* a) { return _mm_and_si128(a: _mm_load_si128 (p: (__m128i*)a),b: mask); }
102	static __forceinline vint4 loadu(const vbool4& mask, const void* a) { return _mm_and_si128(a: _mm_loadu_si128(p: (__m128i*)a),b: mask); }
103
104	static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& i) { store (ptr,v: select(m: mask,t: i,f: load (a: ptr))); }
105	static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& i) { storeu(ptr,v: select(m: mask,t: i,f: loadu(a: ptr))); }
106	#endif
107
108
109	#if defined(__SSE4_1__)
110	static __forceinline vint4 load(const unsigned char* ptr) {
111	return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
112	}
113
114	static __forceinline vint4 loadu(const unsigned char* ptr) {
115	return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
116	}
117	#else
118
119	static __forceinline vint4 load(const unsigned char* ptr) {
120	return vint4 (ptr[`0`],ptr[`1`],ptr[`2`],ptr[`3`]);
121	}
122
123	static __forceinline vint4 loadu(const unsigned char* ptr) {
124	return vint4 (ptr[`0`],ptr[`1`],ptr[`2`],ptr[`3`]);
125	}
126
127	#endif
128
129	static __forceinline vint4 load(const unsigned short* ptr) {
130	#if defined (__SSE4_1__)
131	return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr));
132	#else
133	return vint4 (ptr[`0`],ptr[`1`],ptr[`2`],ptr[`3`]);
134	#endif
135	}
136
137	static __forceinline void store(unsigned char* ptr, const vint4& v) {
138	#if defined(__SSE4_1__)
139	__m128i x = v;
140	x = _mm_packus_epi32(x, x);
141	x = _mm_packus_epi16(x, x);
142	(int**)ptr = _mm_cvtsi128_si32(x);
143	#else
144	for (size_t i=`0`;i<`4`;i++)
145	ptr[i] = (unsigned char)v [i];
146	#endif
147	}
148
149	static __forceinline void store(unsigned short* ptr, const vint4& v) {
150	for (size_t i=`0`;i<`4`;i++)
151	ptr[i] = (unsigned short)v [i];
152	}
153
154	static __forceinline vint4 load_nt(void* ptr) {
155	#if defined(__SSE4_1__)
156	return _mm_stream_load_si128((__m128i*)ptr);
157	#else
158	return _mm_load_si128(p: (__m128i*)ptr);
159	#endif
160	}
161
162	static __forceinline void store_nt(void* ptr, const vint4& v) {
163	#if defined(__SSE4_1__)
164	_mm_stream_ps((float*)ptr, _mm_castsi128_ps(v));
165	#else
166	_mm_store_si128(p: (__m128i*)ptr,b: v);
167	#endif
168	}
169
170	template<int scale = `4`>
171	static __forceinline vint4 gather(const int* ptr, const vint4& index) {
172	#if defined(__AVX2__)
173	return _mm_i32gather_epi32(ptr, index, scale);
174	#else
175	return vint4 (
176	(int*)(((char*)ptr)+scaleindex [`0`]),
177	(int*)(((char*)ptr)+scaleindex [`1`]),
178	(int*)(((char*)ptr)+scaleindex [`2`]),
179	(int*)(((char*)ptr)+scaleindex [`3`]));
180	#endif
181	}
182
183	template<int scale = `4`>
184	static __forceinline vint4 gather(const vboolf4& mask, const int* ptr, const vint4& index) {
185	vint4 r = zero;
186	#if defined(__AVX512VL__)
187	return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale);
188	#elif defined(__AVX2__)
189	return _mm_mask_i32gather_epi32(r, ptr, index, mask, scale);
190	#else
191	if (likely(mask[`0`])) r [`0`] = (int*)(((char*)ptr)+scaleindex [`0`]);
192	if (likely(mask[`1`])) r [`1`] = (int*)(((char*)ptr)+scaleindex [`1`]);
193	if (likely(mask[`2`])) r [`2`] = (int*)(((char*)ptr)+scaleindex [`2`]);
194	if (likely(mask[`3`])) r [`3`] = (int*)(((char*)ptr)+scaleindex [`3`]);
195	return r;
196	#endif
197	}
198
199	template<int scale = `4`>
200	static __forceinline void scatter(void* ptr, const vint4& index, const vint4& v)
201	{
202	#if defined(__AVX512VL__)
203	_mm_i32scatter_epi32((int*)ptr, index, v, scale);
204	#else
205	(int*)(((char*)ptr)+scaleindex [`0`]) = v [`0`];
206	(int*)(((char*)ptr)+scaleindex [`1`]) = v [`1`];
207	(int*)(((char*)ptr)+scaleindex [`2`]) = v [`2`];
208	(int*)(((char*)ptr)+scaleindex [`3`]) = v [`3`];
209	#endif
210	}
211
212	template<int scale = `4`>
213	static __forceinline void scatter(const vboolf4& mask, void* ptr, const vint4& index, const vint4& v)
214	{
215	#if defined(__AVX512VL__)
216	_mm_mask_i32scatter_epi32((int*)ptr, mask, index, v, scale);
217	#else
218	if (likely(mask[`0`])) (int*)(((char*)ptr)+scaleindex [`0`]) = v [`0`];
219	if (likely(mask[`1`])) (int*)(((char*)ptr)+scaleindex [`1`]) = v [`1`];
220	if (likely(mask[`2`])) (int*)(((char*)ptr)+scaleindex [`2`]) = v [`2`];
221	if (likely(mask[`3`])) (int*)(((char*)ptr)+scaleindex [`3`]) = v [`3`];
222	#endif
223	}
224
225	#if defined(__x86_64__)
226	static __forceinline vint4 broadcast64(long long a) { return _mm_set1_epi64x(q: a); }
227	#endif
228
229	////////////////////////////////////////////////////////////////////////////////
230	/// Array Access
231	////////////////////////////////////////////////////////////////////////////////
232
233	__forceinline const int& operator [](size_t index) const { assert(index < `4`); return i[index]; }
234	__forceinline int& operator [](size_t index) { assert(index < `4`); return i[index]; }
235
236	friend __forceinline vint4 select(const vboolf4& m, const vint4& t, const vint4& f) {
237	#if defined(__AVX512VL__)
238	return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t);
239	#elif defined(__SSE4_1__)
240	return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m));
241	#else
242	return _mm_or_si128(a: _mm_and_si128(a: m, b: t), b: _mm_andnot_si128(a: m, b: f));
243	#endif
244	}
245	};
246
247	////////////////////////////////////////////////////////////////////////////////
248	/// Unary Operators
249	////////////////////////////////////////////////////////////////////////////////
250
251	#if defined(__AVX512VL__)
252	__forceinline vboolf4 asBool(const vint4& a) { return _mm_movepi32_mask(a); }
253	#else
254	__forceinline vboolf4 asBool(const vint4& a) { return _mm_castsi128_ps(a: a); }
255	#endif
256
257	__forceinline vint4 operator +(const vint4& a) { return a; }
258	__forceinline vint4 operator -(const vint4& a) { return _mm_sub_epi32(a: _mm_setzero_si128(), b: a); }
259	#if defined(__SSSE3__)
260	__forceinline vint4 abs(const vint4& a) { return _mm_abs_epi32(a); }
261	#endif
262
263	////////////////////////////////////////////////////////////////////////////////
264	/// Binary Operators
265	////////////////////////////////////////////////////////////////////////////////
266
267	__forceinline vint4 operator +(const vint4& a, const vint4& b) { return _mm_add_epi32(a: a, b: b); }
268	__forceinline vint4 operator +(const vint4& a, int b) { return a + vint4 (b); }
269	__forceinline vint4 operator +(int a, const vint4& b) { return vint4 (a) + b; }
270
271	__forceinline vint4 operator -(const vint4& a, const vint4& b) { return _mm_sub_epi32(a: a, b: b); }
272	__forceinline vint4 operator -(const vint4& a, int b) { return a - vint4 (b); }
273	__forceinline vint4 operator -(int a, const vint4& b) { return vint4 (a) - b; }
274
275	#if defined(__SSE4_1__)
276	__forceinline vint4 operator (const* vint4& a, const vint4& b) { return _mm_mullo_epi32(a, b); }
277	#else
278	__forceinline vint4 operator (const* vint4& a, const vint4& b) { return vint4 (a [`0`]b [`0`],a [`1`]b [`1`],a [`2`]b [`2`],a [`3`]b [`3`]); }
279	#endif
280	__forceinline vint4 operator (const* vint4& a, int b) { return a * vint4 (b); }
281	__forceinline vint4 operator (int* a, const vint4& b) { return vint4 (a) * b; }
282
283	__forceinline vint4 operator &(const vint4& a, const vint4& b) { return _mm_and_si128(a: a, b: b); }
284	__forceinline vint4 operator &(const vint4& a, int b) { return a & vint4 (b); }
285	__forceinline vint4 operator &(int a, const vint4& b) { return vint4 (a) & b; }
286
287	__forceinline vint4 operator \|(const vint4& a, const vint4& b) { return _mm_or_si128(a: a, b: b); }
288	__forceinline vint4 operator \|(const vint4& a, int b) { return a \| vint4 (b); }
289	__forceinline vint4 operator \|(int a, const vint4& b) { return vint4 (a) \| b; }
290
291	__forceinline vint4 operator ^(const vint4& a, const vint4& b) { return _mm_xor_si128(a: a, b: b); }
292	__forceinline vint4 operator ^(const vint4& a, int b) { return a ^ vint4 (b); }
293	__forceinline vint4 operator ^(int a, const vint4& b) { return vint4 (a) ^ b; }
294
295	__forceinline vint4 operator <<(const vint4& a, int n) { return _mm_slli_epi32(a: a, count: n); }
296	__forceinline vint4 operator >>(const vint4& a, int n) { return _mm_srai_epi32(a: a, count: n); }
297
298	__forceinline vint4 sll (const vint4& a, int b) { return _mm_slli_epi32(a: a, count: b); }
299	__forceinline vint4 sra (const vint4& a, int b) { return _mm_srai_epi32(a: a, count: b); }
300	__forceinline vint4 srl (const vint4& a, int b) { return _mm_srli_epi32(a: a, count: b); }
301
302	////////////////////////////////////////////////////////////////////////////////
303	/// Assignment Operators
304	////////////////////////////////////////////////////////////////////////////////
305
306	__forceinline vint4& operator +=(vint4& a, const vint4& b) { return a = a + b; }
307	__forceinline vint4& operator +=(vint4& a, int b) { return a = a + b; }
308
309	__forceinline vint4& operator -=(vint4& a, const vint4& b) { return a = a - b; }
310	__forceinline vint4& operator -=(vint4& a, int b) { return a = a - b; }
311
312	#if defined(__SSE4_1__)
313	__forceinline vint4& operator =(vint4& a, const* vint4& b) { return a = a * b; }
314	__forceinline vint4& operator =(vint4& a, int* b) { return a = a * b; }
315	#endif
316
317	__forceinline vint4& operator &=(vint4& a, const vint4& b) { return a = a & b; }
318	__forceinline vint4& operator &=(vint4& a, int b) { return a = a & b; }
319
320	__forceinline vint4& operator \|=(vint4& a, const vint4& b) { return a = a \| b; }
321	__forceinline vint4& operator \|=(vint4& a, int b) { return a = a \| b; }
322
323	__forceinline vint4& operator <<=(vint4& a, int b) { return a = a << b; }
324	__forceinline vint4& operator >>=(vint4& a, int b) { return a = a >> b; }
325
326	////////////////////////////////////////////////////////////////////////////////
327	/// Comparison Operators + Select
328	////////////////////////////////////////////////////////////////////////////////
329
330	#if defined(__AVX512VL__)
331	__forceinline vboolf4 operator ==(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); }
332	__forceinline vboolf4 operator !=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_NE); }
333	__forceinline vboolf4 operator < (const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_LT); }
334	__forceinline vboolf4 operator >=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_GE); }
335	__forceinline vboolf4 operator > (const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_GT); }
336	__forceinline vboolf4 operator <=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_LE); }
337	#else
338	__forceinline vboolf4 operator ==(const vint4& a, const vint4& b) { return _mm_castsi128_ps(a: _mm_cmpeq_epi32(a: a, b: b)); }
339	__forceinline vboolf4 operator !=(const vint4& a, const vint4& b) { return !(a == b); }
340	__forceinline vboolf4 operator < (const vint4& a, const vint4& b) { return _mm_castsi128_ps(a: _mm_cmplt_epi32(a: a, b: b)); }
341	__forceinline vboolf4 operator >=(const vint4& a, const vint4& b) { return !(a < b); }
342	__forceinline vboolf4 operator > (const vint4& a, const vint4& b) { return _mm_castsi128_ps(a: _mm_cmpgt_epi32(a: a, b: b)); }
343	__forceinline vboolf4 operator <=(const vint4& a, const vint4& b) { return !(a > b); }
344	#endif
345
346	__forceinline vboolf4 operator ==(const vint4& a, int b) { return a == vint4 (b); }
347	__forceinline vboolf4 operator ==(int a, const vint4& b) { return vint4 (a) == b; }
348
349	__forceinline vboolf4 operator !=(const vint4& a, int b) { return a != vint4 (b); }
350	__forceinline vboolf4 operator !=(int a, const vint4& b) { return vint4 (a) != b; }
351
352	__forceinline vboolf4 operator < (const vint4& a, int b) { return a < vint4 (b); }
353	__forceinline vboolf4 operator < (int a, const vint4& b) { return vint4 (a) < b; }
354
355	__forceinline vboolf4 operator >=(const vint4& a, int b) { return a >= vint4 (b); }
356	__forceinline vboolf4 operator >=(int a, const vint4& b) { return vint4 (a) >= b; }
357
358	__forceinline vboolf4 operator > (const vint4& a, int b) { return a > vint4 (b); }
359	__forceinline vboolf4 operator > (int a, const vint4& b) { return vint4 (a) > b; }
360
361	__forceinline vboolf4 operator <=(const vint4& a, int b) { return a <= vint4 (b); }
362	__forceinline vboolf4 operator <=(int a, const vint4& b) { return vint4 (a) <= b; }
363
364	__forceinline vboolf4 eq(const vint4& a, const vint4& b) { return a == b; }
365	__forceinline vboolf4 ne(const vint4& a, const vint4& b) { return a != b; }
366	__forceinline vboolf4 lt(const vint4& a, const vint4& b) { return a < b; }
367	__forceinline vboolf4 ge(const vint4& a, const vint4& b) { return a >= b; }
368	__forceinline vboolf4 gt(const vint4& a, const vint4& b) { return a > b; }
369	__forceinline vboolf4 le(const vint4& a, const vint4& b) { return a <= b; }
370
371	#if defined(__AVX512VL__)
372	__forceinline vboolf4 eq(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_EQ); }
373	__forceinline vboolf4 ne(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_NE); }
374	__forceinline vboolf4 lt(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LT); }
375	__forceinline vboolf4 ge(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GE); }
376	__forceinline vboolf4 gt(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GT); }
377	__forceinline vboolf4 le(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LE); }
378	#else
379	__forceinline vboolf4 eq(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a == b); }
380	__forceinline vboolf4 ne(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a != b); }
381	__forceinline vboolf4 lt(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a < b); }
382	__forceinline vboolf4 ge(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a >= b); }
383	__forceinline vboolf4 gt(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a > b); }
384	__forceinline vboolf4 le(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a <= b); }
385	#endif
386
387	template<int mask>
388	__forceinline vint4 select(const vint4& t, const vint4& f) {
389	#if defined(__SSE4_1__)
390	return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask));
391	#else
392	return select(m: vboolf4(mask), t, f);
393	#endif
394	}
395
396	#if defined(__SSE4_1__)
397	__forceinline vint4 min(const vint4& a, const vint4& b) { return _mm_min_epi32(a, b); }
398	__forceinline vint4 max(const vint4& a, const vint4& b) { return _mm_max_epi32(a, b); }
399
400	__forceinline vint4 umin(const vint4& a, const vint4& b) { return _mm_min_epu32(a, b); }
401	__forceinline vint4 umax(const vint4& a, const vint4& b) { return _mm_max_epu32(a, b); }
402
403	#else
404	__forceinline vint4 min(const vint4& a, const vint4& b) { return select(m: a < b,t: a,f: b); }
405	__forceinline vint4 max(const vint4& a, const vint4& b) { return select(m: a < b,t: b,f: a); }
406	#endif
407
408	__forceinline vint4 min(const vint4& a, int b) { return min(a,b: vint4 (b)); }
409	__forceinline vint4 min(int a, const vint4& b) { return min(a: vint4 (a),b); }
410	__forceinline vint4 max(const vint4& a, int b) { return max(a,b: vint4 (b)); }
411	__forceinline vint4 max(int a, const vint4& b) { return max(a: vint4 (a),b); }
412
413	////////////////////////////////////////////////////////////////////////////////
414	// Movement/Shifting/Shuffling Functions
415	////////////////////////////////////////////////////////////////////////////////
416
417	__forceinline vint4 unpacklo(const vint4& a, const vint4& b) { return _mm_castps_si128(a: _mm_unpacklo_ps(a: _mm_castsi128_ps(a: a), b: _mm_castsi128_ps(a: b))); }
418	__forceinline vint4 unpackhi(const vint4& a, const vint4& b) { return _mm_castps_si128(a: _mm_unpackhi_ps(a: _mm_castsi128_ps(a: a), b: _mm_castsi128_ps(a: b))); }
419
420	template<int i0, int i1, int i2, int i3>
421	__forceinline vint4 shuffle(const vint4& v) {
422	return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0));
423	}
424
425	template<int i0, int i1, int i2, int i3>
426	__forceinline vint4 shuffle(const vint4& a, const vint4& b) {
427	return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
428	}
429
430	#if defined(__SSE3__)
431	template<> __forceinline vint4 shuffle<`0`, `0`, `2`, `2`>(const vint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); }
432	template<> __forceinline vint4 shuffle<`1`, `1`, `3`, `3`>(const vint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); }
433	template<> __forceinline vint4 shuffle<`0`, `1`, `0`, `1`>(const vint4& v) { return _mm_castpd_si128(_mm_movedup_pd (_mm_castsi128_pd(v))); }
434	#endif
435
436	template<int i>
437	__forceinline vint4 shuffle(const vint4& v) {
438	return shuffle<i,i,i,i>(v);
439	}
440
441	#if defined(__SSE4_1__)
442	template<int src> __forceinline int extract(const vint4& b) { return _mm_extract_epi32(b, src); }
443	template<int dst> __forceinline vint4 insert(const vint4& a, const int b) { return _mm_insert_epi32(a, b, dst); }
444	#else
445	template<int src> __forceinline int extract(const vint4& b) { return b [src&`3`]; }
446	template<int dst> __forceinline vint4 insert(const vint4& a, int b) { vint4 c = a; c [dst&`3`] = b; return c; }
447	#endif
448
449
450	template<> __forceinline int extract<`0`>(const vint4& b) { return _mm_cvtsi128_si32(a: b); }
451
452	__forceinline int toScalar(const vint4& v) { return _mm_cvtsi128_si32(a: v); }
453
454	__forceinline size_t toSizeT(const vint4& v) {
455	#if defined(__WIN32__) && !defined(__X86_64__) // win32 workaround
456	return toScalar(v);
457	#else
458	return _mm_cvtsi128_si64(a: v);
459	#endif
460	}
461
462	#if defined(__AVX512VL__)
463
464	__forceinline vint4 permute(const vint4 &a, const vint4 &index) {
465	return _mm_castps_si128(_mm_permutevar_ps(_mm_castsi128_ps(a),index));
466	}
467
468	template<int i>
469	__forceinline vint4 align_shift_right(const vint4& a, const vint4& b) {
470	return _mm_alignr_epi32(a, b, i);
471	}
472	#endif
473
474	////////////////////////////////////////////////////////////////////////////////
475	/// Reductions
476	////////////////////////////////////////////////////////////////////////////////
477
478	#if defined(__SSE4_1__)
479	__forceinline vint4 vreduce_min(const vint4& v) { vint4 h = min(shuffle<`1`,`0`,`3`,`2`>(v),v); return min(shuffle<`2`,`3`,`0`,`1`>(h),h); }
480	__forceinline vint4 vreduce_max(const vint4& v) { vint4 h = max(shuffle<`1`,`0`,`3`,`2`>(v),v); return max(shuffle<`2`,`3`,`0`,`1`>(h),h); }
481	__forceinline vint4 vreduce_add(const vint4& v) { vint4 h = shuffle<`1`,`0`,`3`,`2`>(v) + v ; return shuffle<`2`,`3`,`0`,`1`>(h) + h ; }
482
483	__forceinline int reduce_min(const vint4& v) { return toScalar(vreduce_min(v)); }
484	__forceinline int reduce_max(const vint4& v) { return toScalar(vreduce_max(v)); }
485	__forceinline int reduce_add(const vint4& v) { return toScalar(vreduce_add(v)); }
486
487	__forceinline size_t select_min(const vint4& v) { return bsf(movemask(v == vreduce_min(v))); }
488	__forceinline size_t select_max(const vint4& v) { return bsf(movemask(v == vreduce_max(v))); }
489
490	__forceinline size_t select_min(const vboolf4& valid, const vint4& v) { const vint4 a = select(valid,v,vint4(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
491	__forceinline size_t select_max(const vboolf4& valid, const vint4& v) { const vint4 a = select(valid,v,vint4(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
492
493	#else
494
495	__forceinline int reduce_min(const vint4& v) { return min(a: v [`0`],b: v [`1`],c: v [`2`],d: v [`3`]); }
496	__forceinline int reduce_max(const vint4& v) { return max(a: v [`0`],b: v [`1`],c: v [`2`],d: v [`3`]); }
497	__forceinline int reduce_add(const vint4& v) { return v [`0`]+v [`1`]+v [`2`]+v [`3`]; }
498
499	#endif
500
501	////////////////////////////////////////////////////////////////////////////////
502	/// Sorting networks
503	////////////////////////////////////////////////////////////////////////////////
504
505	#if defined(__SSE4_1__)
506
507	__forceinline vint4 usort_ascending(const vint4& v)
508	{
509	const vint4 a0 = v;
510	const vint4 b0 = shuffle<`1`,`0`,`3`,`2`>(a0);
511	const vint4 c0 = umin(a0,b0);
512	const vint4 d0 = umax(a0,b0);
513	const vint4 a1 = select<`0x5` / 0b0101 />(c0,d0);
514	const vint4 b1 = shuffle<`2`,`3`,`0`,`1`>(a1);
515	const vint4 c1 = umin(a1,b1);
516	const vint4 d1 = umax(a1,b1);
517	const vint4 a2 = select<`0x3` / 0b0011 />(c1,d1);
518	const vint4 b2 = shuffle<`0`,`2`,`1`,`3`>(a2);
519	const vint4 c2 = umin(a2,b2);
520	const vint4 d2 = umax(a2,b2);
521	const vint4 a3 = select<`0x2` / 0b0010 />(c2,d2);
522	return a3;
523	}
524
525	__forceinline vint4 usort_descending(const vint4& v)
526	{
527	const vint4 a0 = v;
528	const vint4 b0 = shuffle<`1`,`0`,`3`,`2`>(a0);
529	const vint4 c0 = umax(a0,b0);
530	const vint4 d0 = umin(a0,b0);
531	const vint4 a1 = select<`0x5` / 0b0101 />(c0,d0);
532	const vint4 b1 = shuffle<`2`,`3`,`0`,`1`>(a1);
533	const vint4 c1 = umax(a1,b1);
534	const vint4 d1 = umin(a1,b1);
535	const vint4 a2 = select<`0x3` / 0b0011 />(c1,d1);
536	const vint4 b2 = shuffle<`0`,`2`,`1`,`3`>(a2);
537	const vint4 c2 = umax(a2,b2);
538	const vint4 d2 = umin(a2,b2);
539	const vint4 a3 = select<`0x2` / 0b0010 />(c2,d2);
540	return a3;
541	}
542
543	#else
544
545	__forceinline vint4 usort_ascending(const vint4& v)
546	{
547	const vint4 a0 = v -vint4 (`0x80000000`);
548	const vint4 b0 = shuffle<`1`,`0`,`3`,`2`>(v: a0);
549	const vint4 c0 = min(a: a0,b: b0);
550	const vint4 d0 = max(a: a0,b: b0);
551	const vint4 a1 = select<`0x5` / 0b0101 />(t: c0,f: d0);
552	const vint4 b1 = shuffle<`2`,`3`,`0`,`1`>(v: a1);
553	const vint4 c1 = min(a: a1,b: b1);
554	const vint4 d1 = max(a: a1,b: b1);
555	const vint4 a2 = select<`0x3` / 0b0011 />(t: c1,f: d1);
556	const vint4 b2 = shuffle<`0`,`2`,`1`,`3`>(v: a2);
557	const vint4 c2 = min(a: a2,b: b2);
558	const vint4 d2 = max(a: a2,b: b2);
559	const vint4 a3 = select<`0x2` / 0b0010 />(t: c2,f: d2);
560	return a3 +vint4 (`0x80000000`);
561	}
562
563	__forceinline vint4 usort_descending(const vint4& v)
564	{
565	const vint4 a0 = v -vint4 (`0x80000000`);
566	const vint4 b0 = shuffle<`1`,`0`,`3`,`2`>(v: a0);
567	const vint4 c0 = max(a: a0,b: b0);
568	const vint4 d0 = min(a: a0,b: b0);
569	const vint4 a1 = select<`0x5` / 0b0101 />(t: c0,f: d0);
570	const vint4 b1 = shuffle<`2`,`3`,`0`,`1`>(v: a1);
571	const vint4 c1 = max(a: a1,b: b1);
572	const vint4 d1 = min(a: a1,b: b1);
573	const vint4 a2 = select<`0x3` / 0b0011 />(t: c1,f: d1);
574	const vint4 b2 = shuffle<`0`,`2`,`1`,`3`>(v: a2);
575	const vint4 c2 = max(a: a2,b: b2);
576	const vint4 d2 = min(a: a2,b: b2);
577	const vint4 a3 = select<`0x2` / 0b0010 />(t: c2,f: d2);
578	return a3 +vint4 (`0x80000000`);
579	}
580
581	#endif
582
583	////////////////////////////////////////////////////////////////////////////////
584	/// Output Operators
585	////////////////////////////////////////////////////////////////////////////////
586
587	__forceinline embree_ostream operator <<(embree_ostream cout, const vint4& a) {
588	return cout << "<" << a [`0`] << ", " << a [`1`] << ", " << a [`2`] << ", " << a [`3`] << ">";
589	}
590	}
591
592	#undef vboolf
593	#undef vboold
594	#undef vint
595	#undef vuint
596	#undef vllong
597	#undef vfloat
598	#undef vdouble
599

source code of qtquick3d/src/3rdparty/embree/common/simd/vint4_sse2.h