1//
2// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions
4// are met:
5// * Redistributions of source code must retain the above copyright
6// notice, this list of conditions and the following disclaimer.
7// * Redistributions in binary form must reproduce the above copyright
8// notice, this list of conditions and the following disclaimer in the
9// documentation and/or other materials provided with the distribution.
10// * Neither the name of NVIDIA CORPORATION nor the names of its
11// contributors may be used to endorse or promote products derived
12// from this software without specific prior written permission.
13//
14// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
15// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
18// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25//
26// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
27// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
28// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
29
30#ifndef PSFOUNDATION_PSUNIXSSE2INLINEAOS_H
31#define PSFOUNDATION_PSUNIXSSE2INLINEAOS_H
32
33#if !COMPILE_VECTOR_INTRINSICS
34#error Vector intrinsics should not be included when using scalar implementation.
35#endif
36
37#ifdef __SSE4_2__
38#include "smmintrin.h"
39#endif
40
41#include "../../PsVecMathSSE.h"
42
43namespace physx
44{
45namespace shdfnd
46{
47namespace aos
48{
49
50#define PX_FPCLASS_SNAN 0x0001 /* signaling NaN */
51#define PX_FPCLASS_QNAN 0x0002 /* quiet NaN */
52#define PX_FPCLASS_NINF 0x0004 /* negative infinity */
53#define PX_FPCLASS_PINF 0x0200 /* positive infinity */
54
55PX_FORCE_INLINE __m128 m128_I2F(__m128i n)
56{
57 return _mm_castsi128_ps(a: n);
58}
59PX_FORCE_INLINE __m128i m128_F2I(__m128 n)
60{
61 return _mm_castps_si128(a: n);
62}
63
64//////////////////////////////////////////////////////////////////////
65//Test that Vec3V and FloatV are legal
66//////////////////////////////////////////////////////////////////////
67
68#define FLOAT_COMPONENTS_EQUAL_THRESHOLD 0.01f
69PX_FORCE_INLINE static bool isValidFloatV(const FloatV a)
70{
71 const PxF32 x = V4ReadX(v: a);
72 const PxF32 y = V4ReadY(v: a);
73 const PxF32 z = V4ReadZ(v: a);
74 const PxF32 w = V4ReadW(v: a);
75
76 if (
77 (PxAbs(a: x - y) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) &&
78 (PxAbs(a: x - z) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) &&
79 (PxAbs(a: x - w) < FLOAT_COMPONENTS_EQUAL_THRESHOLD)
80 )
81 {
82 return true;
83 }
84
85 if (
86 (PxAbs(a: (x - y) / x) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) &&
87 (PxAbs(a: (x - z) / x) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) &&
88 (PxAbs(a: (x - w) / x) < FLOAT_COMPONENTS_EQUAL_THRESHOLD)
89 )
90 {
91 return true;
92 }
93
94 return false;
95}
96
97PX_FORCE_INLINE bool isValidVec3V(const Vec3V a)
98{
99 PX_ALIGN(16, PxF32 f[4]);
100 V4StoreA(a, f);
101 return (f[3] == 0.0f);
102}
103
104PX_FORCE_INLINE bool isFiniteLength(const Vec3V a)
105{
106 return !FAllEq(a: V4LengthSq(a), b: FZero());
107}
108
109PX_FORCE_INLINE bool isAligned16(void* a)
110{
111 return(0 == (size_t(a) & 0x0f));
112}
113
114//ASSERT_FINITELENGTH is deactivated because there is a lot of code that calls a simd normalisation function with zero length but then ignores the result.
115
116#if PX_DEBUG
117#define ASSERT_ISVALIDVEC3V(a) PX_ASSERT(isValidVec3V(a))
118#define ASSERT_ISVALIDFLOATV(a) PX_ASSERT(isValidFloatV(a))
119#define ASSERT_ISALIGNED16(a) PX_ASSERT(isAligned16(reinterpret_cast<void*>(a)))
120#define ASSERT_ISFINITELENGTH(a) //PX_ASSERT(isFiniteLength(a))
121#else
122#define ASSERT_ISVALIDVEC3V(a)
123#define ASSERT_ISVALIDFLOATV(a)
124#define ASSERT_ISALIGNED16(a)
125#define ASSERT_ISFINITELENGTH(a)
126#endif
127
128
129namespace internalUnitSSE2Simd
130{
131PX_FORCE_INLINE PxU32 BAllTrue4_R(const BoolV a)
132{
133 const PxI32 moveMask = _mm_movemask_ps(a: a);
134 return PxU32(moveMask == 0xf);
135}
136
137PX_FORCE_INLINE PxU32 BAllTrue3_R(const BoolV a)
138{
139 const PxI32 moveMask = _mm_movemask_ps(a: a);
140 return PxU32((moveMask & 0x7) == 0x7);
141}
142
143PX_FORCE_INLINE PxU32 BAnyTrue4_R(const BoolV a)
144{
145 const PxI32 moveMask = _mm_movemask_ps(a: a);
146 return PxU32(moveMask != 0x0);
147}
148
149PX_FORCE_INLINE PxU32 BAnyTrue3_R(const BoolV a)
150{
151 const PxI32 moveMask = _mm_movemask_ps(a: a);
152 return PxU32((moveMask & 0x7) != 0x0);
153}
154
155PX_FORCE_INLINE PxU32 FiniteTestEq(const Vec4V a, const Vec4V b)
156{
157 // This is a bit of a bodge.
158 //_mm_comieq_ss returns 1 if either value is nan so we need to re-cast a and b with true encoded as a non-nan
159 // number.
160 // There must be a better way of doing this in sse.
161 const BoolV one = FOne();
162 const BoolV zero = FZero();
163 const BoolV a1 = V4Sel(c: a, a: one, b: zero);
164 const BoolV b1 = V4Sel(c: b, a: one, b: zero);
165 return (
166 _mm_comieq_ss(a: a1, b: b1) &&
167 _mm_comieq_ss(_mm_shuffle_ps(a1, a1, _MM_SHUFFLE(1, 1, 1, 1)), _mm_shuffle_ps(b1, b1, _MM_SHUFFLE(1, 1, 1, 1))) &&
168 _mm_comieq_ss(_mm_shuffle_ps(a1, a1, _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(b1, b1, _MM_SHUFFLE(2, 2, 2, 2))) &&
169 _mm_comieq_ss(_mm_shuffle_ps(a1, a1, _MM_SHUFFLE(3, 3, 3, 3)), _mm_shuffle_ps(b1, b1, _MM_SHUFFLE(3, 3, 3, 3))));
170}
171
172#if !PX_EMSCRIPTEN
173const PX_ALIGN(16, PxF32 gMaskXYZ[4]) = { physx::PxUnionCast<PxF32>(b: 0xffffffff), physx::PxUnionCast<PxF32>(b: 0xffffffff),
174 physx::PxUnionCast<PxF32>(b: 0xffffffff), 0 };
175#else
176// emscripten doesn't like the PxUnionCast data structure
177// the following is what windows and xbox does -- using these for emscripten
178const PX_ALIGN(16, PxU32 gMaskXYZ[4]) = { 0xffffffff, 0xffffffff, 0xffffffff, 0 };
179#endif
180}
181
182namespace _VecMathTests
183{
184// PT: this function returns an invalid Vec3V (W!=0.0f) just for unit-testing 'isValidVec3V'
185PX_FORCE_INLINE Vec3V getInvalidVec3V()
186{
187 const float f = 1.0f;
188 return _mm_load1_ps(p: &f);
189}
190
191PX_FORCE_INLINE bool allElementsEqualFloatV(const FloatV a, const FloatV b)
192{
193 ASSERT_ISVALIDFLOATV(a);
194 ASSERT_ISVALIDFLOATV(b);
195 return _mm_comieq_ss(a: a, b: b) != 0;
196}
197
198PX_FORCE_INLINE bool allElementsEqualVec3V(const Vec3V a, const Vec3V b)
199{
200 return V3AllEq(a, b) != 0;
201}
202
203PX_FORCE_INLINE bool allElementsEqualVec4V(const Vec4V a, const Vec4V b)
204{
205 return V4AllEq(a, b) != 0;
206}
207
208PX_FORCE_INLINE bool allElementsEqualBoolV(const BoolV a, const BoolV b)
209{
210 return internalUnitSSE2Simd::BAllTrue4_R(a: VecI32V_IsEq(a: m128_F2I(n: a), b: m128_F2I(n: b))) != 0;
211}
212
213PX_FORCE_INLINE bool allElementsEqualVecU32V(const VecU32V a, const VecU32V b)
214{
215 return internalUnitSSE2Simd::BAllTrue4_R(a: V4IsEqU32(a, b)) != 0;
216}
217
218PX_FORCE_INLINE bool allElementsEqualVecI32V(const VecI32V a, const VecI32V b)
219{
220 BoolV c = m128_I2F(n: _mm_cmpeq_epi32(a: a, b: b));
221 return internalUnitSSE2Simd::BAllTrue4_R(a: c) != 0;
222}
223
224#define VECMATH_AOS_EPSILON (1e-3f)
225
226PX_FORCE_INLINE bool allElementsNearEqualFloatV(const FloatV a, const FloatV b)
227{
228 ASSERT_ISVALIDFLOATV(a);
229 ASSERT_ISVALIDFLOATV(b);
230 const FloatV c = FSub(a, b);
231 const FloatV minError = FLoad(f: -VECMATH_AOS_EPSILON);
232 const FloatV maxError = FLoad(VECMATH_AOS_EPSILON);
233 return _mm_comigt_ss(a: c, b: minError) && _mm_comilt_ss(a: c, b: maxError);
234}
235
236PX_FORCE_INLINE bool allElementsNearEqualVec3V(const Vec3V a, const Vec3V b)
237{
238 const Vec3V c = V3Sub(a, b);
239 const Vec3V minError = V3Load(f: -VECMATH_AOS_EPSILON);
240 const Vec3V maxError = V3Load(VECMATH_AOS_EPSILON);
241 return (_mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)), b: minError) &&
242 _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)), b: maxError) &&
243 _mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1)), b: minError) &&
244 _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1)), b: maxError) &&
245 _mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2)), b: minError) &&
246 _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2)), b: maxError));
247}
248
249PX_FORCE_INLINE bool allElementsNearEqualVec4V(const Vec4V a, const Vec4V b)
250{
251 const Vec4V c = V4Sub(a, b);
252 const Vec4V minError = V4Load(f: -VECMATH_AOS_EPSILON);
253 const Vec4V maxError = V4Load(VECMATH_AOS_EPSILON);
254 return (_mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)), b: minError) &&
255 _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)), b: maxError) &&
256 _mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1)), b: minError) &&
257 _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1)), b: maxError) &&
258 _mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2)), b: minError) &&
259 _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2)), b: maxError) &&
260 _mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 3, 3, 3)), b: minError) &&
261 _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 3, 3, 3)), b: maxError));
262}
263}
264
265/////////////////////////////////////////////////////////////////////
266////FUNCTIONS USED ONLY FOR ASSERTS IN VECTORISED IMPLEMENTATIONS
267/////////////////////////////////////////////////////////////////////
268
269PX_FORCE_INLINE bool isFiniteFloatV(const FloatV a)
270{
271 PxF32 badNumber =
272 physx::PxUnionCast<PxF32, PxU32>(PX_FPCLASS_SNAN | PX_FPCLASS_QNAN | PX_FPCLASS_NINF | PX_FPCLASS_PINF);
273 const FloatV vBadNum = FLoad(f: badNumber);
274 const BoolV vMask = BAnd(a: vBadNum, b: a);
275 return internalUnitSSE2Simd::FiniteTestEq(a: vMask, b: BFFFF()) == 1;
276}
277
278PX_FORCE_INLINE bool isFiniteVec3V(const Vec3V a)
279{
280 PxF32 badNumber =
281 physx::PxUnionCast<PxF32, PxU32>(PX_FPCLASS_SNAN | PX_FPCLASS_QNAN | PX_FPCLASS_NINF | PX_FPCLASS_PINF);
282 const Vec3V vBadNum = V3Load(f: badNumber);
283 const BoolV vMask = BAnd(a: BAnd(a: vBadNum, b: a), b: BTTTF());
284 return internalUnitSSE2Simd::FiniteTestEq(a: vMask, b: BFFFF()) == 1;
285}
286
287PX_FORCE_INLINE bool isFiniteVec4V(const Vec4V a)
288{
289 /*Vec4V a;
290 PX_ALIGN(16, PxF32 f[4]);
291 F32Array_Aligned_From_Vec4V(a, f);
292 return PxIsFinite(f[0])
293 && PxIsFinite(f[1])
294 && PxIsFinite(f[2])
295 && PxIsFinite(f[3]);*/
296
297 PxF32 badNumber =
298 physx::PxUnionCast<PxF32, PxU32>(PX_FPCLASS_SNAN | PX_FPCLASS_QNAN | PX_FPCLASS_NINF | PX_FPCLASS_PINF);
299 const Vec4V vBadNum = V4Load(f: badNumber);
300 const BoolV vMask = BAnd(a: vBadNum, b: a);
301
302 return internalUnitSSE2Simd::FiniteTestEq(a: vMask, b: BFFFF()) == 1;
303}
304
305PX_FORCE_INLINE bool hasZeroElementinFloatV(const FloatV a)
306{
307 ASSERT_ISVALIDFLOATV(a);
308 return _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)), b: FZero()) ? true : false;
309}
310
311PX_FORCE_INLINE bool hasZeroElementInVec3V(const Vec3V a)
312{
313 return (_mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)), b: FZero()) ||
314 _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)), b: FZero()) ||
315 _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)), b: FZero()));
316}
317
318PX_FORCE_INLINE bool hasZeroElementInVec4V(const Vec4V a)
319{
320 return (_mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)), b: FZero()) ||
321 _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)), b: FZero()) ||
322 _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)), b: FZero()) ||
323 _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3)), b: FZero()));
324}
325
326/////////////////////////////////////////////////////////////////////
327////VECTORISED FUNCTION IMPLEMENTATIONS
328/////////////////////////////////////////////////////////////////////
329
330PX_FORCE_INLINE FloatV FLoad(const PxF32 f)
331{
332 return _mm_load1_ps(p: &f);
333}
334
335PX_FORCE_INLINE Vec3V V3Load(const PxF32 f)
336{
337 return _mm_set_ps(z: 0.0f, y: f, x: f, w: f);
338}
339
340PX_FORCE_INLINE Vec4V V4Load(const PxF32 f)
341{
342 return _mm_load1_ps(p: &f);
343}
344
345PX_FORCE_INLINE BoolV BLoad(const bool f)
346{
347 const PxU32 i = -PxI32(f);
348 return _mm_load1_ps(p: reinterpret_cast<const float*>(&i));
349}
350
351PX_FORCE_INLINE Vec3V V3LoadA(const PxVec3& f)
352{
353 ASSERT_ISALIGNED16(const_cast<PxVec3*>(&f));
354#if !PX_EMSCRIPTEN
355 return _mm_and_ps(a: reinterpret_cast<const Vec3V&>(f), b: V4LoadA(f: internalUnitSSE2Simd::gMaskXYZ));
356#else
357 return _mm_and_ps((Vec3V&)f, (VecI32V&)internalUnitSSE2Simd::gMaskXYZ);
358#endif
359}
360
361PX_FORCE_INLINE Vec3V V3LoadU(const PxVec3& f)
362{
363 return _mm_set_ps(z: 0.0f, y: f.z, x: f.y, w: f.x);
364}
365
366PX_FORCE_INLINE Vec3V V3LoadUnsafeA(const PxVec3& f)
367{
368 ASSERT_ISALIGNED16(const_cast<PxVec3*>(&f));
369 return _mm_set_ps(z: 0.0f, y: f.z, x: f.y, w: f.x);
370}
371
372PX_FORCE_INLINE Vec3V V3LoadA(const PxF32* const f)
373{
374 ASSERT_ISALIGNED16(const_cast<PxF32*>(f));
375#if !PX_EMSCRIPTEN
376 return _mm_and_ps(a: V4LoadA(f), b: V4LoadA(f: internalUnitSSE2Simd::gMaskXYZ));
377#else
378 return _mm_and_ps((Vec3V&)*f, (VecI32V&)internalUnitSSE2Simd::gMaskXYZ);
379#endif
380}
381
382PX_FORCE_INLINE Vec3V V3LoadU(const PxF32* const i)
383{
384 return _mm_set_ps(z: 0.0f, y: i[2], x: i[1], w: i[0]);
385}
386
387PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V(Vec4V v)
388{
389 return V4ClearW(v);
390}
391
392PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V_WUndefined(const Vec4V v)
393{
394 return v;
395}
396
397PX_FORCE_INLINE Vec4V Vec4V_From_Vec3V(Vec3V f)
398{
399 ASSERT_ISVALIDVEC3V(f);
400 return f; // ok if it is implemented as the same type.
401}
402
403PX_FORCE_INLINE Vec4V Vec4V_From_PxVec3_WUndefined(const PxVec3& f)
404{
405 return _mm_set_ps(z: 0.0f, y: f.z, x: f.y, w: f.x);
406}
407
408PX_FORCE_INLINE Vec4V Vec4V_From_FloatV(FloatV f)
409{
410 return f;
411}
412
413PX_FORCE_INLINE Vec3V Vec3V_From_FloatV(FloatV f)
414{
415 ASSERT_ISVALIDFLOATV(f);
416 return Vec3V_From_Vec4V(v: Vec4V_From_FloatV(f));
417}
418
419PX_FORCE_INLINE Vec3V Vec3V_From_FloatV_WUndefined(FloatV f)
420{
421 ASSERT_ISVALIDVEC3V(f);
422 return Vec3V_From_Vec4V_WUndefined(v: Vec4V_From_FloatV(f));
423}
424
425PX_FORCE_INLINE Mat33V Mat33V_From_PxMat33(const PxMat33& m)
426{
427 return Mat33V(V3LoadU(f: m.column0), V3LoadU(f: m.column1), V3LoadU(f: m.column2));
428}
429
430PX_FORCE_INLINE void PxMat33_From_Mat33V(const Mat33V& m, PxMat33& out)
431{
432 V3StoreU(a: m.col0, f&: out.column0);
433 V3StoreU(a: m.col1, f&: out.column1);
434 V3StoreU(a: m.col2, f&: out.column2);
435}
436
437PX_FORCE_INLINE Vec4V V4LoadA(const PxF32* const f)
438{
439 ASSERT_ISALIGNED16(const_cast<PxF32*>(f));
440 return _mm_load_ps(p: f);
441}
442
443PX_FORCE_INLINE void V4StoreA(Vec4V a, PxF32* f)
444{
445 ASSERT_ISALIGNED16(f);
446 _mm_store_ps(p: f, a: a);
447}
448
449PX_FORCE_INLINE void V4StoreU(const Vec4V a, PxF32* f)
450{
451 _mm_storeu_ps(p: f, a: a);
452}
453
454PX_FORCE_INLINE void BStoreA(const BoolV a, PxU32* f)
455{
456 ASSERT_ISALIGNED16(f);
457 _mm_store_ps(p: reinterpret_cast<PxF32*>(f), a: a);
458}
459
460PX_FORCE_INLINE void U4StoreA(const VecU32V uv, PxU32* u)
461{
462 ASSERT_ISALIGNED16(u);
463 _mm_store_ps(p: reinterpret_cast<float*>(u), a: uv);
464}
465
466PX_FORCE_INLINE void I4StoreA(const VecI32V iv, PxI32* i)
467{
468 ASSERT_ISALIGNED16(i);
469 _mm_store_ps(p: reinterpret_cast<float*>(i), a: m128_I2F(n: iv));
470}
471
472PX_FORCE_INLINE Vec4V V4LoadU(const PxF32* const f)
473{
474 return _mm_loadu_ps(p: f);
475}
476
477PX_FORCE_INLINE BoolV BLoad(const bool* const f)
478{
479 const PX_ALIGN(16, PxI32) b[4] = { -PxI32(f[0]), -PxI32(f[1]), -PxI32(f[2]), -PxI32(f[3]) };
480 return _mm_load_ps(p: reinterpret_cast<const float*>(&b));
481}
482
483PX_FORCE_INLINE void FStore(const FloatV a, PxF32* PX_RESTRICT f)
484{
485 ASSERT_ISVALIDFLOATV(a);
486 _mm_store_ss(p: f, a: a);
487}
488
489PX_FORCE_INLINE void V3StoreA(const Vec3V a, PxVec3& f)
490{
491 ASSERT_ISALIGNED16(&f);
492 PX_ALIGN(16, PxF32) f2[4];
493 _mm_store_ps(p: f2, a: a);
494 f = PxVec3(f2[0], f2[1], f2[2]);
495}
496
497PX_FORCE_INLINE void V3StoreU(const Vec3V a, PxVec3& f)
498{
499 PX_ALIGN(16, PxF32) f2[4];
500 _mm_store_ps(p: f2, a: a);
501 f = PxVec3(f2[0], f2[1], f2[2]);
502}
503
504PX_FORCE_INLINE void Store_From_BoolV(const BoolV b, PxU32* b2)
505{
506 _mm_store_ss(p: reinterpret_cast<PxF32*>(b2), a: b);
507}
508
509PX_FORCE_INLINE VecU32V U4Load(const PxU32 i)
510{
511 return _mm_load1_ps(p: reinterpret_cast<const PxF32*>(&i));
512}
513
514PX_FORCE_INLINE VecU32V U4LoadU(const PxU32* i)
515{
516 return _mm_loadu_ps(p: reinterpret_cast<const PxF32*>(i));
517}
518
519PX_FORCE_INLINE VecU32V U4LoadA(const PxU32* i)
520{
521 ASSERT_ISALIGNED16(const_cast<PxU32*>(i));
522 return _mm_load_ps(p: reinterpret_cast<const PxF32*>(i));
523}
524
525//////////////////////////////////
526// FLOATV
527//////////////////////////////////
528
529PX_FORCE_INLINE FloatV FZero()
530{
531 return FLoad(f: 0.0f);
532}
533
534PX_FORCE_INLINE FloatV FOne()
535{
536 return FLoad(f: 1.0f);
537}
538
539PX_FORCE_INLINE FloatV FHalf()
540{
541 return FLoad(f: 0.5f);
542}
543
544PX_FORCE_INLINE FloatV FEps()
545{
546 return FLoad(PX_EPS_REAL);
547}
548
549PX_FORCE_INLINE FloatV FEps6()
550{
551 return FLoad(f: 1e-6f);
552}
553
554PX_FORCE_INLINE FloatV FMax()
555{
556 return FLoad(PX_MAX_REAL);
557}
558
559PX_FORCE_INLINE FloatV FNegMax()
560{
561 return FLoad(f: -PX_MAX_REAL);
562}
563
564PX_FORCE_INLINE FloatV IZero()
565{
566 const PxU32 zero = 0;
567 return _mm_load1_ps(p: reinterpret_cast<const PxF32*>(&zero));
568}
569
570PX_FORCE_INLINE FloatV IOne()
571{
572 const PxU32 one = 1;
573 return _mm_load1_ps(p: reinterpret_cast<const PxF32*>(&one));
574}
575
576PX_FORCE_INLINE FloatV ITwo()
577{
578 const PxU32 two = 2;
579 return _mm_load1_ps(p: reinterpret_cast<const PxF32*>(&two));
580}
581
582PX_FORCE_INLINE FloatV IThree()
583{
584 const PxU32 three = 3;
585 return _mm_load1_ps(p: reinterpret_cast<const PxF32*>(&three));
586}
587
588PX_FORCE_INLINE FloatV IFour()
589{
590 PxU32 four = 4;
591 return _mm_load1_ps(p: reinterpret_cast<const PxF32*>(&four));
592}
593
594PX_FORCE_INLINE FloatV FNeg(const FloatV f)
595{
596 ASSERT_ISVALIDFLOATV(f);
597 return _mm_sub_ps(a: _mm_setzero_ps(), b: f);
598}
599
600PX_FORCE_INLINE FloatV FAdd(const FloatV a, const FloatV b)
601{
602 ASSERT_ISVALIDFLOATV(a);
603 ASSERT_ISVALIDFLOATV(b);
604/*
605 if(!isValidFloatV(a))
606 {
607assert(false);
608 }
609 if(!isValidFloatV(b))
610 {
611assert(false);
612 }
613*/
614 return _mm_add_ps(a: a, b: b);
615}
616
617PX_FORCE_INLINE FloatV FSub(const FloatV a, const FloatV b)
618{
619 ASSERT_ISVALIDFLOATV(a);
620 ASSERT_ISVALIDFLOATV(b);
621 return _mm_sub_ps(a: a, b: b);
622}
623
624PX_FORCE_INLINE FloatV FMul(const FloatV a, const FloatV b)
625{
626 ASSERT_ISVALIDFLOATV(a);
627 ASSERT_ISVALIDFLOATV(b);
628 return _mm_mul_ps(a: a, b: b);
629}
630
631PX_FORCE_INLINE FloatV FDiv(const FloatV a, const FloatV b)
632{
633 ASSERT_ISVALIDFLOATV(a);
634 ASSERT_ISVALIDFLOATV(b);
635 return _mm_div_ps(a: a, b: b);
636}
637
638PX_FORCE_INLINE FloatV FDivFast(const FloatV a, const FloatV b)
639{
640 ASSERT_ISVALIDFLOATV(a);
641 ASSERT_ISVALIDFLOATV(b);
642 return _mm_mul_ps(a: a, b: _mm_rcp_ps(a: b));
643}
644
645PX_FORCE_INLINE FloatV FRecip(const FloatV a)
646{
647 ASSERT_ISVALIDFLOATV(a);
648 return _mm_div_ps(a: FOne(), b: a);
649}
650
651PX_FORCE_INLINE FloatV FRecipFast(const FloatV a)
652{
653 ASSERT_ISVALIDFLOATV(a);
654 return _mm_rcp_ps(a: a);
655}
656
657PX_FORCE_INLINE FloatV FRsqrt(const FloatV a)
658{
659 ASSERT_ISVALIDFLOATV(a);
660 return _mm_div_ps(a: FOne(), b: _mm_sqrt_ps(a: a));
661}
662
663PX_FORCE_INLINE FloatV FSqrt(const FloatV a)
664{
665 ASSERT_ISVALIDFLOATV(a);
666 return _mm_sqrt_ps(a: a);
667}
668
669PX_FORCE_INLINE FloatV FRsqrtFast(const FloatV a)
670{
671 ASSERT_ISVALIDFLOATV(a);
672 return _mm_rsqrt_ps(a: a);
673}
674
675PX_FORCE_INLINE FloatV FScaleAdd(const FloatV a, const FloatV b, const FloatV c)
676{
677 ASSERT_ISVALIDFLOATV(a);
678 ASSERT_ISVALIDFLOATV(b);
679 ASSERT_ISVALIDFLOATV(c);
680 return FAdd(a: FMul(a, b), b: c);
681}
682
683PX_FORCE_INLINE FloatV FNegScaleSub(const FloatV a, const FloatV b, const FloatV c)
684{
685 ASSERT_ISVALIDFLOATV(a);
686 ASSERT_ISVALIDFLOATV(b);
687 ASSERT_ISVALIDFLOATV(c);
688 return FSub(a: c, b: FMul(a, b));
689}
690
691PX_FORCE_INLINE FloatV FAbs(const FloatV a)
692{
693 ASSERT_ISVALIDFLOATV(a);
694 PX_ALIGN(16, const PxU32) absMask[4] = { 0x7fFFffFF, 0x7fFFffFF, 0x7fFFffFF, 0x7fFFffFF };
695 return _mm_and_ps(a: a, b: _mm_load_ps(p: reinterpret_cast<const PxF32*>(absMask)));
696}
697
698PX_FORCE_INLINE FloatV FSel(const BoolV c, const FloatV a, const FloatV b)
699{
700 PX_ASSERT(_VecMathTests::allElementsEqualBoolV(c,BTTTT()) ||
701 _VecMathTests::allElementsEqualBoolV(c,BFFFF()));
702 ASSERT_ISVALIDFLOATV(_mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a)));
703 return _mm_or_ps(a: _mm_andnot_ps(a: c, b: b), b: _mm_and_ps(a: c, b: a));
704}
705
706PX_FORCE_INLINE BoolV FIsGrtr(const FloatV a, const FloatV b)
707{
708 ASSERT_ISVALIDFLOATV(a);
709 ASSERT_ISVALIDFLOATV(b);
710 return _mm_cmpgt_ps(a: a, b: b);
711}
712
713PX_FORCE_INLINE BoolV FIsGrtrOrEq(const FloatV a, const FloatV b)
714{
715 ASSERT_ISVALIDFLOATV(a);
716 ASSERT_ISVALIDFLOATV(b);
717 return _mm_cmpge_ps(a: a, b: b);
718}
719
720PX_FORCE_INLINE BoolV FIsEq(const FloatV a, const FloatV b)
721{
722 ASSERT_ISVALIDFLOATV(a);
723 ASSERT_ISVALIDFLOATV(b);
724 return _mm_cmpeq_ps(a: a, b: b);
725}
726
727PX_FORCE_INLINE FloatV FMax(const FloatV a, const FloatV b)
728{
729 ASSERT_ISVALIDFLOATV(a);
730 ASSERT_ISVALIDFLOATV(b);
731 return _mm_max_ps(a: a, b: b);
732}
733
734PX_FORCE_INLINE FloatV FMin(const FloatV a, const FloatV b)
735{
736 ASSERT_ISVALIDFLOATV(a);
737 ASSERT_ISVALIDFLOATV(b);
738 return _mm_min_ps(a: a, b: b);
739}
740
741PX_FORCE_INLINE FloatV FClamp(const FloatV a, const FloatV minV, const FloatV maxV)
742{
743 ASSERT_ISVALIDFLOATV(minV);
744 ASSERT_ISVALIDFLOATV(maxV);
745 return _mm_max_ps(a: _mm_min_ps(a: a, b: maxV), b: minV);
746}
747
748PX_FORCE_INLINE PxU32 FAllGrtr(const FloatV a, const FloatV b)
749{
750 ASSERT_ISVALIDFLOATV(a);
751 ASSERT_ISVALIDFLOATV(b);
752 return _mm_comigt_ss(a: a, b: b);
753}
754
755PX_FORCE_INLINE PxU32 FAllGrtrOrEq(const FloatV a, const FloatV b)
756{
757 ASSERT_ISVALIDFLOATV(a);
758 ASSERT_ISVALIDFLOATV(b);
759 return _mm_comige_ss(a: a, b: b);
760}
761
762PX_FORCE_INLINE PxU32 FAllEq(const FloatV a, const FloatV b)
763{
764 ASSERT_ISVALIDFLOATV(a);
765 ASSERT_ISVALIDFLOATV(b);
766 return _mm_comieq_ss(a: a, b: b);
767}
768
769PX_FORCE_INLINE FloatV FRound(const FloatV a)
770{
771 ASSERT_ISVALIDFLOATV(a);
772#ifdef __SSE4_2__
773 return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
774#else
775 // return _mm_round_ps(a, 0x0);
776 const FloatV half = FLoad(f: 0.5f);
777 const __m128 signBit = _mm_cvtepi32_ps(a: _mm_srli_epi32(a: _mm_cvtps_epi32(a: a), count: 31));
778 const FloatV aRound = FSub(a: FAdd(a, b: half), b: signBit);
779 __m128i tmp = _mm_cvttps_epi32(a: aRound);
780 return _mm_cvtepi32_ps(a: tmp);
781#endif
782}
783
784PX_FORCE_INLINE FloatV FSin(const FloatV a)
785{
786 ASSERT_ISVALIDFLOATV(a);
787
788 // Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI
789 const FloatV recipTwoPi = V4LoadA(f: g_PXReciprocalTwoPi.f);
790 const FloatV twoPi = V4LoadA(f: g_PXTwoPi.f);
791 const FloatV tmp = FMul(a, b: recipTwoPi);
792 const FloatV b = FRound(a: tmp);
793 const FloatV V1 = FNegScaleSub(a: twoPi, b, c: a);
794
795 // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! -
796 // V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI)
797 const FloatV V2 = FMul(a: V1, b: V1);
798 const FloatV V3 = FMul(a: V2, b: V1);
799 const FloatV V5 = FMul(a: V3, b: V2);
800 const FloatV V7 = FMul(a: V5, b: V2);
801 const FloatV V9 = FMul(a: V7, b: V2);
802 const FloatV V11 = FMul(a: V9, b: V2);
803 const FloatV V13 = FMul(a: V11, b: V2);
804 const FloatV V15 = FMul(a: V13, b: V2);
805 const FloatV V17 = FMul(a: V15, b: V2);
806 const FloatV V19 = FMul(a: V17, b: V2);
807 const FloatV V21 = FMul(a: V19, b: V2);
808 const FloatV V23 = FMul(a: V21, b: V2);
809
810 const Vec4V sinCoefficients0 = V4LoadA(f: g_PXSinCoefficients0.f);
811 const Vec4V sinCoefficients1 = V4LoadA(f: g_PXSinCoefficients1.f);
812 const Vec4V sinCoefficients2 = V4LoadA(f: g_PXSinCoefficients2.f);
813
814 const FloatV S1 = V4GetY(f: sinCoefficients0);
815 const FloatV S2 = V4GetZ(f: sinCoefficients0);
816 const FloatV S3 = V4GetW(f: sinCoefficients0);
817 const FloatV S4 = V4GetX(f: sinCoefficients1);
818 const FloatV S5 = V4GetY(f: sinCoefficients1);
819 const FloatV S6 = V4GetZ(f: sinCoefficients1);
820 const FloatV S7 = V4GetW(f: sinCoefficients1);
821 const FloatV S8 = V4GetX(f: sinCoefficients2);
822 const FloatV S9 = V4GetY(f: sinCoefficients2);
823 const FloatV S10 = V4GetZ(f: sinCoefficients2);
824 const FloatV S11 = V4GetW(f: sinCoefficients2);
825
826 FloatV Result;
827 Result = FScaleAdd(a: S1, b: V3, c: V1);
828 Result = FScaleAdd(a: S2, b: V5, c: Result);
829 Result = FScaleAdd(a: S3, b: V7, c: Result);
830 Result = FScaleAdd(a: S4, b: V9, c: Result);
831 Result = FScaleAdd(a: S5, b: V11, c: Result);
832 Result = FScaleAdd(a: S6, b: V13, c: Result);
833 Result = FScaleAdd(a: S7, b: V15, c: Result);
834 Result = FScaleAdd(a: S8, b: V17, c: Result);
835 Result = FScaleAdd(a: S9, b: V19, c: Result);
836 Result = FScaleAdd(a: S10, b: V21, c: Result);
837 Result = FScaleAdd(a: S11, b: V23, c: Result);
838
839 return Result;
840}
841
842PX_FORCE_INLINE FloatV FCos(const FloatV a)
843{
844 ASSERT_ISVALIDFLOATV(a);
845
846 // Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI
847 const FloatV recipTwoPi = V4LoadA(f: g_PXReciprocalTwoPi.f);
848 const FloatV twoPi = V4LoadA(f: g_PXTwoPi.f);
849 const FloatV tmp = FMul(a, b: recipTwoPi);
850 const FloatV b = FRound(a: tmp);
851 const FloatV V1 = FNegScaleSub(a: twoPi, b, c: a);
852
853 // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! -
854 // V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI)
855 const FloatV V2 = FMul(a: V1, b: V1);
856 const FloatV V4 = FMul(a: V2, b: V2);
857 const FloatV V6 = FMul(a: V4, b: V2);
858 const FloatV V8 = FMul(a: V4, b: V4);
859 const FloatV V10 = FMul(a: V6, b: V4);
860 const FloatV V12 = FMul(a: V6, b: V6);
861 const FloatV V14 = FMul(a: V8, b: V6);
862 const FloatV V16 = FMul(a: V8, b: V8);
863 const FloatV V18 = FMul(a: V10, b: V8);
864 const FloatV V20 = FMul(a: V10, b: V10);
865 const FloatV V22 = FMul(a: V12, b: V10);
866
867 const Vec4V cosCoefficients0 = V4LoadA(f: g_PXCosCoefficients0.f);
868 const Vec4V cosCoefficients1 = V4LoadA(f: g_PXCosCoefficients1.f);
869 const Vec4V cosCoefficients2 = V4LoadA(f: g_PXCosCoefficients2.f);
870
871 const FloatV C1 = V4GetY(f: cosCoefficients0);
872 const FloatV C2 = V4GetZ(f: cosCoefficients0);
873 const FloatV C3 = V4GetW(f: cosCoefficients0);
874 const FloatV C4 = V4GetX(f: cosCoefficients1);
875 const FloatV C5 = V4GetY(f: cosCoefficients1);
876 const FloatV C6 = V4GetZ(f: cosCoefficients1);
877 const FloatV C7 = V4GetW(f: cosCoefficients1);
878 const FloatV C8 = V4GetX(f: cosCoefficients2);
879 const FloatV C9 = V4GetY(f: cosCoefficients2);
880 const FloatV C10 = V4GetZ(f: cosCoefficients2);
881 const FloatV C11 = V4GetW(f: cosCoefficients2);
882
883 FloatV Result;
884 Result = FScaleAdd(a: C1, b: V2, c: V4One());
885 Result = FScaleAdd(a: C2, b: V4, c: Result);
886 Result = FScaleAdd(a: C3, b: V6, c: Result);
887 Result = FScaleAdd(a: C4, b: V8, c: Result);
888 Result = FScaleAdd(a: C5, b: V10, c: Result);
889 Result = FScaleAdd(a: C6, b: V12, c: Result);
890 Result = FScaleAdd(a: C7, b: V14, c: Result);
891 Result = FScaleAdd(a: C8, b: V16, c: Result);
892 Result = FScaleAdd(a: C9, b: V18, c: Result);
893 Result = FScaleAdd(a: C10, b: V20, c: Result);
894 Result = FScaleAdd(a: C11, b: V22, c: Result);
895
896 return Result;
897}
898
899PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV min, const FloatV max)
900{
901 ASSERT_ISVALIDFLOATV(a);
902 ASSERT_ISVALIDFLOATV(min);
903 ASSERT_ISVALIDFLOATV(max);
904 const BoolV c = BOr(a: FIsGrtr(a, b: max), b: FIsGrtr(a: min, b: a));
905 return !BAllEqFFFF(a: c);
906}
907
908PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV min, const FloatV max)
909{
910 ASSERT_ISVALIDFLOATV(a);
911 ASSERT_ISVALIDFLOATV(min);
912 ASSERT_ISVALIDFLOATV(max)
913 const BoolV c = BAnd(a: FIsGrtrOrEq(a, b: min), b: FIsGrtrOrEq(a: max, b: a));
914 return BAllEqTTTT(a: c);
915}
916
917PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV bounds)
918{
919 ASSERT_ISVALIDFLOATV(a);
920 ASSERT_ISVALIDFLOATV(bounds);
921 return FOutOfBounds(a, min: FNeg(f: bounds), max: bounds);
922}
923
924PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV bounds)
925{
926 ASSERT_ISVALIDFLOATV(a);
927 ASSERT_ISVALIDFLOATV(bounds);
928 return FInBounds(a, min: FNeg(f: bounds), max: bounds);
929}
930
931//////////////////////////////////
932// VEC3V
933//////////////////////////////////
934
935PX_FORCE_INLINE Vec3V V3Splat(const FloatV f)
936{
937 ASSERT_ISVALIDFLOATV(f);
938 const __m128 zero = FZero();
939 const __m128 fff0 = _mm_move_ss(a: f, b: zero);
940 return _mm_shuffle_ps(fff0, fff0, _MM_SHUFFLE(0, 1, 2, 3));
941}
942
943PX_FORCE_INLINE Vec3V V3Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z)
944{
945 ASSERT_ISVALIDFLOATV(x);
946 ASSERT_ISVALIDFLOATV(y);
947 ASSERT_ISVALIDFLOATV(z);
948 // static on zero causes compiler crash on x64 debug_opt
949 const __m128 zero = FZero();
950 const __m128 xy = _mm_move_ss(a: x, b: y);
951 const __m128 z0 = _mm_move_ss(a: zero, b: z);
952
953 return _mm_shuffle_ps(xy, z0, _MM_SHUFFLE(1, 0, 0, 1));
954}
955
956PX_FORCE_INLINE Vec3V V3UnitX()
957{
958 const PX_ALIGN(16, PxF32) x[4] = { 1.0f, 0.0f, 0.0f, 0.0f };
959 const __m128 x128 = _mm_load_ps(p: x);
960 return x128;
961}
962
963PX_FORCE_INLINE Vec3V V3UnitY()
964{
965 const PX_ALIGN(16, PxF32) y[4] = { 0.0f, 1.0f, 0.0f, 0.0f };
966 const __m128 y128 = _mm_load_ps(p: y);
967 return y128;
968}
969
970PX_FORCE_INLINE Vec3V V3UnitZ()
971{
972 const PX_ALIGN(16, PxF32) z[4] = { 0.0f, 0.0f, 1.0f, 0.0f };
973 const __m128 z128 = _mm_load_ps(p: z);
974 return z128;
975}
976
977PX_FORCE_INLINE FloatV V3GetX(const Vec3V f)
978{
979 ASSERT_ISVALIDVEC3V(f);
980 return _mm_shuffle_ps(f, f, _MM_SHUFFLE(0, 0, 0, 0));
981}
982
983PX_FORCE_INLINE FloatV V3GetY(const Vec3V f)
984{
985 ASSERT_ISVALIDVEC3V(f)
986 return _mm_shuffle_ps(f, f, _MM_SHUFFLE(1, 1, 1, 1));
987}
988
989PX_FORCE_INLINE FloatV V3GetZ(const Vec3V f)
990{
991 ASSERT_ISVALIDVEC3V(f);
992 return _mm_shuffle_ps(f, f, _MM_SHUFFLE(2, 2, 2, 2));
993}
994
995PX_FORCE_INLINE Vec3V V3SetX(const Vec3V v, const FloatV f)
996{
997 ASSERT_ISVALIDVEC3V(v);
998 ASSERT_ISVALIDFLOATV(f);
999 return V4Sel(c: BFTTT(), a: v, b: f);
1000}
1001
1002PX_FORCE_INLINE Vec3V V3SetY(const Vec3V v, const FloatV f)
1003{
1004 ASSERT_ISVALIDVEC3V(v);
1005 ASSERT_ISVALIDFLOATV(f);
1006 return V4Sel(c: BTFTT(), a: v, b: f);
1007}
1008
1009PX_FORCE_INLINE Vec3V V3SetZ(const Vec3V v, const FloatV f)
1010{
1011 ASSERT_ISVALIDVEC3V(v);
1012 ASSERT_ISVALIDFLOATV(f);
1013 return V4Sel(c: BTTFT(), a: v, b: f);
1014}
1015
1016PX_FORCE_INLINE Vec3V V3ColX(const Vec3V a, const Vec3V b, const Vec3V c)
1017{
1018 ASSERT_ISVALIDVEC3V(a);
1019 ASSERT_ISVALIDVEC3V(b);
1020 ASSERT_ISVALIDVEC3V(c);
1021 Vec3V r = _mm_shuffle_ps(a, c, _MM_SHUFFLE(3, 0, 3, 0));
1022 return V3SetY(v: r, f: V3GetX(f: b));
1023}
1024
1025PX_FORCE_INLINE Vec3V V3ColY(const Vec3V a, const Vec3V b, const Vec3V c)
1026{
1027 ASSERT_ISVALIDVEC3V(a);
1028 ASSERT_ISVALIDVEC3V(b);
1029 ASSERT_ISVALIDVEC3V(c)
1030 Vec3V r = _mm_shuffle_ps(a, c, _MM_SHUFFLE(3, 1, 3, 1));
1031 return V3SetY(v: r, f: V3GetY(f: b));
1032}
1033
1034PX_FORCE_INLINE Vec3V V3ColZ(const Vec3V a, const Vec3V b, const Vec3V c)
1035{
1036 ASSERT_ISVALIDVEC3V(a);
1037 ASSERT_ISVALIDVEC3V(b);
1038 ASSERT_ISVALIDVEC3V(c);
1039 Vec3V r = _mm_shuffle_ps(a, c, _MM_SHUFFLE(3, 2, 3, 2));
1040 return V3SetY(v: r, f: V3GetZ(f: b));
1041}
1042
1043PX_FORCE_INLINE Vec3V V3Zero()
1044{
1045 return V3Load(f: 0.0f);
1046}
1047
1048PX_FORCE_INLINE Vec3V V3Eps()
1049{
1050 return V3Load(PX_EPS_REAL);
1051}
1052PX_FORCE_INLINE Vec3V V3One()
1053{
1054 return V3Load(f: 1.0f);
1055}
1056
1057PX_FORCE_INLINE Vec3V V3Neg(const Vec3V f)
1058{
1059 ASSERT_ISVALIDVEC3V(f);
1060 return _mm_sub_ps(a: _mm_setzero_ps(), b: f);
1061}
1062
1063PX_FORCE_INLINE Vec3V V3Add(const Vec3V a, const Vec3V b)
1064{
1065 ASSERT_ISVALIDVEC3V(a);
1066 ASSERT_ISVALIDVEC3V(b);
1067 return _mm_add_ps(a: a, b: b);
1068}
1069
1070PX_FORCE_INLINE Vec3V V3Sub(const Vec3V a, const Vec3V b)
1071{
1072 ASSERT_ISVALIDVEC3V(a);
1073 ASSERT_ISVALIDVEC3V(b);
1074 return _mm_sub_ps(a: a, b: b);
1075}
1076
1077PX_FORCE_INLINE Vec3V V3Scale(const Vec3V a, const FloatV b)
1078{
1079 ASSERT_ISVALIDVEC3V(a);
1080 ASSERT_ISVALIDFLOATV(b);
1081 return _mm_mul_ps(a: a, b: b);
1082}
1083
1084PX_FORCE_INLINE Vec3V V3Mul(const Vec3V a, const Vec3V b)
1085{
1086 ASSERT_ISVALIDVEC3V(a);
1087 ASSERT_ISVALIDVEC3V(b);
1088 return _mm_mul_ps(a: a, b: b);
1089}
1090
1091PX_FORCE_INLINE Vec3V V3ScaleInv(const Vec3V a, const FloatV b)
1092{
1093 ASSERT_ISVALIDVEC3V(a);
1094 ASSERT_ISVALIDFLOATV(b);
1095 return _mm_div_ps(a: a, b: b);
1096}
1097
1098PX_FORCE_INLINE Vec3V V3Div(const Vec3V a, const Vec3V b)
1099{
1100 ASSERT_ISVALIDVEC3V(a);
1101 ASSERT_ISVALIDVEC3V(b);
1102 return V4ClearW(v: _mm_div_ps(a: a, b: b));
1103}
1104
1105PX_FORCE_INLINE Vec3V V3ScaleInvFast(const Vec3V a, const FloatV b)
1106{
1107 ASSERT_ISVALIDVEC3V(a);
1108 ASSERT_ISVALIDFLOATV(b);
1109 return _mm_mul_ps(a: a, b: _mm_rcp_ps(a: b));
1110}
1111
1112PX_FORCE_INLINE Vec3V V3DivFast(const Vec3V a, const Vec3V b)
1113{
1114 ASSERT_ISVALIDVEC3V(a);
1115 ASSERT_ISVALIDVEC3V(b);
1116 return V4ClearW(v: _mm_mul_ps(a: a, b: _mm_rcp_ps(a: b)));
1117}
1118
1119PX_FORCE_INLINE Vec3V V3Recip(const Vec3V a)
1120{
1121 ASSERT_ISVALIDVEC3V(a);
1122 const __m128 zero = V3Zero();
1123 const __m128 tttf = BTTTF();
1124 const __m128 recipA = _mm_div_ps(a: V3One(), b: a);
1125 return V4Sel(c: tttf, a: recipA, b: zero);
1126}
1127
1128PX_FORCE_INLINE Vec3V V3RecipFast(const Vec3V a)
1129{
1130 ASSERT_ISVALIDVEC3V(a);
1131 const __m128 zero = V3Zero();
1132 const __m128 tttf = BTTTF();
1133 const __m128 recipA = _mm_rcp_ps(a: a);
1134 return V4Sel(c: tttf, a: recipA, b: zero);
1135}
1136
1137PX_FORCE_INLINE Vec3V V3Rsqrt(const Vec3V a)
1138{
1139 ASSERT_ISVALIDVEC3V(a);
1140 const __m128 zero = V3Zero();
1141 const __m128 tttf = BTTTF();
1142 const __m128 recipA = _mm_div_ps(a: V3One(), b: _mm_sqrt_ps(a: a));
1143 return V4Sel(c: tttf, a: recipA, b: zero);
1144}
1145
1146PX_FORCE_INLINE Vec3V V3RsqrtFast(const Vec3V a)
1147{
1148 ASSERT_ISVALIDVEC3V(a);
1149 const __m128 zero = V3Zero();
1150 const __m128 tttf = BTTTF();
1151 const __m128 recipA = _mm_rsqrt_ps(a: a);
1152 return V4Sel(c: tttf, a: recipA, b: zero);
1153}
1154
1155PX_FORCE_INLINE Vec3V V3ScaleAdd(const Vec3V a, const FloatV b, const Vec3V c)
1156{
1157 ASSERT_ISVALIDVEC3V(a);
1158 ASSERT_ISVALIDFLOATV(b);
1159 ASSERT_ISVALIDVEC3V(c);
1160 return V3Add(a: V3Scale(a, b), b: c);
1161}
1162
1163PX_FORCE_INLINE Vec3V V3NegScaleSub(const Vec3V a, const FloatV b, const Vec3V c)
1164{
1165 ASSERT_ISVALIDVEC3V(a);
1166 ASSERT_ISVALIDFLOATV(b);
1167 ASSERT_ISVALIDVEC3V(c);
1168 return V3Sub(a: c, b: V3Scale(a, b));
1169}
1170
1171PX_FORCE_INLINE Vec3V V3MulAdd(const Vec3V a, const Vec3V b, const Vec3V c)
1172{
1173 ASSERT_ISVALIDVEC3V(a);
1174 ASSERT_ISVALIDVEC3V(b);
1175 ASSERT_ISVALIDVEC3V(c);
1176 return V3Add(a: V3Mul(a, b), b: c);
1177}
1178
1179PX_FORCE_INLINE Vec3V V3NegMulSub(const Vec3V a, const Vec3V b, const Vec3V c)
1180{
1181 ASSERT_ISVALIDVEC3V(a);
1182 ASSERT_ISVALIDVEC3V(b);
1183 ASSERT_ISVALIDVEC3V(c);
1184 return V3Sub(a: c, b: V3Mul(a, b));
1185}
1186
1187PX_FORCE_INLINE Vec3V V3Abs(const Vec3V a)
1188{
1189 ASSERT_ISVALIDVEC3V(a);
1190 return V3Max(a, b: V3Neg(f: a));
1191}
1192
1193PX_FORCE_INLINE FloatV V3Dot(const Vec3V a, const Vec3V b)
1194{
1195 ASSERT_ISVALIDVEC3V(a);
1196 ASSERT_ISVALIDVEC3V(b);
1197#ifdef __SSE4_2__
1198 return _mm_dp_ps(a, b, 0x7f);
1199#else
1200 const __m128 t0 = _mm_mul_ps(a: a, b: b); // aw*bw | az*bz | ay*by | ax*bx
1201 const __m128 t1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(1,0,3,2)); // ay*by | ax*bx | aw*bw | az*bz
1202 const __m128 t2 = _mm_add_ps(a: t0, b: t1); // ay*by + aw*bw | ax*bx + az*bz | aw*bw + ay*by | az*bz + ax*bx
1203 const __m128 t3 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,3,0,1)); // ax*bx + az*bz | ay*by + aw*bw | az*bz + ax*bx | aw*bw + ay*by
1204 return _mm_add_ps(a: t3, b: t2); // ax*bx + az*bz + ay*by + aw*bw
1205 // ay*by + aw*bw + ax*bx + az*bz
1206 // az*bz + ax*bx + aw*bw + ay*by
1207 // aw*bw + ay*by + az*bz + ax*bx
1208#endif
1209}
1210
1211PX_FORCE_INLINE Vec3V V3Cross(const Vec3V a, const Vec3V b)
1212{
1213 ASSERT_ISVALIDVEC3V(a);
1214 ASSERT_ISVALIDVEC3V(b);
1215 const __m128 r1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
1216 const __m128 r2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
1217 const __m128 l1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
1218 const __m128 l2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
1219 return _mm_sub_ps(a: _mm_mul_ps(a: l1, b: l2), b: _mm_mul_ps(a: r1, b: r2));
1220}
1221
1222PX_FORCE_INLINE VecCrossV V3PrepareCross(const Vec3V a)
1223{
1224 ASSERT_ISVALIDVEC3V(a);
1225 VecCrossV v;
1226 v.mR1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
1227 v.mL1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
1228 return v;
1229}
1230
1231PX_FORCE_INLINE Vec3V V3Cross(const VecCrossV& a, const Vec3V b)
1232{
1233 ASSERT_ISVALIDVEC3V(b);
1234 const __m128 r2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
1235 const __m128 l2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
1236 return _mm_sub_ps(a: _mm_mul_ps(a: a.mL1, b: l2), b: _mm_mul_ps(a: a.mR1, b: r2));
1237}
1238
1239PX_FORCE_INLINE Vec3V V3Cross(const Vec3V a, const VecCrossV& b)
1240{
1241 ASSERT_ISVALIDVEC3V(a);
1242 const __m128 r2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
1243 const __m128 l2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
1244 return _mm_sub_ps(a: _mm_mul_ps(a: b.mR1, b: r2), b: _mm_mul_ps(a: b.mL1, b: l2));
1245}
1246
1247PX_FORCE_INLINE Vec3V V3Cross(const VecCrossV& a, const VecCrossV& b)
1248{
1249 return _mm_sub_ps(a: _mm_mul_ps(a: a.mL1, b: b.mR1), b: _mm_mul_ps(a: a.mR1, b: b.mL1));
1250}
1251
1252PX_FORCE_INLINE FloatV V3Length(const Vec3V a)
1253{
1254 ASSERT_ISVALIDVEC3V(a);
1255 return _mm_sqrt_ps(a: V3Dot(a, b: a));
1256}
1257
1258PX_FORCE_INLINE FloatV V3LengthSq(const Vec3V a)
1259{
1260 ASSERT_ISVALIDVEC3V(a);
1261 return V3Dot(a, b: a);
1262}
1263
1264PX_FORCE_INLINE Vec3V V3Normalize(const Vec3V a)
1265{
1266 ASSERT_ISVALIDVEC3V(a);
1267 ASSERT_ISFINITELENGTH(a);
1268 return V3ScaleInv(a, b: _mm_sqrt_ps(a: V3Dot(a, b: a)));
1269}
1270
1271PX_FORCE_INLINE Vec3V V3NormalizeFast(const Vec3V a)
1272{
1273 ASSERT_ISVALIDVEC3V(a);
1274 ASSERT_ISFINITELENGTH(a);
1275 return V3Scale(a, b: _mm_rsqrt_ps(a: V3Dot(a, b: a)));
1276}
1277
1278PX_FORCE_INLINE Vec3V V3NormalizeSafe(const Vec3V a, const Vec3V unsafeReturnValue)
1279{
1280 ASSERT_ISVALIDVEC3V(a);
1281 const __m128 eps = V3Eps();
1282 const __m128 length = V3Length(a);
1283 const __m128 isGreaterThanZero = FIsGrtr(a: length, b: eps);
1284 return V3Sel(c: isGreaterThanZero, a: V3ScaleInv(a, b: length), b: unsafeReturnValue);
1285}
1286
1287PX_FORCE_INLINE Vec3V V3Sel(const BoolV c, const Vec3V a, const Vec3V b)
1288{
1289 ASSERT_ISVALIDVEC3V(_mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a)));
1290 return _mm_or_ps(a: _mm_andnot_ps(a: c, b: b), b: _mm_and_ps(a: c, b: a));
1291}
1292
1293PX_FORCE_INLINE BoolV V3IsGrtr(const Vec3V a, const Vec3V b)
1294{
1295 ASSERT_ISVALIDVEC3V(a);
1296 ASSERT_ISVALIDVEC3V(b);
1297 return _mm_cmpgt_ps(a: a, b: b);
1298}
1299
1300PX_FORCE_INLINE BoolV V3IsGrtrOrEq(const Vec3V a, const Vec3V b)
1301{
1302 ASSERT_ISVALIDVEC3V(a);
1303 ASSERT_ISVALIDVEC3V(b);
1304 return _mm_cmpge_ps(a: a, b: b);
1305}
1306
1307PX_FORCE_INLINE BoolV V3IsEq(const Vec3V a, const Vec3V b)
1308{
1309 ASSERT_ISVALIDVEC3V(a);
1310 ASSERT_ISVALIDVEC3V(b);
1311 return _mm_cmpeq_ps(a: a, b: b);
1312}
1313
1314PX_FORCE_INLINE Vec3V V3Max(const Vec3V a, const Vec3V b)
1315{
1316 ASSERT_ISVALIDVEC3V(a);
1317 ASSERT_ISVALIDVEC3V(b);
1318 return _mm_max_ps(a: a, b: b);
1319}
1320
1321PX_FORCE_INLINE Vec3V V3Min(const Vec3V a, const Vec3V b)
1322{
1323 ASSERT_ISVALIDVEC3V(a);
1324 ASSERT_ISVALIDVEC3V(b);
1325 return _mm_min_ps(a: a, b: b);
1326}
1327
1328PX_FORCE_INLINE FloatV V3ExtractMax(const Vec3V a)
1329{
1330 ASSERT_ISVALIDVEC3V(a);
1331 const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0));
1332 const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1));
1333 const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2));
1334
1335 return _mm_max_ps(a: _mm_max_ps(a: shuf1, b: shuf2), b: shuf3);
1336}
1337
1338PX_FORCE_INLINE FloatV V3ExtractMin(const Vec3V a)
1339{
1340 ASSERT_ISVALIDVEC3V(a);
1341
1342 const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0));
1343 const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1));
1344 const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2));
1345
1346 return _mm_min_ps(a: _mm_min_ps(a: shuf1, b: shuf2), b: shuf3);
1347}
1348
1349// return (a >= 0.0f) ? 1.0f : -1.0f;
1350PX_FORCE_INLINE Vec3V V3Sign(const Vec3V a)
1351{
1352 ASSERT_ISVALIDVEC3V(a);
1353 const __m128 zero = V3Zero();
1354 const __m128 one = V3One();
1355 const __m128 none = V3Neg(f: one);
1356 return V3Sel(c: V3IsGrtrOrEq(a, b: zero), a: one, b: none);
1357}
1358
1359PX_FORCE_INLINE Vec3V V3Clamp(const Vec3V a, const Vec3V minV, const Vec3V maxV)
1360{
1361 ASSERT_ISVALIDVEC3V(maxV);
1362 ASSERT_ISVALIDVEC3V(minV);
1363 return V3Max(a: V3Min(a, b: maxV), b: minV);
1364}
1365
1366PX_FORCE_INLINE PxU32 V3AllGrtr(const Vec3V a, const Vec3V b)
1367{
1368 ASSERT_ISVALIDVEC3V(a);
1369 ASSERT_ISVALIDVEC3V(b);
1370 return internalUnitSSE2Simd::BAllTrue3_R(a: V4IsGrtr(a, b));
1371}
1372
1373PX_FORCE_INLINE PxU32 V3AllGrtrOrEq(const Vec3V a, const Vec3V b)
1374{
1375 ASSERT_ISVALIDVEC3V(a);
1376 ASSERT_ISVALIDVEC3V(b);
1377 return internalUnitSSE2Simd::BAllTrue3_R(a: V4IsGrtrOrEq(a, b));
1378}
1379
1380PX_FORCE_INLINE PxU32 V3AllEq(const Vec3V a, const Vec3V b)
1381{
1382 ASSERT_ISVALIDVEC3V(a);
1383 ASSERT_ISVALIDVEC3V(b);
1384 return internalUnitSSE2Simd::BAllTrue3_R(a: V4IsEq(a, b));
1385}
1386
1387PX_FORCE_INLINE Vec3V V3Round(const Vec3V a)
1388{
1389 ASSERT_ISVALIDVEC3V(a);
1390#ifdef __SSE4_2__
1391 return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
1392#else
1393 // return _mm_round_ps(a, 0x0);
1394 const Vec3V half = V3Load(f: 0.5f);
1395 const __m128 signBit = _mm_cvtepi32_ps(a: _mm_srli_epi32(a: _mm_cvtps_epi32(a: a), count: 31));
1396 const Vec3V aRound = V3Sub(a: V3Add(a, b: half), b: signBit);
1397 __m128i tmp = _mm_cvttps_epi32(a: aRound);
1398 return _mm_cvtepi32_ps(a: tmp);
1399#endif
1400}
1401
1402PX_FORCE_INLINE Vec3V V3Sin(const Vec3V a)
1403{
1404 ASSERT_ISVALIDVEC3V(a);
1405 // Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI
1406 const Vec4V recipTwoPi = V4LoadA(f: g_PXReciprocalTwoPi.f);
1407 const Vec4V twoPi = V4LoadA(f: g_PXTwoPi.f);
1408 const Vec3V tmp = V3Scale(a, b: recipTwoPi);
1409 const Vec3V b = V3Round(a: tmp);
1410 const Vec3V V1 = V3NegScaleSub(a: b, b: twoPi, c: a);
1411
1412 // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! -
1413 // V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI)
1414 const Vec3V V2 = V3Mul(a: V1, b: V1);
1415 const Vec3V V3 = V3Mul(a: V2, b: V1);
1416 const Vec3V V5 = V3Mul(a: V3, b: V2);
1417 const Vec3V V7 = V3Mul(a: V5, b: V2);
1418 const Vec3V V9 = V3Mul(a: V7, b: V2);
1419 const Vec3V V11 = V3Mul(a: V9, b: V2);
1420 const Vec3V V13 = V3Mul(a: V11, b: V2);
1421 const Vec3V V15 = V3Mul(a: V13, b: V2);
1422 const Vec3V V17 = V3Mul(a: V15, b: V2);
1423 const Vec3V V19 = V3Mul(a: V17, b: V2);
1424 const Vec3V V21 = V3Mul(a: V19, b: V2);
1425 const Vec3V V23 = V3Mul(a: V21, b: V2);
1426
1427 const Vec4V sinCoefficients0 = V4LoadA(f: g_PXSinCoefficients0.f);
1428 const Vec4V sinCoefficients1 = V4LoadA(f: g_PXSinCoefficients1.f);
1429 const Vec4V sinCoefficients2 = V4LoadA(f: g_PXSinCoefficients2.f);
1430
1431 const FloatV S1 = V4GetY(f: sinCoefficients0);
1432 const FloatV S2 = V4GetZ(f: sinCoefficients0);
1433 const FloatV S3 = V4GetW(f: sinCoefficients0);
1434 const FloatV S4 = V4GetX(f: sinCoefficients1);
1435 const FloatV S5 = V4GetY(f: sinCoefficients1);
1436 const FloatV S6 = V4GetZ(f: sinCoefficients1);
1437 const FloatV S7 = V4GetW(f: sinCoefficients1);
1438 const FloatV S8 = V4GetX(f: sinCoefficients2);
1439 const FloatV S9 = V4GetY(f: sinCoefficients2);
1440 const FloatV S10 = V4GetZ(f: sinCoefficients2);
1441 const FloatV S11 = V4GetW(f: sinCoefficients2);
1442
1443 Vec3V Result;
1444 Result = V3ScaleAdd(a: V3, b: S1, c: V1);
1445 Result = V3ScaleAdd(a: V5, b: S2, c: Result);
1446 Result = V3ScaleAdd(a: V7, b: S3, c: Result);
1447 Result = V3ScaleAdd(a: V9, b: S4, c: Result);
1448 Result = V3ScaleAdd(a: V11, b: S5, c: Result);
1449 Result = V3ScaleAdd(a: V13, b: S6, c: Result);
1450 Result = V3ScaleAdd(a: V15, b: S7, c: Result);
1451 Result = V3ScaleAdd(a: V17, b: S8, c: Result);
1452 Result = V3ScaleAdd(a: V19, b: S9, c: Result);
1453 Result = V3ScaleAdd(a: V21, b: S10, c: Result);
1454 Result = V3ScaleAdd(a: V23, b: S11, c: Result);
1455
1456 ASSERT_ISVALIDVEC3V(Result);
1457 return Result;
1458}
1459
1460PX_FORCE_INLINE Vec3V V3Cos(const Vec3V a)
1461{
1462 ASSERT_ISVALIDVEC3V(a);
1463
1464 // Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI
1465 const Vec4V recipTwoPi = V4LoadA(f: g_PXReciprocalTwoPi.f);
1466 const Vec4V twoPi = V4LoadA(f: g_PXTwoPi.f);
1467 const Vec3V tmp = V3Scale(a, b: recipTwoPi);
1468 const Vec3V b = V3Round(a: tmp);
1469 const Vec3V V1 = V3NegScaleSub(a: b, b: twoPi, c: a);
1470
1471 // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! -
1472 // V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI)
1473 const Vec3V V2 = V3Mul(a: V1, b: V1);
1474 const Vec3V V4 = V3Mul(a: V2, b: V2);
1475 const Vec3V V6 = V3Mul(a: V4, b: V2);
1476 const Vec3V V8 = V3Mul(a: V4, b: V4);
1477 const Vec3V V10 = V3Mul(a: V6, b: V4);
1478 const Vec3V V12 = V3Mul(a: V6, b: V6);
1479 const Vec3V V14 = V3Mul(a: V8, b: V6);
1480 const Vec3V V16 = V3Mul(a: V8, b: V8);
1481 const Vec3V V18 = V3Mul(a: V10, b: V8);
1482 const Vec3V V20 = V3Mul(a: V10, b: V10);
1483 const Vec3V V22 = V3Mul(a: V12, b: V10);
1484
1485 const Vec4V cosCoefficients0 = V4LoadA(f: g_PXCosCoefficients0.f);
1486 const Vec4V cosCoefficients1 = V4LoadA(f: g_PXCosCoefficients1.f);
1487 const Vec4V cosCoefficients2 = V4LoadA(f: g_PXCosCoefficients2.f);
1488
1489 const FloatV C1 = V4GetY(f: cosCoefficients0);
1490 const FloatV C2 = V4GetZ(f: cosCoefficients0);
1491 const FloatV C3 = V4GetW(f: cosCoefficients0);
1492 const FloatV C4 = V4GetX(f: cosCoefficients1);
1493 const FloatV C5 = V4GetY(f: cosCoefficients1);
1494 const FloatV C6 = V4GetZ(f: cosCoefficients1);
1495 const FloatV C7 = V4GetW(f: cosCoefficients1);
1496 const FloatV C8 = V4GetX(f: cosCoefficients2);
1497 const FloatV C9 = V4GetY(f: cosCoefficients2);
1498 const FloatV C10 = V4GetZ(f: cosCoefficients2);
1499 const FloatV C11 = V4GetW(f: cosCoefficients2);
1500
1501 Vec3V Result;
1502 Result = V3ScaleAdd(a: V2, b: C1, c: V3One());
1503 Result = V3ScaleAdd(a: V4, b: C2, c: Result);
1504 Result = V3ScaleAdd(a: V6, b: C3, c: Result);
1505 Result = V3ScaleAdd(a: V8, b: C4, c: Result);
1506 Result = V3ScaleAdd(a: V10, b: C5, c: Result);
1507 Result = V3ScaleAdd(a: V12, b: C6, c: Result);
1508 Result = V3ScaleAdd(a: V14, b: C7, c: Result);
1509 Result = V3ScaleAdd(a: V16, b: C8, c: Result);
1510 Result = V3ScaleAdd(a: V18, b: C9, c: Result);
1511 Result = V3ScaleAdd(a: V20, b: C10, c: Result);
1512 Result = V3ScaleAdd(a: V22, b: C11, c: Result);
1513
1514 ASSERT_ISVALIDVEC3V(Result);
1515 return Result;
1516}
1517
1518PX_FORCE_INLINE Vec3V V3PermYZZ(const Vec3V a)
1519{
1520 ASSERT_ISVALIDVEC3V(a);
1521 return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 2, 1));
1522}
1523
1524PX_FORCE_INLINE Vec3V V3PermXYX(const Vec3V a)
1525{
1526 ASSERT_ISVALIDVEC3V(a);
1527 return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 1, 0));
1528}
1529
1530PX_FORCE_INLINE Vec3V V3PermYZX(const Vec3V a)
1531{
1532 ASSERT_ISVALIDVEC3V(a);
1533 return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
1534}
1535
1536PX_FORCE_INLINE Vec3V V3PermZXY(const Vec3V a)
1537{
1538 ASSERT_ISVALIDVEC3V(a);
1539 return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2));
1540}
1541
1542PX_FORCE_INLINE Vec3V V3PermZZY(const Vec3V a)
1543{
1544 ASSERT_ISVALIDVEC3V(a);
1545 return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 2, 2));
1546}
1547
1548PX_FORCE_INLINE Vec3V V3PermYXX(const Vec3V a)
1549{
1550 ASSERT_ISVALIDVEC3V(a);
1551 return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 0, 1));
1552}
1553
1554PX_FORCE_INLINE Vec3V V3Perm_Zero_1Z_0Y(const Vec3V v0, const Vec3V v1)
1555{
1556 ASSERT_ISVALIDVEC3V(v0);
1557 ASSERT_ISVALIDVEC3V(v1);
1558 return _mm_shuffle_ps(v1, v0, _MM_SHUFFLE(3, 1, 2, 3));
1559}
1560
1561PX_FORCE_INLINE Vec3V V3Perm_0Z_Zero_1X(const Vec3V v0, const Vec3V v1)
1562{
1563 ASSERT_ISVALIDVEC3V(v0);
1564 ASSERT_ISVALIDVEC3V(v1);
1565 return _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(3, 0, 3, 2));
1566}
1567
1568PX_FORCE_INLINE Vec3V V3Perm_1Y_0X_Zero(const Vec3V v0, const Vec3V v1)
1569{
1570 ASSERT_ISVALIDVEC3V(v0);
1571 ASSERT_ISVALIDVEC3V(v1);
1572 // There must be a better way to do this.
1573 Vec3V v2 = V3Zero();
1574 FloatV y1 = V3GetY(f: v1);
1575 FloatV x0 = V3GetX(f: v0);
1576 v2 = V3SetX(v: v2, f: y1);
1577 return V3SetY(v: v2, f: x0);
1578}
1579
1580PX_FORCE_INLINE FloatV V3SumElems(const Vec3V a)
1581{
1582 ASSERT_ISVALIDVEC3V(a);
1583#ifdef __SSE4_2__
1584 Vec3V r = _mm_hadd_ps(a, a);
1585 r = _mm_hadd_ps(r, r);
1586 return r;
1587#else
1588 __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)); // z,y,x,w
1589 __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)); // y,x,w,z
1590 __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)); // x,w,z,y
1591 return _mm_add_ps(a: _mm_add_ps(a: shuf1, b: shuf2), b: shuf3);
1592#endif
1593}
1594
1595PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V min, const Vec3V max)
1596{
1597 ASSERT_ISVALIDVEC3V(a);
1598 ASSERT_ISVALIDVEC3V(min);
1599 ASSERT_ISVALIDVEC3V(max);
1600 const BoolV c = BOr(a: V3IsGrtr(a, b: max), b: V3IsGrtr(a: min, b: a));
1601 return !BAllEqFFFF(a: c);
1602}
1603
1604PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V min, const Vec3V max)
1605{
1606 ASSERT_ISVALIDVEC3V(a);
1607 ASSERT_ISVALIDVEC3V(min);
1608 ASSERT_ISVALIDVEC3V(max);
1609 const BoolV c = BAnd(a: V3IsGrtrOrEq(a, b: min), b: V3IsGrtrOrEq(a: max, b: a));
1610 return BAllEqTTTT(a: c);
1611}
1612
1613PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V bounds)
1614{
1615 ASSERT_ISVALIDVEC3V(a);
1616 ASSERT_ISVALIDVEC3V(bounds);
1617 return V3OutOfBounds(a, min: V3Neg(f: bounds), max: bounds);
1618}
1619
1620PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V bounds)
1621{
1622 ASSERT_ISVALIDVEC3V(a);
1623 ASSERT_ISVALIDVEC3V(bounds)
1624 return V3InBounds(a, min: V3Neg(f: bounds), max: bounds);
1625}
1626
1627PX_FORCE_INLINE void V3Transpose(Vec3V& col0, Vec3V& col1, Vec3V& col2)
1628{
1629 ASSERT_ISVALIDVEC3V(col0);
1630 ASSERT_ISVALIDVEC3V(col1);
1631 ASSERT_ISVALIDVEC3V(col2);
1632
1633 const Vec3V col3 = _mm_setzero_ps();
1634 Vec3V tmp0 = _mm_unpacklo_ps(a: col0, b: col1);
1635 Vec3V tmp2 = _mm_unpacklo_ps(a: col2, b: col3);
1636 Vec3V tmp1 = _mm_unpackhi_ps(a: col0, b: col1);
1637 Vec3V tmp3 = _mm_unpackhi_ps(a: col2, b: col3);
1638 col0 = _mm_movelh_ps(a: tmp0, b: tmp2);
1639 col1 = _mm_movehl_ps(a: tmp2, b: tmp0);
1640 col2 = _mm_movelh_ps(a: tmp1, b: tmp3);
1641}
1642
1643//////////////////////////////////
1644// VEC4V
1645//////////////////////////////////
1646
1647PX_FORCE_INLINE Vec4V V4Splat(const FloatV f)
1648{
1649 ASSERT_ISVALIDFLOATV(f);
1650 // return _mm_shuffle_ps(f, f, _MM_SHUFFLE(0,0,0,0));
1651 return f;
1652}
1653
1654PX_FORCE_INLINE Vec4V V4Merge(const FloatV* const floatVArray)
1655{
1656 ASSERT_ISVALIDFLOATV(floatVArray[0]);
1657 ASSERT_ISVALIDFLOATV(floatVArray[1]);
1658 ASSERT_ISVALIDFLOATV(floatVArray[2]);
1659 ASSERT_ISVALIDFLOATV(floatVArray[3]);
1660 const __m128 xw = _mm_move_ss(a: floatVArray[1], b: floatVArray[0]); // y, y, y, x
1661 const __m128 yz = _mm_move_ss(a: floatVArray[2], b: floatVArray[3]); // z, z, z, w
1662 return _mm_shuffle_ps(xw, yz, _MM_SHUFFLE(0, 2, 1, 0));
1663}
1664
1665PX_FORCE_INLINE Vec4V V4Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z, const FloatVArg w)
1666{
1667 ASSERT_ISVALIDFLOATV(x);
1668 ASSERT_ISVALIDFLOATV(y);
1669 ASSERT_ISVALIDFLOATV(z);
1670 ASSERT_ISVALIDFLOATV(w);
1671 const __m128 xw = _mm_move_ss(a: y, b: x); // y, y, y, x
1672 const __m128 yz = _mm_move_ss(a: z, b: w); // z, z, z, w
1673 return _mm_shuffle_ps(xw, yz, _MM_SHUFFLE(0, 2, 1, 0));
1674}
1675
1676PX_FORCE_INLINE Vec4V V4MergeW(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
1677{
1678 const Vec4V xz = _mm_unpackhi_ps(a: x, b: z);
1679 const Vec4V yw = _mm_unpackhi_ps(a: y, b: w);
1680 return _mm_unpackhi_ps(a: xz, b: yw);
1681}
1682
1683PX_FORCE_INLINE Vec4V V4MergeZ(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
1684{
1685 const Vec4V xz = _mm_unpackhi_ps(a: x, b: z);
1686 const Vec4V yw = _mm_unpackhi_ps(a: y, b: w);
1687 return _mm_unpacklo_ps(a: xz, b: yw);
1688}
1689
1690PX_FORCE_INLINE Vec4V V4MergeY(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
1691{
1692 const Vec4V xz = _mm_unpacklo_ps(a: x, b: z);
1693 const Vec4V yw = _mm_unpacklo_ps(a: y, b: w);
1694 return _mm_unpackhi_ps(a: xz, b: yw);
1695}
1696
1697PX_FORCE_INLINE Vec4V V4MergeX(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
1698{
1699 const Vec4V xz = _mm_unpacklo_ps(a: x, b: z);
1700 const Vec4V yw = _mm_unpacklo_ps(a: y, b: w);
1701 return _mm_unpacklo_ps(a: xz, b: yw);
1702}
1703
1704PX_FORCE_INLINE Vec4V V4UnpackXY(const Vec4VArg a, const Vec4VArg b)
1705{
1706 return _mm_unpacklo_ps(a: a, b: b);
1707}
1708
1709PX_FORCE_INLINE Vec4V V4UnpackZW(const Vec4VArg a, const Vec4VArg b)
1710{
1711 return _mm_unpackhi_ps(a: a, b: b);
1712}
1713
1714PX_FORCE_INLINE Vec4V V4UnitW()
1715{
1716 const PX_ALIGN(16, PxF32) w[4] = { 0.0f, 0.0f, 0.0f, 1.0f };
1717 const __m128 w128 = _mm_load_ps(p: w);
1718 return w128;
1719}
1720
1721PX_FORCE_INLINE Vec4V V4UnitX()
1722{
1723 const PX_ALIGN(16, PxF32) x[4] = { 1.0f, 0.0f, 0.0f, 0.0f };
1724 const __m128 x128 = _mm_load_ps(p: x);
1725 return x128;
1726}
1727
1728PX_FORCE_INLINE Vec4V V4UnitY()
1729{
1730 const PX_ALIGN(16, PxF32) y[4] = { 0.0f, 1.0f, 0.0f, 0.0f };
1731 const __m128 y128 = _mm_load_ps(p: y);
1732 return y128;
1733}
1734
1735PX_FORCE_INLINE Vec4V V4UnitZ()
1736{
1737 const PX_ALIGN(16, PxF32) z[4] = { 0.0f, 0.0f, 1.0f, 0.0f };
1738 const __m128 z128 = _mm_load_ps(p: z);
1739 return z128;
1740}
1741
1742PX_FORCE_INLINE FloatV V4GetW(const Vec4V f)
1743{
1744 return _mm_shuffle_ps(f, f, _MM_SHUFFLE(3, 3, 3, 3));
1745}
1746
1747PX_FORCE_INLINE FloatV V4GetX(const Vec4V f)
1748{
1749 return _mm_shuffle_ps(f, f, _MM_SHUFFLE(0, 0, 0, 0));
1750}
1751
1752PX_FORCE_INLINE FloatV V4GetY(const Vec4V f)
1753{
1754 return _mm_shuffle_ps(f, f, _MM_SHUFFLE(1, 1, 1, 1));
1755}
1756
1757PX_FORCE_INLINE FloatV V4GetZ(const Vec4V f)
1758{
1759 return _mm_shuffle_ps(f, f, _MM_SHUFFLE(2, 2, 2, 2));
1760}
1761
1762PX_FORCE_INLINE Vec4V V4SetW(const Vec4V v, const FloatV f)
1763{
1764 ASSERT_ISVALIDFLOATV(f);
1765 return V4Sel(c: BTTTF(), a: v, b: f);
1766}
1767
1768PX_FORCE_INLINE Vec4V V4SetX(const Vec4V v, const FloatV f)
1769{
1770 ASSERT_ISVALIDFLOATV(f);
1771 return V4Sel(c: BFTTT(), a: v, b: f);
1772}
1773
1774PX_FORCE_INLINE Vec4V V4SetY(const Vec4V v, const FloatV f)
1775{
1776 ASSERT_ISVALIDFLOATV(f);
1777 return V4Sel(c: BTFTT(), a: v, b: f);
1778}
1779
1780PX_FORCE_INLINE Vec4V V4SetZ(const Vec4V v, const FloatV f)
1781{
1782 ASSERT_ISVALIDFLOATV(f);
1783 return V4Sel(c: BTTFT(), a: v, b: f);
1784}
1785
1786PX_FORCE_INLINE Vec4V V4ClearW(const Vec4V v)
1787{
1788#if !PX_EMSCRIPTEN
1789 return _mm_and_ps(a: v, b: V4LoadA(f: internalUnitSSE2Simd::gMaskXYZ));
1790#else
1791 return _mm_and_ps(v, (VecI32V&)internalUnitSSE2Simd::gMaskXYZ);
1792#endif
1793}
1794
1795PX_FORCE_INLINE Vec4V V4PermYXWZ(const Vec4V a)
1796{
1797 return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));
1798}
1799
1800PX_FORCE_INLINE Vec4V V4PermXZXZ(const Vec4V a)
1801{
1802 return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 0, 2, 0));
1803}
1804
1805PX_FORCE_INLINE Vec4V V4PermYWYW(const Vec4V a)
1806{
1807 return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 3, 1));
1808}
1809
1810PX_FORCE_INLINE Vec4V V4PermYZXW(const Vec4V a)
1811{
1812 return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
1813}
1814
1815PX_FORCE_INLINE Vec4V V4PermZWXY(const Vec4V a)
1816{
1817 return _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2));
1818}
1819
1820template <PxU8 x, PxU8 y, PxU8 z, PxU8 w>
1821PX_FORCE_INLINE Vec4V V4Perm(const Vec4V a)
1822{
1823 return _mm_shuffle_ps(a, a, _MM_SHUFFLE(w, z, y, x));
1824}
1825
1826PX_FORCE_INLINE Vec4V V4Zero()
1827{
1828 return V4Load(f: 0.0f);
1829}
1830
1831PX_FORCE_INLINE Vec4V V4One()
1832{
1833 return V4Load(f: 1.0f);
1834}
1835
1836PX_FORCE_INLINE Vec4V V4Eps()
1837{
1838 return V4Load(PX_EPS_REAL);
1839}
1840
1841PX_FORCE_INLINE Vec4V V4Neg(const Vec4V f)
1842{
1843 return _mm_sub_ps(a: _mm_setzero_ps(), b: f);
1844}
1845
1846PX_FORCE_INLINE Vec4V V4Add(const Vec4V a, const Vec4V b)
1847{
1848 return _mm_add_ps(a: a, b: b);
1849}
1850
1851PX_FORCE_INLINE Vec4V V4Sub(const Vec4V a, const Vec4V b)
1852{
1853 return _mm_sub_ps(a: a, b: b);
1854}
1855
1856PX_FORCE_INLINE Vec4V V4Scale(const Vec4V a, const FloatV b)
1857{
1858 return _mm_mul_ps(a: a, b: b);
1859}
1860
1861PX_FORCE_INLINE Vec4V V4Mul(const Vec4V a, const Vec4V b)
1862{
1863 return _mm_mul_ps(a: a, b: b);
1864}
1865
1866PX_FORCE_INLINE Vec4V V4ScaleInv(const Vec4V a, const FloatV b)
1867{
1868 ASSERT_ISVALIDFLOATV(b);
1869 return _mm_div_ps(a: a, b: b);
1870}
1871
1872PX_FORCE_INLINE Vec4V V4Div(const Vec4V a, const Vec4V b)
1873{
1874 return _mm_div_ps(a: a, b: b);
1875}
1876
1877PX_FORCE_INLINE Vec4V V4ScaleInvFast(const Vec4V a, const FloatV b)
1878{
1879 ASSERT_ISVALIDFLOATV(b);
1880 return _mm_mul_ps(a: a, b: _mm_rcp_ps(a: b));
1881}
1882
1883PX_FORCE_INLINE Vec4V V4DivFast(const Vec4V a, const Vec4V b)
1884{
1885 return _mm_mul_ps(a: a, b: _mm_rcp_ps(a: b));
1886}
1887
1888PX_FORCE_INLINE Vec4V V4Recip(const Vec4V a)
1889{
1890 return _mm_div_ps(a: V4One(), b: a);
1891}
1892
1893PX_FORCE_INLINE Vec4V V4RecipFast(const Vec4V a)
1894{
1895 return _mm_rcp_ps(a: a);
1896}
1897
1898PX_FORCE_INLINE Vec4V V4Rsqrt(const Vec4V a)
1899{
1900 return _mm_div_ps(a: V4One(), b: _mm_sqrt_ps(a: a));
1901}
1902
1903PX_FORCE_INLINE Vec4V V4RsqrtFast(const Vec4V a)
1904{
1905 return _mm_rsqrt_ps(a: a);
1906}
1907
1908PX_FORCE_INLINE Vec4V V4Sqrt(const Vec4V a)
1909{
1910 return _mm_sqrt_ps(a: a);
1911}
1912
1913PX_FORCE_INLINE Vec4V V4ScaleAdd(const Vec4V a, const FloatV b, const Vec4V c)
1914{
1915 ASSERT_ISVALIDFLOATV(b);
1916 return V4Add(a: V4Scale(a, b), b: c);
1917}
1918
1919PX_FORCE_INLINE Vec4V V4NegScaleSub(const Vec4V a, const FloatV b, const Vec4V c)
1920{
1921 ASSERT_ISVALIDFLOATV(b);
1922 return V4Sub(a: c, b: V4Scale(a, b));
1923}
1924
1925PX_FORCE_INLINE Vec4V V4MulAdd(const Vec4V a, const Vec4V b, const Vec4V c)
1926{
1927 return V4Add(a: V4Mul(a, b), b: c);
1928}
1929
1930PX_FORCE_INLINE Vec4V V4NegMulSub(const Vec4V a, const Vec4V b, const Vec4V c)
1931{
1932 return V4Sub(a: c, b: V4Mul(a, b));
1933}
1934
1935PX_FORCE_INLINE Vec4V V4Abs(const Vec4V a)
1936{
1937 return V4Max(a, b: V4Neg(f: a));
1938}
1939
1940PX_FORCE_INLINE FloatV V4SumElements(const Vec4V a)
1941{
1942#ifdef __SSE4_2__
1943 Vec4V r = _mm_hadd_ps(a, a);
1944 r = _mm_hadd_ps(r, r);
1945 return r;
1946#else
1947 const Vec4V xy = V4UnpackXY(a, b: a); // x,x,y,y
1948 const Vec4V zw = V4UnpackZW(a, b: a); // z,z,w,w
1949 const Vec4V xz_yw = V4Add(a: xy, b: zw); // x+z,x+z,y+w,y+w
1950 const FloatV xz = V4GetX(f: xz_yw); // x+z
1951 const FloatV yw = V4GetZ(f: xz_yw); // y+w
1952 return FAdd(a: xz, b: yw); // sum
1953#endif
1954}
1955
1956PX_FORCE_INLINE FloatV V4Dot(const Vec4V a, const Vec4V b)
1957{
1958#ifdef __SSE4_2__
1959 return _mm_dp_ps(a, b, 0xff);
1960#else
1961 const __m128 dot1 = _mm_mul_ps(a: a, b: b); // x,y,z,w
1962 const __m128 shuf1 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(2, 1, 0, 3)); // w,x,y,z
1963 const __m128 shuf2 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(1, 0, 3, 2)); // z,w,x,y
1964 const __m128 shuf3 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(0, 3, 2, 1)); // y,z,w,x
1965 return _mm_add_ps(a: _mm_add_ps(a: shuf2, b: shuf3), b: _mm_add_ps(a: dot1, b: shuf1));
1966#endif
1967}
1968
1969PX_FORCE_INLINE FloatV V4Dot3(const Vec4V a, const Vec4V b)
1970{
1971#ifdef __SSE4_2__
1972 return _mm_dp_ps(a, b, 0x7f);
1973#else
1974 const __m128 dot1 = _mm_mul_ps(a: a, b: b); // w,z,y,x
1975 const __m128 shuf1 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(0, 0, 0, 0)); // z,y,x,w
1976 const __m128 shuf2 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(1, 1, 1, 1)); // y,x,w,z
1977 const __m128 shuf3 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(2, 2, 2, 2)); // x,w,z,y
1978 return _mm_add_ps(a: _mm_add_ps(a: shuf1, b: shuf2), b: shuf3);
1979#endif
1980}
1981
1982PX_FORCE_INLINE Vec4V V4Cross(const Vec4V a, const Vec4V b)
1983{
1984 const __m128 r1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
1985 const __m128 r2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
1986 const __m128 l1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
1987 const __m128 l2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
1988 return _mm_sub_ps(a: _mm_mul_ps(a: l1, b: l2), b: _mm_mul_ps(a: r1, b: r2));
1989}
1990
1991PX_FORCE_INLINE FloatV V4Length(const Vec4V a)
1992{
1993 return _mm_sqrt_ps(a: V4Dot(a, b: a));
1994}
1995
1996PX_FORCE_INLINE FloatV V4LengthSq(const Vec4V a)
1997{
1998 return V4Dot(a, b: a);
1999}
2000
2001PX_FORCE_INLINE Vec4V V4Normalize(const Vec4V a)
2002{
2003 ASSERT_ISFINITELENGTH(a);
2004 return V4ScaleInv(a, b: _mm_sqrt_ps(a: V4Dot(a, b: a)));
2005}
2006
2007PX_FORCE_INLINE Vec4V V4NormalizeFast(const Vec4V a)
2008{
2009 ASSERT_ISFINITELENGTH(a);
2010 return V4ScaleInvFast(a, b: _mm_sqrt_ps(a: V4Dot(a, b: a)));
2011}
2012
2013PX_FORCE_INLINE Vec4V V4NormalizeSafe(const Vec4V a, const Vec3V unsafeReturnValue)
2014{
2015 const __m128 eps = V3Eps();
2016 const __m128 length = V4Length(a);
2017 const __m128 isGreaterThanZero = V4IsGrtr(a: length, b: eps);
2018 return V4Sel(c: isGreaterThanZero, a: V4ScaleInv(a, b: length), b: unsafeReturnValue);
2019}
2020
2021PX_FORCE_INLINE BoolV V4IsEqU32(const VecU32V a, const VecU32V b)
2022{
2023 return m128_I2F(n: _mm_cmpeq_epi32(a: m128_F2I(n: a), b: m128_F2I(n: b)));
2024}
2025
2026PX_FORCE_INLINE Vec4V V4Sel(const BoolV c, const Vec4V a, const Vec4V b)
2027{
2028 return _mm_or_ps(a: _mm_andnot_ps(a: c, b: b), b: _mm_and_ps(a: c, b: a));
2029}
2030
2031PX_FORCE_INLINE BoolV V4IsGrtr(const Vec4V a, const Vec4V b)
2032{
2033 return _mm_cmpgt_ps(a: a, b: b);
2034}
2035
2036PX_FORCE_INLINE BoolV V4IsGrtrOrEq(const Vec4V a, const Vec4V b)
2037{
2038 return _mm_cmpge_ps(a: a, b: b);
2039}
2040
2041PX_FORCE_INLINE BoolV V4IsEq(const Vec4V a, const Vec4V b)
2042{
2043 return _mm_cmpeq_ps(a: a, b: b);
2044}
2045
2046PX_FORCE_INLINE Vec4V V4Max(const Vec4V a, const Vec4V b)
2047{
2048 return _mm_max_ps(a: a, b: b);
2049}
2050
2051PX_FORCE_INLINE Vec4V V4Min(const Vec4V a, const Vec4V b)
2052{
2053 return _mm_min_ps(a: a, b: b);
2054}
2055
2056PX_FORCE_INLINE FloatV V4ExtractMax(const Vec4V a)
2057{
2058 const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 1, 0, 3));
2059 const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2));
2060 const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 3, 2, 1));
2061
2062 return _mm_max_ps(a: _mm_max_ps(a: a, b: shuf1), b: _mm_max_ps(a: shuf2, b: shuf3));
2063}
2064
2065PX_FORCE_INLINE FloatV V4ExtractMin(const Vec4V a)
2066{
2067 const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 1, 0, 3));
2068 const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2));
2069 const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 3, 2, 1));
2070
2071 return _mm_min_ps(a: _mm_min_ps(a: a, b: shuf1), b: _mm_min_ps(a: shuf2, b: shuf3));
2072}
2073
2074PX_FORCE_INLINE Vec4V V4Clamp(const Vec4V a, const Vec4V minV, const Vec4V maxV)
2075{
2076 return V4Max(a: V4Min(a, b: maxV), b: minV);
2077}
2078
2079PX_FORCE_INLINE PxU32 V4AllGrtr(const Vec4V a, const Vec4V b)
2080{
2081 return internalUnitSSE2Simd::BAllTrue4_R(a: V4IsGrtr(a, b));
2082}
2083
2084PX_FORCE_INLINE PxU32 V4AllGrtrOrEq(const Vec4V a, const Vec4V b)
2085{
2086 return internalUnitSSE2Simd::BAllTrue4_R(a: V4IsGrtrOrEq(a, b));
2087}
2088
2089PX_FORCE_INLINE PxU32 V4AllGrtrOrEq3(const Vec4V a, const Vec4V b)
2090{
2091 return internalUnitSSE2Simd::BAllTrue3_R(a: V4IsGrtrOrEq(a, b));
2092}
2093
2094PX_FORCE_INLINE PxU32 V4AllEq(const Vec4V a, const Vec4V b)
2095{
2096 return internalUnitSSE2Simd::BAllTrue4_R(a: V4IsEq(a, b));
2097}
2098
2099PX_FORCE_INLINE PxU32 V4AnyGrtr3(const Vec4V a, const Vec4V b)
2100{
2101 return internalUnitSSE2Simd::BAnyTrue3_R(a: V4IsGrtr(a, b));
2102}
2103
2104PX_FORCE_INLINE Vec4V V4Round(const Vec4V a)
2105{
2106#ifdef __SSE4_2__
2107 return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
2108#else
2109 // return _mm_round_ps(a, 0x0);
2110 const Vec4V half = V4Load(f: 0.5f);
2111 const __m128 signBit = _mm_cvtepi32_ps(a: _mm_srli_epi32(a: _mm_cvtps_epi32(a: a), count: 31));
2112 const Vec4V aRound = V4Sub(a: V4Add(a, b: half), b: signBit);
2113 __m128i tmp = _mm_cvttps_epi32(a: aRound);
2114 return _mm_cvtepi32_ps(a: tmp);
2115#endif
2116}
2117
2118PX_FORCE_INLINE Vec4V V4Sin(const Vec4V a)
2119{
2120 const Vec4V recipTwoPi = V4LoadA(f: g_PXReciprocalTwoPi.f);
2121 const Vec4V twoPi = V4LoadA(f: g_PXTwoPi.f);
2122 const Vec4V tmp = V4Mul(a, b: recipTwoPi);
2123 const Vec4V b = V4Round(a: tmp);
2124 const Vec4V V1 = V4NegMulSub(a: twoPi, b, c: a);
2125
2126 // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! -
2127 // V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI)
2128 const Vec4V V2 = V4Mul(a: V1, b: V1);
2129 const Vec4V V3 = V4Mul(a: V2, b: V1);
2130 const Vec4V V5 = V4Mul(a: V3, b: V2);
2131 const Vec4V V7 = V4Mul(a: V5, b: V2);
2132 const Vec4V V9 = V4Mul(a: V7, b: V2);
2133 const Vec4V V11 = V4Mul(a: V9, b: V2);
2134 const Vec4V V13 = V4Mul(a: V11, b: V2);
2135 const Vec4V V15 = V4Mul(a: V13, b: V2);
2136 const Vec4V V17 = V4Mul(a: V15, b: V2);
2137 const Vec4V V19 = V4Mul(a: V17, b: V2);
2138 const Vec4V V21 = V4Mul(a: V19, b: V2);
2139 const Vec4V V23 = V4Mul(a: V21, b: V2);
2140
2141 const Vec4V sinCoefficients0 = V4LoadA(f: g_PXSinCoefficients0.f);
2142 const Vec4V sinCoefficients1 = V4LoadA(f: g_PXSinCoefficients1.f);
2143 const Vec4V sinCoefficients2 = V4LoadA(f: g_PXSinCoefficients2.f);
2144
2145 const FloatV S1 = V4GetY(f: sinCoefficients0);
2146 const FloatV S2 = V4GetZ(f: sinCoefficients0);
2147 const FloatV S3 = V4GetW(f: sinCoefficients0);
2148 const FloatV S4 = V4GetX(f: sinCoefficients1);
2149 const FloatV S5 = V4GetY(f: sinCoefficients1);
2150 const FloatV S6 = V4GetZ(f: sinCoefficients1);
2151 const FloatV S7 = V4GetW(f: sinCoefficients1);
2152 const FloatV S8 = V4GetX(f: sinCoefficients2);
2153 const FloatV S9 = V4GetY(f: sinCoefficients2);
2154 const FloatV S10 = V4GetZ(f: sinCoefficients2);
2155 const FloatV S11 = V4GetW(f: sinCoefficients2);
2156
2157 Vec4V Result;
2158 Result = V4MulAdd(a: S1, b: V3, c: V1);
2159 Result = V4MulAdd(a: S2, b: V5, c: Result);
2160 Result = V4MulAdd(a: S3, b: V7, c: Result);
2161 Result = V4MulAdd(a: S4, b: V9, c: Result);
2162 Result = V4MulAdd(a: S5, b: V11, c: Result);
2163 Result = V4MulAdd(a: S6, b: V13, c: Result);
2164 Result = V4MulAdd(a: S7, b: V15, c: Result);
2165 Result = V4MulAdd(a: S8, b: V17, c: Result);
2166 Result = V4MulAdd(a: S9, b: V19, c: Result);
2167 Result = V4MulAdd(a: S10, b: V21, c: Result);
2168 Result = V4MulAdd(a: S11, b: V23, c: Result);
2169
2170 return Result;
2171}
2172
2173PX_FORCE_INLINE Vec4V V4Cos(const Vec4V a)
2174{
2175 const Vec4V recipTwoPi = V4LoadA(f: g_PXReciprocalTwoPi.f);
2176 const Vec4V twoPi = V4LoadA(f: g_PXTwoPi.f);
2177 const Vec4V tmp = V4Mul(a, b: recipTwoPi);
2178 const Vec4V b = V4Round(a: tmp);
2179 const Vec4V V1 = V4NegMulSub(a: twoPi, b, c: a);
2180
2181 // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! -
2182 // V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI)
2183 const Vec4V V2 = V4Mul(a: V1, b: V1);
2184 const Vec4V V4 = V4Mul(a: V2, b: V2);
2185 const Vec4V V6 = V4Mul(a: V4, b: V2);
2186 const Vec4V V8 = V4Mul(a: V4, b: V4);
2187 const Vec4V V10 = V4Mul(a: V6, b: V4);
2188 const Vec4V V12 = V4Mul(a: V6, b: V6);
2189 const Vec4V V14 = V4Mul(a: V8, b: V6);
2190 const Vec4V V16 = V4Mul(a: V8, b: V8);
2191 const Vec4V V18 = V4Mul(a: V10, b: V8);
2192 const Vec4V V20 = V4Mul(a: V10, b: V10);
2193 const Vec4V V22 = V4Mul(a: V12, b: V10);
2194
2195 const Vec4V cosCoefficients0 = V4LoadA(f: g_PXCosCoefficients0.f);
2196 const Vec4V cosCoefficients1 = V4LoadA(f: g_PXCosCoefficients1.f);
2197 const Vec4V cosCoefficients2 = V4LoadA(f: g_PXCosCoefficients2.f);
2198
2199 const FloatV C1 = V4GetY(f: cosCoefficients0);
2200 const FloatV C2 = V4GetZ(f: cosCoefficients0);
2201 const FloatV C3 = V4GetW(f: cosCoefficients0);
2202 const FloatV C4 = V4GetX(f: cosCoefficients1);
2203 const FloatV C5 = V4GetY(f: cosCoefficients1);
2204 const FloatV C6 = V4GetZ(f: cosCoefficients1);
2205 const FloatV C7 = V4GetW(f: cosCoefficients1);
2206 const FloatV C8 = V4GetX(f: cosCoefficients2);
2207 const FloatV C9 = V4GetY(f: cosCoefficients2);
2208 const FloatV C10 = V4GetZ(f: cosCoefficients2);
2209 const FloatV C11 = V4GetW(f: cosCoefficients2);
2210
2211 Vec4V Result;
2212 Result = V4MulAdd(a: C1, b: V2, c: V4One());
2213 Result = V4MulAdd(a: C2, b: V4, c: Result);
2214 Result = V4MulAdd(a: C3, b: V6, c: Result);
2215 Result = V4MulAdd(a: C4, b: V8, c: Result);
2216 Result = V4MulAdd(a: C5, b: V10, c: Result);
2217 Result = V4MulAdd(a: C6, b: V12, c: Result);
2218 Result = V4MulAdd(a: C7, b: V14, c: Result);
2219 Result = V4MulAdd(a: C8, b: V16, c: Result);
2220 Result = V4MulAdd(a: C9, b: V18, c: Result);
2221 Result = V4MulAdd(a: C10, b: V20, c: Result);
2222 Result = V4MulAdd(a: C11, b: V22, c: Result);
2223
2224 return Result;
2225}
2226
2227PX_FORCE_INLINE void V4Transpose(Vec4V& col0, Vec4V& col1, Vec4V& col2, Vec4V& col3)
2228{
2229 Vec4V tmp0 = _mm_unpacklo_ps(a: col0, b: col1);
2230 Vec4V tmp2 = _mm_unpacklo_ps(a: col2, b: col3);
2231 Vec4V tmp1 = _mm_unpackhi_ps(a: col0, b: col1);
2232 Vec4V tmp3 = _mm_unpackhi_ps(a: col2, b: col3);
2233 col0 = _mm_movelh_ps(a: tmp0, b: tmp2);
2234 col1 = _mm_movehl_ps(a: tmp2, b: tmp0);
2235 col2 = _mm_movelh_ps(a: tmp1, b: tmp3);
2236 col3 = _mm_movehl_ps(a: tmp3, b: tmp1);
2237}
2238
2239//////////////////////////////////
2240// BoolV
2241//////////////////////////////////
2242
2243PX_FORCE_INLINE BoolV BFFFF()
2244{
2245 return _mm_setzero_ps();
2246}
2247
2248PX_FORCE_INLINE BoolV BFFFT()
2249{
2250 /*const PX_ALIGN(16, PxU32 f[4])={0,0,0,0xFFFFFFFF};
2251 const __m128 ffft=_mm_load_ps((float*)&f);
2252 return ffft;*/
2253 return m128_I2F(n: _mm_set_epi32(i3: -1, i2: 0, i1: 0, i0: 0));
2254}
2255
2256PX_FORCE_INLINE BoolV BFFTF()
2257{
2258 /*const PX_ALIGN(16, PxU32 f[4])={0,0,0xFFFFFFFF,0};
2259 const __m128 fftf=_mm_load_ps((float*)&f);
2260 return fftf;*/
2261 return m128_I2F(n: _mm_set_epi32(i3: 0, i2: -1, i1: 0, i0: 0));
2262}
2263
2264PX_FORCE_INLINE BoolV BFFTT()
2265{
2266 /*const PX_ALIGN(16, PxU32 f[4])={0,0,0xFFFFFFFF,0xFFFFFFFF};
2267 const __m128 fftt=_mm_load_ps((float*)&f);
2268 return fftt;*/
2269 return m128_I2F(n: _mm_set_epi32(i3: -1, i2: -1, i1: 0, i0: 0));
2270}
2271
2272PX_FORCE_INLINE BoolV BFTFF()
2273{
2274 /*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0,0};
2275 const __m128 ftff=_mm_load_ps((float*)&f);
2276 return ftff;*/
2277 return m128_I2F(n: _mm_set_epi32(i3: 0, i2: 0, i1: -1, i0: 0));
2278}
2279
2280PX_FORCE_INLINE BoolV BFTFT()
2281{
2282 /*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0,0xFFFFFFFF};
2283 const __m128 ftft=_mm_load_ps((float*)&f);
2284 return ftft;*/
2285 return m128_I2F(n: _mm_set_epi32(i3: -1, i2: 0, i1: -1, i0: 0));
2286}
2287
2288PX_FORCE_INLINE BoolV BFTTF()
2289{
2290 /*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0xFFFFFFFF,0};
2291 const __m128 fttf=_mm_load_ps((float*)&f);
2292 return fttf;*/
2293 return m128_I2F(n: _mm_set_epi32(i3: 0, i2: -1, i1: -1, i0: 0));
2294}
2295
2296PX_FORCE_INLINE BoolV BFTTT()
2297{
2298 /*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF};
2299 const __m128 fttt=_mm_load_ps((float*)&f);
2300 return fttt;*/
2301 return m128_I2F(n: _mm_set_epi32(i3: -1, i2: -1, i1: -1, i0: 0));
2302}
2303
2304PX_FORCE_INLINE BoolV BTFFF()
2305{
2306 // const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0,0};
2307 // const __m128 tfff=_mm_load_ps((float*)&f);
2308 // return tfff;
2309 return m128_I2F(n: _mm_set_epi32(i3: 0, i2: 0, i1: 0, i0: -1));
2310}
2311
2312PX_FORCE_INLINE BoolV BTFFT()
2313{
2314 /*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0,0xFFFFFFFF};
2315 const __m128 tfft=_mm_load_ps((float*)&f);
2316 return tfft;*/
2317 return m128_I2F(n: _mm_set_epi32(i3: -1, i2: 0, i1: 0, i0: -1));
2318}
2319
2320PX_FORCE_INLINE BoolV BTFTF()
2321{
2322 /*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0xFFFFFFFF,0};
2323 const __m128 tftf=_mm_load_ps((float*)&f);
2324 return tftf;*/
2325 return m128_I2F(n: _mm_set_epi32(i3: 0, i2: -1, i1: 0, i0: -1));
2326}
2327
2328PX_FORCE_INLINE BoolV BTFTT()
2329{
2330 /*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0xFFFFFFFF,0xFFFFFFFF};
2331 const __m128 tftt=_mm_load_ps((float*)&f);
2332 return tftt;*/
2333 return m128_I2F(n: _mm_set_epi32(i3: -1, i2: -1, i1: 0, i0: -1));
2334}
2335
2336PX_FORCE_INLINE BoolV BTTFF()
2337{
2338 /*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0,0};
2339 const __m128 ttff=_mm_load_ps((float*)&f);
2340 return ttff;*/
2341 return m128_I2F(n: _mm_set_epi32(i3: 0, i2: 0, i1: -1, i0: -1));
2342}
2343
2344PX_FORCE_INLINE BoolV BTTFT()
2345{
2346 /*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0,0xFFFFFFFF};
2347 const __m128 ttft=_mm_load_ps((float*)&f);
2348 return ttft;*/
2349 return m128_I2F(n: _mm_set_epi32(i3: -1, i2: 0, i1: -1, i0: -1));
2350}
2351
2352PX_FORCE_INLINE BoolV BTTTF()
2353{
2354 /*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0};
2355 const __m128 tttf=_mm_load_ps((float*)&f);
2356 return tttf;*/
2357 return m128_I2F(n: _mm_set_epi32(i3: 0, i2: -1, i1: -1, i0: -1));
2358}
2359
2360PX_FORCE_INLINE BoolV BTTTT()
2361{
2362 /*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF};
2363 const __m128 tttt=_mm_load_ps((float*)&f);
2364 return tttt;*/
2365 return m128_I2F(n: _mm_set_epi32(i3: -1, i2: -1, i1: -1, i0: -1));
2366}
2367
2368PX_FORCE_INLINE BoolV BXMask()
2369{
2370 /*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0,0};
2371 const __m128 tfff=_mm_load_ps((float*)&f);
2372 return tfff;*/
2373 return m128_I2F(n: _mm_set_epi32(i3: 0, i2: 0, i1: 0, i0: -1));
2374}
2375
2376PX_FORCE_INLINE BoolV BYMask()
2377{
2378 /*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0,0};
2379 const __m128 ftff=_mm_load_ps((float*)&f);
2380 return ftff;*/
2381 return m128_I2F(n: _mm_set_epi32(i3: 0, i2: 0, i1: -1, i0: 0));
2382}
2383
2384PX_FORCE_INLINE BoolV BZMask()
2385{
2386 /*const PX_ALIGN(16, PxU32 f[4])={0,0,0xFFFFFFFF,0};
2387 const __m128 fftf=_mm_load_ps((float*)&f);
2388 return fftf;*/
2389 return m128_I2F(n: _mm_set_epi32(i3: 0, i2: -1, i1: 0, i0: 0));
2390}
2391
2392PX_FORCE_INLINE BoolV BWMask()
2393{
2394 /*const PX_ALIGN(16, PxU32 f[4])={0,0,0,0xFFFFFFFF};
2395 const __m128 ffft=_mm_load_ps((float*)&f);
2396 return ffft;*/
2397 return m128_I2F(n: _mm_set_epi32(i3: -1, i2: 0, i1: 0, i0: 0));
2398}
2399
2400PX_FORCE_INLINE BoolV BGetX(const BoolV f)
2401{
2402 return _mm_shuffle_ps(f, f, _MM_SHUFFLE(0, 0, 0, 0));
2403}
2404
2405PX_FORCE_INLINE BoolV BGetY(const BoolV f)
2406{
2407 return _mm_shuffle_ps(f, f, _MM_SHUFFLE(1, 1, 1, 1));
2408}
2409
2410PX_FORCE_INLINE BoolV BGetZ(const BoolV f)
2411{
2412 return _mm_shuffle_ps(f, f, _MM_SHUFFLE(2, 2, 2, 2));
2413}
2414
2415PX_FORCE_INLINE BoolV BGetW(const BoolV f)
2416{
2417 return _mm_shuffle_ps(f, f, _MM_SHUFFLE(3, 3, 3, 3));
2418}
2419
2420PX_FORCE_INLINE BoolV BSetX(const BoolV v, const BoolV f)
2421{
2422 return V4Sel(c: BFTTT(), a: v, b: f);
2423}
2424
2425PX_FORCE_INLINE BoolV BSetY(const BoolV v, const BoolV f)
2426{
2427 return V4Sel(c: BTFTT(), a: v, b: f);
2428}
2429
2430PX_FORCE_INLINE BoolV BSetZ(const BoolV v, const BoolV f)
2431{
2432 return V4Sel(c: BTTFT(), a: v, b: f);
2433}
2434
2435PX_FORCE_INLINE BoolV BSetW(const BoolV v, const BoolV f)
2436{
2437 return V4Sel(c: BTTTF(), a: v, b: f);
2438}
2439
2440PX_FORCE_INLINE BoolV BAnd(const BoolV a, const BoolV b)
2441{
2442 return _mm_and_ps(a: a, b: b);
2443}
2444
2445PX_FORCE_INLINE BoolV BNot(const BoolV a)
2446{
2447 const BoolV bAllTrue(BTTTT());
2448 return _mm_xor_ps(a: a, b: bAllTrue);
2449}
2450
2451PX_FORCE_INLINE BoolV BAndNot(const BoolV a, const BoolV b)
2452{
2453 return _mm_andnot_ps(a: b, b: a);
2454}
2455
2456PX_FORCE_INLINE BoolV BOr(const BoolV a, const BoolV b)
2457{
2458 return _mm_or_ps(a: a, b: b);
2459}
2460
2461PX_FORCE_INLINE BoolV BAllTrue4(const BoolV a)
2462{
2463 const BoolV bTmp =
2464 _mm_and_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 0, 1)), _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 2, 3)));
2465 return _mm_and_ps(_mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(0, 0, 0, 0)),
2466 _mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(1, 1, 1, 1)));
2467}
2468
2469PX_FORCE_INLINE BoolV BAnyTrue4(const BoolV a)
2470{
2471 const BoolV bTmp =
2472 _mm_or_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 0, 1)), _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 2, 3)));
2473 return _mm_or_ps(_mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(0, 0, 0, 0)),
2474 _mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(1, 1, 1, 1)));
2475}
2476
2477PX_FORCE_INLINE BoolV BAllTrue3(const BoolV a)
2478{
2479 const BoolV bTmp =
2480 _mm_and_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 0, 1)), _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)));
2481 return _mm_and_ps(_mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(0, 0, 0, 0)),
2482 _mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(1, 1, 1, 1)));
2483}
2484
2485PX_FORCE_INLINE BoolV BAnyTrue3(const BoolV a)
2486{
2487 const BoolV bTmp =
2488 _mm_or_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 0, 1)), _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)));
2489 return _mm_or_ps(_mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(0, 0, 0, 0)),
2490 _mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(1, 1, 1, 1)));
2491}
2492
2493PX_FORCE_INLINE PxU32 BAllEq(const BoolV a, const BoolV b)
2494{
2495 const BoolV bTest = m128_I2F(n: _mm_cmpeq_epi32(a: m128_F2I(n: a), b: m128_F2I(n: b)));
2496 return internalUnitSSE2Simd::BAllTrue4_R(a: bTest);
2497}
2498
2499PX_FORCE_INLINE PxU32 BAllEqTTTT(const BoolV a)
2500{
2501 return PxU32(_mm_movemask_ps(a: a)==15);
2502}
2503
2504PX_FORCE_INLINE PxU32 BAllEqFFFF(const BoolV a)
2505{
2506 return PxU32(_mm_movemask_ps(a: a)==0);
2507}
2508
2509PX_FORCE_INLINE PxU32 BGetBitMask(const BoolV a)
2510{
2511 return PxU32(_mm_movemask_ps(a: a));
2512}
2513
2514//////////////////////////////////
2515// MAT33V
2516//////////////////////////////////
2517
2518PX_FORCE_INLINE Vec3V M33MulV3(const Mat33V& a, const Vec3V b)
2519{
2520 const FloatV x = V3GetX(f: b);
2521 const FloatV y = V3GetY(f: b);
2522 const FloatV z = V3GetZ(f: b);
2523 const Vec3V v0 = V3Scale(a: a.col0, b: x);
2524 const Vec3V v1 = V3Scale(a: a.col1, b: y);
2525 const Vec3V v2 = V3Scale(a: a.col2, b: z);
2526 const Vec3V v0PlusV1 = V3Add(a: v0, b: v1);
2527 return V3Add(a: v0PlusV1, b: v2);
2528}
2529
2530PX_FORCE_INLINE Vec3V M33TrnspsMulV3(const Mat33V& a, const Vec3V b)
2531{
2532 const FloatV x = V3Dot(a: a.col0, b);
2533 const FloatV y = V3Dot(a: a.col1, b);
2534 const FloatV z = V3Dot(a: a.col2, b);
2535 return V3Merge(x, y, z);
2536}
2537
2538PX_FORCE_INLINE Vec3V M33MulV3AddV3(const Mat33V& A, const Vec3V b, const Vec3V c)
2539{
2540 const FloatV x = V3GetX(f: b);
2541 const FloatV y = V3GetY(f: b);
2542 const FloatV z = V3GetZ(f: b);
2543 Vec3V result = V3ScaleAdd(a: A.col0, b: x, c);
2544 result = V3ScaleAdd(a: A.col1, b: y, c: result);
2545 return V3ScaleAdd(a: A.col2, b: z, c: result);
2546}
2547
2548PX_FORCE_INLINE Mat33V M33MulM33(const Mat33V& a, const Mat33V& b)
2549{
2550 return Mat33V(M33MulV3(a, b: b.col0), M33MulV3(a, b: b.col1), M33MulV3(a, b: b.col2));
2551}
2552
2553PX_FORCE_INLINE Mat33V M33Add(const Mat33V& a, const Mat33V& b)
2554{
2555 return Mat33V(V3Add(a: a.col0, b: b.col0), V3Add(a: a.col1, b: b.col1), V3Add(a: a.col2, b: b.col2));
2556}
2557
2558PX_FORCE_INLINE Mat33V M33Scale(const Mat33V& a, const FloatV& b)
2559{
2560 return Mat33V(V3Scale(a: a.col0, b), V3Scale(a: a.col1, b), V3Scale(a: a.col2, b));
2561}
2562
2563PX_FORCE_INLINE Mat33V M33Inverse(const Mat33V& a)
2564{
2565 const BoolV tfft = BTFFT();
2566 const BoolV tttf = BTTTF();
2567 const FloatV zero = FZero();
2568 const Vec3V cross01 = V3Cross(a: a.col0, b: a.col1);
2569 const Vec3V cross12 = V3Cross(a: a.col1, b: a.col2);
2570 const Vec3V cross20 = V3Cross(a: a.col2, b: a.col0);
2571 const FloatV dot = V3Dot(a: cross01, b: a.col2);
2572 const FloatV invDet = _mm_rcp_ps(a: dot);
2573 const Vec3V mergeh = _mm_unpacklo_ps(a: cross12, b: cross01);
2574 const Vec3V mergel = _mm_unpackhi_ps(a: cross12, b: cross01);
2575 Vec3V colInv0 = _mm_unpacklo_ps(a: mergeh, b: cross20);
2576 colInv0 = _mm_or_ps(a: _mm_andnot_ps(a: tttf, b: zero), b: _mm_and_ps(a: tttf, b: colInv0));
2577 const Vec3V zppd = _mm_shuffle_ps(mergeh, cross20, _MM_SHUFFLE(3, 0, 0, 2));
2578 const Vec3V pbwp = _mm_shuffle_ps(cross20, mergeh, _MM_SHUFFLE(3, 3, 1, 0));
2579 const Vec3V colInv1 = _mm_or_ps(a: _mm_andnot_ps(a: BTFFT(), b: pbwp), b: _mm_and_ps(a: BTFFT(), b: zppd));
2580 const Vec3V xppd = _mm_shuffle_ps(mergel, cross20, _MM_SHUFFLE(3, 0, 0, 0));
2581 const Vec3V pcyp = _mm_shuffle_ps(cross20, mergel, _MM_SHUFFLE(3, 1, 2, 0));
2582 const Vec3V colInv2 = _mm_or_ps(a: _mm_andnot_ps(a: tfft, b: pcyp), b: _mm_and_ps(a: tfft, b: xppd));
2583
2584 return Mat33V(_mm_mul_ps(a: colInv0, b: invDet), _mm_mul_ps(a: colInv1, b: invDet), _mm_mul_ps(a: colInv2, b: invDet));
2585}
2586
2587PX_FORCE_INLINE Mat33V M33Trnsps(const Mat33V& a)
2588{
2589 return Mat33V(V3Merge(x: V3GetX(f: a.col0), y: V3GetX(f: a.col1), z: V3GetX(f: a.col2)),
2590 V3Merge(x: V3GetY(f: a.col0), y: V3GetY(f: a.col1), z: V3GetY(f: a.col2)),
2591 V3Merge(x: V3GetZ(f: a.col0), y: V3GetZ(f: a.col1), z: V3GetZ(f: a.col2)));
2592}
2593
2594PX_FORCE_INLINE Mat33V M33Identity()
2595{
2596 return Mat33V(V3UnitX(), V3UnitY(), V3UnitZ());
2597}
2598
2599PX_FORCE_INLINE Mat33V M33Sub(const Mat33V& a, const Mat33V& b)
2600{
2601 return Mat33V(V3Sub(a: a.col0, b: b.col0), V3Sub(a: a.col1, b: b.col1), V3Sub(a: a.col2, b: b.col2));
2602}
2603
2604PX_FORCE_INLINE Mat33V M33Neg(const Mat33V& a)
2605{
2606 return Mat33V(V3Neg(f: a.col0), V3Neg(f: a.col1), V3Neg(f: a.col2));
2607}
2608
2609PX_FORCE_INLINE Mat33V M33Abs(const Mat33V& a)
2610{
2611 return Mat33V(V3Abs(a: a.col0), V3Abs(a: a.col1), V3Abs(a: a.col2));
2612}
2613
2614PX_FORCE_INLINE Mat33V PromoteVec3V(const Vec3V v)
2615{
2616 const BoolV bTFFF = BTFFF();
2617 const BoolV bFTFF = BFTFF();
2618 const BoolV bFFTF = BTFTF();
2619
2620 const Vec3V zero = V3Zero();
2621
2622 return Mat33V(V3Sel(c: bTFFF, a: v, b: zero), V3Sel(c: bFTFF, a: v, b: zero), V3Sel(c: bFFTF, a: v, b: zero));
2623}
2624
2625PX_FORCE_INLINE Mat33V M33Diagonal(const Vec3VArg d)
2626{
2627 const FloatV x = V3Mul(a: V3UnitX(), b: d);
2628 const FloatV y = V3Mul(a: V3UnitY(), b: d);
2629 const FloatV z = V3Mul(a: V3UnitZ(), b: d);
2630 return Mat33V(x, y, z);
2631}
2632
2633//////////////////////////////////
2634// MAT34V
2635//////////////////////////////////
2636
2637PX_FORCE_INLINE Vec3V M34MulV3(const Mat34V& a, const Vec3V b)
2638{
2639 const FloatV x = V3GetX(f: b);
2640 const FloatV y = V3GetY(f: b);
2641 const FloatV z = V3GetZ(f: b);
2642 const Vec3V v0 = V3Scale(a: a.col0, b: x);
2643 const Vec3V v1 = V3Scale(a: a.col1, b: y);
2644 const Vec3V v2 = V3Scale(a: a.col2, b: z);
2645 const Vec3V v0PlusV1 = V3Add(a: v0, b: v1);
2646 const Vec3V v0PlusV1Plusv2 = V3Add(a: v0PlusV1, b: v2);
2647 return V3Add(a: v0PlusV1Plusv2, b: a.col3);
2648}
2649
2650PX_FORCE_INLINE Vec3V M34Mul33V3(const Mat34V& a, const Vec3V b)
2651{
2652 const FloatV x = V3GetX(f: b);
2653 const FloatV y = V3GetY(f: b);
2654 const FloatV z = V3GetZ(f: b);
2655 const Vec3V v0 = V3Scale(a: a.col0, b: x);
2656 const Vec3V v1 = V3Scale(a: a.col1, b: y);
2657 const Vec3V v2 = V3Scale(a: a.col2, b: z);
2658 const Vec3V v0PlusV1 = V3Add(a: v0, b: v1);
2659 return V3Add(a: v0PlusV1, b: v2);
2660}
2661
2662PX_FORCE_INLINE Vec3V M34TrnspsMul33V3(const Mat34V& a, const Vec3V b)
2663{
2664 const FloatV x = V3Dot(a: a.col0, b);
2665 const FloatV y = V3Dot(a: a.col1, b);
2666 const FloatV z = V3Dot(a: a.col2, b);
2667 return V3Merge(x, y, z);
2668}
2669
2670PX_FORCE_INLINE Mat34V M34MulM34(const Mat34V& a, const Mat34V& b)
2671{
2672 return Mat34V(M34Mul33V3(a, b: b.col0), M34Mul33V3(a, b: b.col1), M34Mul33V3(a, b: b.col2), M34MulV3(a, b: b.col3));
2673}
2674
2675PX_FORCE_INLINE Mat33V M34MulM33(const Mat34V& a, const Mat33V& b)
2676{
2677 return Mat33V(M34Mul33V3(a, b: b.col0), M34Mul33V3(a, b: b.col1), M34Mul33V3(a, b: b.col2));
2678}
2679
2680PX_FORCE_INLINE Mat33V M34Mul33MM34(const Mat34V& a, const Mat34V& b)
2681{
2682 return Mat33V(M34Mul33V3(a, b: b.col0), M34Mul33V3(a, b: b.col1), M34Mul33V3(a, b: b.col2));
2683}
2684
2685PX_FORCE_INLINE Mat34V M34Add(const Mat34V& a, const Mat34V& b)
2686{
2687 return Mat34V(V3Add(a: a.col0, b: b.col0), V3Add(a: a.col1, b: b.col1), V3Add(a: a.col2, b: b.col2), V3Add(a: a.col3, b: b.col3));
2688}
2689
2690PX_FORCE_INLINE Mat33V M34Trnsps33(const Mat34V& a)
2691{
2692 return Mat33V(V3Merge(x: V3GetX(f: a.col0), y: V3GetX(f: a.col1), z: V3GetX(f: a.col2)),
2693 V3Merge(x: V3GetY(f: a.col0), y: V3GetY(f: a.col1), z: V3GetY(f: a.col2)),
2694 V3Merge(x: V3GetZ(f: a.col0), y: V3GetZ(f: a.col1), z: V3GetZ(f: a.col2)));
2695}
2696
2697//////////////////////////////////
2698// MAT44V
2699//////////////////////////////////
2700
2701PX_FORCE_INLINE Vec4V M44MulV4(const Mat44V& a, const Vec4V b)
2702{
2703 const FloatV x = V4GetX(f: b);
2704 const FloatV y = V4GetY(f: b);
2705 const FloatV z = V4GetZ(f: b);
2706 const FloatV w = V4GetW(f: b);
2707
2708 const Vec4V v0 = V4Scale(a: a.col0, b: x);
2709 const Vec4V v1 = V4Scale(a: a.col1, b: y);
2710 const Vec4V v2 = V4Scale(a: a.col2, b: z);
2711 const Vec4V v3 = V4Scale(a: a.col3, b: w);
2712 const Vec4V v0PlusV1 = V4Add(a: v0, b: v1);
2713 const Vec4V v0PlusV1Plusv2 = V4Add(a: v0PlusV1, b: v2);
2714 return V4Add(a: v0PlusV1Plusv2, b: v3);
2715}
2716
2717PX_FORCE_INLINE Vec4V M44TrnspsMulV4(const Mat44V& a, const Vec4V b)
2718{
2719 PX_ALIGN(16, FloatV) dotProdArray[4] = { V4Dot(a: a.col0, b), V4Dot(a: a.col1, b), V4Dot(a: a.col2, b), V4Dot(a: a.col3, b) };
2720 return V4Merge(floatVArray: dotProdArray);
2721}
2722
2723PX_FORCE_INLINE Mat44V M44MulM44(const Mat44V& a, const Mat44V& b)
2724{
2725 return Mat44V(M44MulV4(a, b: b.col0), M44MulV4(a, b: b.col1), M44MulV4(a, b: b.col2), M44MulV4(a, b: b.col3));
2726}
2727
2728PX_FORCE_INLINE Mat44V M44Add(const Mat44V& a, const Mat44V& b)
2729{
2730 return Mat44V(V4Add(a: a.col0, b: b.col0), V4Add(a: a.col1, b: b.col1), V4Add(a: a.col2, b: b.col2), V4Add(a: a.col3, b: b.col3));
2731}
2732
2733PX_FORCE_INLINE Mat44V M44Trnsps(const Mat44V& a)
2734{
2735 const Vec4V v0 = _mm_unpacklo_ps(a: a.col0, b: a.col2);
2736 const Vec4V v1 = _mm_unpackhi_ps(a: a.col0, b: a.col2);
2737 const Vec4V v2 = _mm_unpacklo_ps(a: a.col1, b: a.col3);
2738 const Vec4V v3 = _mm_unpackhi_ps(a: a.col1, b: a.col3);
2739 return Mat44V(_mm_unpacklo_ps(a: v0, b: v2), _mm_unpackhi_ps(a: v0, b: v2), _mm_unpacklo_ps(a: v1, b: v3), _mm_unpackhi_ps(a: v1, b: v3));
2740}
2741
2742PX_FORCE_INLINE Mat44V M44Inverse(const Mat44V& a)
2743{
2744 __m128 minor0, minor1, minor2, minor3;
2745 __m128 row0, row1, row2, row3;
2746 __m128 det, tmp1;
2747
2748 tmp1 = V4Zero();
2749 row1 = V4Zero();
2750 row3 = V4Zero();
2751
2752 row0 = a.col0;
2753 row1 = _mm_shuffle_ps(a.col1, a.col1, _MM_SHUFFLE(1, 0, 3, 2));
2754 row2 = a.col2;
2755 row3 = _mm_shuffle_ps(a.col3, a.col3, _MM_SHUFFLE(1, 0, 3, 2));
2756
2757 tmp1 = _mm_mul_ps(a: row2, b: row3);
2758 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
2759 minor0 = _mm_mul_ps(a: row1, b: tmp1);
2760 minor1 = _mm_mul_ps(a: row0, b: tmp1);
2761 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
2762 minor0 = _mm_sub_ps(a: _mm_mul_ps(a: row1, b: tmp1), b: minor0);
2763 minor1 = _mm_sub_ps(a: _mm_mul_ps(a: row0, b: tmp1), b: minor1);
2764 minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E);
2765
2766 tmp1 = _mm_mul_ps(a: row1, b: row2);
2767 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
2768 minor0 = _mm_add_ps(a: _mm_mul_ps(a: row3, b: tmp1), b: minor0);
2769 minor3 = _mm_mul_ps(a: row0, b: tmp1);
2770 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
2771 minor0 = _mm_sub_ps(a: minor0, b: _mm_mul_ps(a: row3, b: tmp1));
2772 minor3 = _mm_sub_ps(a: _mm_mul_ps(a: row0, b: tmp1), b: minor3);
2773 minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E);
2774
2775 tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), b: row3);
2776 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
2777 row2 = _mm_shuffle_ps(row2, row2, 0x4E);
2778 minor0 = _mm_add_ps(a: _mm_mul_ps(a: row2, b: tmp1), b: minor0);
2779 minor2 = _mm_mul_ps(a: row0, b: tmp1);
2780 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
2781 minor0 = _mm_sub_ps(a: minor0, b: _mm_mul_ps(a: row2, b: tmp1));
2782 minor2 = _mm_sub_ps(a: _mm_mul_ps(a: row0, b: tmp1), b: minor2);
2783 minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E);
2784
2785 tmp1 = _mm_mul_ps(a: row0, b: row1);
2786 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
2787 minor2 = _mm_add_ps(a: _mm_mul_ps(a: row3, b: tmp1), b: minor2);
2788 minor3 = _mm_sub_ps(a: _mm_mul_ps(a: row2, b: tmp1), b: minor3);
2789 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
2790 minor2 = _mm_sub_ps(a: _mm_mul_ps(a: row3, b: tmp1), b: minor2);
2791 minor3 = _mm_sub_ps(a: minor3, b: _mm_mul_ps(a: row2, b: tmp1));
2792
2793 tmp1 = _mm_mul_ps(a: row0, b: row3);
2794 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
2795 minor1 = _mm_sub_ps(a: minor1, b: _mm_mul_ps(a: row2, b: tmp1));
2796 minor2 = _mm_add_ps(a: _mm_mul_ps(a: row1, b: tmp1), b: minor2);
2797 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
2798 minor1 = _mm_add_ps(a: _mm_mul_ps(a: row2, b: tmp1), b: minor1);
2799 minor2 = _mm_sub_ps(a: minor2, b: _mm_mul_ps(a: row1, b: tmp1));
2800
2801 tmp1 = _mm_mul_ps(a: row0, b: row2);
2802 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
2803 minor1 = _mm_add_ps(a: _mm_mul_ps(a: row3, b: tmp1), b: minor1);
2804 minor3 = _mm_sub_ps(a: minor3, b: _mm_mul_ps(a: row1, b: tmp1));
2805 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
2806 minor1 = _mm_sub_ps(a: minor1, b: _mm_mul_ps(a: row3, b: tmp1));
2807 minor3 = _mm_add_ps(a: _mm_mul_ps(a: row1, b: tmp1), b: minor3);
2808
2809 det = _mm_mul_ps(a: row0, b: minor0);
2810 det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), b: det);
2811 det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), b: det);
2812 tmp1 = _mm_rcp_ss(a: det);
2813#if 0
2814 det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)));
2815 det = _mm_shuffle_ps(det, det, 0x00);
2816#else
2817 det = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(0, 0, 0, 0));
2818#endif
2819
2820 minor0 = _mm_mul_ps(a: det, b: minor0);
2821 minor1 = _mm_mul_ps(a: det, b: minor1);
2822 minor2 = _mm_mul_ps(a: det, b: minor2);
2823 minor3 = _mm_mul_ps(a: det, b: minor3);
2824 Mat44V invTrans(minor0, minor1, minor2, minor3);
2825 return M44Trnsps(a: invTrans);
2826}
2827
2828PX_FORCE_INLINE Vec4V V4LoadXYZW(const PxF32& x, const PxF32& y, const PxF32& z, const PxF32& w)
2829{
2830 return _mm_set_ps(z: w, y: z, x: y, w: x);
2831}
2832
2833/*
2834// AP: work in progress - use proper SSE intrinsics where possible
2835PX_FORCE_INLINE VecU16V V4U32PK(VecU32V a, VecU32V b)
2836{
2837 VecU16V result;
2838 result.m128_u16[0] = PxU16(PxClamp<PxU32>((a).m128_u32[0], 0, 0xFFFF));
2839 result.m128_u16[1] = PxU16(PxClamp<PxU32>((a).m128_u32[1], 0, 0xFFFF));
2840 result.m128_u16[2] = PxU16(PxClamp<PxU32>((a).m128_u32[2], 0, 0xFFFF));
2841 result.m128_u16[3] = PxU16(PxClamp<PxU32>((a).m128_u32[3], 0, 0xFFFF));
2842 result.m128_u16[4] = PxU16(PxClamp<PxU32>((b).m128_u32[0], 0, 0xFFFF));
2843 result.m128_u16[5] = PxU16(PxClamp<PxU32>((b).m128_u32[1], 0, 0xFFFF));
2844 result.m128_u16[6] = PxU16(PxClamp<PxU32>((b).m128_u32[2], 0, 0xFFFF));
2845 result.m128_u16[7] = PxU16(PxClamp<PxU32>((b).m128_u32[3], 0, 0xFFFF));
2846 return result;
2847}
2848*/
2849
2850PX_FORCE_INLINE VecU32V V4U32Sel(const BoolV c, const VecU32V a, const VecU32V b)
2851{
2852 return m128_I2F(n: _mm_or_si128(a: _mm_andnot_si128(a: m128_F2I(n: c), b: m128_F2I(n: b)), b: _mm_and_si128(a: m128_F2I(n: c), b: m128_F2I(n: a))));
2853}
2854
2855PX_FORCE_INLINE VecU32V V4U32or(VecU32V a, VecU32V b)
2856{
2857 return m128_I2F(n: _mm_or_si128(a: m128_F2I(n: a), b: m128_F2I(n: b)));
2858}
2859
2860PX_FORCE_INLINE VecU32V V4U32xor(VecU32V a, VecU32V b)
2861{
2862 return m128_I2F(n: _mm_xor_si128(a: m128_F2I(n: a), b: m128_F2I(n: b)));
2863}
2864
2865PX_FORCE_INLINE VecU32V V4U32and(VecU32V a, VecU32V b)
2866{
2867 return m128_I2F(n: _mm_and_si128(a: m128_F2I(n: a), b: m128_F2I(n: b)));
2868}
2869
2870PX_FORCE_INLINE VecU32V V4U32Andc(VecU32V a, VecU32V b)
2871{
2872 return m128_I2F(n: _mm_andnot_si128(a: m128_F2I(n: b), b: m128_F2I(n: a)));
2873}
2874
2875/*
2876PX_FORCE_INLINE VecU16V V4U16Or(VecU16V a, VecU16V b)
2877{
2878 return m128_I2F(_mm_or_si128(m128_F2I(a), m128_F2I(b)));
2879}
2880*/
2881
2882/*
2883PX_FORCE_INLINE VecU16V V4U16And(VecU16V a, VecU16V b)
2884{
2885 return m128_I2F(_mm_and_si128(m128_F2I(a), m128_F2I(b)));
2886}
2887*/
2888
2889/*
2890PX_FORCE_INLINE VecU16V V4U16Andc(VecU16V a, VecU16V b)
2891{
2892 return m128_I2F(_mm_andnot_si128(m128_F2I(b), m128_F2I(a)));
2893}
2894*/
2895
2896PX_FORCE_INLINE VecI32V I4Load(const PxI32 i)
2897{
2898 return m128_F2I(n: _mm_load1_ps(p: reinterpret_cast<const PxF32*>(&i)));
2899}
2900
2901PX_FORCE_INLINE VecI32V I4LoadU(const PxI32* i)
2902{
2903 return m128_F2I(n: _mm_loadu_ps(p: reinterpret_cast<const PxF32*>(i)));
2904}
2905
2906PX_FORCE_INLINE VecI32V I4LoadA(const PxI32* i)
2907{
2908 return m128_F2I(n: _mm_load_ps(p: reinterpret_cast<const PxF32*>(i)));
2909}
2910
2911PX_FORCE_INLINE VecI32V VecI32V_Add(const VecI32VArg a, const VecI32VArg b)
2912{
2913 return _mm_add_epi32(a: a, b: b);
2914}
2915
2916PX_FORCE_INLINE VecI32V VecI32V_Sub(const VecI32VArg a, const VecI32VArg b)
2917{
2918 return _mm_sub_epi32(a: a, b: b);
2919}
2920
2921PX_FORCE_INLINE BoolV VecI32V_IsGrtr(const VecI32VArg a, const VecI32VArg b)
2922{
2923 return m128_I2F(n: _mm_cmpgt_epi32(a: a, b: b));
2924}
2925
2926PX_FORCE_INLINE BoolV VecI32V_IsEq(const VecI32VArg a, const VecI32VArg b)
2927{
2928 return m128_I2F(n: _mm_cmpeq_epi32(a: a, b: b));
2929}
2930
2931PX_FORCE_INLINE VecI32V V4I32Sel(const BoolV c, const VecI32V a, const VecI32V b)
2932{
2933 return _mm_or_si128(a: _mm_andnot_si128(a: m128_F2I(n: c), b: b), b: _mm_and_si128(a: m128_F2I(n: c), b: a));
2934}
2935
2936PX_FORCE_INLINE VecI32V VecI32V_Zero()
2937{
2938 return _mm_setzero_si128();
2939}
2940
2941PX_FORCE_INLINE VecI32V VecI32V_One()
2942{
2943 return I4Load(i: 1);
2944}
2945
2946PX_FORCE_INLINE VecI32V VecI32V_Two()
2947{
2948 return I4Load(i: 2);
2949}
2950
2951PX_FORCE_INLINE VecI32V VecI32V_MinusOne()
2952{
2953 return I4Load(i: -1);
2954}
2955
2956PX_FORCE_INLINE VecU32V U4Zero()
2957{
2958 return U4Load(i: 0);
2959}
2960
2961PX_FORCE_INLINE VecU32V U4One()
2962{
2963 return U4Load(i: 1);
2964}
2965
2966PX_FORCE_INLINE VecU32V U4Two()
2967{
2968 return U4Load(i: 2);
2969}
2970
2971PX_FORCE_INLINE VecI32V VecI32V_Sel(const BoolV c, const VecI32VArg a, const VecI32VArg b)
2972{
2973 return _mm_or_si128(a: _mm_andnot_si128(a: m128_F2I(n: c), b: b), b: _mm_and_si128(a: m128_F2I(n: c), b: a));
2974}
2975
2976PX_FORCE_INLINE VecShiftV VecI32V_PrepareShift(const VecI32VArg shift)
2977{
2978 VecShiftV s;
2979 s.shift = VecI32V_Sel(c: BTFFF(), a: shift, b: VecI32V_Zero());
2980 return s;
2981}
2982
2983PX_FORCE_INLINE VecI32V VecI32V_LeftShift(const VecI32VArg a, const VecShiftVArg count)
2984{
2985 return _mm_sll_epi32(a: a, count: count.shift);
2986}
2987
2988PX_FORCE_INLINE VecI32V VecI32V_RightShift(const VecI32VArg a, const VecShiftVArg count)
2989{
2990 return _mm_srl_epi32(a: a, count: count.shift);
2991}
2992
2993PX_FORCE_INLINE VecI32V VecI32V_And(const VecI32VArg a, const VecI32VArg b)
2994{
2995 return _mm_and_si128(a: a, b: b);
2996}
2997
2998PX_FORCE_INLINE VecI32V VecI32V_Or(const VecI32VArg a, const VecI32VArg b)
2999{
3000 return _mm_or_si128(a: a, b: b);
3001}
3002
3003PX_FORCE_INLINE VecI32V VecI32V_GetX(const VecI32VArg a)
3004{
3005 return m128_F2I(_mm_shuffle_ps(m128_I2F(a), m128_I2F(a), _MM_SHUFFLE(0, 0, 0, 0)));
3006}
3007
3008PX_FORCE_INLINE VecI32V VecI32V_GetY(const VecI32VArg a)
3009{
3010 return m128_F2I(_mm_shuffle_ps(m128_I2F(a), m128_I2F(a), _MM_SHUFFLE(1, 1, 1, 1)));
3011}
3012
3013PX_FORCE_INLINE VecI32V VecI32V_GetZ(const VecI32VArg a)
3014{
3015 return m128_F2I(_mm_shuffle_ps(m128_I2F(a), m128_I2F(a), _MM_SHUFFLE(2, 2, 2, 2)));
3016}
3017
3018PX_FORCE_INLINE VecI32V VecI32V_GetW(const VecI32VArg a)
3019{
3020 return m128_F2I(_mm_shuffle_ps(m128_I2F(a), m128_I2F(a), _MM_SHUFFLE(3, 3, 3, 3)));
3021}
3022
3023PX_FORCE_INLINE void PxI32_From_VecI32V(const VecI32VArg a, PxI32* i)
3024{
3025 _mm_store_ss(p: reinterpret_cast<PxF32*>(i), a: m128_I2F(n: a));
3026}
3027
3028PX_FORCE_INLINE VecI32V VecI32V_Merge(const VecI32VArg x, const VecI32VArg y, const VecI32VArg z, const VecI32VArg w)
3029{
3030 const __m128 xw = _mm_move_ss(a: m128_I2F(n: y), b: m128_I2F(n: x)); // y, y, y, x
3031 const __m128 yz = _mm_move_ss(a: m128_I2F(n: z), b: m128_I2F(n: w)); // z, z, z, w
3032 return m128_F2I(_mm_shuffle_ps(xw, yz, _MM_SHUFFLE(0, 2, 1, 0)));
3033}
3034
3035PX_FORCE_INLINE VecI32V VecI32V_From_BoolV(const BoolVArg a)
3036{
3037 return m128_F2I(n: a);
3038}
3039
3040PX_FORCE_INLINE VecU32V VecU32V_From_BoolV(const BoolVArg a)
3041{
3042 return a;
3043}
3044
3045/*
3046template<int a> PX_FORCE_INLINE VecI32V V4ISplat()
3047{
3048 VecI32V result;
3049 result.m128_i32[0] = a;
3050 result.m128_i32[1] = a;
3051 result.m128_i32[2] = a;
3052 result.m128_i32[3] = a;
3053 return result;
3054}
3055
3056template<PxU32 a> PX_FORCE_INLINE VecU32V V4USplat()
3057{
3058 VecU32V result;
3059 result.m128_u32[0] = a;
3060 result.m128_u32[1] = a;
3061 result.m128_u32[2] = a;
3062 result.m128_u32[3] = a;
3063 return result;
3064}
3065*/
3066
3067/*
3068PX_FORCE_INLINE void V4U16StoreAligned(VecU16V val, VecU16V* address)
3069{
3070 *address = val;
3071}
3072*/
3073
3074PX_FORCE_INLINE void V4U32StoreAligned(VecU32V val, VecU32V* address)
3075{
3076 *address = val;
3077}
3078
3079PX_FORCE_INLINE Vec4V V4LoadAligned(Vec4V* addr)
3080{
3081 return *addr;
3082}
3083
3084PX_FORCE_INLINE Vec4V V4LoadUnaligned(Vec4V* addr)
3085{
3086 return V4LoadU(f: reinterpret_cast<float*>(addr));
3087}
3088
3089PX_FORCE_INLINE Vec4V V4Andc(const Vec4V a, const VecU32V b)
3090{
3091 VecU32V result32(a);
3092 result32 = V4U32Andc(a: result32, b);
3093 return Vec4V(result32);
3094}
3095
3096PX_FORCE_INLINE VecU32V V4IsGrtrV32u(const Vec4V a, const Vec4V b)
3097{
3098 return V4IsGrtr(a, b);
3099}
3100
3101PX_FORCE_INLINE VecU16V V4U16LoadAligned(VecU16V* addr)
3102{
3103 return *addr;
3104}
3105
3106PX_FORCE_INLINE VecU16V V4U16LoadUnaligned(VecU16V* addr)
3107{
3108 return *addr;
3109}
3110
3111PX_FORCE_INLINE VecU16V V4U16CompareGt(VecU16V a, VecU16V b)
3112{
3113 // _mm_cmpgt_epi16 doesn't work for unsigned values unfortunately
3114 // return m128_I2F(_mm_cmpgt_epi16(m128_F2I(a), m128_F2I(b)));
3115 VecU16V result;
3116 result.m128_u16[0] = (a).m128_u16[0] > (b).m128_u16[0];
3117 result.m128_u16[1] = (a).m128_u16[1] > (b).m128_u16[1];
3118 result.m128_u16[2] = (a).m128_u16[2] > (b).m128_u16[2];
3119 result.m128_u16[3] = (a).m128_u16[3] > (b).m128_u16[3];
3120 result.m128_u16[4] = (a).m128_u16[4] > (b).m128_u16[4];
3121 result.m128_u16[5] = (a).m128_u16[5] > (b).m128_u16[5];
3122 result.m128_u16[6] = (a).m128_u16[6] > (b).m128_u16[6];
3123 result.m128_u16[7] = (a).m128_u16[7] > (b).m128_u16[7];
3124 return result;
3125}
3126
3127PX_FORCE_INLINE VecU16V V4I16CompareGt(VecU16V a, VecU16V b)
3128{
3129 return m128_I2F(n: _mm_cmpgt_epi16(a: m128_F2I(n: a), b: m128_F2I(n: b)));
3130}
3131
3132PX_FORCE_INLINE Vec4V Vec4V_From_VecU32V(VecU32V a)
3133{
3134 Vec4V result = V4LoadXYZW(x: PxF32(a.m128_u32[0]), y: PxF32(a.m128_u32[1]), z: PxF32(a.m128_u32[2]), w: PxF32(a.m128_u32[3]));
3135 return result;
3136}
3137
3138PX_FORCE_INLINE Vec4V Vec4V_From_VecI32V(VecI32V in)
3139{
3140 return _mm_cvtepi32_ps(a: in);
3141}
3142
3143PX_FORCE_INLINE VecI32V VecI32V_From_Vec4V(Vec4V a)
3144{
3145 return _mm_cvttps_epi32(a: a);
3146}
3147
3148PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecU32V(VecU32V a)
3149{
3150 return Vec4V(a);
3151}
3152
3153PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecI32V(VecI32V a)
3154{
3155 return m128_I2F(n: a);
3156}
3157
3158PX_FORCE_INLINE VecU32V VecU32V_ReinterpretFrom_Vec4V(Vec4V a)
3159{
3160 return VecU32V(a);
3161}
3162
3163PX_FORCE_INLINE VecI32V VecI32V_ReinterpretFrom_Vec4V(Vec4V a)
3164{
3165 return m128_F2I(n: a);
3166}
3167
3168/*
3169template<int index> PX_FORCE_INLINE BoolV BSplatElement(BoolV a)
3170{
3171 BoolV result;
3172 result[0] = result[1] = result[2] = result[3] = a[index];
3173 return result;
3174}
3175*/
3176
3177template <int index>
3178BoolV BSplatElement(BoolV a)
3179{
3180 float* data = reinterpret_cast<float*>(&a);
3181 return V4Load(f: data[index]);
3182}
3183
3184template <int index>
3185PX_FORCE_INLINE VecU32V V4U32SplatElement(VecU32V a)
3186{
3187 VecU32V result;
3188 result.m128_u32[0] = result.m128_u32[1] = result.m128_u32[2] = result.m128_u32[3] = a.m128_u32[index];
3189 return result;
3190}
3191
3192template <int index>
3193PX_FORCE_INLINE Vec4V V4SplatElement(Vec4V a)
3194{
3195 float* data = reinterpret_cast<float*>(&a);
3196 return V4Load(f: data[index]);
3197}
3198
3199PX_FORCE_INLINE VecU32V U4LoadXYZW(PxU32 x, PxU32 y, PxU32 z, PxU32 w)
3200{
3201 VecU32V result;
3202 result.m128_u32[0] = x;
3203 result.m128_u32[1] = y;
3204 result.m128_u32[2] = z;
3205 result.m128_u32[3] = w;
3206 return result;
3207}
3208
3209PX_FORCE_INLINE Vec4V V4Ceil(const Vec4V in)
3210{
3211 UnionM128 a(in);
3212 return V4LoadXYZW(x: PxCeil(a: a.m128_f32[0]), y: PxCeil(a: a.m128_f32[1]), z: PxCeil(a: a.m128_f32[2]), w: PxCeil(a: a.m128_f32[3]));
3213}
3214
3215PX_FORCE_INLINE Vec4V V4Floor(const Vec4V in)
3216{
3217 UnionM128 a(in);
3218 return V4LoadXYZW(x: PxFloor(a: a.m128_f32[0]), y: PxFloor(a: a.m128_f32[1]), z: PxFloor(a: a.m128_f32[2]), w: PxFloor(a: a.m128_f32[3]));
3219}
3220
3221PX_FORCE_INLINE VecU32V V4ConvertToU32VSaturate(const Vec4V in, PxU32 power)
3222{
3223 PX_ASSERT(power == 0 && "Non-zero power not supported in convertToU32VSaturate");
3224 PX_UNUSED(power); // prevent warning in release builds
3225 PxF32 ffffFFFFasFloat = PxF32(0xFFFF0000);
3226 UnionM128 a(in);
3227 VecU32V result;
3228 result.m128_u32[0] = PxU32(PxClamp<PxF32>(v: (a).m128_f32[0], lo: 0.0f, hi: ffffFFFFasFloat));
3229 result.m128_u32[1] = PxU32(PxClamp<PxF32>(v: (a).m128_f32[1], lo: 0.0f, hi: ffffFFFFasFloat));
3230 result.m128_u32[2] = PxU32(PxClamp<PxF32>(v: (a).m128_f32[2], lo: 0.0f, hi: ffffFFFFasFloat));
3231 result.m128_u32[3] = PxU32(PxClamp<PxF32>(v: (a).m128_f32[3], lo: 0.0f, hi: ffffFFFFasFloat));
3232 return result;
3233}
3234
3235} // namespace aos
3236} // namespace shdfnd
3237} // namespace physx
3238
3239#endif // PSFOUNDATION_PSUNIXSSE2INLINEAOS_H
3240

source code of qtquick3dphysics/src/3rdparty/PhysX/source/foundation/include/unix/sse2/PsUnixSse2InlineAoS.h