1 | // |
2 | // Redistribution and use in source and binary forms, with or without |
3 | // modification, are permitted provided that the following conditions |
4 | // are met: |
5 | // * Redistributions of source code must retain the above copyright |
6 | // notice, this list of conditions and the following disclaimer. |
7 | // * Redistributions in binary form must reproduce the above copyright |
8 | // notice, this list of conditions and the following disclaimer in the |
9 | // documentation and/or other materials provided with the distribution. |
10 | // * Neither the name of NVIDIA CORPORATION nor the names of its |
11 | // contributors may be used to endorse or promote products derived |
12 | // from this software without specific prior written permission. |
13 | // |
14 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY |
15 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
16 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
17 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR |
18 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
19 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
20 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
21 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
22 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
23 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
24 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
25 | // |
26 | // Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved. |
27 | // Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. |
28 | // Copyright (c) 2001-2004 NovodeX AG. All rights reserved. |
29 | |
30 | #ifndef PSFOUNDATION_PSUNIXSSE2INLINEAOS_H |
31 | #define PSFOUNDATION_PSUNIXSSE2INLINEAOS_H |
32 | |
33 | #if !COMPILE_VECTOR_INTRINSICS |
34 | #error Vector intrinsics should not be included when using scalar implementation. |
35 | #endif |
36 | |
37 | #ifdef __SSE4_2__ |
38 | #include "smmintrin.h" |
39 | #endif |
40 | |
41 | #include "../../PsVecMathSSE.h" |
42 | |
43 | namespace physx |
44 | { |
45 | namespace shdfnd |
46 | { |
47 | namespace aos |
48 | { |
49 | |
50 | #define PX_FPCLASS_SNAN 0x0001 /* signaling NaN */ |
51 | #define PX_FPCLASS_QNAN 0x0002 /* quiet NaN */ |
52 | #define PX_FPCLASS_NINF 0x0004 /* negative infinity */ |
53 | #define PX_FPCLASS_PINF 0x0200 /* positive infinity */ |
54 | |
55 | PX_FORCE_INLINE __m128 m128_I2F(__m128i n) |
56 | { |
57 | return _mm_castsi128_ps(a: n); |
58 | } |
59 | PX_FORCE_INLINE __m128i m128_F2I(__m128 n) |
60 | { |
61 | return _mm_castps_si128(a: n); |
62 | } |
63 | |
64 | ////////////////////////////////////////////////////////////////////// |
65 | //Test that Vec3V and FloatV are legal |
66 | ////////////////////////////////////////////////////////////////////// |
67 | |
68 | #define FLOAT_COMPONENTS_EQUAL_THRESHOLD 0.01f |
69 | PX_FORCE_INLINE static bool isValidFloatV(const FloatV a) |
70 | { |
71 | const PxF32 x = V4ReadX(v: a); |
72 | const PxF32 y = V4ReadY(v: a); |
73 | const PxF32 z = V4ReadZ(v: a); |
74 | const PxF32 w = V4ReadW(v: a); |
75 | |
76 | if ( |
77 | (PxAbs(a: x - y) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) && |
78 | (PxAbs(a: x - z) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) && |
79 | (PxAbs(a: x - w) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) |
80 | ) |
81 | { |
82 | return true; |
83 | } |
84 | |
85 | if ( |
86 | (PxAbs(a: (x - y) / x) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) && |
87 | (PxAbs(a: (x - z) / x) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) && |
88 | (PxAbs(a: (x - w) / x) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) |
89 | ) |
90 | { |
91 | return true; |
92 | } |
93 | |
94 | return false; |
95 | } |
96 | |
97 | PX_FORCE_INLINE bool isValidVec3V(const Vec3V a) |
98 | { |
99 | PX_ALIGN(16, PxF32 f[4]); |
100 | V4StoreA(a, f); |
101 | return (f[3] == 0.0f); |
102 | } |
103 | |
104 | PX_FORCE_INLINE bool isFiniteLength(const Vec3V a) |
105 | { |
106 | return !FAllEq(a: V4LengthSq(a), b: FZero()); |
107 | } |
108 | |
109 | PX_FORCE_INLINE bool isAligned16(void* a) |
110 | { |
111 | return(0 == (size_t(a) & 0x0f)); |
112 | } |
113 | |
114 | //ASSERT_FINITELENGTH is deactivated because there is a lot of code that calls a simd normalisation function with zero length but then ignores the result. |
115 | |
116 | #if PX_DEBUG |
117 | #define ASSERT_ISVALIDVEC3V(a) PX_ASSERT(isValidVec3V(a)) |
118 | #define ASSERT_ISVALIDFLOATV(a) PX_ASSERT(isValidFloatV(a)) |
119 | #define ASSERT_ISALIGNED16(a) PX_ASSERT(isAligned16(reinterpret_cast<void*>(a))) |
120 | #define ASSERT_ISFINITELENGTH(a) //PX_ASSERT(isFiniteLength(a)) |
121 | #else |
122 | #define ASSERT_ISVALIDVEC3V(a) |
123 | #define ASSERT_ISVALIDFLOATV(a) |
124 | #define ASSERT_ISALIGNED16(a) |
125 | #define ASSERT_ISFINITELENGTH(a) |
126 | #endif |
127 | |
128 | |
129 | namespace internalUnitSSE2Simd |
130 | { |
131 | PX_FORCE_INLINE PxU32 BAllTrue4_R(const BoolV a) |
132 | { |
133 | const PxI32 moveMask = _mm_movemask_ps(a: a); |
134 | return PxU32(moveMask == 0xf); |
135 | } |
136 | |
137 | PX_FORCE_INLINE PxU32 BAllTrue3_R(const BoolV a) |
138 | { |
139 | const PxI32 moveMask = _mm_movemask_ps(a: a); |
140 | return PxU32((moveMask & 0x7) == 0x7); |
141 | } |
142 | |
143 | PX_FORCE_INLINE PxU32 BAnyTrue4_R(const BoolV a) |
144 | { |
145 | const PxI32 moveMask = _mm_movemask_ps(a: a); |
146 | return PxU32(moveMask != 0x0); |
147 | } |
148 | |
149 | PX_FORCE_INLINE PxU32 BAnyTrue3_R(const BoolV a) |
150 | { |
151 | const PxI32 moveMask = _mm_movemask_ps(a: a); |
152 | return PxU32((moveMask & 0x7) != 0x0); |
153 | } |
154 | |
155 | PX_FORCE_INLINE PxU32 FiniteTestEq(const Vec4V a, const Vec4V b) |
156 | { |
157 | // This is a bit of a bodge. |
158 | //_mm_comieq_ss returns 1 if either value is nan so we need to re-cast a and b with true encoded as a non-nan |
159 | // number. |
160 | // There must be a better way of doing this in sse. |
161 | const BoolV one = FOne(); |
162 | const BoolV zero = FZero(); |
163 | const BoolV a1 = V4Sel(c: a, a: one, b: zero); |
164 | const BoolV b1 = V4Sel(c: b, a: one, b: zero); |
165 | return ( |
166 | _mm_comieq_ss(a: a1, b: b1) && |
167 | _mm_comieq_ss(_mm_shuffle_ps(a1, a1, _MM_SHUFFLE(1, 1, 1, 1)), _mm_shuffle_ps(b1, b1, _MM_SHUFFLE(1, 1, 1, 1))) && |
168 | _mm_comieq_ss(_mm_shuffle_ps(a1, a1, _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(b1, b1, _MM_SHUFFLE(2, 2, 2, 2))) && |
169 | _mm_comieq_ss(_mm_shuffle_ps(a1, a1, _MM_SHUFFLE(3, 3, 3, 3)), _mm_shuffle_ps(b1, b1, _MM_SHUFFLE(3, 3, 3, 3)))); |
170 | } |
171 | |
172 | #if !PX_EMSCRIPTEN |
173 | const PX_ALIGN(16, PxF32 gMaskXYZ[4]) = { physx::PxUnionCast<PxF32>(b: 0xffffffff), physx::PxUnionCast<PxF32>(b: 0xffffffff), |
174 | physx::PxUnionCast<PxF32>(b: 0xffffffff), 0 }; |
175 | #else |
176 | // emscripten doesn't like the PxUnionCast data structure |
177 | // the following is what windows and xbox does -- using these for emscripten |
178 | const PX_ALIGN(16, PxU32 gMaskXYZ[4]) = { 0xffffffff, 0xffffffff, 0xffffffff, 0 }; |
179 | #endif |
180 | } |
181 | |
182 | namespace _VecMathTests |
183 | { |
184 | // PT: this function returns an invalid Vec3V (W!=0.0f) just for unit-testing 'isValidVec3V' |
185 | PX_FORCE_INLINE Vec3V getInvalidVec3V() |
186 | { |
187 | const float f = 1.0f; |
188 | return _mm_load1_ps(p: &f); |
189 | } |
190 | |
191 | PX_FORCE_INLINE bool allElementsEqualFloatV(const FloatV a, const FloatV b) |
192 | { |
193 | ASSERT_ISVALIDFLOATV(a); |
194 | ASSERT_ISVALIDFLOATV(b); |
195 | return _mm_comieq_ss(a: a, b: b) != 0; |
196 | } |
197 | |
198 | PX_FORCE_INLINE bool allElementsEqualVec3V(const Vec3V a, const Vec3V b) |
199 | { |
200 | return V3AllEq(a, b) != 0; |
201 | } |
202 | |
203 | PX_FORCE_INLINE bool allElementsEqualVec4V(const Vec4V a, const Vec4V b) |
204 | { |
205 | return V4AllEq(a, b) != 0; |
206 | } |
207 | |
208 | PX_FORCE_INLINE bool allElementsEqualBoolV(const BoolV a, const BoolV b) |
209 | { |
210 | return internalUnitSSE2Simd::BAllTrue4_R(a: VecI32V_IsEq(a: m128_F2I(n: a), b: m128_F2I(n: b))) != 0; |
211 | } |
212 | |
213 | PX_FORCE_INLINE bool allElementsEqualVecU32V(const VecU32V a, const VecU32V b) |
214 | { |
215 | return internalUnitSSE2Simd::BAllTrue4_R(a: V4IsEqU32(a, b)) != 0; |
216 | } |
217 | |
218 | PX_FORCE_INLINE bool allElementsEqualVecI32V(const VecI32V a, const VecI32V b) |
219 | { |
220 | BoolV c = m128_I2F(n: _mm_cmpeq_epi32(a: a, b: b)); |
221 | return internalUnitSSE2Simd::BAllTrue4_R(a: c) != 0; |
222 | } |
223 | |
224 | #define VECMATH_AOS_EPSILON (1e-3f) |
225 | |
226 | PX_FORCE_INLINE bool allElementsNearEqualFloatV(const FloatV a, const FloatV b) |
227 | { |
228 | ASSERT_ISVALIDFLOATV(a); |
229 | ASSERT_ISVALIDFLOATV(b); |
230 | const FloatV c = FSub(a, b); |
231 | const FloatV minError = FLoad(f: -VECMATH_AOS_EPSILON); |
232 | const FloatV maxError = FLoad(VECMATH_AOS_EPSILON); |
233 | return _mm_comigt_ss(a: c, b: minError) && _mm_comilt_ss(a: c, b: maxError); |
234 | } |
235 | |
236 | PX_FORCE_INLINE bool allElementsNearEqualVec3V(const Vec3V a, const Vec3V b) |
237 | { |
238 | const Vec3V c = V3Sub(a, b); |
239 | const Vec3V minError = V3Load(f: -VECMATH_AOS_EPSILON); |
240 | const Vec3V maxError = V3Load(VECMATH_AOS_EPSILON); |
241 | return (_mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)), b: minError) && |
242 | _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)), b: maxError) && |
243 | _mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1)), b: minError) && |
244 | _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1)), b: maxError) && |
245 | _mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2)), b: minError) && |
246 | _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2)), b: maxError)); |
247 | } |
248 | |
249 | PX_FORCE_INLINE bool allElementsNearEqualVec4V(const Vec4V a, const Vec4V b) |
250 | { |
251 | const Vec4V c = V4Sub(a, b); |
252 | const Vec4V minError = V4Load(f: -VECMATH_AOS_EPSILON); |
253 | const Vec4V maxError = V4Load(VECMATH_AOS_EPSILON); |
254 | return (_mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)), b: minError) && |
255 | _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)), b: maxError) && |
256 | _mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1)), b: minError) && |
257 | _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1)), b: maxError) && |
258 | _mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2)), b: minError) && |
259 | _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2)), b: maxError) && |
260 | _mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 3, 3, 3)), b: minError) && |
261 | _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 3, 3, 3)), b: maxError)); |
262 | } |
263 | } |
264 | |
265 | ///////////////////////////////////////////////////////////////////// |
266 | ////FUNCTIONS USED ONLY FOR ASSERTS IN VECTORISED IMPLEMENTATIONS |
267 | ///////////////////////////////////////////////////////////////////// |
268 | |
269 | PX_FORCE_INLINE bool isFiniteFloatV(const FloatV a) |
270 | { |
271 | PxF32 badNumber = |
272 | physx::PxUnionCast<PxF32, PxU32>(PX_FPCLASS_SNAN | PX_FPCLASS_QNAN | PX_FPCLASS_NINF | PX_FPCLASS_PINF); |
273 | const FloatV vBadNum = FLoad(f: badNumber); |
274 | const BoolV vMask = BAnd(a: vBadNum, b: a); |
275 | return internalUnitSSE2Simd::FiniteTestEq(a: vMask, b: BFFFF()) == 1; |
276 | } |
277 | |
278 | PX_FORCE_INLINE bool isFiniteVec3V(const Vec3V a) |
279 | { |
280 | PxF32 badNumber = |
281 | physx::PxUnionCast<PxF32, PxU32>(PX_FPCLASS_SNAN | PX_FPCLASS_QNAN | PX_FPCLASS_NINF | PX_FPCLASS_PINF); |
282 | const Vec3V vBadNum = V3Load(f: badNumber); |
283 | const BoolV vMask = BAnd(a: BAnd(a: vBadNum, b: a), b: BTTTF()); |
284 | return internalUnitSSE2Simd::FiniteTestEq(a: vMask, b: BFFFF()) == 1; |
285 | } |
286 | |
287 | PX_FORCE_INLINE bool isFiniteVec4V(const Vec4V a) |
288 | { |
289 | /*Vec4V a; |
290 | PX_ALIGN(16, PxF32 f[4]); |
291 | F32Array_Aligned_From_Vec4V(a, f); |
292 | return PxIsFinite(f[0]) |
293 | && PxIsFinite(f[1]) |
294 | && PxIsFinite(f[2]) |
295 | && PxIsFinite(f[3]);*/ |
296 | |
297 | PxF32 badNumber = |
298 | physx::PxUnionCast<PxF32, PxU32>(PX_FPCLASS_SNAN | PX_FPCLASS_QNAN | PX_FPCLASS_NINF | PX_FPCLASS_PINF); |
299 | const Vec4V vBadNum = V4Load(f: badNumber); |
300 | const BoolV vMask = BAnd(a: vBadNum, b: a); |
301 | |
302 | return internalUnitSSE2Simd::FiniteTestEq(a: vMask, b: BFFFF()) == 1; |
303 | } |
304 | |
305 | PX_FORCE_INLINE bool hasZeroElementinFloatV(const FloatV a) |
306 | { |
307 | ASSERT_ISVALIDFLOATV(a); |
308 | return _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)), b: FZero()) ? true : false; |
309 | } |
310 | |
311 | PX_FORCE_INLINE bool hasZeroElementInVec3V(const Vec3V a) |
312 | { |
313 | return (_mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)), b: FZero()) || |
314 | _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)), b: FZero()) || |
315 | _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)), b: FZero())); |
316 | } |
317 | |
318 | PX_FORCE_INLINE bool hasZeroElementInVec4V(const Vec4V a) |
319 | { |
320 | return (_mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)), b: FZero()) || |
321 | _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)), b: FZero()) || |
322 | _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)), b: FZero()) || |
323 | _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3)), b: FZero())); |
324 | } |
325 | |
326 | ///////////////////////////////////////////////////////////////////// |
327 | ////VECTORISED FUNCTION IMPLEMENTATIONS |
328 | ///////////////////////////////////////////////////////////////////// |
329 | |
330 | PX_FORCE_INLINE FloatV FLoad(const PxF32 f) |
331 | { |
332 | return _mm_load1_ps(p: &f); |
333 | } |
334 | |
335 | PX_FORCE_INLINE Vec3V V3Load(const PxF32 f) |
336 | { |
337 | return _mm_set_ps(z: 0.0f, y: f, x: f, w: f); |
338 | } |
339 | |
340 | PX_FORCE_INLINE Vec4V V4Load(const PxF32 f) |
341 | { |
342 | return _mm_load1_ps(p: &f); |
343 | } |
344 | |
345 | PX_FORCE_INLINE BoolV BLoad(const bool f) |
346 | { |
347 | const PxU32 i = -PxI32(f); |
348 | return _mm_load1_ps(p: reinterpret_cast<const float*>(&i)); |
349 | } |
350 | |
351 | PX_FORCE_INLINE Vec3V V3LoadA(const PxVec3& f) |
352 | { |
353 | ASSERT_ISALIGNED16(const_cast<PxVec3*>(&f)); |
354 | #if !PX_EMSCRIPTEN |
355 | return _mm_and_ps(a: reinterpret_cast<const Vec3V&>(f), b: V4LoadA(f: internalUnitSSE2Simd::gMaskXYZ)); |
356 | #else |
357 | return _mm_and_ps((Vec3V&)f, (VecI32V&)internalUnitSSE2Simd::gMaskXYZ); |
358 | #endif |
359 | } |
360 | |
361 | PX_FORCE_INLINE Vec3V V3LoadU(const PxVec3& f) |
362 | { |
363 | return _mm_set_ps(z: 0.0f, y: f.z, x: f.y, w: f.x); |
364 | } |
365 | |
366 | PX_FORCE_INLINE Vec3V V3LoadUnsafeA(const PxVec3& f) |
367 | { |
368 | ASSERT_ISALIGNED16(const_cast<PxVec3*>(&f)); |
369 | return _mm_set_ps(z: 0.0f, y: f.z, x: f.y, w: f.x); |
370 | } |
371 | |
372 | PX_FORCE_INLINE Vec3V V3LoadA(const PxF32* const f) |
373 | { |
374 | ASSERT_ISALIGNED16(const_cast<PxF32*>(f)); |
375 | #if !PX_EMSCRIPTEN |
376 | return _mm_and_ps(a: V4LoadA(f), b: V4LoadA(f: internalUnitSSE2Simd::gMaskXYZ)); |
377 | #else |
378 | return _mm_and_ps((Vec3V&)*f, (VecI32V&)internalUnitSSE2Simd::gMaskXYZ); |
379 | #endif |
380 | } |
381 | |
382 | PX_FORCE_INLINE Vec3V V3LoadU(const PxF32* const i) |
383 | { |
384 | return _mm_set_ps(z: 0.0f, y: i[2], x: i[1], w: i[0]); |
385 | } |
386 | |
387 | PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V(Vec4V v) |
388 | { |
389 | return V4ClearW(v); |
390 | } |
391 | |
392 | PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V_WUndefined(const Vec4V v) |
393 | { |
394 | return v; |
395 | } |
396 | |
397 | PX_FORCE_INLINE Vec4V Vec4V_From_Vec3V(Vec3V f) |
398 | { |
399 | ASSERT_ISVALIDVEC3V(f); |
400 | return f; // ok if it is implemented as the same type. |
401 | } |
402 | |
403 | PX_FORCE_INLINE Vec4V Vec4V_From_PxVec3_WUndefined(const PxVec3& f) |
404 | { |
405 | return _mm_set_ps(z: 0.0f, y: f.z, x: f.y, w: f.x); |
406 | } |
407 | |
408 | PX_FORCE_INLINE Vec4V Vec4V_From_FloatV(FloatV f) |
409 | { |
410 | return f; |
411 | } |
412 | |
413 | PX_FORCE_INLINE Vec3V Vec3V_From_FloatV(FloatV f) |
414 | { |
415 | ASSERT_ISVALIDFLOATV(f); |
416 | return Vec3V_From_Vec4V(v: Vec4V_From_FloatV(f)); |
417 | } |
418 | |
419 | PX_FORCE_INLINE Vec3V Vec3V_From_FloatV_WUndefined(FloatV f) |
420 | { |
421 | ASSERT_ISVALIDVEC3V(f); |
422 | return Vec3V_From_Vec4V_WUndefined(v: Vec4V_From_FloatV(f)); |
423 | } |
424 | |
425 | PX_FORCE_INLINE Mat33V Mat33V_From_PxMat33(const PxMat33& m) |
426 | { |
427 | return Mat33V(V3LoadU(f: m.column0), V3LoadU(f: m.column1), V3LoadU(f: m.column2)); |
428 | } |
429 | |
430 | PX_FORCE_INLINE void PxMat33_From_Mat33V(const Mat33V& m, PxMat33& out) |
431 | { |
432 | V3StoreU(a: m.col0, f&: out.column0); |
433 | V3StoreU(a: m.col1, f&: out.column1); |
434 | V3StoreU(a: m.col2, f&: out.column2); |
435 | } |
436 | |
437 | PX_FORCE_INLINE Vec4V V4LoadA(const PxF32* const f) |
438 | { |
439 | ASSERT_ISALIGNED16(const_cast<PxF32*>(f)); |
440 | return _mm_load_ps(p: f); |
441 | } |
442 | |
443 | PX_FORCE_INLINE void V4StoreA(Vec4V a, PxF32* f) |
444 | { |
445 | ASSERT_ISALIGNED16(f); |
446 | _mm_store_ps(p: f, a: a); |
447 | } |
448 | |
449 | PX_FORCE_INLINE void V4StoreU(const Vec4V a, PxF32* f) |
450 | { |
451 | _mm_storeu_ps(p: f, a: a); |
452 | } |
453 | |
454 | PX_FORCE_INLINE void BStoreA(const BoolV a, PxU32* f) |
455 | { |
456 | ASSERT_ISALIGNED16(f); |
457 | _mm_store_ps(p: reinterpret_cast<PxF32*>(f), a: a); |
458 | } |
459 | |
460 | PX_FORCE_INLINE void U4StoreA(const VecU32V uv, PxU32* u) |
461 | { |
462 | ASSERT_ISALIGNED16(u); |
463 | _mm_store_ps(p: reinterpret_cast<float*>(u), a: uv); |
464 | } |
465 | |
466 | PX_FORCE_INLINE void I4StoreA(const VecI32V iv, PxI32* i) |
467 | { |
468 | ASSERT_ISALIGNED16(i); |
469 | _mm_store_ps(p: reinterpret_cast<float*>(i), a: m128_I2F(n: iv)); |
470 | } |
471 | |
472 | PX_FORCE_INLINE Vec4V V4LoadU(const PxF32* const f) |
473 | { |
474 | return _mm_loadu_ps(p: f); |
475 | } |
476 | |
477 | PX_FORCE_INLINE BoolV BLoad(const bool* const f) |
478 | { |
479 | const PX_ALIGN(16, PxI32) b[4] = { -PxI32(f[0]), -PxI32(f[1]), -PxI32(f[2]), -PxI32(f[3]) }; |
480 | return _mm_load_ps(p: reinterpret_cast<const float*>(&b)); |
481 | } |
482 | |
483 | PX_FORCE_INLINE void FStore(const FloatV a, PxF32* PX_RESTRICT f) |
484 | { |
485 | ASSERT_ISVALIDFLOATV(a); |
486 | _mm_store_ss(p: f, a: a); |
487 | } |
488 | |
489 | PX_FORCE_INLINE void V3StoreA(const Vec3V a, PxVec3& f) |
490 | { |
491 | ASSERT_ISALIGNED16(&f); |
492 | PX_ALIGN(16, PxF32) f2[4]; |
493 | _mm_store_ps(p: f2, a: a); |
494 | f = PxVec3(f2[0], f2[1], f2[2]); |
495 | } |
496 | |
497 | PX_FORCE_INLINE void V3StoreU(const Vec3V a, PxVec3& f) |
498 | { |
499 | PX_ALIGN(16, PxF32) f2[4]; |
500 | _mm_store_ps(p: f2, a: a); |
501 | f = PxVec3(f2[0], f2[1], f2[2]); |
502 | } |
503 | |
504 | PX_FORCE_INLINE void Store_From_BoolV(const BoolV b, PxU32* b2) |
505 | { |
506 | _mm_store_ss(p: reinterpret_cast<PxF32*>(b2), a: b); |
507 | } |
508 | |
509 | PX_FORCE_INLINE VecU32V U4Load(const PxU32 i) |
510 | { |
511 | return _mm_load1_ps(p: reinterpret_cast<const PxF32*>(&i)); |
512 | } |
513 | |
514 | PX_FORCE_INLINE VecU32V U4LoadU(const PxU32* i) |
515 | { |
516 | return _mm_loadu_ps(p: reinterpret_cast<const PxF32*>(i)); |
517 | } |
518 | |
519 | PX_FORCE_INLINE VecU32V U4LoadA(const PxU32* i) |
520 | { |
521 | ASSERT_ISALIGNED16(const_cast<PxU32*>(i)); |
522 | return _mm_load_ps(p: reinterpret_cast<const PxF32*>(i)); |
523 | } |
524 | |
525 | ////////////////////////////////// |
526 | // FLOATV |
527 | ////////////////////////////////// |
528 | |
529 | PX_FORCE_INLINE FloatV FZero() |
530 | { |
531 | return FLoad(f: 0.0f); |
532 | } |
533 | |
534 | PX_FORCE_INLINE FloatV FOne() |
535 | { |
536 | return FLoad(f: 1.0f); |
537 | } |
538 | |
539 | PX_FORCE_INLINE FloatV FHalf() |
540 | { |
541 | return FLoad(f: 0.5f); |
542 | } |
543 | |
544 | PX_FORCE_INLINE FloatV FEps() |
545 | { |
546 | return FLoad(PX_EPS_REAL); |
547 | } |
548 | |
549 | PX_FORCE_INLINE FloatV FEps6() |
550 | { |
551 | return FLoad(f: 1e-6f); |
552 | } |
553 | |
554 | PX_FORCE_INLINE FloatV FMax() |
555 | { |
556 | return FLoad(PX_MAX_REAL); |
557 | } |
558 | |
559 | PX_FORCE_INLINE FloatV FNegMax() |
560 | { |
561 | return FLoad(f: -PX_MAX_REAL); |
562 | } |
563 | |
564 | PX_FORCE_INLINE FloatV IZero() |
565 | { |
566 | const PxU32 zero = 0; |
567 | return _mm_load1_ps(p: reinterpret_cast<const PxF32*>(&zero)); |
568 | } |
569 | |
570 | PX_FORCE_INLINE FloatV IOne() |
571 | { |
572 | const PxU32 one = 1; |
573 | return _mm_load1_ps(p: reinterpret_cast<const PxF32*>(&one)); |
574 | } |
575 | |
576 | PX_FORCE_INLINE FloatV ITwo() |
577 | { |
578 | const PxU32 two = 2; |
579 | return _mm_load1_ps(p: reinterpret_cast<const PxF32*>(&two)); |
580 | } |
581 | |
582 | PX_FORCE_INLINE FloatV IThree() |
583 | { |
584 | const PxU32 three = 3; |
585 | return _mm_load1_ps(p: reinterpret_cast<const PxF32*>(&three)); |
586 | } |
587 | |
588 | PX_FORCE_INLINE FloatV IFour() |
589 | { |
590 | PxU32 four = 4; |
591 | return _mm_load1_ps(p: reinterpret_cast<const PxF32*>(&four)); |
592 | } |
593 | |
594 | PX_FORCE_INLINE FloatV FNeg(const FloatV f) |
595 | { |
596 | ASSERT_ISVALIDFLOATV(f); |
597 | return _mm_sub_ps(a: _mm_setzero_ps(), b: f); |
598 | } |
599 | |
600 | PX_FORCE_INLINE FloatV FAdd(const FloatV a, const FloatV b) |
601 | { |
602 | ASSERT_ISVALIDFLOATV(a); |
603 | ASSERT_ISVALIDFLOATV(b); |
604 | /* |
605 | if(!isValidFloatV(a)) |
606 | { |
607 | assert(false); |
608 | } |
609 | if(!isValidFloatV(b)) |
610 | { |
611 | assert(false); |
612 | } |
613 | */ |
614 | return _mm_add_ps(a: a, b: b); |
615 | } |
616 | |
617 | PX_FORCE_INLINE FloatV FSub(const FloatV a, const FloatV b) |
618 | { |
619 | ASSERT_ISVALIDFLOATV(a); |
620 | ASSERT_ISVALIDFLOATV(b); |
621 | return _mm_sub_ps(a: a, b: b); |
622 | } |
623 | |
624 | PX_FORCE_INLINE FloatV FMul(const FloatV a, const FloatV b) |
625 | { |
626 | ASSERT_ISVALIDFLOATV(a); |
627 | ASSERT_ISVALIDFLOATV(b); |
628 | return _mm_mul_ps(a: a, b: b); |
629 | } |
630 | |
631 | PX_FORCE_INLINE FloatV FDiv(const FloatV a, const FloatV b) |
632 | { |
633 | ASSERT_ISVALIDFLOATV(a); |
634 | ASSERT_ISVALIDFLOATV(b); |
635 | return _mm_div_ps(a: a, b: b); |
636 | } |
637 | |
638 | PX_FORCE_INLINE FloatV FDivFast(const FloatV a, const FloatV b) |
639 | { |
640 | ASSERT_ISVALIDFLOATV(a); |
641 | ASSERT_ISVALIDFLOATV(b); |
642 | return _mm_mul_ps(a: a, b: _mm_rcp_ps(a: b)); |
643 | } |
644 | |
645 | PX_FORCE_INLINE FloatV FRecip(const FloatV a) |
646 | { |
647 | ASSERT_ISVALIDFLOATV(a); |
648 | return _mm_div_ps(a: FOne(), b: a); |
649 | } |
650 | |
651 | PX_FORCE_INLINE FloatV FRecipFast(const FloatV a) |
652 | { |
653 | ASSERT_ISVALIDFLOATV(a); |
654 | return _mm_rcp_ps(a: a); |
655 | } |
656 | |
657 | PX_FORCE_INLINE FloatV FRsqrt(const FloatV a) |
658 | { |
659 | ASSERT_ISVALIDFLOATV(a); |
660 | return _mm_div_ps(a: FOne(), b: _mm_sqrt_ps(a: a)); |
661 | } |
662 | |
663 | PX_FORCE_INLINE FloatV FSqrt(const FloatV a) |
664 | { |
665 | ASSERT_ISVALIDFLOATV(a); |
666 | return _mm_sqrt_ps(a: a); |
667 | } |
668 | |
669 | PX_FORCE_INLINE FloatV FRsqrtFast(const FloatV a) |
670 | { |
671 | ASSERT_ISVALIDFLOATV(a); |
672 | return _mm_rsqrt_ps(a: a); |
673 | } |
674 | |
675 | PX_FORCE_INLINE FloatV FScaleAdd(const FloatV a, const FloatV b, const FloatV c) |
676 | { |
677 | ASSERT_ISVALIDFLOATV(a); |
678 | ASSERT_ISVALIDFLOATV(b); |
679 | ASSERT_ISVALIDFLOATV(c); |
680 | return FAdd(a: FMul(a, b), b: c); |
681 | } |
682 | |
683 | PX_FORCE_INLINE FloatV FNegScaleSub(const FloatV a, const FloatV b, const FloatV c) |
684 | { |
685 | ASSERT_ISVALIDFLOATV(a); |
686 | ASSERT_ISVALIDFLOATV(b); |
687 | ASSERT_ISVALIDFLOATV(c); |
688 | return FSub(a: c, b: FMul(a, b)); |
689 | } |
690 | |
691 | PX_FORCE_INLINE FloatV FAbs(const FloatV a) |
692 | { |
693 | ASSERT_ISVALIDFLOATV(a); |
694 | PX_ALIGN(16, const PxU32) absMask[4] = { 0x7fFFffFF, 0x7fFFffFF, 0x7fFFffFF, 0x7fFFffFF }; |
695 | return _mm_and_ps(a: a, b: _mm_load_ps(p: reinterpret_cast<const PxF32*>(absMask))); |
696 | } |
697 | |
698 | PX_FORCE_INLINE FloatV FSel(const BoolV c, const FloatV a, const FloatV b) |
699 | { |
700 | PX_ASSERT(_VecMathTests::allElementsEqualBoolV(c,BTTTT()) || |
701 | _VecMathTests::allElementsEqualBoolV(c,BFFFF())); |
702 | ASSERT_ISVALIDFLOATV(_mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a))); |
703 | return _mm_or_ps(a: _mm_andnot_ps(a: c, b: b), b: _mm_and_ps(a: c, b: a)); |
704 | } |
705 | |
706 | PX_FORCE_INLINE BoolV FIsGrtr(const FloatV a, const FloatV b) |
707 | { |
708 | ASSERT_ISVALIDFLOATV(a); |
709 | ASSERT_ISVALIDFLOATV(b); |
710 | return _mm_cmpgt_ps(a: a, b: b); |
711 | } |
712 | |
713 | PX_FORCE_INLINE BoolV FIsGrtrOrEq(const FloatV a, const FloatV b) |
714 | { |
715 | ASSERT_ISVALIDFLOATV(a); |
716 | ASSERT_ISVALIDFLOATV(b); |
717 | return _mm_cmpge_ps(a: a, b: b); |
718 | } |
719 | |
720 | PX_FORCE_INLINE BoolV FIsEq(const FloatV a, const FloatV b) |
721 | { |
722 | ASSERT_ISVALIDFLOATV(a); |
723 | ASSERT_ISVALIDFLOATV(b); |
724 | return _mm_cmpeq_ps(a: a, b: b); |
725 | } |
726 | |
727 | PX_FORCE_INLINE FloatV FMax(const FloatV a, const FloatV b) |
728 | { |
729 | ASSERT_ISVALIDFLOATV(a); |
730 | ASSERT_ISVALIDFLOATV(b); |
731 | return _mm_max_ps(a: a, b: b); |
732 | } |
733 | |
734 | PX_FORCE_INLINE FloatV FMin(const FloatV a, const FloatV b) |
735 | { |
736 | ASSERT_ISVALIDFLOATV(a); |
737 | ASSERT_ISVALIDFLOATV(b); |
738 | return _mm_min_ps(a: a, b: b); |
739 | } |
740 | |
741 | PX_FORCE_INLINE FloatV FClamp(const FloatV a, const FloatV minV, const FloatV maxV) |
742 | { |
743 | ASSERT_ISVALIDFLOATV(minV); |
744 | ASSERT_ISVALIDFLOATV(maxV); |
745 | return _mm_max_ps(a: _mm_min_ps(a: a, b: maxV), b: minV); |
746 | } |
747 | |
748 | PX_FORCE_INLINE PxU32 FAllGrtr(const FloatV a, const FloatV b) |
749 | { |
750 | ASSERT_ISVALIDFLOATV(a); |
751 | ASSERT_ISVALIDFLOATV(b); |
752 | return _mm_comigt_ss(a: a, b: b); |
753 | } |
754 | |
755 | PX_FORCE_INLINE PxU32 FAllGrtrOrEq(const FloatV a, const FloatV b) |
756 | { |
757 | ASSERT_ISVALIDFLOATV(a); |
758 | ASSERT_ISVALIDFLOATV(b); |
759 | return _mm_comige_ss(a: a, b: b); |
760 | } |
761 | |
762 | PX_FORCE_INLINE PxU32 FAllEq(const FloatV a, const FloatV b) |
763 | { |
764 | ASSERT_ISVALIDFLOATV(a); |
765 | ASSERT_ISVALIDFLOATV(b); |
766 | return _mm_comieq_ss(a: a, b: b); |
767 | } |
768 | |
769 | PX_FORCE_INLINE FloatV FRound(const FloatV a) |
770 | { |
771 | ASSERT_ISVALIDFLOATV(a); |
772 | #ifdef __SSE4_2__ |
773 | return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); |
774 | #else |
775 | // return _mm_round_ps(a, 0x0); |
776 | const FloatV half = FLoad(f: 0.5f); |
777 | const __m128 signBit = _mm_cvtepi32_ps(a: _mm_srli_epi32(a: _mm_cvtps_epi32(a: a), count: 31)); |
778 | const FloatV aRound = FSub(a: FAdd(a, b: half), b: signBit); |
779 | __m128i tmp = _mm_cvttps_epi32(a: aRound); |
780 | return _mm_cvtepi32_ps(a: tmp); |
781 | #endif |
782 | } |
783 | |
784 | PX_FORCE_INLINE FloatV FSin(const FloatV a) |
785 | { |
786 | ASSERT_ISVALIDFLOATV(a); |
787 | |
788 | // Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI |
789 | const FloatV recipTwoPi = V4LoadA(f: g_PXReciprocalTwoPi.f); |
790 | const FloatV twoPi = V4LoadA(f: g_PXTwoPi.f); |
791 | const FloatV tmp = FMul(a, b: recipTwoPi); |
792 | const FloatV b = FRound(a: tmp); |
793 | const FloatV V1 = FNegScaleSub(a: twoPi, b, c: a); |
794 | |
795 | // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! - |
796 | // V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI) |
797 | const FloatV V2 = FMul(a: V1, b: V1); |
798 | const FloatV V3 = FMul(a: V2, b: V1); |
799 | const FloatV V5 = FMul(a: V3, b: V2); |
800 | const FloatV V7 = FMul(a: V5, b: V2); |
801 | const FloatV V9 = FMul(a: V7, b: V2); |
802 | const FloatV V11 = FMul(a: V9, b: V2); |
803 | const FloatV V13 = FMul(a: V11, b: V2); |
804 | const FloatV V15 = FMul(a: V13, b: V2); |
805 | const FloatV V17 = FMul(a: V15, b: V2); |
806 | const FloatV V19 = FMul(a: V17, b: V2); |
807 | const FloatV V21 = FMul(a: V19, b: V2); |
808 | const FloatV V23 = FMul(a: V21, b: V2); |
809 | |
810 | const Vec4V sinCoefficients0 = V4LoadA(f: g_PXSinCoefficients0.f); |
811 | const Vec4V sinCoefficients1 = V4LoadA(f: g_PXSinCoefficients1.f); |
812 | const Vec4V sinCoefficients2 = V4LoadA(f: g_PXSinCoefficients2.f); |
813 | |
814 | const FloatV S1 = V4GetY(f: sinCoefficients0); |
815 | const FloatV S2 = V4GetZ(f: sinCoefficients0); |
816 | const FloatV S3 = V4GetW(f: sinCoefficients0); |
817 | const FloatV S4 = V4GetX(f: sinCoefficients1); |
818 | const FloatV S5 = V4GetY(f: sinCoefficients1); |
819 | const FloatV S6 = V4GetZ(f: sinCoefficients1); |
820 | const FloatV S7 = V4GetW(f: sinCoefficients1); |
821 | const FloatV S8 = V4GetX(f: sinCoefficients2); |
822 | const FloatV S9 = V4GetY(f: sinCoefficients2); |
823 | const FloatV S10 = V4GetZ(f: sinCoefficients2); |
824 | const FloatV S11 = V4GetW(f: sinCoefficients2); |
825 | |
826 | FloatV Result; |
827 | Result = FScaleAdd(a: S1, b: V3, c: V1); |
828 | Result = FScaleAdd(a: S2, b: V5, c: Result); |
829 | Result = FScaleAdd(a: S3, b: V7, c: Result); |
830 | Result = FScaleAdd(a: S4, b: V9, c: Result); |
831 | Result = FScaleAdd(a: S5, b: V11, c: Result); |
832 | Result = FScaleAdd(a: S6, b: V13, c: Result); |
833 | Result = FScaleAdd(a: S7, b: V15, c: Result); |
834 | Result = FScaleAdd(a: S8, b: V17, c: Result); |
835 | Result = FScaleAdd(a: S9, b: V19, c: Result); |
836 | Result = FScaleAdd(a: S10, b: V21, c: Result); |
837 | Result = FScaleAdd(a: S11, b: V23, c: Result); |
838 | |
839 | return Result; |
840 | } |
841 | |
842 | PX_FORCE_INLINE FloatV FCos(const FloatV a) |
843 | { |
844 | ASSERT_ISVALIDFLOATV(a); |
845 | |
846 | // Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI |
847 | const FloatV recipTwoPi = V4LoadA(f: g_PXReciprocalTwoPi.f); |
848 | const FloatV twoPi = V4LoadA(f: g_PXTwoPi.f); |
849 | const FloatV tmp = FMul(a, b: recipTwoPi); |
850 | const FloatV b = FRound(a: tmp); |
851 | const FloatV V1 = FNegScaleSub(a: twoPi, b, c: a); |
852 | |
853 | // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! - |
854 | // V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI) |
855 | const FloatV V2 = FMul(a: V1, b: V1); |
856 | const FloatV V4 = FMul(a: V2, b: V2); |
857 | const FloatV V6 = FMul(a: V4, b: V2); |
858 | const FloatV V8 = FMul(a: V4, b: V4); |
859 | const FloatV V10 = FMul(a: V6, b: V4); |
860 | const FloatV V12 = FMul(a: V6, b: V6); |
861 | const FloatV V14 = FMul(a: V8, b: V6); |
862 | const FloatV V16 = FMul(a: V8, b: V8); |
863 | const FloatV V18 = FMul(a: V10, b: V8); |
864 | const FloatV V20 = FMul(a: V10, b: V10); |
865 | const FloatV V22 = FMul(a: V12, b: V10); |
866 | |
867 | const Vec4V cosCoefficients0 = V4LoadA(f: g_PXCosCoefficients0.f); |
868 | const Vec4V cosCoefficients1 = V4LoadA(f: g_PXCosCoefficients1.f); |
869 | const Vec4V cosCoefficients2 = V4LoadA(f: g_PXCosCoefficients2.f); |
870 | |
871 | const FloatV C1 = V4GetY(f: cosCoefficients0); |
872 | const FloatV C2 = V4GetZ(f: cosCoefficients0); |
873 | const FloatV C3 = V4GetW(f: cosCoefficients0); |
874 | const FloatV C4 = V4GetX(f: cosCoefficients1); |
875 | const FloatV C5 = V4GetY(f: cosCoefficients1); |
876 | const FloatV C6 = V4GetZ(f: cosCoefficients1); |
877 | const FloatV C7 = V4GetW(f: cosCoefficients1); |
878 | const FloatV C8 = V4GetX(f: cosCoefficients2); |
879 | const FloatV C9 = V4GetY(f: cosCoefficients2); |
880 | const FloatV C10 = V4GetZ(f: cosCoefficients2); |
881 | const FloatV C11 = V4GetW(f: cosCoefficients2); |
882 | |
883 | FloatV Result; |
884 | Result = FScaleAdd(a: C1, b: V2, c: V4One()); |
885 | Result = FScaleAdd(a: C2, b: V4, c: Result); |
886 | Result = FScaleAdd(a: C3, b: V6, c: Result); |
887 | Result = FScaleAdd(a: C4, b: V8, c: Result); |
888 | Result = FScaleAdd(a: C5, b: V10, c: Result); |
889 | Result = FScaleAdd(a: C6, b: V12, c: Result); |
890 | Result = FScaleAdd(a: C7, b: V14, c: Result); |
891 | Result = FScaleAdd(a: C8, b: V16, c: Result); |
892 | Result = FScaleAdd(a: C9, b: V18, c: Result); |
893 | Result = FScaleAdd(a: C10, b: V20, c: Result); |
894 | Result = FScaleAdd(a: C11, b: V22, c: Result); |
895 | |
896 | return Result; |
897 | } |
898 | |
899 | PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV min, const FloatV max) |
900 | { |
901 | ASSERT_ISVALIDFLOATV(a); |
902 | ASSERT_ISVALIDFLOATV(min); |
903 | ASSERT_ISVALIDFLOATV(max); |
904 | const BoolV c = BOr(a: FIsGrtr(a, b: max), b: FIsGrtr(a: min, b: a)); |
905 | return !BAllEqFFFF(a: c); |
906 | } |
907 | |
908 | PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV min, const FloatV max) |
909 | { |
910 | ASSERT_ISVALIDFLOATV(a); |
911 | ASSERT_ISVALIDFLOATV(min); |
912 | ASSERT_ISVALIDFLOATV(max) |
913 | const BoolV c = BAnd(a: FIsGrtrOrEq(a, b: min), b: FIsGrtrOrEq(a: max, b: a)); |
914 | return BAllEqTTTT(a: c); |
915 | } |
916 | |
917 | PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV bounds) |
918 | { |
919 | ASSERT_ISVALIDFLOATV(a); |
920 | ASSERT_ISVALIDFLOATV(bounds); |
921 | return FOutOfBounds(a, min: FNeg(f: bounds), max: bounds); |
922 | } |
923 | |
924 | PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV bounds) |
925 | { |
926 | ASSERT_ISVALIDFLOATV(a); |
927 | ASSERT_ISVALIDFLOATV(bounds); |
928 | return FInBounds(a, min: FNeg(f: bounds), max: bounds); |
929 | } |
930 | |
931 | ////////////////////////////////// |
932 | // VEC3V |
933 | ////////////////////////////////// |
934 | |
935 | PX_FORCE_INLINE Vec3V V3Splat(const FloatV f) |
936 | { |
937 | ASSERT_ISVALIDFLOATV(f); |
938 | const __m128 zero = FZero(); |
939 | const __m128 fff0 = _mm_move_ss(a: f, b: zero); |
940 | return _mm_shuffle_ps(fff0, fff0, _MM_SHUFFLE(0, 1, 2, 3)); |
941 | } |
942 | |
943 | PX_FORCE_INLINE Vec3V V3Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z) |
944 | { |
945 | ASSERT_ISVALIDFLOATV(x); |
946 | ASSERT_ISVALIDFLOATV(y); |
947 | ASSERT_ISVALIDFLOATV(z); |
948 | // static on zero causes compiler crash on x64 debug_opt |
949 | const __m128 zero = FZero(); |
950 | const __m128 xy = _mm_move_ss(a: x, b: y); |
951 | const __m128 z0 = _mm_move_ss(a: zero, b: z); |
952 | |
953 | return _mm_shuffle_ps(xy, z0, _MM_SHUFFLE(1, 0, 0, 1)); |
954 | } |
955 | |
956 | PX_FORCE_INLINE Vec3V V3UnitX() |
957 | { |
958 | const PX_ALIGN(16, PxF32) x[4] = { 1.0f, 0.0f, 0.0f, 0.0f }; |
959 | const __m128 x128 = _mm_load_ps(p: x); |
960 | return x128; |
961 | } |
962 | |
963 | PX_FORCE_INLINE Vec3V V3UnitY() |
964 | { |
965 | const PX_ALIGN(16, PxF32) y[4] = { 0.0f, 1.0f, 0.0f, 0.0f }; |
966 | const __m128 y128 = _mm_load_ps(p: y); |
967 | return y128; |
968 | } |
969 | |
970 | PX_FORCE_INLINE Vec3V V3UnitZ() |
971 | { |
972 | const PX_ALIGN(16, PxF32) z[4] = { 0.0f, 0.0f, 1.0f, 0.0f }; |
973 | const __m128 z128 = _mm_load_ps(p: z); |
974 | return z128; |
975 | } |
976 | |
977 | PX_FORCE_INLINE FloatV V3GetX(const Vec3V f) |
978 | { |
979 | ASSERT_ISVALIDVEC3V(f); |
980 | return _mm_shuffle_ps(f, f, _MM_SHUFFLE(0, 0, 0, 0)); |
981 | } |
982 | |
983 | PX_FORCE_INLINE FloatV V3GetY(const Vec3V f) |
984 | { |
985 | ASSERT_ISVALIDVEC3V(f) |
986 | return _mm_shuffle_ps(f, f, _MM_SHUFFLE(1, 1, 1, 1)); |
987 | } |
988 | |
989 | PX_FORCE_INLINE FloatV V3GetZ(const Vec3V f) |
990 | { |
991 | ASSERT_ISVALIDVEC3V(f); |
992 | return _mm_shuffle_ps(f, f, _MM_SHUFFLE(2, 2, 2, 2)); |
993 | } |
994 | |
995 | PX_FORCE_INLINE Vec3V V3SetX(const Vec3V v, const FloatV f) |
996 | { |
997 | ASSERT_ISVALIDVEC3V(v); |
998 | ASSERT_ISVALIDFLOATV(f); |
999 | return V4Sel(c: BFTTT(), a: v, b: f); |
1000 | } |
1001 | |
1002 | PX_FORCE_INLINE Vec3V V3SetY(const Vec3V v, const FloatV f) |
1003 | { |
1004 | ASSERT_ISVALIDVEC3V(v); |
1005 | ASSERT_ISVALIDFLOATV(f); |
1006 | return V4Sel(c: BTFTT(), a: v, b: f); |
1007 | } |
1008 | |
1009 | PX_FORCE_INLINE Vec3V V3SetZ(const Vec3V v, const FloatV f) |
1010 | { |
1011 | ASSERT_ISVALIDVEC3V(v); |
1012 | ASSERT_ISVALIDFLOATV(f); |
1013 | return V4Sel(c: BTTFT(), a: v, b: f); |
1014 | } |
1015 | |
1016 | PX_FORCE_INLINE Vec3V V3ColX(const Vec3V a, const Vec3V b, const Vec3V c) |
1017 | { |
1018 | ASSERT_ISVALIDVEC3V(a); |
1019 | ASSERT_ISVALIDVEC3V(b); |
1020 | ASSERT_ISVALIDVEC3V(c); |
1021 | Vec3V r = _mm_shuffle_ps(a, c, _MM_SHUFFLE(3, 0, 3, 0)); |
1022 | return V3SetY(v: r, f: V3GetX(f: b)); |
1023 | } |
1024 | |
1025 | PX_FORCE_INLINE Vec3V V3ColY(const Vec3V a, const Vec3V b, const Vec3V c) |
1026 | { |
1027 | ASSERT_ISVALIDVEC3V(a); |
1028 | ASSERT_ISVALIDVEC3V(b); |
1029 | ASSERT_ISVALIDVEC3V(c) |
1030 | Vec3V r = _mm_shuffle_ps(a, c, _MM_SHUFFLE(3, 1, 3, 1)); |
1031 | return V3SetY(v: r, f: V3GetY(f: b)); |
1032 | } |
1033 | |
1034 | PX_FORCE_INLINE Vec3V V3ColZ(const Vec3V a, const Vec3V b, const Vec3V c) |
1035 | { |
1036 | ASSERT_ISVALIDVEC3V(a); |
1037 | ASSERT_ISVALIDVEC3V(b); |
1038 | ASSERT_ISVALIDVEC3V(c); |
1039 | Vec3V r = _mm_shuffle_ps(a, c, _MM_SHUFFLE(3, 2, 3, 2)); |
1040 | return V3SetY(v: r, f: V3GetZ(f: b)); |
1041 | } |
1042 | |
1043 | PX_FORCE_INLINE Vec3V V3Zero() |
1044 | { |
1045 | return V3Load(f: 0.0f); |
1046 | } |
1047 | |
1048 | PX_FORCE_INLINE Vec3V V3Eps() |
1049 | { |
1050 | return V3Load(PX_EPS_REAL); |
1051 | } |
1052 | PX_FORCE_INLINE Vec3V V3One() |
1053 | { |
1054 | return V3Load(f: 1.0f); |
1055 | } |
1056 | |
1057 | PX_FORCE_INLINE Vec3V V3Neg(const Vec3V f) |
1058 | { |
1059 | ASSERT_ISVALIDVEC3V(f); |
1060 | return _mm_sub_ps(a: _mm_setzero_ps(), b: f); |
1061 | } |
1062 | |
1063 | PX_FORCE_INLINE Vec3V V3Add(const Vec3V a, const Vec3V b) |
1064 | { |
1065 | ASSERT_ISVALIDVEC3V(a); |
1066 | ASSERT_ISVALIDVEC3V(b); |
1067 | return _mm_add_ps(a: a, b: b); |
1068 | } |
1069 | |
1070 | PX_FORCE_INLINE Vec3V V3Sub(const Vec3V a, const Vec3V b) |
1071 | { |
1072 | ASSERT_ISVALIDVEC3V(a); |
1073 | ASSERT_ISVALIDVEC3V(b); |
1074 | return _mm_sub_ps(a: a, b: b); |
1075 | } |
1076 | |
1077 | PX_FORCE_INLINE Vec3V V3Scale(const Vec3V a, const FloatV b) |
1078 | { |
1079 | ASSERT_ISVALIDVEC3V(a); |
1080 | ASSERT_ISVALIDFLOATV(b); |
1081 | return _mm_mul_ps(a: a, b: b); |
1082 | } |
1083 | |
1084 | PX_FORCE_INLINE Vec3V V3Mul(const Vec3V a, const Vec3V b) |
1085 | { |
1086 | ASSERT_ISVALIDVEC3V(a); |
1087 | ASSERT_ISVALIDVEC3V(b); |
1088 | return _mm_mul_ps(a: a, b: b); |
1089 | } |
1090 | |
1091 | PX_FORCE_INLINE Vec3V V3ScaleInv(const Vec3V a, const FloatV b) |
1092 | { |
1093 | ASSERT_ISVALIDVEC3V(a); |
1094 | ASSERT_ISVALIDFLOATV(b); |
1095 | return _mm_div_ps(a: a, b: b); |
1096 | } |
1097 | |
1098 | PX_FORCE_INLINE Vec3V V3Div(const Vec3V a, const Vec3V b) |
1099 | { |
1100 | ASSERT_ISVALIDVEC3V(a); |
1101 | ASSERT_ISVALIDVEC3V(b); |
1102 | return V4ClearW(v: _mm_div_ps(a: a, b: b)); |
1103 | } |
1104 | |
1105 | PX_FORCE_INLINE Vec3V V3ScaleInvFast(const Vec3V a, const FloatV b) |
1106 | { |
1107 | ASSERT_ISVALIDVEC3V(a); |
1108 | ASSERT_ISVALIDFLOATV(b); |
1109 | return _mm_mul_ps(a: a, b: _mm_rcp_ps(a: b)); |
1110 | } |
1111 | |
1112 | PX_FORCE_INLINE Vec3V V3DivFast(const Vec3V a, const Vec3V b) |
1113 | { |
1114 | ASSERT_ISVALIDVEC3V(a); |
1115 | ASSERT_ISVALIDVEC3V(b); |
1116 | return V4ClearW(v: _mm_mul_ps(a: a, b: _mm_rcp_ps(a: b))); |
1117 | } |
1118 | |
1119 | PX_FORCE_INLINE Vec3V V3Recip(const Vec3V a) |
1120 | { |
1121 | ASSERT_ISVALIDVEC3V(a); |
1122 | const __m128 zero = V3Zero(); |
1123 | const __m128 tttf = BTTTF(); |
1124 | const __m128 recipA = _mm_div_ps(a: V3One(), b: a); |
1125 | return V4Sel(c: tttf, a: recipA, b: zero); |
1126 | } |
1127 | |
1128 | PX_FORCE_INLINE Vec3V V3RecipFast(const Vec3V a) |
1129 | { |
1130 | ASSERT_ISVALIDVEC3V(a); |
1131 | const __m128 zero = V3Zero(); |
1132 | const __m128 tttf = BTTTF(); |
1133 | const __m128 recipA = _mm_rcp_ps(a: a); |
1134 | return V4Sel(c: tttf, a: recipA, b: zero); |
1135 | } |
1136 | |
1137 | PX_FORCE_INLINE Vec3V V3Rsqrt(const Vec3V a) |
1138 | { |
1139 | ASSERT_ISVALIDVEC3V(a); |
1140 | const __m128 zero = V3Zero(); |
1141 | const __m128 tttf = BTTTF(); |
1142 | const __m128 recipA = _mm_div_ps(a: V3One(), b: _mm_sqrt_ps(a: a)); |
1143 | return V4Sel(c: tttf, a: recipA, b: zero); |
1144 | } |
1145 | |
1146 | PX_FORCE_INLINE Vec3V V3RsqrtFast(const Vec3V a) |
1147 | { |
1148 | ASSERT_ISVALIDVEC3V(a); |
1149 | const __m128 zero = V3Zero(); |
1150 | const __m128 tttf = BTTTF(); |
1151 | const __m128 recipA = _mm_rsqrt_ps(a: a); |
1152 | return V4Sel(c: tttf, a: recipA, b: zero); |
1153 | } |
1154 | |
1155 | PX_FORCE_INLINE Vec3V V3ScaleAdd(const Vec3V a, const FloatV b, const Vec3V c) |
1156 | { |
1157 | ASSERT_ISVALIDVEC3V(a); |
1158 | ASSERT_ISVALIDFLOATV(b); |
1159 | ASSERT_ISVALIDVEC3V(c); |
1160 | return V3Add(a: V3Scale(a, b), b: c); |
1161 | } |
1162 | |
1163 | PX_FORCE_INLINE Vec3V V3NegScaleSub(const Vec3V a, const FloatV b, const Vec3V c) |
1164 | { |
1165 | ASSERT_ISVALIDVEC3V(a); |
1166 | ASSERT_ISVALIDFLOATV(b); |
1167 | ASSERT_ISVALIDVEC3V(c); |
1168 | return V3Sub(a: c, b: V3Scale(a, b)); |
1169 | } |
1170 | |
1171 | PX_FORCE_INLINE Vec3V V3MulAdd(const Vec3V a, const Vec3V b, const Vec3V c) |
1172 | { |
1173 | ASSERT_ISVALIDVEC3V(a); |
1174 | ASSERT_ISVALIDVEC3V(b); |
1175 | ASSERT_ISVALIDVEC3V(c); |
1176 | return V3Add(a: V3Mul(a, b), b: c); |
1177 | } |
1178 | |
1179 | PX_FORCE_INLINE Vec3V V3NegMulSub(const Vec3V a, const Vec3V b, const Vec3V c) |
1180 | { |
1181 | ASSERT_ISVALIDVEC3V(a); |
1182 | ASSERT_ISVALIDVEC3V(b); |
1183 | ASSERT_ISVALIDVEC3V(c); |
1184 | return V3Sub(a: c, b: V3Mul(a, b)); |
1185 | } |
1186 | |
1187 | PX_FORCE_INLINE Vec3V V3Abs(const Vec3V a) |
1188 | { |
1189 | ASSERT_ISVALIDVEC3V(a); |
1190 | return V3Max(a, b: V3Neg(f: a)); |
1191 | } |
1192 | |
1193 | PX_FORCE_INLINE FloatV V3Dot(const Vec3V a, const Vec3V b) |
1194 | { |
1195 | ASSERT_ISVALIDVEC3V(a); |
1196 | ASSERT_ISVALIDVEC3V(b); |
1197 | #ifdef __SSE4_2__ |
1198 | return _mm_dp_ps(a, b, 0x7f); |
1199 | #else |
1200 | const __m128 t0 = _mm_mul_ps(a: a, b: b); // aw*bw | az*bz | ay*by | ax*bx |
1201 | const __m128 t1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(1,0,3,2)); // ay*by | ax*bx | aw*bw | az*bz |
1202 | const __m128 t2 = _mm_add_ps(a: t0, b: t1); // ay*by + aw*bw | ax*bx + az*bz | aw*bw + ay*by | az*bz + ax*bx |
1203 | const __m128 t3 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,3,0,1)); // ax*bx + az*bz | ay*by + aw*bw | az*bz + ax*bx | aw*bw + ay*by |
1204 | return _mm_add_ps(a: t3, b: t2); // ax*bx + az*bz + ay*by + aw*bw |
1205 | // ay*by + aw*bw + ax*bx + az*bz |
1206 | // az*bz + ax*bx + aw*bw + ay*by |
1207 | // aw*bw + ay*by + az*bz + ax*bx |
1208 | #endif |
1209 | } |
1210 | |
1211 | PX_FORCE_INLINE Vec3V V3Cross(const Vec3V a, const Vec3V b) |
1212 | { |
1213 | ASSERT_ISVALIDVEC3V(a); |
1214 | ASSERT_ISVALIDVEC3V(b); |
1215 | const __m128 r1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w |
1216 | const __m128 r2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w |
1217 | const __m128 l1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w |
1218 | const __m128 l2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w |
1219 | return _mm_sub_ps(a: _mm_mul_ps(a: l1, b: l2), b: _mm_mul_ps(a: r1, b: r2)); |
1220 | } |
1221 | |
1222 | PX_FORCE_INLINE VecCrossV V3PrepareCross(const Vec3V a) |
1223 | { |
1224 | ASSERT_ISVALIDVEC3V(a); |
1225 | VecCrossV v; |
1226 | v.mR1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w |
1227 | v.mL1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w |
1228 | return v; |
1229 | } |
1230 | |
1231 | PX_FORCE_INLINE Vec3V V3Cross(const VecCrossV& a, const Vec3V b) |
1232 | { |
1233 | ASSERT_ISVALIDVEC3V(b); |
1234 | const __m128 r2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w |
1235 | const __m128 l2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w |
1236 | return _mm_sub_ps(a: _mm_mul_ps(a: a.mL1, b: l2), b: _mm_mul_ps(a: a.mR1, b: r2)); |
1237 | } |
1238 | |
1239 | PX_FORCE_INLINE Vec3V V3Cross(const Vec3V a, const VecCrossV& b) |
1240 | { |
1241 | ASSERT_ISVALIDVEC3V(a); |
1242 | const __m128 r2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w |
1243 | const __m128 l2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w |
1244 | return _mm_sub_ps(a: _mm_mul_ps(a: b.mR1, b: r2), b: _mm_mul_ps(a: b.mL1, b: l2)); |
1245 | } |
1246 | |
1247 | PX_FORCE_INLINE Vec3V V3Cross(const VecCrossV& a, const VecCrossV& b) |
1248 | { |
1249 | return _mm_sub_ps(a: _mm_mul_ps(a: a.mL1, b: b.mR1), b: _mm_mul_ps(a: a.mR1, b: b.mL1)); |
1250 | } |
1251 | |
1252 | PX_FORCE_INLINE FloatV V3Length(const Vec3V a) |
1253 | { |
1254 | ASSERT_ISVALIDVEC3V(a); |
1255 | return _mm_sqrt_ps(a: V3Dot(a, b: a)); |
1256 | } |
1257 | |
1258 | PX_FORCE_INLINE FloatV V3LengthSq(const Vec3V a) |
1259 | { |
1260 | ASSERT_ISVALIDVEC3V(a); |
1261 | return V3Dot(a, b: a); |
1262 | } |
1263 | |
1264 | PX_FORCE_INLINE Vec3V V3Normalize(const Vec3V a) |
1265 | { |
1266 | ASSERT_ISVALIDVEC3V(a); |
1267 | ASSERT_ISFINITELENGTH(a); |
1268 | return V3ScaleInv(a, b: _mm_sqrt_ps(a: V3Dot(a, b: a))); |
1269 | } |
1270 | |
1271 | PX_FORCE_INLINE Vec3V V3NormalizeFast(const Vec3V a) |
1272 | { |
1273 | ASSERT_ISVALIDVEC3V(a); |
1274 | ASSERT_ISFINITELENGTH(a); |
1275 | return V3Scale(a, b: _mm_rsqrt_ps(a: V3Dot(a, b: a))); |
1276 | } |
1277 | |
1278 | PX_FORCE_INLINE Vec3V V3NormalizeSafe(const Vec3V a, const Vec3V unsafeReturnValue) |
1279 | { |
1280 | ASSERT_ISVALIDVEC3V(a); |
1281 | const __m128 eps = V3Eps(); |
1282 | const __m128 length = V3Length(a); |
1283 | const __m128 isGreaterThanZero = FIsGrtr(a: length, b: eps); |
1284 | return V3Sel(c: isGreaterThanZero, a: V3ScaleInv(a, b: length), b: unsafeReturnValue); |
1285 | } |
1286 | |
1287 | PX_FORCE_INLINE Vec3V V3Sel(const BoolV c, const Vec3V a, const Vec3V b) |
1288 | { |
1289 | ASSERT_ISVALIDVEC3V(_mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a))); |
1290 | return _mm_or_ps(a: _mm_andnot_ps(a: c, b: b), b: _mm_and_ps(a: c, b: a)); |
1291 | } |
1292 | |
1293 | PX_FORCE_INLINE BoolV V3IsGrtr(const Vec3V a, const Vec3V b) |
1294 | { |
1295 | ASSERT_ISVALIDVEC3V(a); |
1296 | ASSERT_ISVALIDVEC3V(b); |
1297 | return _mm_cmpgt_ps(a: a, b: b); |
1298 | } |
1299 | |
1300 | PX_FORCE_INLINE BoolV V3IsGrtrOrEq(const Vec3V a, const Vec3V b) |
1301 | { |
1302 | ASSERT_ISVALIDVEC3V(a); |
1303 | ASSERT_ISVALIDVEC3V(b); |
1304 | return _mm_cmpge_ps(a: a, b: b); |
1305 | } |
1306 | |
1307 | PX_FORCE_INLINE BoolV V3IsEq(const Vec3V a, const Vec3V b) |
1308 | { |
1309 | ASSERT_ISVALIDVEC3V(a); |
1310 | ASSERT_ISVALIDVEC3V(b); |
1311 | return _mm_cmpeq_ps(a: a, b: b); |
1312 | } |
1313 | |
1314 | PX_FORCE_INLINE Vec3V V3Max(const Vec3V a, const Vec3V b) |
1315 | { |
1316 | ASSERT_ISVALIDVEC3V(a); |
1317 | ASSERT_ISVALIDVEC3V(b); |
1318 | return _mm_max_ps(a: a, b: b); |
1319 | } |
1320 | |
1321 | PX_FORCE_INLINE Vec3V V3Min(const Vec3V a, const Vec3V b) |
1322 | { |
1323 | ASSERT_ISVALIDVEC3V(a); |
1324 | ASSERT_ISVALIDVEC3V(b); |
1325 | return _mm_min_ps(a: a, b: b); |
1326 | } |
1327 | |
1328 | PX_FORCE_INLINE FloatV (const Vec3V a) |
1329 | { |
1330 | ASSERT_ISVALIDVEC3V(a); |
1331 | const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)); |
1332 | const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)); |
1333 | const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)); |
1334 | |
1335 | return _mm_max_ps(a: _mm_max_ps(a: shuf1, b: shuf2), b: shuf3); |
1336 | } |
1337 | |
1338 | PX_FORCE_INLINE FloatV (const Vec3V a) |
1339 | { |
1340 | ASSERT_ISVALIDVEC3V(a); |
1341 | |
1342 | const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)); |
1343 | const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)); |
1344 | const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)); |
1345 | |
1346 | return _mm_min_ps(a: _mm_min_ps(a: shuf1, b: shuf2), b: shuf3); |
1347 | } |
1348 | |
1349 | // return (a >= 0.0f) ? 1.0f : -1.0f; |
1350 | PX_FORCE_INLINE Vec3V V3Sign(const Vec3V a) |
1351 | { |
1352 | ASSERT_ISVALIDVEC3V(a); |
1353 | const __m128 zero = V3Zero(); |
1354 | const __m128 one = V3One(); |
1355 | const __m128 none = V3Neg(f: one); |
1356 | return V3Sel(c: V3IsGrtrOrEq(a, b: zero), a: one, b: none); |
1357 | } |
1358 | |
1359 | PX_FORCE_INLINE Vec3V V3Clamp(const Vec3V a, const Vec3V minV, const Vec3V maxV) |
1360 | { |
1361 | ASSERT_ISVALIDVEC3V(maxV); |
1362 | ASSERT_ISVALIDVEC3V(minV); |
1363 | return V3Max(a: V3Min(a, b: maxV), b: minV); |
1364 | } |
1365 | |
1366 | PX_FORCE_INLINE PxU32 V3AllGrtr(const Vec3V a, const Vec3V b) |
1367 | { |
1368 | ASSERT_ISVALIDVEC3V(a); |
1369 | ASSERT_ISVALIDVEC3V(b); |
1370 | return internalUnitSSE2Simd::BAllTrue3_R(a: V4IsGrtr(a, b)); |
1371 | } |
1372 | |
1373 | PX_FORCE_INLINE PxU32 V3AllGrtrOrEq(const Vec3V a, const Vec3V b) |
1374 | { |
1375 | ASSERT_ISVALIDVEC3V(a); |
1376 | ASSERT_ISVALIDVEC3V(b); |
1377 | return internalUnitSSE2Simd::BAllTrue3_R(a: V4IsGrtrOrEq(a, b)); |
1378 | } |
1379 | |
1380 | PX_FORCE_INLINE PxU32 V3AllEq(const Vec3V a, const Vec3V b) |
1381 | { |
1382 | ASSERT_ISVALIDVEC3V(a); |
1383 | ASSERT_ISVALIDVEC3V(b); |
1384 | return internalUnitSSE2Simd::BAllTrue3_R(a: V4IsEq(a, b)); |
1385 | } |
1386 | |
1387 | PX_FORCE_INLINE Vec3V V3Round(const Vec3V a) |
1388 | { |
1389 | ASSERT_ISVALIDVEC3V(a); |
1390 | #ifdef __SSE4_2__ |
1391 | return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); |
1392 | #else |
1393 | // return _mm_round_ps(a, 0x0); |
1394 | const Vec3V half = V3Load(f: 0.5f); |
1395 | const __m128 signBit = _mm_cvtepi32_ps(a: _mm_srli_epi32(a: _mm_cvtps_epi32(a: a), count: 31)); |
1396 | const Vec3V aRound = V3Sub(a: V3Add(a, b: half), b: signBit); |
1397 | __m128i tmp = _mm_cvttps_epi32(a: aRound); |
1398 | return _mm_cvtepi32_ps(a: tmp); |
1399 | #endif |
1400 | } |
1401 | |
1402 | PX_FORCE_INLINE Vec3V V3Sin(const Vec3V a) |
1403 | { |
1404 | ASSERT_ISVALIDVEC3V(a); |
1405 | // Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI |
1406 | const Vec4V recipTwoPi = V4LoadA(f: g_PXReciprocalTwoPi.f); |
1407 | const Vec4V twoPi = V4LoadA(f: g_PXTwoPi.f); |
1408 | const Vec3V tmp = V3Scale(a, b: recipTwoPi); |
1409 | const Vec3V b = V3Round(a: tmp); |
1410 | const Vec3V V1 = V3NegScaleSub(a: b, b: twoPi, c: a); |
1411 | |
1412 | // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! - |
1413 | // V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI) |
1414 | const Vec3V V2 = V3Mul(a: V1, b: V1); |
1415 | const Vec3V V3 = V3Mul(a: V2, b: V1); |
1416 | const Vec3V V5 = V3Mul(a: V3, b: V2); |
1417 | const Vec3V V7 = V3Mul(a: V5, b: V2); |
1418 | const Vec3V V9 = V3Mul(a: V7, b: V2); |
1419 | const Vec3V V11 = V3Mul(a: V9, b: V2); |
1420 | const Vec3V V13 = V3Mul(a: V11, b: V2); |
1421 | const Vec3V V15 = V3Mul(a: V13, b: V2); |
1422 | const Vec3V V17 = V3Mul(a: V15, b: V2); |
1423 | const Vec3V V19 = V3Mul(a: V17, b: V2); |
1424 | const Vec3V V21 = V3Mul(a: V19, b: V2); |
1425 | const Vec3V V23 = V3Mul(a: V21, b: V2); |
1426 | |
1427 | const Vec4V sinCoefficients0 = V4LoadA(f: g_PXSinCoefficients0.f); |
1428 | const Vec4V sinCoefficients1 = V4LoadA(f: g_PXSinCoefficients1.f); |
1429 | const Vec4V sinCoefficients2 = V4LoadA(f: g_PXSinCoefficients2.f); |
1430 | |
1431 | const FloatV S1 = V4GetY(f: sinCoefficients0); |
1432 | const FloatV S2 = V4GetZ(f: sinCoefficients0); |
1433 | const FloatV S3 = V4GetW(f: sinCoefficients0); |
1434 | const FloatV S4 = V4GetX(f: sinCoefficients1); |
1435 | const FloatV S5 = V4GetY(f: sinCoefficients1); |
1436 | const FloatV S6 = V4GetZ(f: sinCoefficients1); |
1437 | const FloatV S7 = V4GetW(f: sinCoefficients1); |
1438 | const FloatV S8 = V4GetX(f: sinCoefficients2); |
1439 | const FloatV S9 = V4GetY(f: sinCoefficients2); |
1440 | const FloatV S10 = V4GetZ(f: sinCoefficients2); |
1441 | const FloatV S11 = V4GetW(f: sinCoefficients2); |
1442 | |
1443 | Vec3V Result; |
1444 | Result = V3ScaleAdd(a: V3, b: S1, c: V1); |
1445 | Result = V3ScaleAdd(a: V5, b: S2, c: Result); |
1446 | Result = V3ScaleAdd(a: V7, b: S3, c: Result); |
1447 | Result = V3ScaleAdd(a: V9, b: S4, c: Result); |
1448 | Result = V3ScaleAdd(a: V11, b: S5, c: Result); |
1449 | Result = V3ScaleAdd(a: V13, b: S6, c: Result); |
1450 | Result = V3ScaleAdd(a: V15, b: S7, c: Result); |
1451 | Result = V3ScaleAdd(a: V17, b: S8, c: Result); |
1452 | Result = V3ScaleAdd(a: V19, b: S9, c: Result); |
1453 | Result = V3ScaleAdd(a: V21, b: S10, c: Result); |
1454 | Result = V3ScaleAdd(a: V23, b: S11, c: Result); |
1455 | |
1456 | ASSERT_ISVALIDVEC3V(Result); |
1457 | return Result; |
1458 | } |
1459 | |
1460 | PX_FORCE_INLINE Vec3V V3Cos(const Vec3V a) |
1461 | { |
1462 | ASSERT_ISVALIDVEC3V(a); |
1463 | |
1464 | // Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI |
1465 | const Vec4V recipTwoPi = V4LoadA(f: g_PXReciprocalTwoPi.f); |
1466 | const Vec4V twoPi = V4LoadA(f: g_PXTwoPi.f); |
1467 | const Vec3V tmp = V3Scale(a, b: recipTwoPi); |
1468 | const Vec3V b = V3Round(a: tmp); |
1469 | const Vec3V V1 = V3NegScaleSub(a: b, b: twoPi, c: a); |
1470 | |
1471 | // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! - |
1472 | // V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI) |
1473 | const Vec3V V2 = V3Mul(a: V1, b: V1); |
1474 | const Vec3V V4 = V3Mul(a: V2, b: V2); |
1475 | const Vec3V V6 = V3Mul(a: V4, b: V2); |
1476 | const Vec3V V8 = V3Mul(a: V4, b: V4); |
1477 | const Vec3V V10 = V3Mul(a: V6, b: V4); |
1478 | const Vec3V V12 = V3Mul(a: V6, b: V6); |
1479 | const Vec3V V14 = V3Mul(a: V8, b: V6); |
1480 | const Vec3V V16 = V3Mul(a: V8, b: V8); |
1481 | const Vec3V V18 = V3Mul(a: V10, b: V8); |
1482 | const Vec3V V20 = V3Mul(a: V10, b: V10); |
1483 | const Vec3V V22 = V3Mul(a: V12, b: V10); |
1484 | |
1485 | const Vec4V cosCoefficients0 = V4LoadA(f: g_PXCosCoefficients0.f); |
1486 | const Vec4V cosCoefficients1 = V4LoadA(f: g_PXCosCoefficients1.f); |
1487 | const Vec4V cosCoefficients2 = V4LoadA(f: g_PXCosCoefficients2.f); |
1488 | |
1489 | const FloatV C1 = V4GetY(f: cosCoefficients0); |
1490 | const FloatV C2 = V4GetZ(f: cosCoefficients0); |
1491 | const FloatV C3 = V4GetW(f: cosCoefficients0); |
1492 | const FloatV C4 = V4GetX(f: cosCoefficients1); |
1493 | const FloatV C5 = V4GetY(f: cosCoefficients1); |
1494 | const FloatV C6 = V4GetZ(f: cosCoefficients1); |
1495 | const FloatV C7 = V4GetW(f: cosCoefficients1); |
1496 | const FloatV C8 = V4GetX(f: cosCoefficients2); |
1497 | const FloatV C9 = V4GetY(f: cosCoefficients2); |
1498 | const FloatV C10 = V4GetZ(f: cosCoefficients2); |
1499 | const FloatV C11 = V4GetW(f: cosCoefficients2); |
1500 | |
1501 | Vec3V Result; |
1502 | Result = V3ScaleAdd(a: V2, b: C1, c: V3One()); |
1503 | Result = V3ScaleAdd(a: V4, b: C2, c: Result); |
1504 | Result = V3ScaleAdd(a: V6, b: C3, c: Result); |
1505 | Result = V3ScaleAdd(a: V8, b: C4, c: Result); |
1506 | Result = V3ScaleAdd(a: V10, b: C5, c: Result); |
1507 | Result = V3ScaleAdd(a: V12, b: C6, c: Result); |
1508 | Result = V3ScaleAdd(a: V14, b: C7, c: Result); |
1509 | Result = V3ScaleAdd(a: V16, b: C8, c: Result); |
1510 | Result = V3ScaleAdd(a: V18, b: C9, c: Result); |
1511 | Result = V3ScaleAdd(a: V20, b: C10, c: Result); |
1512 | Result = V3ScaleAdd(a: V22, b: C11, c: Result); |
1513 | |
1514 | ASSERT_ISVALIDVEC3V(Result); |
1515 | return Result; |
1516 | } |
1517 | |
1518 | PX_FORCE_INLINE Vec3V V3PermYZZ(const Vec3V a) |
1519 | { |
1520 | ASSERT_ISVALIDVEC3V(a); |
1521 | return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 2, 1)); |
1522 | } |
1523 | |
1524 | PX_FORCE_INLINE Vec3V V3PermXYX(const Vec3V a) |
1525 | { |
1526 | ASSERT_ISVALIDVEC3V(a); |
1527 | return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 1, 0)); |
1528 | } |
1529 | |
1530 | PX_FORCE_INLINE Vec3V V3PermYZX(const Vec3V a) |
1531 | { |
1532 | ASSERT_ISVALIDVEC3V(a); |
1533 | return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); |
1534 | } |
1535 | |
1536 | PX_FORCE_INLINE Vec3V V3PermZXY(const Vec3V a) |
1537 | { |
1538 | ASSERT_ISVALIDVEC3V(a); |
1539 | return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); |
1540 | } |
1541 | |
1542 | PX_FORCE_INLINE Vec3V V3PermZZY(const Vec3V a) |
1543 | { |
1544 | ASSERT_ISVALIDVEC3V(a); |
1545 | return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 2, 2)); |
1546 | } |
1547 | |
1548 | PX_FORCE_INLINE Vec3V V3PermYXX(const Vec3V a) |
1549 | { |
1550 | ASSERT_ISVALIDVEC3V(a); |
1551 | return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 0, 1)); |
1552 | } |
1553 | |
1554 | PX_FORCE_INLINE Vec3V V3Perm_Zero_1Z_0Y(const Vec3V v0, const Vec3V v1) |
1555 | { |
1556 | ASSERT_ISVALIDVEC3V(v0); |
1557 | ASSERT_ISVALIDVEC3V(v1); |
1558 | return _mm_shuffle_ps(v1, v0, _MM_SHUFFLE(3, 1, 2, 3)); |
1559 | } |
1560 | |
1561 | PX_FORCE_INLINE Vec3V V3Perm_0Z_Zero_1X(const Vec3V v0, const Vec3V v1) |
1562 | { |
1563 | ASSERT_ISVALIDVEC3V(v0); |
1564 | ASSERT_ISVALIDVEC3V(v1); |
1565 | return _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(3, 0, 3, 2)); |
1566 | } |
1567 | |
1568 | PX_FORCE_INLINE Vec3V V3Perm_1Y_0X_Zero(const Vec3V v0, const Vec3V v1) |
1569 | { |
1570 | ASSERT_ISVALIDVEC3V(v0); |
1571 | ASSERT_ISVALIDVEC3V(v1); |
1572 | // There must be a better way to do this. |
1573 | Vec3V v2 = V3Zero(); |
1574 | FloatV y1 = V3GetY(f: v1); |
1575 | FloatV x0 = V3GetX(f: v0); |
1576 | v2 = V3SetX(v: v2, f: y1); |
1577 | return V3SetY(v: v2, f: x0); |
1578 | } |
1579 | |
1580 | PX_FORCE_INLINE FloatV V3SumElems(const Vec3V a) |
1581 | { |
1582 | ASSERT_ISVALIDVEC3V(a); |
1583 | #ifdef __SSE4_2__ |
1584 | Vec3V r = _mm_hadd_ps(a, a); |
1585 | r = _mm_hadd_ps(r, r); |
1586 | return r; |
1587 | #else |
1588 | __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)); // z,y,x,w |
1589 | __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)); // y,x,w,z |
1590 | __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)); // x,w,z,y |
1591 | return _mm_add_ps(a: _mm_add_ps(a: shuf1, b: shuf2), b: shuf3); |
1592 | #endif |
1593 | } |
1594 | |
1595 | PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V min, const Vec3V max) |
1596 | { |
1597 | ASSERT_ISVALIDVEC3V(a); |
1598 | ASSERT_ISVALIDVEC3V(min); |
1599 | ASSERT_ISVALIDVEC3V(max); |
1600 | const BoolV c = BOr(a: V3IsGrtr(a, b: max), b: V3IsGrtr(a: min, b: a)); |
1601 | return !BAllEqFFFF(a: c); |
1602 | } |
1603 | |
1604 | PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V min, const Vec3V max) |
1605 | { |
1606 | ASSERT_ISVALIDVEC3V(a); |
1607 | ASSERT_ISVALIDVEC3V(min); |
1608 | ASSERT_ISVALIDVEC3V(max); |
1609 | const BoolV c = BAnd(a: V3IsGrtrOrEq(a, b: min), b: V3IsGrtrOrEq(a: max, b: a)); |
1610 | return BAllEqTTTT(a: c); |
1611 | } |
1612 | |
1613 | PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V bounds) |
1614 | { |
1615 | ASSERT_ISVALIDVEC3V(a); |
1616 | ASSERT_ISVALIDVEC3V(bounds); |
1617 | return V3OutOfBounds(a, min: V3Neg(f: bounds), max: bounds); |
1618 | } |
1619 | |
1620 | PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V bounds) |
1621 | { |
1622 | ASSERT_ISVALIDVEC3V(a); |
1623 | ASSERT_ISVALIDVEC3V(bounds) |
1624 | return V3InBounds(a, min: V3Neg(f: bounds), max: bounds); |
1625 | } |
1626 | |
1627 | PX_FORCE_INLINE void V3Transpose(Vec3V& col0, Vec3V& col1, Vec3V& col2) |
1628 | { |
1629 | ASSERT_ISVALIDVEC3V(col0); |
1630 | ASSERT_ISVALIDVEC3V(col1); |
1631 | ASSERT_ISVALIDVEC3V(col2); |
1632 | |
1633 | const Vec3V col3 = _mm_setzero_ps(); |
1634 | Vec3V tmp0 = _mm_unpacklo_ps(a: col0, b: col1); |
1635 | Vec3V tmp2 = _mm_unpacklo_ps(a: col2, b: col3); |
1636 | Vec3V tmp1 = _mm_unpackhi_ps(a: col0, b: col1); |
1637 | Vec3V tmp3 = _mm_unpackhi_ps(a: col2, b: col3); |
1638 | col0 = _mm_movelh_ps(a: tmp0, b: tmp2); |
1639 | col1 = _mm_movehl_ps(a: tmp2, b: tmp0); |
1640 | col2 = _mm_movelh_ps(a: tmp1, b: tmp3); |
1641 | } |
1642 | |
1643 | ////////////////////////////////// |
1644 | // VEC4V |
1645 | ////////////////////////////////// |
1646 | |
1647 | PX_FORCE_INLINE Vec4V V4Splat(const FloatV f) |
1648 | { |
1649 | ASSERT_ISVALIDFLOATV(f); |
1650 | // return _mm_shuffle_ps(f, f, _MM_SHUFFLE(0,0,0,0)); |
1651 | return f; |
1652 | } |
1653 | |
1654 | PX_FORCE_INLINE Vec4V V4Merge(const FloatV* const floatVArray) |
1655 | { |
1656 | ASSERT_ISVALIDFLOATV(floatVArray[0]); |
1657 | ASSERT_ISVALIDFLOATV(floatVArray[1]); |
1658 | ASSERT_ISVALIDFLOATV(floatVArray[2]); |
1659 | ASSERT_ISVALIDFLOATV(floatVArray[3]); |
1660 | const __m128 xw = _mm_move_ss(a: floatVArray[1], b: floatVArray[0]); // y, y, y, x |
1661 | const __m128 yz = _mm_move_ss(a: floatVArray[2], b: floatVArray[3]); // z, z, z, w |
1662 | return _mm_shuffle_ps(xw, yz, _MM_SHUFFLE(0, 2, 1, 0)); |
1663 | } |
1664 | |
1665 | PX_FORCE_INLINE Vec4V V4Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z, const FloatVArg w) |
1666 | { |
1667 | ASSERT_ISVALIDFLOATV(x); |
1668 | ASSERT_ISVALIDFLOATV(y); |
1669 | ASSERT_ISVALIDFLOATV(z); |
1670 | ASSERT_ISVALIDFLOATV(w); |
1671 | const __m128 xw = _mm_move_ss(a: y, b: x); // y, y, y, x |
1672 | const __m128 yz = _mm_move_ss(a: z, b: w); // z, z, z, w |
1673 | return _mm_shuffle_ps(xw, yz, _MM_SHUFFLE(0, 2, 1, 0)); |
1674 | } |
1675 | |
1676 | PX_FORCE_INLINE Vec4V V4MergeW(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w) |
1677 | { |
1678 | const Vec4V xz = _mm_unpackhi_ps(a: x, b: z); |
1679 | const Vec4V yw = _mm_unpackhi_ps(a: y, b: w); |
1680 | return _mm_unpackhi_ps(a: xz, b: yw); |
1681 | } |
1682 | |
1683 | PX_FORCE_INLINE Vec4V V4MergeZ(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w) |
1684 | { |
1685 | const Vec4V xz = _mm_unpackhi_ps(a: x, b: z); |
1686 | const Vec4V yw = _mm_unpackhi_ps(a: y, b: w); |
1687 | return _mm_unpacklo_ps(a: xz, b: yw); |
1688 | } |
1689 | |
1690 | PX_FORCE_INLINE Vec4V V4MergeY(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w) |
1691 | { |
1692 | const Vec4V xz = _mm_unpacklo_ps(a: x, b: z); |
1693 | const Vec4V yw = _mm_unpacklo_ps(a: y, b: w); |
1694 | return _mm_unpackhi_ps(a: xz, b: yw); |
1695 | } |
1696 | |
1697 | PX_FORCE_INLINE Vec4V V4MergeX(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w) |
1698 | { |
1699 | const Vec4V xz = _mm_unpacklo_ps(a: x, b: z); |
1700 | const Vec4V yw = _mm_unpacklo_ps(a: y, b: w); |
1701 | return _mm_unpacklo_ps(a: xz, b: yw); |
1702 | } |
1703 | |
1704 | PX_FORCE_INLINE Vec4V V4UnpackXY(const Vec4VArg a, const Vec4VArg b) |
1705 | { |
1706 | return _mm_unpacklo_ps(a: a, b: b); |
1707 | } |
1708 | |
1709 | PX_FORCE_INLINE Vec4V V4UnpackZW(const Vec4VArg a, const Vec4VArg b) |
1710 | { |
1711 | return _mm_unpackhi_ps(a: a, b: b); |
1712 | } |
1713 | |
1714 | PX_FORCE_INLINE Vec4V V4UnitW() |
1715 | { |
1716 | const PX_ALIGN(16, PxF32) w[4] = { 0.0f, 0.0f, 0.0f, 1.0f }; |
1717 | const __m128 w128 = _mm_load_ps(p: w); |
1718 | return w128; |
1719 | } |
1720 | |
1721 | PX_FORCE_INLINE Vec4V V4UnitX() |
1722 | { |
1723 | const PX_ALIGN(16, PxF32) x[4] = { 1.0f, 0.0f, 0.0f, 0.0f }; |
1724 | const __m128 x128 = _mm_load_ps(p: x); |
1725 | return x128; |
1726 | } |
1727 | |
1728 | PX_FORCE_INLINE Vec4V V4UnitY() |
1729 | { |
1730 | const PX_ALIGN(16, PxF32) y[4] = { 0.0f, 1.0f, 0.0f, 0.0f }; |
1731 | const __m128 y128 = _mm_load_ps(p: y); |
1732 | return y128; |
1733 | } |
1734 | |
1735 | PX_FORCE_INLINE Vec4V V4UnitZ() |
1736 | { |
1737 | const PX_ALIGN(16, PxF32) z[4] = { 0.0f, 0.0f, 1.0f, 0.0f }; |
1738 | const __m128 z128 = _mm_load_ps(p: z); |
1739 | return z128; |
1740 | } |
1741 | |
1742 | PX_FORCE_INLINE FloatV V4GetW(const Vec4V f) |
1743 | { |
1744 | return _mm_shuffle_ps(f, f, _MM_SHUFFLE(3, 3, 3, 3)); |
1745 | } |
1746 | |
1747 | PX_FORCE_INLINE FloatV V4GetX(const Vec4V f) |
1748 | { |
1749 | return _mm_shuffle_ps(f, f, _MM_SHUFFLE(0, 0, 0, 0)); |
1750 | } |
1751 | |
1752 | PX_FORCE_INLINE FloatV V4GetY(const Vec4V f) |
1753 | { |
1754 | return _mm_shuffle_ps(f, f, _MM_SHUFFLE(1, 1, 1, 1)); |
1755 | } |
1756 | |
1757 | PX_FORCE_INLINE FloatV V4GetZ(const Vec4V f) |
1758 | { |
1759 | return _mm_shuffle_ps(f, f, _MM_SHUFFLE(2, 2, 2, 2)); |
1760 | } |
1761 | |
1762 | PX_FORCE_INLINE Vec4V V4SetW(const Vec4V v, const FloatV f) |
1763 | { |
1764 | ASSERT_ISVALIDFLOATV(f); |
1765 | return V4Sel(c: BTTTF(), a: v, b: f); |
1766 | } |
1767 | |
1768 | PX_FORCE_INLINE Vec4V V4SetX(const Vec4V v, const FloatV f) |
1769 | { |
1770 | ASSERT_ISVALIDFLOATV(f); |
1771 | return V4Sel(c: BFTTT(), a: v, b: f); |
1772 | } |
1773 | |
1774 | PX_FORCE_INLINE Vec4V V4SetY(const Vec4V v, const FloatV f) |
1775 | { |
1776 | ASSERT_ISVALIDFLOATV(f); |
1777 | return V4Sel(c: BTFTT(), a: v, b: f); |
1778 | } |
1779 | |
1780 | PX_FORCE_INLINE Vec4V V4SetZ(const Vec4V v, const FloatV f) |
1781 | { |
1782 | ASSERT_ISVALIDFLOATV(f); |
1783 | return V4Sel(c: BTTFT(), a: v, b: f); |
1784 | } |
1785 | |
1786 | PX_FORCE_INLINE Vec4V V4ClearW(const Vec4V v) |
1787 | { |
1788 | #if !PX_EMSCRIPTEN |
1789 | return _mm_and_ps(a: v, b: V4LoadA(f: internalUnitSSE2Simd::gMaskXYZ)); |
1790 | #else |
1791 | return _mm_and_ps(v, (VecI32V&)internalUnitSSE2Simd::gMaskXYZ); |
1792 | #endif |
1793 | } |
1794 | |
1795 | PX_FORCE_INLINE Vec4V V4PermYXWZ(const Vec4V a) |
1796 | { |
1797 | return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)); |
1798 | } |
1799 | |
1800 | PX_FORCE_INLINE Vec4V V4PermXZXZ(const Vec4V a) |
1801 | { |
1802 | return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 0, 2, 0)); |
1803 | } |
1804 | |
1805 | PX_FORCE_INLINE Vec4V V4PermYWYW(const Vec4V a) |
1806 | { |
1807 | return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 3, 1)); |
1808 | } |
1809 | |
1810 | PX_FORCE_INLINE Vec4V V4PermYZXW(const Vec4V a) |
1811 | { |
1812 | return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); |
1813 | } |
1814 | |
1815 | PX_FORCE_INLINE Vec4V V4PermZWXY(const Vec4V a) |
1816 | { |
1817 | return _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2)); |
1818 | } |
1819 | |
1820 | template <PxU8 x, PxU8 y, PxU8 z, PxU8 w> |
1821 | PX_FORCE_INLINE Vec4V V4Perm(const Vec4V a) |
1822 | { |
1823 | return _mm_shuffle_ps(a, a, _MM_SHUFFLE(w, z, y, x)); |
1824 | } |
1825 | |
1826 | PX_FORCE_INLINE Vec4V V4Zero() |
1827 | { |
1828 | return V4Load(f: 0.0f); |
1829 | } |
1830 | |
1831 | PX_FORCE_INLINE Vec4V V4One() |
1832 | { |
1833 | return V4Load(f: 1.0f); |
1834 | } |
1835 | |
1836 | PX_FORCE_INLINE Vec4V V4Eps() |
1837 | { |
1838 | return V4Load(PX_EPS_REAL); |
1839 | } |
1840 | |
1841 | PX_FORCE_INLINE Vec4V V4Neg(const Vec4V f) |
1842 | { |
1843 | return _mm_sub_ps(a: _mm_setzero_ps(), b: f); |
1844 | } |
1845 | |
1846 | PX_FORCE_INLINE Vec4V V4Add(const Vec4V a, const Vec4V b) |
1847 | { |
1848 | return _mm_add_ps(a: a, b: b); |
1849 | } |
1850 | |
1851 | PX_FORCE_INLINE Vec4V V4Sub(const Vec4V a, const Vec4V b) |
1852 | { |
1853 | return _mm_sub_ps(a: a, b: b); |
1854 | } |
1855 | |
1856 | PX_FORCE_INLINE Vec4V V4Scale(const Vec4V a, const FloatV b) |
1857 | { |
1858 | return _mm_mul_ps(a: a, b: b); |
1859 | } |
1860 | |
1861 | PX_FORCE_INLINE Vec4V V4Mul(const Vec4V a, const Vec4V b) |
1862 | { |
1863 | return _mm_mul_ps(a: a, b: b); |
1864 | } |
1865 | |
1866 | PX_FORCE_INLINE Vec4V V4ScaleInv(const Vec4V a, const FloatV b) |
1867 | { |
1868 | ASSERT_ISVALIDFLOATV(b); |
1869 | return _mm_div_ps(a: a, b: b); |
1870 | } |
1871 | |
1872 | PX_FORCE_INLINE Vec4V V4Div(const Vec4V a, const Vec4V b) |
1873 | { |
1874 | return _mm_div_ps(a: a, b: b); |
1875 | } |
1876 | |
1877 | PX_FORCE_INLINE Vec4V V4ScaleInvFast(const Vec4V a, const FloatV b) |
1878 | { |
1879 | ASSERT_ISVALIDFLOATV(b); |
1880 | return _mm_mul_ps(a: a, b: _mm_rcp_ps(a: b)); |
1881 | } |
1882 | |
1883 | PX_FORCE_INLINE Vec4V V4DivFast(const Vec4V a, const Vec4V b) |
1884 | { |
1885 | return _mm_mul_ps(a: a, b: _mm_rcp_ps(a: b)); |
1886 | } |
1887 | |
1888 | PX_FORCE_INLINE Vec4V V4Recip(const Vec4V a) |
1889 | { |
1890 | return _mm_div_ps(a: V4One(), b: a); |
1891 | } |
1892 | |
1893 | PX_FORCE_INLINE Vec4V V4RecipFast(const Vec4V a) |
1894 | { |
1895 | return _mm_rcp_ps(a: a); |
1896 | } |
1897 | |
1898 | PX_FORCE_INLINE Vec4V V4Rsqrt(const Vec4V a) |
1899 | { |
1900 | return _mm_div_ps(a: V4One(), b: _mm_sqrt_ps(a: a)); |
1901 | } |
1902 | |
1903 | PX_FORCE_INLINE Vec4V V4RsqrtFast(const Vec4V a) |
1904 | { |
1905 | return _mm_rsqrt_ps(a: a); |
1906 | } |
1907 | |
1908 | PX_FORCE_INLINE Vec4V V4Sqrt(const Vec4V a) |
1909 | { |
1910 | return _mm_sqrt_ps(a: a); |
1911 | } |
1912 | |
1913 | PX_FORCE_INLINE Vec4V V4ScaleAdd(const Vec4V a, const FloatV b, const Vec4V c) |
1914 | { |
1915 | ASSERT_ISVALIDFLOATV(b); |
1916 | return V4Add(a: V4Scale(a, b), b: c); |
1917 | } |
1918 | |
1919 | PX_FORCE_INLINE Vec4V V4NegScaleSub(const Vec4V a, const FloatV b, const Vec4V c) |
1920 | { |
1921 | ASSERT_ISVALIDFLOATV(b); |
1922 | return V4Sub(a: c, b: V4Scale(a, b)); |
1923 | } |
1924 | |
1925 | PX_FORCE_INLINE Vec4V V4MulAdd(const Vec4V a, const Vec4V b, const Vec4V c) |
1926 | { |
1927 | return V4Add(a: V4Mul(a, b), b: c); |
1928 | } |
1929 | |
1930 | PX_FORCE_INLINE Vec4V V4NegMulSub(const Vec4V a, const Vec4V b, const Vec4V c) |
1931 | { |
1932 | return V4Sub(a: c, b: V4Mul(a, b)); |
1933 | } |
1934 | |
1935 | PX_FORCE_INLINE Vec4V V4Abs(const Vec4V a) |
1936 | { |
1937 | return V4Max(a, b: V4Neg(f: a)); |
1938 | } |
1939 | |
1940 | PX_FORCE_INLINE FloatV V4SumElements(const Vec4V a) |
1941 | { |
1942 | #ifdef __SSE4_2__ |
1943 | Vec4V r = _mm_hadd_ps(a, a); |
1944 | r = _mm_hadd_ps(r, r); |
1945 | return r; |
1946 | #else |
1947 | const Vec4V xy = V4UnpackXY(a, b: a); // x,x,y,y |
1948 | const Vec4V zw = V4UnpackZW(a, b: a); // z,z,w,w |
1949 | const Vec4V xz_yw = V4Add(a: xy, b: zw); // x+z,x+z,y+w,y+w |
1950 | const FloatV xz = V4GetX(f: xz_yw); // x+z |
1951 | const FloatV yw = V4GetZ(f: xz_yw); // y+w |
1952 | return FAdd(a: xz, b: yw); // sum |
1953 | #endif |
1954 | } |
1955 | |
1956 | PX_FORCE_INLINE FloatV V4Dot(const Vec4V a, const Vec4V b) |
1957 | { |
1958 | #ifdef __SSE4_2__ |
1959 | return _mm_dp_ps(a, b, 0xff); |
1960 | #else |
1961 | const __m128 dot1 = _mm_mul_ps(a: a, b: b); // x,y,z,w |
1962 | const __m128 shuf1 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(2, 1, 0, 3)); // w,x,y,z |
1963 | const __m128 shuf2 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(1, 0, 3, 2)); // z,w,x,y |
1964 | const __m128 shuf3 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(0, 3, 2, 1)); // y,z,w,x |
1965 | return _mm_add_ps(a: _mm_add_ps(a: shuf2, b: shuf3), b: _mm_add_ps(a: dot1, b: shuf1)); |
1966 | #endif |
1967 | } |
1968 | |
1969 | PX_FORCE_INLINE FloatV V4Dot3(const Vec4V a, const Vec4V b) |
1970 | { |
1971 | #ifdef __SSE4_2__ |
1972 | return _mm_dp_ps(a, b, 0x7f); |
1973 | #else |
1974 | const __m128 dot1 = _mm_mul_ps(a: a, b: b); // w,z,y,x |
1975 | const __m128 shuf1 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(0, 0, 0, 0)); // z,y,x,w |
1976 | const __m128 shuf2 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(1, 1, 1, 1)); // y,x,w,z |
1977 | const __m128 shuf3 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(2, 2, 2, 2)); // x,w,z,y |
1978 | return _mm_add_ps(a: _mm_add_ps(a: shuf1, b: shuf2), b: shuf3); |
1979 | #endif |
1980 | } |
1981 | |
1982 | PX_FORCE_INLINE Vec4V V4Cross(const Vec4V a, const Vec4V b) |
1983 | { |
1984 | const __m128 r1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w |
1985 | const __m128 r2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w |
1986 | const __m128 l1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w |
1987 | const __m128 l2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w |
1988 | return _mm_sub_ps(a: _mm_mul_ps(a: l1, b: l2), b: _mm_mul_ps(a: r1, b: r2)); |
1989 | } |
1990 | |
1991 | PX_FORCE_INLINE FloatV V4Length(const Vec4V a) |
1992 | { |
1993 | return _mm_sqrt_ps(a: V4Dot(a, b: a)); |
1994 | } |
1995 | |
1996 | PX_FORCE_INLINE FloatV V4LengthSq(const Vec4V a) |
1997 | { |
1998 | return V4Dot(a, b: a); |
1999 | } |
2000 | |
2001 | PX_FORCE_INLINE Vec4V V4Normalize(const Vec4V a) |
2002 | { |
2003 | ASSERT_ISFINITELENGTH(a); |
2004 | return V4ScaleInv(a, b: _mm_sqrt_ps(a: V4Dot(a, b: a))); |
2005 | } |
2006 | |
2007 | PX_FORCE_INLINE Vec4V V4NormalizeFast(const Vec4V a) |
2008 | { |
2009 | ASSERT_ISFINITELENGTH(a); |
2010 | return V4ScaleInvFast(a, b: _mm_sqrt_ps(a: V4Dot(a, b: a))); |
2011 | } |
2012 | |
2013 | PX_FORCE_INLINE Vec4V V4NormalizeSafe(const Vec4V a, const Vec3V unsafeReturnValue) |
2014 | { |
2015 | const __m128 eps = V3Eps(); |
2016 | const __m128 length = V4Length(a); |
2017 | const __m128 isGreaterThanZero = V4IsGrtr(a: length, b: eps); |
2018 | return V4Sel(c: isGreaterThanZero, a: V4ScaleInv(a, b: length), b: unsafeReturnValue); |
2019 | } |
2020 | |
2021 | PX_FORCE_INLINE BoolV V4IsEqU32(const VecU32V a, const VecU32V b) |
2022 | { |
2023 | return m128_I2F(n: _mm_cmpeq_epi32(a: m128_F2I(n: a), b: m128_F2I(n: b))); |
2024 | } |
2025 | |
2026 | PX_FORCE_INLINE Vec4V V4Sel(const BoolV c, const Vec4V a, const Vec4V b) |
2027 | { |
2028 | return _mm_or_ps(a: _mm_andnot_ps(a: c, b: b), b: _mm_and_ps(a: c, b: a)); |
2029 | } |
2030 | |
2031 | PX_FORCE_INLINE BoolV V4IsGrtr(const Vec4V a, const Vec4V b) |
2032 | { |
2033 | return _mm_cmpgt_ps(a: a, b: b); |
2034 | } |
2035 | |
2036 | PX_FORCE_INLINE BoolV V4IsGrtrOrEq(const Vec4V a, const Vec4V b) |
2037 | { |
2038 | return _mm_cmpge_ps(a: a, b: b); |
2039 | } |
2040 | |
2041 | PX_FORCE_INLINE BoolV V4IsEq(const Vec4V a, const Vec4V b) |
2042 | { |
2043 | return _mm_cmpeq_ps(a: a, b: b); |
2044 | } |
2045 | |
2046 | PX_FORCE_INLINE Vec4V V4Max(const Vec4V a, const Vec4V b) |
2047 | { |
2048 | return _mm_max_ps(a: a, b: b); |
2049 | } |
2050 | |
2051 | PX_FORCE_INLINE Vec4V V4Min(const Vec4V a, const Vec4V b) |
2052 | { |
2053 | return _mm_min_ps(a: a, b: b); |
2054 | } |
2055 | |
2056 | PX_FORCE_INLINE FloatV (const Vec4V a) |
2057 | { |
2058 | const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 1, 0, 3)); |
2059 | const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2)); |
2060 | const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 3, 2, 1)); |
2061 | |
2062 | return _mm_max_ps(a: _mm_max_ps(a: a, b: shuf1), b: _mm_max_ps(a: shuf2, b: shuf3)); |
2063 | } |
2064 | |
2065 | PX_FORCE_INLINE FloatV (const Vec4V a) |
2066 | { |
2067 | const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 1, 0, 3)); |
2068 | const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2)); |
2069 | const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 3, 2, 1)); |
2070 | |
2071 | return _mm_min_ps(a: _mm_min_ps(a: a, b: shuf1), b: _mm_min_ps(a: shuf2, b: shuf3)); |
2072 | } |
2073 | |
2074 | PX_FORCE_INLINE Vec4V V4Clamp(const Vec4V a, const Vec4V minV, const Vec4V maxV) |
2075 | { |
2076 | return V4Max(a: V4Min(a, b: maxV), b: minV); |
2077 | } |
2078 | |
2079 | PX_FORCE_INLINE PxU32 V4AllGrtr(const Vec4V a, const Vec4V b) |
2080 | { |
2081 | return internalUnitSSE2Simd::BAllTrue4_R(a: V4IsGrtr(a, b)); |
2082 | } |
2083 | |
2084 | PX_FORCE_INLINE PxU32 V4AllGrtrOrEq(const Vec4V a, const Vec4V b) |
2085 | { |
2086 | return internalUnitSSE2Simd::BAllTrue4_R(a: V4IsGrtrOrEq(a, b)); |
2087 | } |
2088 | |
2089 | PX_FORCE_INLINE PxU32 V4AllGrtrOrEq3(const Vec4V a, const Vec4V b) |
2090 | { |
2091 | return internalUnitSSE2Simd::BAllTrue3_R(a: V4IsGrtrOrEq(a, b)); |
2092 | } |
2093 | |
2094 | PX_FORCE_INLINE PxU32 V4AllEq(const Vec4V a, const Vec4V b) |
2095 | { |
2096 | return internalUnitSSE2Simd::BAllTrue4_R(a: V4IsEq(a, b)); |
2097 | } |
2098 | |
2099 | PX_FORCE_INLINE PxU32 V4AnyGrtr3(const Vec4V a, const Vec4V b) |
2100 | { |
2101 | return internalUnitSSE2Simd::BAnyTrue3_R(a: V4IsGrtr(a, b)); |
2102 | } |
2103 | |
2104 | PX_FORCE_INLINE Vec4V V4Round(const Vec4V a) |
2105 | { |
2106 | #ifdef __SSE4_2__ |
2107 | return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); |
2108 | #else |
2109 | // return _mm_round_ps(a, 0x0); |
2110 | const Vec4V half = V4Load(f: 0.5f); |
2111 | const __m128 signBit = _mm_cvtepi32_ps(a: _mm_srli_epi32(a: _mm_cvtps_epi32(a: a), count: 31)); |
2112 | const Vec4V aRound = V4Sub(a: V4Add(a, b: half), b: signBit); |
2113 | __m128i tmp = _mm_cvttps_epi32(a: aRound); |
2114 | return _mm_cvtepi32_ps(a: tmp); |
2115 | #endif |
2116 | } |
2117 | |
2118 | PX_FORCE_INLINE Vec4V V4Sin(const Vec4V a) |
2119 | { |
2120 | const Vec4V recipTwoPi = V4LoadA(f: g_PXReciprocalTwoPi.f); |
2121 | const Vec4V twoPi = V4LoadA(f: g_PXTwoPi.f); |
2122 | const Vec4V tmp = V4Mul(a, b: recipTwoPi); |
2123 | const Vec4V b = V4Round(a: tmp); |
2124 | const Vec4V V1 = V4NegMulSub(a: twoPi, b, c: a); |
2125 | |
2126 | // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! - |
2127 | // V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI) |
2128 | const Vec4V V2 = V4Mul(a: V1, b: V1); |
2129 | const Vec4V V3 = V4Mul(a: V2, b: V1); |
2130 | const Vec4V V5 = V4Mul(a: V3, b: V2); |
2131 | const Vec4V V7 = V4Mul(a: V5, b: V2); |
2132 | const Vec4V V9 = V4Mul(a: V7, b: V2); |
2133 | const Vec4V V11 = V4Mul(a: V9, b: V2); |
2134 | const Vec4V V13 = V4Mul(a: V11, b: V2); |
2135 | const Vec4V V15 = V4Mul(a: V13, b: V2); |
2136 | const Vec4V V17 = V4Mul(a: V15, b: V2); |
2137 | const Vec4V V19 = V4Mul(a: V17, b: V2); |
2138 | const Vec4V V21 = V4Mul(a: V19, b: V2); |
2139 | const Vec4V V23 = V4Mul(a: V21, b: V2); |
2140 | |
2141 | const Vec4V sinCoefficients0 = V4LoadA(f: g_PXSinCoefficients0.f); |
2142 | const Vec4V sinCoefficients1 = V4LoadA(f: g_PXSinCoefficients1.f); |
2143 | const Vec4V sinCoefficients2 = V4LoadA(f: g_PXSinCoefficients2.f); |
2144 | |
2145 | const FloatV S1 = V4GetY(f: sinCoefficients0); |
2146 | const FloatV S2 = V4GetZ(f: sinCoefficients0); |
2147 | const FloatV S3 = V4GetW(f: sinCoefficients0); |
2148 | const FloatV S4 = V4GetX(f: sinCoefficients1); |
2149 | const FloatV S5 = V4GetY(f: sinCoefficients1); |
2150 | const FloatV S6 = V4GetZ(f: sinCoefficients1); |
2151 | const FloatV S7 = V4GetW(f: sinCoefficients1); |
2152 | const FloatV S8 = V4GetX(f: sinCoefficients2); |
2153 | const FloatV S9 = V4GetY(f: sinCoefficients2); |
2154 | const FloatV S10 = V4GetZ(f: sinCoefficients2); |
2155 | const FloatV S11 = V4GetW(f: sinCoefficients2); |
2156 | |
2157 | Vec4V Result; |
2158 | Result = V4MulAdd(a: S1, b: V3, c: V1); |
2159 | Result = V4MulAdd(a: S2, b: V5, c: Result); |
2160 | Result = V4MulAdd(a: S3, b: V7, c: Result); |
2161 | Result = V4MulAdd(a: S4, b: V9, c: Result); |
2162 | Result = V4MulAdd(a: S5, b: V11, c: Result); |
2163 | Result = V4MulAdd(a: S6, b: V13, c: Result); |
2164 | Result = V4MulAdd(a: S7, b: V15, c: Result); |
2165 | Result = V4MulAdd(a: S8, b: V17, c: Result); |
2166 | Result = V4MulAdd(a: S9, b: V19, c: Result); |
2167 | Result = V4MulAdd(a: S10, b: V21, c: Result); |
2168 | Result = V4MulAdd(a: S11, b: V23, c: Result); |
2169 | |
2170 | return Result; |
2171 | } |
2172 | |
2173 | PX_FORCE_INLINE Vec4V V4Cos(const Vec4V a) |
2174 | { |
2175 | const Vec4V recipTwoPi = V4LoadA(f: g_PXReciprocalTwoPi.f); |
2176 | const Vec4V twoPi = V4LoadA(f: g_PXTwoPi.f); |
2177 | const Vec4V tmp = V4Mul(a, b: recipTwoPi); |
2178 | const Vec4V b = V4Round(a: tmp); |
2179 | const Vec4V V1 = V4NegMulSub(a: twoPi, b, c: a); |
2180 | |
2181 | // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! - |
2182 | // V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI) |
2183 | const Vec4V V2 = V4Mul(a: V1, b: V1); |
2184 | const Vec4V V4 = V4Mul(a: V2, b: V2); |
2185 | const Vec4V V6 = V4Mul(a: V4, b: V2); |
2186 | const Vec4V V8 = V4Mul(a: V4, b: V4); |
2187 | const Vec4V V10 = V4Mul(a: V6, b: V4); |
2188 | const Vec4V V12 = V4Mul(a: V6, b: V6); |
2189 | const Vec4V V14 = V4Mul(a: V8, b: V6); |
2190 | const Vec4V V16 = V4Mul(a: V8, b: V8); |
2191 | const Vec4V V18 = V4Mul(a: V10, b: V8); |
2192 | const Vec4V V20 = V4Mul(a: V10, b: V10); |
2193 | const Vec4V V22 = V4Mul(a: V12, b: V10); |
2194 | |
2195 | const Vec4V cosCoefficients0 = V4LoadA(f: g_PXCosCoefficients0.f); |
2196 | const Vec4V cosCoefficients1 = V4LoadA(f: g_PXCosCoefficients1.f); |
2197 | const Vec4V cosCoefficients2 = V4LoadA(f: g_PXCosCoefficients2.f); |
2198 | |
2199 | const FloatV C1 = V4GetY(f: cosCoefficients0); |
2200 | const FloatV C2 = V4GetZ(f: cosCoefficients0); |
2201 | const FloatV C3 = V4GetW(f: cosCoefficients0); |
2202 | const FloatV C4 = V4GetX(f: cosCoefficients1); |
2203 | const FloatV C5 = V4GetY(f: cosCoefficients1); |
2204 | const FloatV C6 = V4GetZ(f: cosCoefficients1); |
2205 | const FloatV C7 = V4GetW(f: cosCoefficients1); |
2206 | const FloatV C8 = V4GetX(f: cosCoefficients2); |
2207 | const FloatV C9 = V4GetY(f: cosCoefficients2); |
2208 | const FloatV C10 = V4GetZ(f: cosCoefficients2); |
2209 | const FloatV C11 = V4GetW(f: cosCoefficients2); |
2210 | |
2211 | Vec4V Result; |
2212 | Result = V4MulAdd(a: C1, b: V2, c: V4One()); |
2213 | Result = V4MulAdd(a: C2, b: V4, c: Result); |
2214 | Result = V4MulAdd(a: C3, b: V6, c: Result); |
2215 | Result = V4MulAdd(a: C4, b: V8, c: Result); |
2216 | Result = V4MulAdd(a: C5, b: V10, c: Result); |
2217 | Result = V4MulAdd(a: C6, b: V12, c: Result); |
2218 | Result = V4MulAdd(a: C7, b: V14, c: Result); |
2219 | Result = V4MulAdd(a: C8, b: V16, c: Result); |
2220 | Result = V4MulAdd(a: C9, b: V18, c: Result); |
2221 | Result = V4MulAdd(a: C10, b: V20, c: Result); |
2222 | Result = V4MulAdd(a: C11, b: V22, c: Result); |
2223 | |
2224 | return Result; |
2225 | } |
2226 | |
2227 | PX_FORCE_INLINE void V4Transpose(Vec4V& col0, Vec4V& col1, Vec4V& col2, Vec4V& col3) |
2228 | { |
2229 | Vec4V tmp0 = _mm_unpacklo_ps(a: col0, b: col1); |
2230 | Vec4V tmp2 = _mm_unpacklo_ps(a: col2, b: col3); |
2231 | Vec4V tmp1 = _mm_unpackhi_ps(a: col0, b: col1); |
2232 | Vec4V tmp3 = _mm_unpackhi_ps(a: col2, b: col3); |
2233 | col0 = _mm_movelh_ps(a: tmp0, b: tmp2); |
2234 | col1 = _mm_movehl_ps(a: tmp2, b: tmp0); |
2235 | col2 = _mm_movelh_ps(a: tmp1, b: tmp3); |
2236 | col3 = _mm_movehl_ps(a: tmp3, b: tmp1); |
2237 | } |
2238 | |
2239 | ////////////////////////////////// |
2240 | // BoolV |
2241 | ////////////////////////////////// |
2242 | |
2243 | PX_FORCE_INLINE BoolV BFFFF() |
2244 | { |
2245 | return _mm_setzero_ps(); |
2246 | } |
2247 | |
2248 | PX_FORCE_INLINE BoolV BFFFT() |
2249 | { |
2250 | /*const PX_ALIGN(16, PxU32 f[4])={0,0,0,0xFFFFFFFF}; |
2251 | const __m128 ffft=_mm_load_ps((float*)&f); |
2252 | return ffft;*/ |
2253 | return m128_I2F(n: _mm_set_epi32(i3: -1, i2: 0, i1: 0, i0: 0)); |
2254 | } |
2255 | |
2256 | PX_FORCE_INLINE BoolV BFFTF() |
2257 | { |
2258 | /*const PX_ALIGN(16, PxU32 f[4])={0,0,0xFFFFFFFF,0}; |
2259 | const __m128 fftf=_mm_load_ps((float*)&f); |
2260 | return fftf;*/ |
2261 | return m128_I2F(n: _mm_set_epi32(i3: 0, i2: -1, i1: 0, i0: 0)); |
2262 | } |
2263 | |
2264 | PX_FORCE_INLINE BoolV BFFTT() |
2265 | { |
2266 | /*const PX_ALIGN(16, PxU32 f[4])={0,0,0xFFFFFFFF,0xFFFFFFFF}; |
2267 | const __m128 fftt=_mm_load_ps((float*)&f); |
2268 | return fftt;*/ |
2269 | return m128_I2F(n: _mm_set_epi32(i3: -1, i2: -1, i1: 0, i0: 0)); |
2270 | } |
2271 | |
2272 | PX_FORCE_INLINE BoolV BFTFF() |
2273 | { |
2274 | /*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0,0}; |
2275 | const __m128 ftff=_mm_load_ps((float*)&f); |
2276 | return ftff;*/ |
2277 | return m128_I2F(n: _mm_set_epi32(i3: 0, i2: 0, i1: -1, i0: 0)); |
2278 | } |
2279 | |
2280 | PX_FORCE_INLINE BoolV BFTFT() |
2281 | { |
2282 | /*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0,0xFFFFFFFF}; |
2283 | const __m128 ftft=_mm_load_ps((float*)&f); |
2284 | return ftft;*/ |
2285 | return m128_I2F(n: _mm_set_epi32(i3: -1, i2: 0, i1: -1, i0: 0)); |
2286 | } |
2287 | |
2288 | PX_FORCE_INLINE BoolV BFTTF() |
2289 | { |
2290 | /*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0xFFFFFFFF,0}; |
2291 | const __m128 fttf=_mm_load_ps((float*)&f); |
2292 | return fttf;*/ |
2293 | return m128_I2F(n: _mm_set_epi32(i3: 0, i2: -1, i1: -1, i0: 0)); |
2294 | } |
2295 | |
2296 | PX_FORCE_INLINE BoolV BFTTT() |
2297 | { |
2298 | /*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF}; |
2299 | const __m128 fttt=_mm_load_ps((float*)&f); |
2300 | return fttt;*/ |
2301 | return m128_I2F(n: _mm_set_epi32(i3: -1, i2: -1, i1: -1, i0: 0)); |
2302 | } |
2303 | |
2304 | PX_FORCE_INLINE BoolV BTFFF() |
2305 | { |
2306 | // const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0,0}; |
2307 | // const __m128 tfff=_mm_load_ps((float*)&f); |
2308 | // return tfff; |
2309 | return m128_I2F(n: _mm_set_epi32(i3: 0, i2: 0, i1: 0, i0: -1)); |
2310 | } |
2311 | |
2312 | PX_FORCE_INLINE BoolV BTFFT() |
2313 | { |
2314 | /*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0,0xFFFFFFFF}; |
2315 | const __m128 tfft=_mm_load_ps((float*)&f); |
2316 | return tfft;*/ |
2317 | return m128_I2F(n: _mm_set_epi32(i3: -1, i2: 0, i1: 0, i0: -1)); |
2318 | } |
2319 | |
2320 | PX_FORCE_INLINE BoolV BTFTF() |
2321 | { |
2322 | /*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0xFFFFFFFF,0}; |
2323 | const __m128 tftf=_mm_load_ps((float*)&f); |
2324 | return tftf;*/ |
2325 | return m128_I2F(n: _mm_set_epi32(i3: 0, i2: -1, i1: 0, i0: -1)); |
2326 | } |
2327 | |
2328 | PX_FORCE_INLINE BoolV BTFTT() |
2329 | { |
2330 | /*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0xFFFFFFFF,0xFFFFFFFF}; |
2331 | const __m128 tftt=_mm_load_ps((float*)&f); |
2332 | return tftt;*/ |
2333 | return m128_I2F(n: _mm_set_epi32(i3: -1, i2: -1, i1: 0, i0: -1)); |
2334 | } |
2335 | |
2336 | PX_FORCE_INLINE BoolV BTTFF() |
2337 | { |
2338 | /*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0,0}; |
2339 | const __m128 ttff=_mm_load_ps((float*)&f); |
2340 | return ttff;*/ |
2341 | return m128_I2F(n: _mm_set_epi32(i3: 0, i2: 0, i1: -1, i0: -1)); |
2342 | } |
2343 | |
2344 | PX_FORCE_INLINE BoolV BTTFT() |
2345 | { |
2346 | /*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0,0xFFFFFFFF}; |
2347 | const __m128 ttft=_mm_load_ps((float*)&f); |
2348 | return ttft;*/ |
2349 | return m128_I2F(n: _mm_set_epi32(i3: -1, i2: 0, i1: -1, i0: -1)); |
2350 | } |
2351 | |
2352 | PX_FORCE_INLINE BoolV BTTTF() |
2353 | { |
2354 | /*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0}; |
2355 | const __m128 tttf=_mm_load_ps((float*)&f); |
2356 | return tttf;*/ |
2357 | return m128_I2F(n: _mm_set_epi32(i3: 0, i2: -1, i1: -1, i0: -1)); |
2358 | } |
2359 | |
2360 | PX_FORCE_INLINE BoolV BTTTT() |
2361 | { |
2362 | /*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF}; |
2363 | const __m128 tttt=_mm_load_ps((float*)&f); |
2364 | return tttt;*/ |
2365 | return m128_I2F(n: _mm_set_epi32(i3: -1, i2: -1, i1: -1, i0: -1)); |
2366 | } |
2367 | |
2368 | PX_FORCE_INLINE BoolV BXMask() |
2369 | { |
2370 | /*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0,0}; |
2371 | const __m128 tfff=_mm_load_ps((float*)&f); |
2372 | return tfff;*/ |
2373 | return m128_I2F(n: _mm_set_epi32(i3: 0, i2: 0, i1: 0, i0: -1)); |
2374 | } |
2375 | |
2376 | PX_FORCE_INLINE BoolV BYMask() |
2377 | { |
2378 | /*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0,0}; |
2379 | const __m128 ftff=_mm_load_ps((float*)&f); |
2380 | return ftff;*/ |
2381 | return m128_I2F(n: _mm_set_epi32(i3: 0, i2: 0, i1: -1, i0: 0)); |
2382 | } |
2383 | |
2384 | PX_FORCE_INLINE BoolV BZMask() |
2385 | { |
2386 | /*const PX_ALIGN(16, PxU32 f[4])={0,0,0xFFFFFFFF,0}; |
2387 | const __m128 fftf=_mm_load_ps((float*)&f); |
2388 | return fftf;*/ |
2389 | return m128_I2F(n: _mm_set_epi32(i3: 0, i2: -1, i1: 0, i0: 0)); |
2390 | } |
2391 | |
2392 | PX_FORCE_INLINE BoolV BWMask() |
2393 | { |
2394 | /*const PX_ALIGN(16, PxU32 f[4])={0,0,0,0xFFFFFFFF}; |
2395 | const __m128 ffft=_mm_load_ps((float*)&f); |
2396 | return ffft;*/ |
2397 | return m128_I2F(n: _mm_set_epi32(i3: -1, i2: 0, i1: 0, i0: 0)); |
2398 | } |
2399 | |
2400 | PX_FORCE_INLINE BoolV BGetX(const BoolV f) |
2401 | { |
2402 | return _mm_shuffle_ps(f, f, _MM_SHUFFLE(0, 0, 0, 0)); |
2403 | } |
2404 | |
2405 | PX_FORCE_INLINE BoolV BGetY(const BoolV f) |
2406 | { |
2407 | return _mm_shuffle_ps(f, f, _MM_SHUFFLE(1, 1, 1, 1)); |
2408 | } |
2409 | |
2410 | PX_FORCE_INLINE BoolV BGetZ(const BoolV f) |
2411 | { |
2412 | return _mm_shuffle_ps(f, f, _MM_SHUFFLE(2, 2, 2, 2)); |
2413 | } |
2414 | |
2415 | PX_FORCE_INLINE BoolV BGetW(const BoolV f) |
2416 | { |
2417 | return _mm_shuffle_ps(f, f, _MM_SHUFFLE(3, 3, 3, 3)); |
2418 | } |
2419 | |
2420 | PX_FORCE_INLINE BoolV BSetX(const BoolV v, const BoolV f) |
2421 | { |
2422 | return V4Sel(c: BFTTT(), a: v, b: f); |
2423 | } |
2424 | |
2425 | PX_FORCE_INLINE BoolV BSetY(const BoolV v, const BoolV f) |
2426 | { |
2427 | return V4Sel(c: BTFTT(), a: v, b: f); |
2428 | } |
2429 | |
2430 | PX_FORCE_INLINE BoolV BSetZ(const BoolV v, const BoolV f) |
2431 | { |
2432 | return V4Sel(c: BTTFT(), a: v, b: f); |
2433 | } |
2434 | |
2435 | PX_FORCE_INLINE BoolV BSetW(const BoolV v, const BoolV f) |
2436 | { |
2437 | return V4Sel(c: BTTTF(), a: v, b: f); |
2438 | } |
2439 | |
2440 | PX_FORCE_INLINE BoolV BAnd(const BoolV a, const BoolV b) |
2441 | { |
2442 | return _mm_and_ps(a: a, b: b); |
2443 | } |
2444 | |
2445 | PX_FORCE_INLINE BoolV BNot(const BoolV a) |
2446 | { |
2447 | const BoolV bAllTrue(BTTTT()); |
2448 | return _mm_xor_ps(a: a, b: bAllTrue); |
2449 | } |
2450 | |
2451 | PX_FORCE_INLINE BoolV BAndNot(const BoolV a, const BoolV b) |
2452 | { |
2453 | return _mm_andnot_ps(a: b, b: a); |
2454 | } |
2455 | |
2456 | PX_FORCE_INLINE BoolV BOr(const BoolV a, const BoolV b) |
2457 | { |
2458 | return _mm_or_ps(a: a, b: b); |
2459 | } |
2460 | |
2461 | PX_FORCE_INLINE BoolV BAllTrue4(const BoolV a) |
2462 | { |
2463 | const BoolV bTmp = |
2464 | _mm_and_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 0, 1)), _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 2, 3))); |
2465 | return _mm_and_ps(_mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(0, 0, 0, 0)), |
2466 | _mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(1, 1, 1, 1))); |
2467 | } |
2468 | |
2469 | PX_FORCE_INLINE BoolV BAnyTrue4(const BoolV a) |
2470 | { |
2471 | const BoolV bTmp = |
2472 | _mm_or_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 0, 1)), _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 2, 3))); |
2473 | return _mm_or_ps(_mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(0, 0, 0, 0)), |
2474 | _mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(1, 1, 1, 1))); |
2475 | } |
2476 | |
2477 | PX_FORCE_INLINE BoolV BAllTrue3(const BoolV a) |
2478 | { |
2479 | const BoolV bTmp = |
2480 | _mm_and_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 0, 1)), _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2))); |
2481 | return _mm_and_ps(_mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(0, 0, 0, 0)), |
2482 | _mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(1, 1, 1, 1))); |
2483 | } |
2484 | |
2485 | PX_FORCE_INLINE BoolV BAnyTrue3(const BoolV a) |
2486 | { |
2487 | const BoolV bTmp = |
2488 | _mm_or_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 0, 1)), _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2))); |
2489 | return _mm_or_ps(_mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(0, 0, 0, 0)), |
2490 | _mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(1, 1, 1, 1))); |
2491 | } |
2492 | |
2493 | PX_FORCE_INLINE PxU32 BAllEq(const BoolV a, const BoolV b) |
2494 | { |
2495 | const BoolV bTest = m128_I2F(n: _mm_cmpeq_epi32(a: m128_F2I(n: a), b: m128_F2I(n: b))); |
2496 | return internalUnitSSE2Simd::BAllTrue4_R(a: bTest); |
2497 | } |
2498 | |
2499 | PX_FORCE_INLINE PxU32 BAllEqTTTT(const BoolV a) |
2500 | { |
2501 | return PxU32(_mm_movemask_ps(a: a)==15); |
2502 | } |
2503 | |
2504 | PX_FORCE_INLINE PxU32 BAllEqFFFF(const BoolV a) |
2505 | { |
2506 | return PxU32(_mm_movemask_ps(a: a)==0); |
2507 | } |
2508 | |
2509 | PX_FORCE_INLINE PxU32 BGetBitMask(const BoolV a) |
2510 | { |
2511 | return PxU32(_mm_movemask_ps(a: a)); |
2512 | } |
2513 | |
2514 | ////////////////////////////////// |
2515 | // MAT33V |
2516 | ////////////////////////////////// |
2517 | |
2518 | PX_FORCE_INLINE Vec3V M33MulV3(const Mat33V& a, const Vec3V b) |
2519 | { |
2520 | const FloatV x = V3GetX(f: b); |
2521 | const FloatV y = V3GetY(f: b); |
2522 | const FloatV z = V3GetZ(f: b); |
2523 | const Vec3V v0 = V3Scale(a: a.col0, b: x); |
2524 | const Vec3V v1 = V3Scale(a: a.col1, b: y); |
2525 | const Vec3V v2 = V3Scale(a: a.col2, b: z); |
2526 | const Vec3V v0PlusV1 = V3Add(a: v0, b: v1); |
2527 | return V3Add(a: v0PlusV1, b: v2); |
2528 | } |
2529 | |
2530 | PX_FORCE_INLINE Vec3V M33TrnspsMulV3(const Mat33V& a, const Vec3V b) |
2531 | { |
2532 | const FloatV x = V3Dot(a: a.col0, b); |
2533 | const FloatV y = V3Dot(a: a.col1, b); |
2534 | const FloatV z = V3Dot(a: a.col2, b); |
2535 | return V3Merge(x, y, z); |
2536 | } |
2537 | |
2538 | PX_FORCE_INLINE Vec3V M33MulV3AddV3(const Mat33V& A, const Vec3V b, const Vec3V c) |
2539 | { |
2540 | const FloatV x = V3GetX(f: b); |
2541 | const FloatV y = V3GetY(f: b); |
2542 | const FloatV z = V3GetZ(f: b); |
2543 | Vec3V result = V3ScaleAdd(a: A.col0, b: x, c); |
2544 | result = V3ScaleAdd(a: A.col1, b: y, c: result); |
2545 | return V3ScaleAdd(a: A.col2, b: z, c: result); |
2546 | } |
2547 | |
2548 | PX_FORCE_INLINE Mat33V M33MulM33(const Mat33V& a, const Mat33V& b) |
2549 | { |
2550 | return Mat33V(M33MulV3(a, b: b.col0), M33MulV3(a, b: b.col1), M33MulV3(a, b: b.col2)); |
2551 | } |
2552 | |
2553 | PX_FORCE_INLINE Mat33V M33Add(const Mat33V& a, const Mat33V& b) |
2554 | { |
2555 | return Mat33V(V3Add(a: a.col0, b: b.col0), V3Add(a: a.col1, b: b.col1), V3Add(a: a.col2, b: b.col2)); |
2556 | } |
2557 | |
2558 | PX_FORCE_INLINE Mat33V M33Scale(const Mat33V& a, const FloatV& b) |
2559 | { |
2560 | return Mat33V(V3Scale(a: a.col0, b), V3Scale(a: a.col1, b), V3Scale(a: a.col2, b)); |
2561 | } |
2562 | |
2563 | PX_FORCE_INLINE Mat33V M33Inverse(const Mat33V& a) |
2564 | { |
2565 | const BoolV tfft = BTFFT(); |
2566 | const BoolV tttf = BTTTF(); |
2567 | const FloatV zero = FZero(); |
2568 | const Vec3V cross01 = V3Cross(a: a.col0, b: a.col1); |
2569 | const Vec3V cross12 = V3Cross(a: a.col1, b: a.col2); |
2570 | const Vec3V cross20 = V3Cross(a: a.col2, b: a.col0); |
2571 | const FloatV dot = V3Dot(a: cross01, b: a.col2); |
2572 | const FloatV invDet = _mm_rcp_ps(a: dot); |
2573 | const Vec3V mergeh = _mm_unpacklo_ps(a: cross12, b: cross01); |
2574 | const Vec3V mergel = _mm_unpackhi_ps(a: cross12, b: cross01); |
2575 | Vec3V colInv0 = _mm_unpacklo_ps(a: mergeh, b: cross20); |
2576 | colInv0 = _mm_or_ps(a: _mm_andnot_ps(a: tttf, b: zero), b: _mm_and_ps(a: tttf, b: colInv0)); |
2577 | const Vec3V zppd = _mm_shuffle_ps(mergeh, cross20, _MM_SHUFFLE(3, 0, 0, 2)); |
2578 | const Vec3V pbwp = _mm_shuffle_ps(cross20, mergeh, _MM_SHUFFLE(3, 3, 1, 0)); |
2579 | const Vec3V colInv1 = _mm_or_ps(a: _mm_andnot_ps(a: BTFFT(), b: pbwp), b: _mm_and_ps(a: BTFFT(), b: zppd)); |
2580 | const Vec3V xppd = _mm_shuffle_ps(mergel, cross20, _MM_SHUFFLE(3, 0, 0, 0)); |
2581 | const Vec3V pcyp = _mm_shuffle_ps(cross20, mergel, _MM_SHUFFLE(3, 1, 2, 0)); |
2582 | const Vec3V colInv2 = _mm_or_ps(a: _mm_andnot_ps(a: tfft, b: pcyp), b: _mm_and_ps(a: tfft, b: xppd)); |
2583 | |
2584 | return Mat33V(_mm_mul_ps(a: colInv0, b: invDet), _mm_mul_ps(a: colInv1, b: invDet), _mm_mul_ps(a: colInv2, b: invDet)); |
2585 | } |
2586 | |
2587 | PX_FORCE_INLINE Mat33V M33Trnsps(const Mat33V& a) |
2588 | { |
2589 | return Mat33V(V3Merge(x: V3GetX(f: a.col0), y: V3GetX(f: a.col1), z: V3GetX(f: a.col2)), |
2590 | V3Merge(x: V3GetY(f: a.col0), y: V3GetY(f: a.col1), z: V3GetY(f: a.col2)), |
2591 | V3Merge(x: V3GetZ(f: a.col0), y: V3GetZ(f: a.col1), z: V3GetZ(f: a.col2))); |
2592 | } |
2593 | |
2594 | PX_FORCE_INLINE Mat33V M33Identity() |
2595 | { |
2596 | return Mat33V(V3UnitX(), V3UnitY(), V3UnitZ()); |
2597 | } |
2598 | |
2599 | PX_FORCE_INLINE Mat33V M33Sub(const Mat33V& a, const Mat33V& b) |
2600 | { |
2601 | return Mat33V(V3Sub(a: a.col0, b: b.col0), V3Sub(a: a.col1, b: b.col1), V3Sub(a: a.col2, b: b.col2)); |
2602 | } |
2603 | |
2604 | PX_FORCE_INLINE Mat33V M33Neg(const Mat33V& a) |
2605 | { |
2606 | return Mat33V(V3Neg(f: a.col0), V3Neg(f: a.col1), V3Neg(f: a.col2)); |
2607 | } |
2608 | |
2609 | PX_FORCE_INLINE Mat33V M33Abs(const Mat33V& a) |
2610 | { |
2611 | return Mat33V(V3Abs(a: a.col0), V3Abs(a: a.col1), V3Abs(a: a.col2)); |
2612 | } |
2613 | |
2614 | PX_FORCE_INLINE Mat33V PromoteVec3V(const Vec3V v) |
2615 | { |
2616 | const BoolV bTFFF = BTFFF(); |
2617 | const BoolV bFTFF = BFTFF(); |
2618 | const BoolV bFFTF = BTFTF(); |
2619 | |
2620 | const Vec3V zero = V3Zero(); |
2621 | |
2622 | return Mat33V(V3Sel(c: bTFFF, a: v, b: zero), V3Sel(c: bFTFF, a: v, b: zero), V3Sel(c: bFFTF, a: v, b: zero)); |
2623 | } |
2624 | |
2625 | PX_FORCE_INLINE Mat33V M33Diagonal(const Vec3VArg d) |
2626 | { |
2627 | const FloatV x = V3Mul(a: V3UnitX(), b: d); |
2628 | const FloatV y = V3Mul(a: V3UnitY(), b: d); |
2629 | const FloatV z = V3Mul(a: V3UnitZ(), b: d); |
2630 | return Mat33V(x, y, z); |
2631 | } |
2632 | |
2633 | ////////////////////////////////// |
2634 | // MAT34V |
2635 | ////////////////////////////////// |
2636 | |
2637 | PX_FORCE_INLINE Vec3V M34MulV3(const Mat34V& a, const Vec3V b) |
2638 | { |
2639 | const FloatV x = V3GetX(f: b); |
2640 | const FloatV y = V3GetY(f: b); |
2641 | const FloatV z = V3GetZ(f: b); |
2642 | const Vec3V v0 = V3Scale(a: a.col0, b: x); |
2643 | const Vec3V v1 = V3Scale(a: a.col1, b: y); |
2644 | const Vec3V v2 = V3Scale(a: a.col2, b: z); |
2645 | const Vec3V v0PlusV1 = V3Add(a: v0, b: v1); |
2646 | const Vec3V v0PlusV1Plusv2 = V3Add(a: v0PlusV1, b: v2); |
2647 | return V3Add(a: v0PlusV1Plusv2, b: a.col3); |
2648 | } |
2649 | |
2650 | PX_FORCE_INLINE Vec3V M34Mul33V3(const Mat34V& a, const Vec3V b) |
2651 | { |
2652 | const FloatV x = V3GetX(f: b); |
2653 | const FloatV y = V3GetY(f: b); |
2654 | const FloatV z = V3GetZ(f: b); |
2655 | const Vec3V v0 = V3Scale(a: a.col0, b: x); |
2656 | const Vec3V v1 = V3Scale(a: a.col1, b: y); |
2657 | const Vec3V v2 = V3Scale(a: a.col2, b: z); |
2658 | const Vec3V v0PlusV1 = V3Add(a: v0, b: v1); |
2659 | return V3Add(a: v0PlusV1, b: v2); |
2660 | } |
2661 | |
2662 | PX_FORCE_INLINE Vec3V M34TrnspsMul33V3(const Mat34V& a, const Vec3V b) |
2663 | { |
2664 | const FloatV x = V3Dot(a: a.col0, b); |
2665 | const FloatV y = V3Dot(a: a.col1, b); |
2666 | const FloatV z = V3Dot(a: a.col2, b); |
2667 | return V3Merge(x, y, z); |
2668 | } |
2669 | |
2670 | PX_FORCE_INLINE Mat34V M34MulM34(const Mat34V& a, const Mat34V& b) |
2671 | { |
2672 | return Mat34V(M34Mul33V3(a, b: b.col0), M34Mul33V3(a, b: b.col1), M34Mul33V3(a, b: b.col2), M34MulV3(a, b: b.col3)); |
2673 | } |
2674 | |
2675 | PX_FORCE_INLINE Mat33V M34MulM33(const Mat34V& a, const Mat33V& b) |
2676 | { |
2677 | return Mat33V(M34Mul33V3(a, b: b.col0), M34Mul33V3(a, b: b.col1), M34Mul33V3(a, b: b.col2)); |
2678 | } |
2679 | |
2680 | PX_FORCE_INLINE Mat33V M34Mul33MM34(const Mat34V& a, const Mat34V& b) |
2681 | { |
2682 | return Mat33V(M34Mul33V3(a, b: b.col0), M34Mul33V3(a, b: b.col1), M34Mul33V3(a, b: b.col2)); |
2683 | } |
2684 | |
2685 | PX_FORCE_INLINE Mat34V M34Add(const Mat34V& a, const Mat34V& b) |
2686 | { |
2687 | return Mat34V(V3Add(a: a.col0, b: b.col0), V3Add(a: a.col1, b: b.col1), V3Add(a: a.col2, b: b.col2), V3Add(a: a.col3, b: b.col3)); |
2688 | } |
2689 | |
2690 | PX_FORCE_INLINE Mat33V M34Trnsps33(const Mat34V& a) |
2691 | { |
2692 | return Mat33V(V3Merge(x: V3GetX(f: a.col0), y: V3GetX(f: a.col1), z: V3GetX(f: a.col2)), |
2693 | V3Merge(x: V3GetY(f: a.col0), y: V3GetY(f: a.col1), z: V3GetY(f: a.col2)), |
2694 | V3Merge(x: V3GetZ(f: a.col0), y: V3GetZ(f: a.col1), z: V3GetZ(f: a.col2))); |
2695 | } |
2696 | |
2697 | ////////////////////////////////// |
2698 | // MAT44V |
2699 | ////////////////////////////////// |
2700 | |
2701 | PX_FORCE_INLINE Vec4V M44MulV4(const Mat44V& a, const Vec4V b) |
2702 | { |
2703 | const FloatV x = V4GetX(f: b); |
2704 | const FloatV y = V4GetY(f: b); |
2705 | const FloatV z = V4GetZ(f: b); |
2706 | const FloatV w = V4GetW(f: b); |
2707 | |
2708 | const Vec4V v0 = V4Scale(a: a.col0, b: x); |
2709 | const Vec4V v1 = V4Scale(a: a.col1, b: y); |
2710 | const Vec4V v2 = V4Scale(a: a.col2, b: z); |
2711 | const Vec4V v3 = V4Scale(a: a.col3, b: w); |
2712 | const Vec4V v0PlusV1 = V4Add(a: v0, b: v1); |
2713 | const Vec4V v0PlusV1Plusv2 = V4Add(a: v0PlusV1, b: v2); |
2714 | return V4Add(a: v0PlusV1Plusv2, b: v3); |
2715 | } |
2716 | |
2717 | PX_FORCE_INLINE Vec4V M44TrnspsMulV4(const Mat44V& a, const Vec4V b) |
2718 | { |
2719 | PX_ALIGN(16, FloatV) dotProdArray[4] = { V4Dot(a: a.col0, b), V4Dot(a: a.col1, b), V4Dot(a: a.col2, b), V4Dot(a: a.col3, b) }; |
2720 | return V4Merge(floatVArray: dotProdArray); |
2721 | } |
2722 | |
2723 | PX_FORCE_INLINE Mat44V M44MulM44(const Mat44V& a, const Mat44V& b) |
2724 | { |
2725 | return Mat44V(M44MulV4(a, b: b.col0), M44MulV4(a, b: b.col1), M44MulV4(a, b: b.col2), M44MulV4(a, b: b.col3)); |
2726 | } |
2727 | |
2728 | PX_FORCE_INLINE Mat44V M44Add(const Mat44V& a, const Mat44V& b) |
2729 | { |
2730 | return Mat44V(V4Add(a: a.col0, b: b.col0), V4Add(a: a.col1, b: b.col1), V4Add(a: a.col2, b: b.col2), V4Add(a: a.col3, b: b.col3)); |
2731 | } |
2732 | |
2733 | PX_FORCE_INLINE Mat44V M44Trnsps(const Mat44V& a) |
2734 | { |
2735 | const Vec4V v0 = _mm_unpacklo_ps(a: a.col0, b: a.col2); |
2736 | const Vec4V v1 = _mm_unpackhi_ps(a: a.col0, b: a.col2); |
2737 | const Vec4V v2 = _mm_unpacklo_ps(a: a.col1, b: a.col3); |
2738 | const Vec4V v3 = _mm_unpackhi_ps(a: a.col1, b: a.col3); |
2739 | return Mat44V(_mm_unpacklo_ps(a: v0, b: v2), _mm_unpackhi_ps(a: v0, b: v2), _mm_unpacklo_ps(a: v1, b: v3), _mm_unpackhi_ps(a: v1, b: v3)); |
2740 | } |
2741 | |
2742 | PX_FORCE_INLINE Mat44V M44Inverse(const Mat44V& a) |
2743 | { |
2744 | __m128 minor0, minor1, minor2, minor3; |
2745 | __m128 row0, row1, row2, row3; |
2746 | __m128 det, tmp1; |
2747 | |
2748 | tmp1 = V4Zero(); |
2749 | row1 = V4Zero(); |
2750 | row3 = V4Zero(); |
2751 | |
2752 | row0 = a.col0; |
2753 | row1 = _mm_shuffle_ps(a.col1, a.col1, _MM_SHUFFLE(1, 0, 3, 2)); |
2754 | row2 = a.col2; |
2755 | row3 = _mm_shuffle_ps(a.col3, a.col3, _MM_SHUFFLE(1, 0, 3, 2)); |
2756 | |
2757 | tmp1 = _mm_mul_ps(a: row2, b: row3); |
2758 | tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); |
2759 | minor0 = _mm_mul_ps(a: row1, b: tmp1); |
2760 | minor1 = _mm_mul_ps(a: row0, b: tmp1); |
2761 | tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); |
2762 | minor0 = _mm_sub_ps(a: _mm_mul_ps(a: row1, b: tmp1), b: minor0); |
2763 | minor1 = _mm_sub_ps(a: _mm_mul_ps(a: row0, b: tmp1), b: minor1); |
2764 | minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E); |
2765 | |
2766 | tmp1 = _mm_mul_ps(a: row1, b: row2); |
2767 | tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); |
2768 | minor0 = _mm_add_ps(a: _mm_mul_ps(a: row3, b: tmp1), b: minor0); |
2769 | minor3 = _mm_mul_ps(a: row0, b: tmp1); |
2770 | tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); |
2771 | minor0 = _mm_sub_ps(a: minor0, b: _mm_mul_ps(a: row3, b: tmp1)); |
2772 | minor3 = _mm_sub_ps(a: _mm_mul_ps(a: row0, b: tmp1), b: minor3); |
2773 | minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E); |
2774 | |
2775 | tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), b: row3); |
2776 | tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); |
2777 | row2 = _mm_shuffle_ps(row2, row2, 0x4E); |
2778 | minor0 = _mm_add_ps(a: _mm_mul_ps(a: row2, b: tmp1), b: minor0); |
2779 | minor2 = _mm_mul_ps(a: row0, b: tmp1); |
2780 | tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); |
2781 | minor0 = _mm_sub_ps(a: minor0, b: _mm_mul_ps(a: row2, b: tmp1)); |
2782 | minor2 = _mm_sub_ps(a: _mm_mul_ps(a: row0, b: tmp1), b: minor2); |
2783 | minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E); |
2784 | |
2785 | tmp1 = _mm_mul_ps(a: row0, b: row1); |
2786 | tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); |
2787 | minor2 = _mm_add_ps(a: _mm_mul_ps(a: row3, b: tmp1), b: minor2); |
2788 | minor3 = _mm_sub_ps(a: _mm_mul_ps(a: row2, b: tmp1), b: minor3); |
2789 | tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); |
2790 | minor2 = _mm_sub_ps(a: _mm_mul_ps(a: row3, b: tmp1), b: minor2); |
2791 | minor3 = _mm_sub_ps(a: minor3, b: _mm_mul_ps(a: row2, b: tmp1)); |
2792 | |
2793 | tmp1 = _mm_mul_ps(a: row0, b: row3); |
2794 | tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); |
2795 | minor1 = _mm_sub_ps(a: minor1, b: _mm_mul_ps(a: row2, b: tmp1)); |
2796 | minor2 = _mm_add_ps(a: _mm_mul_ps(a: row1, b: tmp1), b: minor2); |
2797 | tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); |
2798 | minor1 = _mm_add_ps(a: _mm_mul_ps(a: row2, b: tmp1), b: minor1); |
2799 | minor2 = _mm_sub_ps(a: minor2, b: _mm_mul_ps(a: row1, b: tmp1)); |
2800 | |
2801 | tmp1 = _mm_mul_ps(a: row0, b: row2); |
2802 | tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); |
2803 | minor1 = _mm_add_ps(a: _mm_mul_ps(a: row3, b: tmp1), b: minor1); |
2804 | minor3 = _mm_sub_ps(a: minor3, b: _mm_mul_ps(a: row1, b: tmp1)); |
2805 | tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); |
2806 | minor1 = _mm_sub_ps(a: minor1, b: _mm_mul_ps(a: row3, b: tmp1)); |
2807 | minor3 = _mm_add_ps(a: _mm_mul_ps(a: row1, b: tmp1), b: minor3); |
2808 | |
2809 | det = _mm_mul_ps(a: row0, b: minor0); |
2810 | det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), b: det); |
2811 | det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), b: det); |
2812 | tmp1 = _mm_rcp_ss(a: det); |
2813 | #if 0 |
2814 | det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1))); |
2815 | det = _mm_shuffle_ps(det, det, 0x00); |
2816 | #else |
2817 | det = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(0, 0, 0, 0)); |
2818 | #endif |
2819 | |
2820 | minor0 = _mm_mul_ps(a: det, b: minor0); |
2821 | minor1 = _mm_mul_ps(a: det, b: minor1); |
2822 | minor2 = _mm_mul_ps(a: det, b: minor2); |
2823 | minor3 = _mm_mul_ps(a: det, b: minor3); |
2824 | Mat44V invTrans(minor0, minor1, minor2, minor3); |
2825 | return M44Trnsps(a: invTrans); |
2826 | } |
2827 | |
2828 | PX_FORCE_INLINE Vec4V V4LoadXYZW(const PxF32& x, const PxF32& y, const PxF32& z, const PxF32& w) |
2829 | { |
2830 | return _mm_set_ps(z: w, y: z, x: y, w: x); |
2831 | } |
2832 | |
2833 | /* |
2834 | // AP: work in progress - use proper SSE intrinsics where possible |
2835 | PX_FORCE_INLINE VecU16V V4U32PK(VecU32V a, VecU32V b) |
2836 | { |
2837 | VecU16V result; |
2838 | result.m128_u16[0] = PxU16(PxClamp<PxU32>((a).m128_u32[0], 0, 0xFFFF)); |
2839 | result.m128_u16[1] = PxU16(PxClamp<PxU32>((a).m128_u32[1], 0, 0xFFFF)); |
2840 | result.m128_u16[2] = PxU16(PxClamp<PxU32>((a).m128_u32[2], 0, 0xFFFF)); |
2841 | result.m128_u16[3] = PxU16(PxClamp<PxU32>((a).m128_u32[3], 0, 0xFFFF)); |
2842 | result.m128_u16[4] = PxU16(PxClamp<PxU32>((b).m128_u32[0], 0, 0xFFFF)); |
2843 | result.m128_u16[5] = PxU16(PxClamp<PxU32>((b).m128_u32[1], 0, 0xFFFF)); |
2844 | result.m128_u16[6] = PxU16(PxClamp<PxU32>((b).m128_u32[2], 0, 0xFFFF)); |
2845 | result.m128_u16[7] = PxU16(PxClamp<PxU32>((b).m128_u32[3], 0, 0xFFFF)); |
2846 | return result; |
2847 | } |
2848 | */ |
2849 | |
2850 | PX_FORCE_INLINE VecU32V V4U32Sel(const BoolV c, const VecU32V a, const VecU32V b) |
2851 | { |
2852 | return m128_I2F(n: _mm_or_si128(a: _mm_andnot_si128(a: m128_F2I(n: c), b: m128_F2I(n: b)), b: _mm_and_si128(a: m128_F2I(n: c), b: m128_F2I(n: a)))); |
2853 | } |
2854 | |
2855 | PX_FORCE_INLINE VecU32V V4U32or(VecU32V a, VecU32V b) |
2856 | { |
2857 | return m128_I2F(n: _mm_or_si128(a: m128_F2I(n: a), b: m128_F2I(n: b))); |
2858 | } |
2859 | |
2860 | PX_FORCE_INLINE VecU32V V4U32xor(VecU32V a, VecU32V b) |
2861 | { |
2862 | return m128_I2F(n: _mm_xor_si128(a: m128_F2I(n: a), b: m128_F2I(n: b))); |
2863 | } |
2864 | |
2865 | PX_FORCE_INLINE VecU32V V4U32and(VecU32V a, VecU32V b) |
2866 | { |
2867 | return m128_I2F(n: _mm_and_si128(a: m128_F2I(n: a), b: m128_F2I(n: b))); |
2868 | } |
2869 | |
2870 | PX_FORCE_INLINE VecU32V V4U32Andc(VecU32V a, VecU32V b) |
2871 | { |
2872 | return m128_I2F(n: _mm_andnot_si128(a: m128_F2I(n: b), b: m128_F2I(n: a))); |
2873 | } |
2874 | |
2875 | /* |
2876 | PX_FORCE_INLINE VecU16V V4U16Or(VecU16V a, VecU16V b) |
2877 | { |
2878 | return m128_I2F(_mm_or_si128(m128_F2I(a), m128_F2I(b))); |
2879 | } |
2880 | */ |
2881 | |
2882 | /* |
2883 | PX_FORCE_INLINE VecU16V V4U16And(VecU16V a, VecU16V b) |
2884 | { |
2885 | return m128_I2F(_mm_and_si128(m128_F2I(a), m128_F2I(b))); |
2886 | } |
2887 | */ |
2888 | |
2889 | /* |
2890 | PX_FORCE_INLINE VecU16V V4U16Andc(VecU16V a, VecU16V b) |
2891 | { |
2892 | return m128_I2F(_mm_andnot_si128(m128_F2I(b), m128_F2I(a))); |
2893 | } |
2894 | */ |
2895 | |
2896 | PX_FORCE_INLINE VecI32V I4Load(const PxI32 i) |
2897 | { |
2898 | return m128_F2I(n: _mm_load1_ps(p: reinterpret_cast<const PxF32*>(&i))); |
2899 | } |
2900 | |
2901 | PX_FORCE_INLINE VecI32V I4LoadU(const PxI32* i) |
2902 | { |
2903 | return m128_F2I(n: _mm_loadu_ps(p: reinterpret_cast<const PxF32*>(i))); |
2904 | } |
2905 | |
2906 | PX_FORCE_INLINE VecI32V I4LoadA(const PxI32* i) |
2907 | { |
2908 | return m128_F2I(n: _mm_load_ps(p: reinterpret_cast<const PxF32*>(i))); |
2909 | } |
2910 | |
2911 | PX_FORCE_INLINE VecI32V VecI32V_Add(const VecI32VArg a, const VecI32VArg b) |
2912 | { |
2913 | return _mm_add_epi32(a: a, b: b); |
2914 | } |
2915 | |
2916 | PX_FORCE_INLINE VecI32V VecI32V_Sub(const VecI32VArg a, const VecI32VArg b) |
2917 | { |
2918 | return _mm_sub_epi32(a: a, b: b); |
2919 | } |
2920 | |
2921 | PX_FORCE_INLINE BoolV VecI32V_IsGrtr(const VecI32VArg a, const VecI32VArg b) |
2922 | { |
2923 | return m128_I2F(n: _mm_cmpgt_epi32(a: a, b: b)); |
2924 | } |
2925 | |
2926 | PX_FORCE_INLINE BoolV VecI32V_IsEq(const VecI32VArg a, const VecI32VArg b) |
2927 | { |
2928 | return m128_I2F(n: _mm_cmpeq_epi32(a: a, b: b)); |
2929 | } |
2930 | |
2931 | PX_FORCE_INLINE VecI32V V4I32Sel(const BoolV c, const VecI32V a, const VecI32V b) |
2932 | { |
2933 | return _mm_or_si128(a: _mm_andnot_si128(a: m128_F2I(n: c), b: b), b: _mm_and_si128(a: m128_F2I(n: c), b: a)); |
2934 | } |
2935 | |
2936 | PX_FORCE_INLINE VecI32V VecI32V_Zero() |
2937 | { |
2938 | return _mm_setzero_si128(); |
2939 | } |
2940 | |
2941 | PX_FORCE_INLINE VecI32V VecI32V_One() |
2942 | { |
2943 | return I4Load(i: 1); |
2944 | } |
2945 | |
2946 | PX_FORCE_INLINE VecI32V VecI32V_Two() |
2947 | { |
2948 | return I4Load(i: 2); |
2949 | } |
2950 | |
2951 | PX_FORCE_INLINE VecI32V VecI32V_MinusOne() |
2952 | { |
2953 | return I4Load(i: -1); |
2954 | } |
2955 | |
2956 | PX_FORCE_INLINE VecU32V U4Zero() |
2957 | { |
2958 | return U4Load(i: 0); |
2959 | } |
2960 | |
2961 | PX_FORCE_INLINE VecU32V U4One() |
2962 | { |
2963 | return U4Load(i: 1); |
2964 | } |
2965 | |
2966 | PX_FORCE_INLINE VecU32V U4Two() |
2967 | { |
2968 | return U4Load(i: 2); |
2969 | } |
2970 | |
2971 | PX_FORCE_INLINE VecI32V VecI32V_Sel(const BoolV c, const VecI32VArg a, const VecI32VArg b) |
2972 | { |
2973 | return _mm_or_si128(a: _mm_andnot_si128(a: m128_F2I(n: c), b: b), b: _mm_and_si128(a: m128_F2I(n: c), b: a)); |
2974 | } |
2975 | |
2976 | PX_FORCE_INLINE VecShiftV VecI32V_PrepareShift(const VecI32VArg shift) |
2977 | { |
2978 | VecShiftV s; |
2979 | s.shift = VecI32V_Sel(c: BTFFF(), a: shift, b: VecI32V_Zero()); |
2980 | return s; |
2981 | } |
2982 | |
2983 | PX_FORCE_INLINE VecI32V VecI32V_LeftShift(const VecI32VArg a, const VecShiftVArg count) |
2984 | { |
2985 | return _mm_sll_epi32(a: a, count: count.shift); |
2986 | } |
2987 | |
2988 | PX_FORCE_INLINE VecI32V VecI32V_RightShift(const VecI32VArg a, const VecShiftVArg count) |
2989 | { |
2990 | return _mm_srl_epi32(a: a, count: count.shift); |
2991 | } |
2992 | |
2993 | PX_FORCE_INLINE VecI32V VecI32V_And(const VecI32VArg a, const VecI32VArg b) |
2994 | { |
2995 | return _mm_and_si128(a: a, b: b); |
2996 | } |
2997 | |
2998 | PX_FORCE_INLINE VecI32V VecI32V_Or(const VecI32VArg a, const VecI32VArg b) |
2999 | { |
3000 | return _mm_or_si128(a: a, b: b); |
3001 | } |
3002 | |
3003 | PX_FORCE_INLINE VecI32V VecI32V_GetX(const VecI32VArg a) |
3004 | { |
3005 | return m128_F2I(_mm_shuffle_ps(m128_I2F(a), m128_I2F(a), _MM_SHUFFLE(0, 0, 0, 0))); |
3006 | } |
3007 | |
3008 | PX_FORCE_INLINE VecI32V VecI32V_GetY(const VecI32VArg a) |
3009 | { |
3010 | return m128_F2I(_mm_shuffle_ps(m128_I2F(a), m128_I2F(a), _MM_SHUFFLE(1, 1, 1, 1))); |
3011 | } |
3012 | |
3013 | PX_FORCE_INLINE VecI32V VecI32V_GetZ(const VecI32VArg a) |
3014 | { |
3015 | return m128_F2I(_mm_shuffle_ps(m128_I2F(a), m128_I2F(a), _MM_SHUFFLE(2, 2, 2, 2))); |
3016 | } |
3017 | |
3018 | PX_FORCE_INLINE VecI32V VecI32V_GetW(const VecI32VArg a) |
3019 | { |
3020 | return m128_F2I(_mm_shuffle_ps(m128_I2F(a), m128_I2F(a), _MM_SHUFFLE(3, 3, 3, 3))); |
3021 | } |
3022 | |
3023 | PX_FORCE_INLINE void PxI32_From_VecI32V(const VecI32VArg a, PxI32* i) |
3024 | { |
3025 | _mm_store_ss(p: reinterpret_cast<PxF32*>(i), a: m128_I2F(n: a)); |
3026 | } |
3027 | |
3028 | PX_FORCE_INLINE VecI32V VecI32V_Merge(const VecI32VArg x, const VecI32VArg y, const VecI32VArg z, const VecI32VArg w) |
3029 | { |
3030 | const __m128 xw = _mm_move_ss(a: m128_I2F(n: y), b: m128_I2F(n: x)); // y, y, y, x |
3031 | const __m128 yz = _mm_move_ss(a: m128_I2F(n: z), b: m128_I2F(n: w)); // z, z, z, w |
3032 | return m128_F2I(_mm_shuffle_ps(xw, yz, _MM_SHUFFLE(0, 2, 1, 0))); |
3033 | } |
3034 | |
3035 | PX_FORCE_INLINE VecI32V VecI32V_From_BoolV(const BoolVArg a) |
3036 | { |
3037 | return m128_F2I(n: a); |
3038 | } |
3039 | |
3040 | PX_FORCE_INLINE VecU32V VecU32V_From_BoolV(const BoolVArg a) |
3041 | { |
3042 | return a; |
3043 | } |
3044 | |
3045 | /* |
3046 | template<int a> PX_FORCE_INLINE VecI32V V4ISplat() |
3047 | { |
3048 | VecI32V result; |
3049 | result.m128_i32[0] = a; |
3050 | result.m128_i32[1] = a; |
3051 | result.m128_i32[2] = a; |
3052 | result.m128_i32[3] = a; |
3053 | return result; |
3054 | } |
3055 | |
3056 | template<PxU32 a> PX_FORCE_INLINE VecU32V V4USplat() |
3057 | { |
3058 | VecU32V result; |
3059 | result.m128_u32[0] = a; |
3060 | result.m128_u32[1] = a; |
3061 | result.m128_u32[2] = a; |
3062 | result.m128_u32[3] = a; |
3063 | return result; |
3064 | } |
3065 | */ |
3066 | |
3067 | /* |
3068 | PX_FORCE_INLINE void V4U16StoreAligned(VecU16V val, VecU16V* address) |
3069 | { |
3070 | *address = val; |
3071 | } |
3072 | */ |
3073 | |
3074 | PX_FORCE_INLINE void V4U32StoreAligned(VecU32V val, VecU32V* address) |
3075 | { |
3076 | *address = val; |
3077 | } |
3078 | |
3079 | PX_FORCE_INLINE Vec4V V4LoadAligned(Vec4V* addr) |
3080 | { |
3081 | return *addr; |
3082 | } |
3083 | |
3084 | PX_FORCE_INLINE Vec4V V4LoadUnaligned(Vec4V* addr) |
3085 | { |
3086 | return V4LoadU(f: reinterpret_cast<float*>(addr)); |
3087 | } |
3088 | |
3089 | PX_FORCE_INLINE Vec4V V4Andc(const Vec4V a, const VecU32V b) |
3090 | { |
3091 | VecU32V result32(a); |
3092 | result32 = V4U32Andc(a: result32, b); |
3093 | return Vec4V(result32); |
3094 | } |
3095 | |
3096 | PX_FORCE_INLINE VecU32V V4IsGrtrV32u(const Vec4V a, const Vec4V b) |
3097 | { |
3098 | return V4IsGrtr(a, b); |
3099 | } |
3100 | |
3101 | PX_FORCE_INLINE VecU16V V4U16LoadAligned(VecU16V* addr) |
3102 | { |
3103 | return *addr; |
3104 | } |
3105 | |
3106 | PX_FORCE_INLINE VecU16V V4U16LoadUnaligned(VecU16V* addr) |
3107 | { |
3108 | return *addr; |
3109 | } |
3110 | |
3111 | PX_FORCE_INLINE VecU16V V4U16CompareGt(VecU16V a, VecU16V b) |
3112 | { |
3113 | // _mm_cmpgt_epi16 doesn't work for unsigned values unfortunately |
3114 | // return m128_I2F(_mm_cmpgt_epi16(m128_F2I(a), m128_F2I(b))); |
3115 | VecU16V result; |
3116 | result.m128_u16[0] = (a).m128_u16[0] > (b).m128_u16[0]; |
3117 | result.m128_u16[1] = (a).m128_u16[1] > (b).m128_u16[1]; |
3118 | result.m128_u16[2] = (a).m128_u16[2] > (b).m128_u16[2]; |
3119 | result.m128_u16[3] = (a).m128_u16[3] > (b).m128_u16[3]; |
3120 | result.m128_u16[4] = (a).m128_u16[4] > (b).m128_u16[4]; |
3121 | result.m128_u16[5] = (a).m128_u16[5] > (b).m128_u16[5]; |
3122 | result.m128_u16[6] = (a).m128_u16[6] > (b).m128_u16[6]; |
3123 | result.m128_u16[7] = (a).m128_u16[7] > (b).m128_u16[7]; |
3124 | return result; |
3125 | } |
3126 | |
3127 | PX_FORCE_INLINE VecU16V V4I16CompareGt(VecU16V a, VecU16V b) |
3128 | { |
3129 | return m128_I2F(n: _mm_cmpgt_epi16(a: m128_F2I(n: a), b: m128_F2I(n: b))); |
3130 | } |
3131 | |
3132 | PX_FORCE_INLINE Vec4V Vec4V_From_VecU32V(VecU32V a) |
3133 | { |
3134 | Vec4V result = V4LoadXYZW(x: PxF32(a.m128_u32[0]), y: PxF32(a.m128_u32[1]), z: PxF32(a.m128_u32[2]), w: PxF32(a.m128_u32[3])); |
3135 | return result; |
3136 | } |
3137 | |
3138 | PX_FORCE_INLINE Vec4V Vec4V_From_VecI32V(VecI32V in) |
3139 | { |
3140 | return _mm_cvtepi32_ps(a: in); |
3141 | } |
3142 | |
3143 | PX_FORCE_INLINE VecI32V VecI32V_From_Vec4V(Vec4V a) |
3144 | { |
3145 | return _mm_cvttps_epi32(a: a); |
3146 | } |
3147 | |
3148 | PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecU32V(VecU32V a) |
3149 | { |
3150 | return Vec4V(a); |
3151 | } |
3152 | |
3153 | PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecI32V(VecI32V a) |
3154 | { |
3155 | return m128_I2F(n: a); |
3156 | } |
3157 | |
3158 | PX_FORCE_INLINE VecU32V VecU32V_ReinterpretFrom_Vec4V(Vec4V a) |
3159 | { |
3160 | return VecU32V(a); |
3161 | } |
3162 | |
3163 | PX_FORCE_INLINE VecI32V VecI32V_ReinterpretFrom_Vec4V(Vec4V a) |
3164 | { |
3165 | return m128_F2I(n: a); |
3166 | } |
3167 | |
3168 | /* |
3169 | template<int index> PX_FORCE_INLINE BoolV BSplatElement(BoolV a) |
3170 | { |
3171 | BoolV result; |
3172 | result[0] = result[1] = result[2] = result[3] = a[index]; |
3173 | return result; |
3174 | } |
3175 | */ |
3176 | |
3177 | template <int index> |
3178 | BoolV BSplatElement(BoolV a) |
3179 | { |
3180 | float* data = reinterpret_cast<float*>(&a); |
3181 | return V4Load(f: data[index]); |
3182 | } |
3183 | |
3184 | template <int index> |
3185 | PX_FORCE_INLINE VecU32V V4U32SplatElement(VecU32V a) |
3186 | { |
3187 | VecU32V result; |
3188 | result.m128_u32[0] = result.m128_u32[1] = result.m128_u32[2] = result.m128_u32[3] = a.m128_u32[index]; |
3189 | return result; |
3190 | } |
3191 | |
3192 | template <int index> |
3193 | PX_FORCE_INLINE Vec4V V4SplatElement(Vec4V a) |
3194 | { |
3195 | float* data = reinterpret_cast<float*>(&a); |
3196 | return V4Load(f: data[index]); |
3197 | } |
3198 | |
3199 | PX_FORCE_INLINE VecU32V U4LoadXYZW(PxU32 x, PxU32 y, PxU32 z, PxU32 w) |
3200 | { |
3201 | VecU32V result; |
3202 | result.m128_u32[0] = x; |
3203 | result.m128_u32[1] = y; |
3204 | result.m128_u32[2] = z; |
3205 | result.m128_u32[3] = w; |
3206 | return result; |
3207 | } |
3208 | |
3209 | PX_FORCE_INLINE Vec4V V4Ceil(const Vec4V in) |
3210 | { |
3211 | UnionM128 a(in); |
3212 | return V4LoadXYZW(x: PxCeil(a: a.m128_f32[0]), y: PxCeil(a: a.m128_f32[1]), z: PxCeil(a: a.m128_f32[2]), w: PxCeil(a: a.m128_f32[3])); |
3213 | } |
3214 | |
3215 | PX_FORCE_INLINE Vec4V V4Floor(const Vec4V in) |
3216 | { |
3217 | UnionM128 a(in); |
3218 | return V4LoadXYZW(x: PxFloor(a: a.m128_f32[0]), y: PxFloor(a: a.m128_f32[1]), z: PxFloor(a: a.m128_f32[2]), w: PxFloor(a: a.m128_f32[3])); |
3219 | } |
3220 | |
3221 | PX_FORCE_INLINE VecU32V V4ConvertToU32VSaturate(const Vec4V in, PxU32 power) |
3222 | { |
3223 | PX_ASSERT(power == 0 && "Non-zero power not supported in convertToU32VSaturate" ); |
3224 | PX_UNUSED(power); // prevent warning in release builds |
3225 | PxF32 ffffFFFFasFloat = PxF32(0xFFFF0000); |
3226 | UnionM128 a(in); |
3227 | VecU32V result; |
3228 | result.m128_u32[0] = PxU32(PxClamp<PxF32>(v: (a).m128_f32[0], lo: 0.0f, hi: ffffFFFFasFloat)); |
3229 | result.m128_u32[1] = PxU32(PxClamp<PxF32>(v: (a).m128_f32[1], lo: 0.0f, hi: ffffFFFFasFloat)); |
3230 | result.m128_u32[2] = PxU32(PxClamp<PxF32>(v: (a).m128_f32[2], lo: 0.0f, hi: ffffFFFFasFloat)); |
3231 | result.m128_u32[3] = PxU32(PxClamp<PxF32>(v: (a).m128_f32[3], lo: 0.0f, hi: ffffFFFFasFloat)); |
3232 | return result; |
3233 | } |
3234 | |
3235 | } // namespace aos |
3236 | } // namespace shdfnd |
3237 | } // namespace physx |
3238 | |
3239 | #endif // PSFOUNDATION_PSUNIXSSE2INLINEAOS_H |
3240 | |