1 | // Copyright (C) 2016 Paul Lemire <paul.lemire350@gmail.com> |
2 | // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only |
3 | |
4 | #ifndef QT3DCORE_MATRIX4X4_SSE_P_H |
5 | #define QT3DCORE_MATRIX4X4_SSE_P_H |
6 | |
7 | // |
8 | // W A R N I N G |
9 | // ------------- |
10 | // |
11 | // This file is not part of the Qt3D API. It exists purely as an |
12 | // implementation detail. This header file may change from version to |
13 | // version without notice, or even be removed. |
14 | // |
15 | // We mean it. |
16 | // |
17 | |
18 | #include <Qt3DCore/private/vector4d_p.h> |
19 | #include <Qt3DCore/private/vector3d_p.h> |
20 | #include <private/qsimd_p.h> |
21 | #include <QMatrix4x4> |
22 | |
23 | #if defined(__AVX2__) |
24 | #include "matrix4x4_avx2_p.h" |
25 | #elif defined(__SSE2__) |
26 | |
27 | QT_BEGIN_NAMESPACE |
28 | |
29 | namespace Qt3DCore { |
30 | |
31 | class Matrix4x4_SSE |
32 | { |
33 | public: |
34 | |
35 | Q_ALWAYS_INLINE Matrix4x4_SSE() { setToIdentity(); } |
36 | explicit Q_ALWAYS_INLINE Matrix4x4_SSE(Qt::Initialization) {} |
37 | |
38 | // QMatrix4x4::constData returns in column major order |
39 | explicit Q_ALWAYS_INLINE Matrix4x4_SSE(const QMatrix4x4 &mat) |
40 | { |
41 | // data may not be properly aligned, using unaligned loads |
42 | const float *data = mat.constData(); |
43 | m_col1 = _mm_loadu_ps(p: data); |
44 | m_col2 = _mm_loadu_ps(p: data + 4); |
45 | m_col3 = _mm_loadu_ps(p: data + 8); |
46 | m_col4 = _mm_loadu_ps(p: data + 12); |
47 | } |
48 | |
49 | // Assumes data is 16 bytes aligned (and in column major order) |
50 | explicit Q_ALWAYS_INLINE Matrix4x4_SSE(float *data) |
51 | { |
52 | m_col1 = _mm_load_ps(p: data); |
53 | m_col2 = _mm_load_ps(p: data + 4); |
54 | m_col3 = _mm_load_ps(p: data + 8); |
55 | m_col4 = _mm_load_ps(p: data + 12); |
56 | } |
57 | |
58 | // In (row major) but we store in column major order |
59 | explicit Q_ALWAYS_INLINE Matrix4x4_SSE(float m11, float m12, float m13, float m14, |
60 | float m21, float m22, float m23, float m24, |
61 | float m31, float m32, float m33, float m34, |
62 | float m41, float m42, float m43, float m44) |
63 | { |
64 | m_col1 = _mm_set_ps(z: m41, y: m31, x: m21, w: m11); |
65 | m_col2 = _mm_set_ps(z: m42, y: m32, x: m22, w: m12); |
66 | m_col3 = _mm_set_ps(z: m43, y: m33, x: m23, w: m13); |
67 | m_col4 = _mm_set_ps(z: m44, y: m34, x: m24, w: m14); |
68 | } |
69 | |
70 | Q_ALWAYS_INLINE void setToIdentity() |
71 | { |
72 | m_col1 = _mm_set_ss(w: 1.0f); |
73 | m_col2 = _mm_set_ps(z: 0.0f, y: 0.0f, x: 1.0f, w: 0.0f); |
74 | m_col3 = _mm_set_ps(z: 0.0f, y: 1.0f, x: 0.0f, w: 0.0f); |
75 | m_col4 = _mm_set_ps(z: 1.0f, y: 0.0f, x: 0.0f, w: 0.0f); |
76 | } |
77 | |
78 | Q_ALWAYS_INLINE Matrix4x4_SSE operator*(const Matrix4x4_SSE &other) const |
79 | { |
80 | Matrix4x4_SSE c(Qt::Uninitialized); |
81 | |
82 | const __m128 c1 = m_col1; |
83 | const __m128 c2 = m_col2; |
84 | const __m128 c3 = m_col3; |
85 | const __m128 c4 = m_col4; |
86 | |
87 | // c11, c21, c31, c41 |
88 | // 1) (m11 x n11), (m11 x n21), (m11 x n31), (m11 x n41) |
89 | // 2) (m11 x n11) + (m21 x n12), (m11 x n21) + (m21 x n22), (m11 x n31) + (m21 x n32), (m11 x n41) + (m21 x n42) |
90 | // 3) (m11 x n11) + (m21 x n21) + (m31 x n13), (m11 x n21) + (m21 x n22) + (m31 x n 23), (m11 x n31) + (m21 x n32) + (m31 x n33), (m11 x n41) + (m21 x n42) (m31 x n43) |
91 | // 4) (m11 x n11) + (m21 x n21) + (m31 x n13) + (m41 x n14), (m11 x n21) + (m21 x n22) + (m31 x n 23) + (m41 x n24), (m11 x n31) + (m21 x n32) + (m31 x n33) + (m41 x n34), (m11 x n41) + (m21 x n42) (m31 x n43) + (m41 x n44) |
92 | __m128 tmp = _mm_mul_ps(a: _mm_set1_ps(w: other.m11()), b: c1); |
93 | tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m21()), b: c2), b: tmp); |
94 | tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m31()), b: c3), b: tmp); |
95 | c.m_col1 = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m41()), b: c4), b: tmp); |
96 | |
97 | // c21, c22, c23, c24 |
98 | tmp = _mm_mul_ps(a: _mm_set1_ps(w: other.m12()), b: c1); |
99 | tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m22()), b: c2), b: tmp); |
100 | tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m32()), b: c3), b: tmp); |
101 | c.m_col2 = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m42()), b: c4), b: tmp); |
102 | |
103 | // c31, c32, c33, c34 |
104 | tmp = _mm_mul_ps(a: _mm_set1_ps(w: other.m13()), b: c1); |
105 | tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m23()), b: c2), b: tmp); |
106 | tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m33()), b: c3), b: tmp); |
107 | c.m_col3 = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m43()), b: c4), b: tmp); |
108 | |
109 | // c41, c42, c43, c44 |
110 | tmp = _mm_mul_ps(a: _mm_set1_ps(w: other.m14()), b: c1); |
111 | tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m24()), b: c2), b: tmp); |
112 | tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m34()), b: c3), b: tmp); |
113 | c.m_col4 = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m44()), b: c4), b: tmp); |
114 | |
115 | return c; |
116 | } |
117 | |
118 | Q_ALWAYS_INLINE Matrix4x4_SSE operator-(const Matrix4x4_SSE &other) const |
119 | { |
120 | Matrix4x4_SSE c(Qt::Uninitialized); |
121 | |
122 | c.m_col1 = _mm_sub_ps(a: m_col1, b: other.m_col1); |
123 | c.m_col2 = _mm_sub_ps(a: m_col2, b: other.m_col2); |
124 | c.m_col3 = _mm_sub_ps(a: m_col3, b: other.m_col3); |
125 | c.m_col4 = _mm_sub_ps(a: m_col4, b: other.m_col4); |
126 | |
127 | return c; |
128 | } |
129 | |
130 | Q_ALWAYS_INLINE Matrix4x4_SSE operator+(const Matrix4x4_SSE &other) const |
131 | { |
132 | Matrix4x4_SSE c(Qt::Uninitialized); |
133 | |
134 | c.m_col1 = _mm_add_ps(a: m_col1, b: other.m_col1); |
135 | c.m_col2 = _mm_add_ps(a: m_col2, b: other.m_col2); |
136 | c.m_col3 = _mm_add_ps(a: m_col3, b: other.m_col3); |
137 | c.m_col4 = _mm_add_ps(a: m_col4, b: other.m_col4); |
138 | |
139 | return c; |
140 | } |
141 | |
142 | Q_ALWAYS_INLINE Matrix4x4_SSE &operator*=(const Matrix4x4_SSE &other) |
143 | { |
144 | *this = *this * other; |
145 | return *this; |
146 | } |
147 | |
148 | Q_ALWAYS_INLINE Matrix4x4_SSE &operator-=(const Matrix4x4_SSE &other) |
149 | { |
150 | *this = *this - other; |
151 | return *this; |
152 | } |
153 | |
154 | Q_ALWAYS_INLINE Matrix4x4_SSE &operator+=(const Matrix4x4_SSE &other) |
155 | { |
156 | *this = *this + other; |
157 | return *this; |
158 | } |
159 | |
160 | Q_ALWAYS_INLINE Matrix4x4_SSE transposed() const |
161 | { |
162 | Matrix4x4_SSE c(Qt::Uninitialized); |
163 | |
164 | // ~113 instructions |
165 | // 0b11011101 == 0xdd |
166 | // 0b10001000 == 0x88 |
167 | const __m128 tmp1 = _mm_shuffle_ps(m_col1, m_col2, 0xdd); |
168 | const __m128 tmp2 = _mm_shuffle_ps(m_col1, m_col2, 0x88); |
169 | const __m128 tmp3 = _mm_shuffle_ps(m_col3, m_col4, 0xdd); |
170 | const __m128 tmp4 = _mm_shuffle_ps(m_col3, m_col4, 0x88); |
171 | c.m_col1 = _mm_shuffle_ps(tmp2, tmp4, 0x88); |
172 | c.m_col2 = _mm_shuffle_ps(tmp1, tmp3, 0x88); |
173 | c.m_col3 = _mm_shuffle_ps(tmp2, tmp4, 0xdd); |
174 | c.m_col4 = _mm_shuffle_ps(tmp1, tmp3, 0xdd); |
175 | |
176 | return c; |
177 | } |
178 | |
179 | Q_ALWAYS_INLINE Matrix4x4_SSE inverted() const |
180 | { |
181 | // TO DO: Optimize |
182 | const QMatrix4x4 mat = toQMatrix4x4(); |
183 | return Matrix4x4_SSE(mat.inverted()); |
184 | } |
185 | |
186 | Q_ALWAYS_INLINE bool operator==(const Matrix4x4_SSE &other) const |
187 | { |
188 | // 0b1111 == 0xf |
189 | return (_mm_movemask_ps(a: _mm_cmpeq_ps(a: m_col1, b: other.m_col1)) == 0xf && |
190 | _mm_movemask_ps(a: _mm_cmpeq_ps(a: m_col2, b: other.m_col2)) == 0xf && |
191 | _mm_movemask_ps(a: _mm_cmpeq_ps(a: m_col3, b: other.m_col3)) == 0xf && |
192 | _mm_movemask_ps(a: _mm_cmpeq_ps(a: m_col4, b: other.m_col4)) == 0xf); |
193 | } |
194 | |
195 | Q_ALWAYS_INLINE bool operator!=(const Matrix4x4_SSE &other) const |
196 | { |
197 | return !(*this == other); |
198 | } |
199 | |
200 | Q_ALWAYS_INLINE float m11() const { return _mm_cvtss_f32(a: m_col1); } |
201 | Q_ALWAYS_INLINE float m12() const { return _mm_cvtss_f32(a: m_col2); } |
202 | Q_ALWAYS_INLINE float m13() const { return _mm_cvtss_f32(a: m_col3); } |
203 | Q_ALWAYS_INLINE float m14() const { return _mm_cvtss_f32(a: m_col4); } |
204 | |
205 | Q_ALWAYS_INLINE float m21() const |
206 | { |
207 | // 0b01010101 = 0x55 |
208 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col1, m_col1, 0x55)); |
209 | } |
210 | Q_ALWAYS_INLINE float m22() const |
211 | { |
212 | // 0b01010101 = 0x55 |
213 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col2, m_col2, 0x55)); |
214 | } |
215 | Q_ALWAYS_INLINE float m23() const |
216 | { |
217 | // 0b01010101 = 0x55 |
218 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col3, m_col3, 0x55)); |
219 | } |
220 | Q_ALWAYS_INLINE float m24() const |
221 | { |
222 | // 0b01010101 = 0x55 |
223 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col4, m_col4, 0x55)); |
224 | } |
225 | |
226 | Q_ALWAYS_INLINE float m31() const |
227 | { |
228 | // 0b10101010 = 0xaa |
229 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col1, m_col1, 0xaa)); |
230 | } |
231 | Q_ALWAYS_INLINE float m32() const |
232 | { |
233 | // 0b10101010 = 0xaa |
234 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col2, m_col2, 0xaa)); |
235 | } |
236 | Q_ALWAYS_INLINE float m33() const |
237 | { |
238 | // 0b10101010 = 0xaa |
239 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col3, m_col3, 0xaa)); |
240 | } |
241 | Q_ALWAYS_INLINE float m34() const |
242 | { |
243 | // 0b10101010 = 0xaa |
244 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col4, m_col4, 0xaa)); |
245 | } |
246 | |
247 | Q_ALWAYS_INLINE float m41() const |
248 | { |
249 | // 0b11111111 = 0xff |
250 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col1, m_col1, 0xff)); |
251 | } |
252 | Q_ALWAYS_INLINE float m42() const |
253 | { |
254 | // 0b11111111 = 0xff |
255 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col2, m_col2, 0xff)); |
256 | } |
257 | Q_ALWAYS_INLINE float m43() const |
258 | { |
259 | // 0b11111111 = 0xff |
260 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col3, m_col3, 0xff)); |
261 | } |
262 | Q_ALWAYS_INLINE float m44() const |
263 | { |
264 | // 0b11111111 = 0xff |
265 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col4, m_col4, 0xff)); |
266 | } |
267 | |
268 | Q_ALWAYS_INLINE Vector4D row(int index) const |
269 | { |
270 | switch (index) { |
271 | case 0: |
272 | return Vector4D(m11(), m12(), m13(), m14()); |
273 | case 1: |
274 | return Vector4D(m21(), m22(), m23(), m24()); |
275 | case 2: |
276 | return Vector4D(m31(), m32(), m33(), m34()); |
277 | case 3: |
278 | return Vector4D(m41(), m42(), m43(), m44()); |
279 | default: |
280 | Q_UNREACHABLE_RETURN(Vector4D()); |
281 | } |
282 | } |
283 | |
284 | Q_ALWAYS_INLINE Vector4D column(int index) const |
285 | { |
286 | Vector4D c(Qt::Uninitialized); |
287 | switch (index) { |
288 | case 0: |
289 | c.m_xyzw = m_col1; |
290 | break; |
291 | case 1: |
292 | c.m_xyzw = m_col2; |
293 | break; |
294 | case 2: |
295 | c.m_xyzw = m_col3; |
296 | break; |
297 | case 3: |
298 | c.m_xyzw = m_col4; |
299 | break; |
300 | default: |
301 | Q_UNREACHABLE_RETURN(Vector4D()); |
302 | } |
303 | return c; |
304 | } |
305 | |
306 | Q_ALWAYS_INLINE float operator()(int row, int column) const { |
307 | return this->row(index: row)[column]; |
308 | } |
309 | |
310 | Q_ALWAYS_INLINE QMatrix4x4 toQMatrix4x4() const { return QMatrix4x4(m11(), m12(), m13(), m14(), |
311 | m21(), m22(), m23(), m24(), |
312 | m31(), m32(), m33(), m34(), |
313 | m41(), m42(), m43(), m44()); } |
314 | |
315 | Q_ALWAYS_INLINE Vector3D_SSE map(const Vector3D_SSE &point) const |
316 | { |
317 | return *this * point; |
318 | } |
319 | |
320 | Q_ALWAYS_INLINE Vector4D_SSE map(const Vector4D_SSE &point) const |
321 | { |
322 | return *this * point; |
323 | } |
324 | |
325 | Q_ALWAYS_INLINE Vector3D_SSE mapVector(const Vector3D_SSE &vector) const |
326 | { |
327 | const Vector3D_SSE row1(m11(), m12(), m13()); |
328 | const Vector3D_SSE row2(m21(), m22(), m23()); |
329 | const Vector3D_SSE row3(m31(), m32(), m33()); |
330 | |
331 | return Vector3D(Vector3D_SSE::dotProduct(a: row1, b: vector), |
332 | Vector3D_SSE::dotProduct(a: row2, b: vector), |
333 | Vector3D_SSE::dotProduct(a: row3, b: vector)); |
334 | } |
335 | |
336 | friend Q_ALWAYS_INLINE Vector4D operator*(const Vector4D &vector, const Matrix4x4_SSE &matrix); |
337 | friend Q_ALWAYS_INLINE Vector4D operator*(const Matrix4x4_SSE &matrix, const Vector4D &vector); |
338 | |
339 | friend Q_ALWAYS_INLINE Vector3D operator*(const Vector3D &vector, const Matrix4x4_SSE &matrix); |
340 | friend Q_ALWAYS_INLINE Vector3D operator*(const Matrix4x4_SSE &matrix, const Vector3D &vector); |
341 | |
342 | friend Q_3DCORE_PRIVATE_EXPORT QDebug operator<<(QDebug dbg, const Matrix4x4_SSE &m); |
343 | |
344 | private: |
345 | // Internally we will store the matrix as indicated below |
346 | // Q_DECL_ALIGN(16) // aligned on 16 bytes boundary for SSE (column major) |
347 | // struct |
348 | // { |
349 | // float m_m11, m_m21, m_m31, m_m41; |
350 | // float m_m12, m_m22, m_m32, m_m42; |
351 | // float m_m13, m_m23, m_m33, m_m43; |
352 | // float m_m14, m_m24, m_m34, m_m44; |
353 | // }; |
354 | // struct |
355 | // { |
356 | // float m[16]; |
357 | // }; |
358 | __m128 m_col1; |
359 | __m128 m_col2; |
360 | __m128 m_col3; |
361 | __m128 m_col4; |
362 | }; |
363 | |
364 | Q_ALWAYS_INLINE Vector4D operator*(const Vector4D &vector, const Matrix4x4_SSE &matrix) |
365 | { |
366 | const __m128 vCol1 = _mm_mul_ps(a: matrix.m_col1, b: vector.m_xyzw); |
367 | const __m128 vCol2 = _mm_mul_ps(a: matrix.m_col2, b: vector.m_xyzw); |
368 | const __m128 vCol3 = _mm_mul_ps(a: matrix.m_col3, b: vector.m_xyzw); |
369 | const __m128 vCol4 = _mm_mul_ps(a: matrix.m_col4, b: vector.m_xyzw); |
370 | |
371 | |
372 | // 0b01000100 == 0x44 |
373 | // 0b11101110 == 0xee |
374 | |
375 | // vCol1.x, vCol1.y, vCol2.x, vCol2.y |
376 | __m128 tmp1 = _mm_shuffle_ps(vCol1, vCol2, 0x44); |
377 | // vCol1.z, vCol1.w, vCol2.z, vCol2.w |
378 | __m128 tmp2 = _mm_shuffle_ps(vCol1, vCol2, 0xee); |
379 | |
380 | // vCol1.x + vCol1.z, vCol1.y + vCol1.w, vCol2.x + vCol2.z, vCol2.y + vCol2.w, |
381 | const __m128 tmpSum01 = _mm_add_ps(a: tmp1, b: tmp2); |
382 | |
383 | // vCol3.x, vCol3.y, vCol4.x, vCol4.y |
384 | tmp1 = _mm_shuffle_ps(vCol3, vCol4, 0x44); |
385 | // vCol3.z, vCol3.w, vCol4.z, vCol4.w |
386 | tmp2 = _mm_shuffle_ps(vCol3, vCol4, 0xee); |
387 | |
388 | // vCol3.x + vCol3.z, vCol3.y + vCol3.w, vCol4.x + vCol4.z, vCol4.y + vCol4.w, |
389 | const __m128 tmpSum02 = _mm_add_ps(a: tmp1, b: tmp2); |
390 | |
391 | // 0b10001000 == 0x88 |
392 | // 0b11011101 == 0xdd |
393 | |
394 | // vCol1.x + vCol1.z, vCol2.x + vCol2.z, vCol3.x + vCol3.z, vCol4.x + vCol4.z, |
395 | tmp1 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0x88); |
396 | // vCol1.y + vCol1.w, vCol2.y + vCol2.w, vCol3.y + vCol3.w, vCol4.y + vCol4.w, |
397 | tmp2 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0xdd); |
398 | |
399 | Vector4D v(Qt::Uninitialized); |
400 | v.m_xyzw = _mm_add_ps(a: tmp1, b: tmp2); |
401 | return v; |
402 | } |
403 | |
404 | Q_ALWAYS_INLINE Vector4D operator*(const Matrix4x4_SSE &matrix, const Vector4D &vector) |
405 | { |
406 | const Matrix4x4_SSE transposed = matrix.transposed(); |
407 | return vector * transposed; |
408 | } |
409 | |
410 | Q_ALWAYS_INLINE Vector3D operator*(const Vector3D &vector, const Matrix4x4_SSE &matrix) |
411 | { |
412 | const __m128 vec4 = _mm_set_ps(z: 1.0f, y: vector.z(), x: vector.y(), w: vector.x()); |
413 | |
414 | const __m128 vCol1 = _mm_mul_ps(a: matrix.m_col1, b: vec4); |
415 | const __m128 vCol2 = _mm_mul_ps(a: matrix.m_col2, b: vec4); |
416 | const __m128 vCol3 = _mm_mul_ps(a: matrix.m_col3, b: vec4); |
417 | const __m128 vCol4 = _mm_mul_ps(a: matrix.m_col4, b: vec4); |
418 | |
419 | // 0b01000100 == 0x44 |
420 | // 0b11101110 == 0xee |
421 | |
422 | // vCol1.x, vCol1.y, vCol2.x, vCol2.y |
423 | __m128 tmp1 = _mm_shuffle_ps(vCol1, vCol2, 0x44); |
424 | // vCol1.z, vCol1.w, vCol2.z, vCol2.w |
425 | __m128 tmp2 = _mm_shuffle_ps(vCol1, vCol2, 0xee); |
426 | |
427 | // vCol1.x + vCol1.z, vCol1.y + vCol1.w, vCol2.x + vCol2.z, vCol2.y + vCol2.w, |
428 | const __m128 tmpSum01 = _mm_add_ps(a: tmp1, b: tmp2); |
429 | |
430 | // vCol3.x, vCol3.y, vCol4.x, vCol4.y |
431 | tmp1 = _mm_shuffle_ps(vCol3, vCol4, 0x44); |
432 | // vCol3.z, vCol3.w, vCol4.z, vCol4.w |
433 | tmp2 = _mm_shuffle_ps(vCol3, vCol4, 0xee); |
434 | |
435 | // vCol3.x + vCol3.z, vCol3.y + vCol3.w, vCol4.x + vCol4.z, vCol4.y + vCol4.w, |
436 | const __m128 tmpSum02 = _mm_add_ps(a: tmp1, b: tmp2); |
437 | |
438 | // 0b10001000 == 0x88 |
439 | // 0b11011101 == 0xdd |
440 | |
441 | // vCol1.x + vCol1.z, vCol2.x + vCol2.z, vCol3.x + vCol3.z, vCol4.x + vCol4.z, |
442 | tmp1 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0x88); |
443 | // vCol1.y + vCol1.w, vCol2.y + vCol2.w, vCol3.y + vCol3.w, vCol4.y + vCol4.w, |
444 | tmp2 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0xdd); |
445 | |
446 | const __m128 result = _mm_add_ps(a: tmp1, b: tmp2); |
447 | // 0b11111111 = 0xff |
448 | const __m128 divisor = _mm_shuffle_ps(result, result, 0xff); |
449 | Vector3D v(Qt::Uninitialized); |
450 | v.m_xyzw = _mm_div_ps(a: result, b: divisor); |
451 | return v; |
452 | } |
453 | |
454 | Q_ALWAYS_INLINE Vector3D operator*(const Matrix4x4_SSE &matrix, const Vector3D &vector) |
455 | { |
456 | const Matrix4x4_SSE transposed = matrix.transposed(); |
457 | return vector * transposed; |
458 | } |
459 | |
460 | } // Qt3DCore |
461 | |
462 | |
463 | Q_DECLARE_TYPEINFO(Qt3DCore::Matrix4x4_SSE, Q_PRIMITIVE_TYPE); |
464 | |
465 | QT_END_NAMESPACE |
466 | |
467 | Q_DECLARE_METATYPE(Qt3DCore::Matrix4x4_SSE) |
468 | |
469 | #endif // __SSE2__ |
470 | |
471 | #endif // QT3DCORE_MATRIX4X4_SSE_P_H |
472 | |