1 | /**************************************************************************** |
2 | ** |
3 | ** Copyright (C) 2016 Paul Lemire <paul.lemire350@gmail.com> |
4 | ** Contact: https://www.qt.io/licensing/ |
5 | ** |
6 | ** This file is part of the Qt3D module of the Qt Toolkit. |
7 | ** |
8 | ** $QT_BEGIN_LICENSE:LGPL$ |
9 | ** Commercial License Usage |
10 | ** Licensees holding valid commercial Qt licenses may use this file in |
11 | ** accordance with the commercial license agreement provided with the |
12 | ** Software or, alternatively, in accordance with the terms contained in |
13 | ** a written agreement between you and The Qt Company. For licensing terms |
14 | ** and conditions see https://www.qt.io/terms-conditions. For further |
15 | ** information use the contact form at https://www.qt.io/contact-us. |
16 | ** |
17 | ** GNU Lesser General Public License Usage |
18 | ** Alternatively, this file may be used under the terms of the GNU Lesser |
19 | ** General Public License version 3 as published by the Free Software |
20 | ** Foundation and appearing in the file LICENSE.LGPL3 included in the |
21 | ** packaging of this file. Please review the following information to |
22 | ** ensure the GNU Lesser General Public License version 3 requirements |
23 | ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. |
24 | ** |
25 | ** GNU General Public License Usage |
26 | ** Alternatively, this file may be used under the terms of the GNU |
27 | ** General Public License version 2.0 or (at your option) the GNU General |
28 | ** Public license version 3 or any later version approved by the KDE Free |
29 | ** Qt Foundation. The licenses are as published by the Free Software |
30 | ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 |
31 | ** included in the packaging of this file. Please review the following |
32 | ** information to ensure the GNU General Public License requirements will |
33 | ** be met: https://www.gnu.org/licenses/gpl-2.0.html and |
34 | ** https://www.gnu.org/licenses/gpl-3.0.html. |
35 | ** |
36 | ** $QT_END_LICENSE$ |
37 | ** |
38 | ****************************************************************************/ |
39 | |
40 | #ifndef QT3DCORE_MATRIX4X4_SSE_P_H |
41 | #define QT3DCORE_MATRIX4X4_SSE_P_H |
42 | |
43 | // |
44 | // W A R N I N G |
45 | // ------------- |
46 | // |
47 | // This file is not part of the Qt3D API. It exists purely as an |
48 | // implementation detail. This header file may change from version to |
49 | // version without notice, or even be removed. |
50 | // |
51 | // We mean it. |
52 | // |
53 | |
54 | #include <Qt3DCore/private/vector4d_p.h> |
55 | #include <Qt3DCore/private/vector3d_p.h> |
56 | #include <private/qsimd_p.h> |
57 | #include <QMatrix4x4> |
58 | |
59 | #ifdef QT_COMPILER_SUPPORTS_SSE2 |
60 | |
61 | QT_BEGIN_NAMESPACE |
62 | |
63 | namespace Qt3DCore { |
64 | |
65 | class Matrix4x4_SSE |
66 | { |
67 | public: |
68 | |
69 | Q_ALWAYS_INLINE Matrix4x4_SSE() { setToIdentity(); } |
70 | explicit Q_ALWAYS_INLINE Matrix4x4_SSE(Qt::Initialization) {} |
71 | |
72 | // QMatrix4x4::constData returns in column major order |
73 | explicit Q_ALWAYS_INLINE Matrix4x4_SSE(const QMatrix4x4 &mat) |
74 | { |
75 | // data may not be properly aligned, using unaligned loads |
76 | const float *data = mat.constData(); |
77 | m_col1 = _mm_loadu_ps(p: data); |
78 | m_col2 = _mm_loadu_ps(p: data + 4); |
79 | m_col3 = _mm_loadu_ps(p: data + 8); |
80 | m_col4 = _mm_loadu_ps(p: data + 12); |
81 | } |
82 | |
83 | // Assumes data is 16 bytes aligned (and in column major order) |
84 | explicit Q_ALWAYS_INLINE Matrix4x4_SSE(float *data) |
85 | { |
86 | m_col1 = _mm_load_ps(p: data); |
87 | m_col2 = _mm_load_ps(p: data + 4); |
88 | m_col3 = _mm_load_ps(p: data + 8); |
89 | m_col4 = _mm_load_ps(p: data + 12); |
90 | } |
91 | |
92 | // In (row major) but we store in column major order |
93 | explicit Q_ALWAYS_INLINE Matrix4x4_SSE(float m11, float m12, float m13, float m14, |
94 | float m21, float m22, float m23, float m24, |
95 | float m31, float m32, float m33, float m34, |
96 | float m41, float m42, float m43, float m44) |
97 | { |
98 | m_col1 = _mm_set_ps(z: m41, y: m31, x: m21, w: m11); |
99 | m_col2 = _mm_set_ps(z: m42, y: m32, x: m22, w: m12); |
100 | m_col3 = _mm_set_ps(z: m43, y: m33, x: m23, w: m13); |
101 | m_col4 = _mm_set_ps(z: m44, y: m34, x: m24, w: m14); |
102 | } |
103 | |
104 | Q_ALWAYS_INLINE void setToIdentity() |
105 | { |
106 | m_col1 = _mm_set_ss(w: 1.0f); |
107 | m_col2 = _mm_set_ps(z: 0.0f, y: 0.0f, x: 1.0f, w: 0.0f); |
108 | m_col3 = _mm_set_ps(z: 0.0f, y: 1.0f, x: 0.0f, w: 0.0f); |
109 | m_col4 = _mm_set_ps(z: 1.0f, y: 0.0f, x: 0.0f, w: 0.0f); |
110 | } |
111 | |
112 | Q_ALWAYS_INLINE Matrix4x4_SSE operator*(const Matrix4x4_SSE &other) const |
113 | { |
114 | Matrix4x4_SSE c(Qt::Uninitialized); |
115 | |
116 | const __m128 c1 = m_col1; |
117 | const __m128 c2 = m_col2; |
118 | const __m128 c3 = m_col3; |
119 | const __m128 c4 = m_col4; |
120 | |
121 | // c11, c21, c31, c41 |
122 | // 1) (m11 x n11), (m11 x n21), (m11 x n31), (m11 x n41) |
123 | // 2) (m11 x n11) + (m21 x n12), (m11 x n21) + (m21 x n22), (m11 x n31) + (m21 x n32), (m11 x n41) + (m21 x n42) |
124 | // 3) (m11 x n11) + (m21 x n21) + (m31 x n13), (m11 x n21) + (m21 x n22) + (m31 x n 23), (m11 x n31) + (m21 x n32) + (m31 x n33), (m11 x n41) + (m21 x n42) (m31 x n43) |
125 | // 4) (m11 x n11) + (m21 x n21) + (m31 x n13) + (m41 x n14), (m11 x n21) + (m21 x n22) + (m31 x n 23) + (m41 x n24), (m11 x n31) + (m21 x n32) + (m31 x n33) + (m41 x n34), (m11 x n41) + (m21 x n42) (m31 x n43) + (m41 x n44) |
126 | __m128 tmp = _mm_mul_ps(a: _mm_set1_ps(w: other.m11()), b: c1); |
127 | tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m21()), b: c2), b: tmp); |
128 | tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m31()), b: c3), b: tmp); |
129 | c.m_col1 = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m41()), b: c4), b: tmp); |
130 | |
131 | // c21, c22, c23, c24 |
132 | tmp = _mm_mul_ps(a: _mm_set1_ps(w: other.m12()), b: c1); |
133 | tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m22()), b: c2), b: tmp); |
134 | tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m32()), b: c3), b: tmp); |
135 | c.m_col2 = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m42()), b: c4), b: tmp); |
136 | |
137 | // c31, c32, c33, c34 |
138 | tmp = _mm_mul_ps(a: _mm_set1_ps(w: other.m13()), b: c1); |
139 | tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m23()), b: c2), b: tmp); |
140 | tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m33()), b: c3), b: tmp); |
141 | c.m_col3 = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m43()), b: c4), b: tmp); |
142 | |
143 | // c41, c42, c43, c44 |
144 | tmp = _mm_mul_ps(a: _mm_set1_ps(w: other.m14()), b: c1); |
145 | tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m24()), b: c2), b: tmp); |
146 | tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m34()), b: c3), b: tmp); |
147 | c.m_col4 = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m44()), b: c4), b: tmp); |
148 | |
149 | return c; |
150 | } |
151 | |
152 | Q_ALWAYS_INLINE Matrix4x4_SSE operator-(const Matrix4x4_SSE &other) const |
153 | { |
154 | Matrix4x4_SSE c(Qt::Uninitialized); |
155 | |
156 | c.m_col1 = _mm_sub_ps(a: m_col1, b: other.m_col1); |
157 | c.m_col2 = _mm_sub_ps(a: m_col2, b: other.m_col2); |
158 | c.m_col3 = _mm_sub_ps(a: m_col3, b: other.m_col3); |
159 | c.m_col4 = _mm_sub_ps(a: m_col4, b: other.m_col4); |
160 | |
161 | return c; |
162 | } |
163 | |
164 | Q_ALWAYS_INLINE Matrix4x4_SSE operator+(const Matrix4x4_SSE &other) const |
165 | { |
166 | Matrix4x4_SSE c(Qt::Uninitialized); |
167 | |
168 | c.m_col1 = _mm_add_ps(a: m_col1, b: other.m_col1); |
169 | c.m_col2 = _mm_add_ps(a: m_col2, b: other.m_col2); |
170 | c.m_col3 = _mm_add_ps(a: m_col3, b: other.m_col3); |
171 | c.m_col4 = _mm_add_ps(a: m_col4, b: other.m_col4); |
172 | |
173 | return c; |
174 | } |
175 | |
176 | Q_ALWAYS_INLINE Matrix4x4_SSE &operator*=(const Matrix4x4_SSE &other) |
177 | { |
178 | *this = *this * other; |
179 | return *this; |
180 | } |
181 | |
182 | Q_ALWAYS_INLINE Matrix4x4_SSE &operator-=(const Matrix4x4_SSE &other) |
183 | { |
184 | *this = *this - other; |
185 | return *this; |
186 | } |
187 | |
188 | Q_ALWAYS_INLINE Matrix4x4_SSE &operator+=(const Matrix4x4_SSE &other) |
189 | { |
190 | *this = *this + other; |
191 | return *this; |
192 | } |
193 | |
194 | Q_ALWAYS_INLINE Matrix4x4_SSE transposed() const |
195 | { |
196 | Matrix4x4_SSE c(Qt::Uninitialized); |
197 | |
198 | // ~113 instructions |
199 | // 0b11011101 == 0xdd |
200 | // 0b10001000 == 0x88 |
201 | const __m128 tmp1 = _mm_shuffle_ps(m_col1, m_col2, 0xdd); |
202 | const __m128 tmp2 = _mm_shuffle_ps(m_col1, m_col2, 0x88); |
203 | const __m128 tmp3 = _mm_shuffle_ps(m_col3, m_col4, 0xdd); |
204 | const __m128 tmp4 = _mm_shuffle_ps(m_col3, m_col4, 0x88); |
205 | c.m_col1 = _mm_shuffle_ps(tmp2, tmp4, 0x88); |
206 | c.m_col2 = _mm_shuffle_ps(tmp1, tmp3, 0x88); |
207 | c.m_col3 = _mm_shuffle_ps(tmp2, tmp4, 0xdd); |
208 | c.m_col4 = _mm_shuffle_ps(tmp1, tmp3, 0xdd); |
209 | |
210 | return c; |
211 | } |
212 | |
213 | Q_ALWAYS_INLINE Matrix4x4_SSE inverted() const |
214 | { |
215 | // TO DO: Optimize |
216 | const QMatrix4x4 mat = toQMatrix4x4(); |
217 | return Matrix4x4_SSE(mat.inverted()); |
218 | } |
219 | |
220 | Q_ALWAYS_INLINE bool operator==(const Matrix4x4_SSE &other) const |
221 | { |
222 | // 0b1111 == 0xf |
223 | return (_mm_movemask_ps(a: _mm_cmpeq_ps(a: m_col1, b: other.m_col1)) == 0xf && |
224 | _mm_movemask_ps(a: _mm_cmpeq_ps(a: m_col2, b: other.m_col2)) == 0xf && |
225 | _mm_movemask_ps(a: _mm_cmpeq_ps(a: m_col3, b: other.m_col3)) == 0xf && |
226 | _mm_movemask_ps(a: _mm_cmpeq_ps(a: m_col4, b: other.m_col4)) == 0xf); |
227 | } |
228 | |
229 | Q_ALWAYS_INLINE bool operator!=(const Matrix4x4_SSE &other) const |
230 | { |
231 | return !(*this == other); |
232 | } |
233 | |
234 | Q_ALWAYS_INLINE float m11() const { return _mm_cvtss_f32(a: m_col1); } |
235 | Q_ALWAYS_INLINE float m12() const { return _mm_cvtss_f32(a: m_col2); } |
236 | Q_ALWAYS_INLINE float m13() const { return _mm_cvtss_f32(a: m_col3); } |
237 | Q_ALWAYS_INLINE float m14() const { return _mm_cvtss_f32(a: m_col4); } |
238 | |
239 | Q_ALWAYS_INLINE float m21() const |
240 | { |
241 | // 0b01010101 = 0x55 |
242 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col1, m_col1, 0x55)); |
243 | } |
244 | Q_ALWAYS_INLINE float m22() const |
245 | { |
246 | // 0b01010101 = 0x55 |
247 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col2, m_col2, 0x55)); |
248 | } |
249 | Q_ALWAYS_INLINE float m23() const |
250 | { |
251 | // 0b01010101 = 0x55 |
252 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col3, m_col3, 0x55)); |
253 | } |
254 | Q_ALWAYS_INLINE float m24() const |
255 | { |
256 | // 0b01010101 = 0x55 |
257 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col4, m_col4, 0x55)); |
258 | } |
259 | |
260 | Q_ALWAYS_INLINE float m31() const |
261 | { |
262 | // 0b10101010 = 0xaa |
263 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col1, m_col1, 0xaa)); |
264 | } |
265 | Q_ALWAYS_INLINE float m32() const |
266 | { |
267 | // 0b10101010 = 0xaa |
268 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col2, m_col2, 0xaa)); |
269 | } |
270 | Q_ALWAYS_INLINE float m33() const |
271 | { |
272 | // 0b10101010 = 0xaa |
273 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col3, m_col3, 0xaa)); |
274 | } |
275 | Q_ALWAYS_INLINE float m34() const |
276 | { |
277 | // 0b10101010 = 0xaa |
278 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col4, m_col4, 0xaa)); |
279 | } |
280 | |
281 | Q_ALWAYS_INLINE float m41() const |
282 | { |
283 | // 0b11111111 = 0xff |
284 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col1, m_col1, 0xff)); |
285 | } |
286 | Q_ALWAYS_INLINE float m42() const |
287 | { |
288 | // 0b11111111 = 0xff |
289 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col2, m_col2, 0xff)); |
290 | } |
291 | Q_ALWAYS_INLINE float m43() const |
292 | { |
293 | // 0b11111111 = 0xff |
294 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col3, m_col3, 0xff)); |
295 | } |
296 | Q_ALWAYS_INLINE float m44() const |
297 | { |
298 | // 0b11111111 = 0xff |
299 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col4, m_col4, 0xff)); |
300 | } |
301 | |
302 | Q_ALWAYS_INLINE Vector4D row(int index) const |
303 | { |
304 | switch (index) { |
305 | case 0: |
306 | return Vector4D(m11(), m12(), m13(), m14()); |
307 | case 1: |
308 | return Vector4D(m21(), m22(), m23(), m24()); |
309 | case 2: |
310 | return Vector4D(m31(), m32(), m33(), m34()); |
311 | case 3: |
312 | return Vector4D(m41(), m42(), m43(), m44()); |
313 | default: |
314 | Q_UNREACHABLE(); |
315 | return Vector4D(); |
316 | } |
317 | } |
318 | |
319 | Q_ALWAYS_INLINE Vector4D column(int index) const |
320 | { |
321 | Vector4D c(Qt::Uninitialized); |
322 | switch (index) { |
323 | case 0: |
324 | c.m_xyzw = m_col1; |
325 | break; |
326 | case 1: |
327 | c.m_xyzw = m_col2; |
328 | break; |
329 | case 2: |
330 | c.m_xyzw = m_col3; |
331 | break; |
332 | case 3: |
333 | c.m_xyzw = m_col4; |
334 | break; |
335 | default: |
336 | Q_UNREACHABLE(); |
337 | return Vector4D(); |
338 | } |
339 | return c; |
340 | } |
341 | |
342 | Q_ALWAYS_INLINE QMatrix4x4 toQMatrix4x4() const { return QMatrix4x4(m11(), m12(), m13(), m14(), |
343 | m21(), m22(), m23(), m24(), |
344 | m31(), m32(), m33(), m34(), |
345 | m41(), m42(), m43(), m44()); } |
346 | |
347 | Q_ALWAYS_INLINE Vector3D_SSE map(const Vector3D_SSE &point) const |
348 | { |
349 | return *this * point; |
350 | } |
351 | |
352 | Q_ALWAYS_INLINE Vector4D_SSE map(const Vector4D_SSE &point) const |
353 | { |
354 | return *this * point; |
355 | } |
356 | |
357 | Q_ALWAYS_INLINE Vector3D_SSE mapVector(const Vector3D_SSE &vector) const |
358 | { |
359 | const Vector3D_SSE row1(m11(), m12(), m13()); |
360 | const Vector3D_SSE row2(m21(), m22(), m23()); |
361 | const Vector3D_SSE row3(m31(), m32(), m33()); |
362 | |
363 | return Vector3D(Vector3D_SSE::dotProduct(a: row1, b: vector), |
364 | Vector3D_SSE::dotProduct(a: row2, b: vector), |
365 | Vector3D_SSE::dotProduct(a: row3, b: vector)); |
366 | } |
367 | |
368 | friend Q_ALWAYS_INLINE Vector4D operator*(const Vector4D &vector, const Matrix4x4_SSE &matrix); |
369 | friend Q_ALWAYS_INLINE Vector4D operator*(const Matrix4x4_SSE &matrix, const Vector4D &vector); |
370 | |
371 | friend Q_ALWAYS_INLINE Vector3D operator*(const Vector3D &vector, const Matrix4x4_SSE &matrix); |
372 | friend Q_ALWAYS_INLINE Vector3D operator*(const Matrix4x4_SSE &matrix, const Vector3D &vector); |
373 | |
374 | friend Q_3DCORE_PRIVATE_EXPORT QDebug operator<<(QDebug dbg, const Matrix4x4_SSE &m); |
375 | |
376 | private: |
377 | // Internally we will store the matrix as indicated below |
378 | // Q_DECL_ALIGN(16) // aligned on 16 bytes boundary for SSE (column major) |
379 | // struct |
380 | // { |
381 | // float m_m11, m_m21, m_m31, m_m41; |
382 | // float m_m12, m_m22, m_m32, m_m42; |
383 | // float m_m13, m_m23, m_m33, m_m43; |
384 | // float m_m14, m_m24, m_m34, m_m44; |
385 | // }; |
386 | // struct |
387 | // { |
388 | // float m[16]; |
389 | // }; |
390 | __m128 m_col1; |
391 | __m128 m_col2; |
392 | __m128 m_col3; |
393 | __m128 m_col4; |
394 | }; |
395 | |
396 | Q_ALWAYS_INLINE Vector4D operator*(const Vector4D &vector, const Matrix4x4_SSE &matrix) |
397 | { |
398 | const __m128 vCol1 = _mm_mul_ps(a: matrix.m_col1, b: vector.m_xyzw); |
399 | const __m128 vCol2 = _mm_mul_ps(a: matrix.m_col2, b: vector.m_xyzw); |
400 | const __m128 vCol3 = _mm_mul_ps(a: matrix.m_col3, b: vector.m_xyzw); |
401 | const __m128 vCol4 = _mm_mul_ps(a: matrix.m_col4, b: vector.m_xyzw); |
402 | |
403 | |
404 | // 0b01000100 == 0x44 |
405 | // 0b11101110 == 0xee |
406 | |
407 | // vCol1.x, vCol1.y, vCol2.x, vCol2.y |
408 | __m128 tmp1 = _mm_shuffle_ps(vCol1, vCol2, 0x44); |
409 | // vCol1.z, vCol1.w, vCol2.z, vCol2.w |
410 | __m128 tmp2 = _mm_shuffle_ps(vCol1, vCol2, 0xee); |
411 | |
412 | // vCol1.x + vCol1.z, vCol1.y + vCol1.w, vCol2.x + vCol2.z, vCol2.y + vCol2.w, |
413 | const __m128 tmpSum01 = _mm_add_ps(a: tmp1, b: tmp2); |
414 | |
415 | // vCol3.x, vCol3.y, vCol4.x, vCol4.y |
416 | tmp1 = _mm_shuffle_ps(vCol3, vCol4, 0x44); |
417 | // vCol3.z, vCol3.w, vCol4.z, vCol4.w |
418 | tmp2 = _mm_shuffle_ps(vCol3, vCol4, 0xee); |
419 | |
420 | // vCol3.x + vCol3.z, vCol3.y + vCol3.w, vCol4.x + vCol4.z, vCol4.y + vCol4.w, |
421 | const __m128 tmpSum02 = _mm_add_ps(a: tmp1, b: tmp2); |
422 | |
423 | // 0b10001000 == 0x88 |
424 | // 0b11011101 == 0xdd |
425 | |
426 | // vCol1.x + vCol1.z, vCol2.x + vCol2.z, vCol3.x + vCol3.z, vCol4.x + vCol4.z, |
427 | tmp1 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0x88); |
428 | // vCol1.y + vCol1.w, vCol2.y + vCol2.w, vCol3.y + vCol3.w, vCol4.y + vCol4.w, |
429 | tmp2 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0xdd); |
430 | |
431 | Vector4D v(Qt::Uninitialized); |
432 | v.m_xyzw = _mm_add_ps(a: tmp1, b: tmp2); |
433 | return v; |
434 | } |
435 | |
436 | Q_ALWAYS_INLINE Vector4D operator*(const Matrix4x4_SSE &matrix, const Vector4D &vector) |
437 | { |
438 | const Matrix4x4_SSE transposed = matrix.transposed(); |
439 | return vector * transposed; |
440 | } |
441 | |
442 | Q_ALWAYS_INLINE Vector3D operator*(const Vector3D &vector, const Matrix4x4_SSE &matrix) |
443 | { |
444 | const __m128 vec4 = _mm_set_ps(z: 1.0f, y: vector.z(), x: vector.y(), w: vector.x()); |
445 | |
446 | const __m128 vCol1 = _mm_mul_ps(a: matrix.m_col1, b: vec4); |
447 | const __m128 vCol2 = _mm_mul_ps(a: matrix.m_col2, b: vec4); |
448 | const __m128 vCol3 = _mm_mul_ps(a: matrix.m_col3, b: vec4); |
449 | const __m128 vCol4 = _mm_mul_ps(a: matrix.m_col4, b: vec4); |
450 | |
451 | // 0b01000100 == 0x44 |
452 | // 0b11101110 == 0xee |
453 | |
454 | // vCol1.x, vCol1.y, vCol2.x, vCol2.y |
455 | __m128 tmp1 = _mm_shuffle_ps(vCol1, vCol2, 0x44); |
456 | // vCol1.z, vCol1.w, vCol2.z, vCol2.w |
457 | __m128 tmp2 = _mm_shuffle_ps(vCol1, vCol2, 0xee); |
458 | |
459 | // vCol1.x + vCol1.z, vCol1.y + vCol1.w, vCol2.x + vCol2.z, vCol2.y + vCol2.w, |
460 | const __m128 tmpSum01 = _mm_add_ps(a: tmp1, b: tmp2); |
461 | |
462 | // vCol3.x, vCol3.y, vCol4.x, vCol4.y |
463 | tmp1 = _mm_shuffle_ps(vCol3, vCol4, 0x44); |
464 | // vCol3.z, vCol3.w, vCol4.z, vCol4.w |
465 | tmp2 = _mm_shuffle_ps(vCol3, vCol4, 0xee); |
466 | |
467 | // vCol3.x + vCol3.z, vCol3.y + vCol3.w, vCol4.x + vCol4.z, vCol4.y + vCol4.w, |
468 | const __m128 tmpSum02 = _mm_add_ps(a: tmp1, b: tmp2); |
469 | |
470 | // 0b10001000 == 0x88 |
471 | // 0b11011101 == 0xdd |
472 | |
473 | // vCol1.x + vCol1.z, vCol2.x + vCol2.z, vCol3.x + vCol3.z, vCol4.x + vCol4.z, |
474 | tmp1 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0x88); |
475 | // vCol1.y + vCol1.w, vCol2.y + vCol2.w, vCol3.y + vCol3.w, vCol4.y + vCol4.w, |
476 | tmp2 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0xdd); |
477 | |
478 | const __m128 result = _mm_add_ps(a: tmp1, b: tmp2); |
479 | // 0b11111111 = 0xff |
480 | const __m128 divisor = _mm_shuffle_ps(result, result, 0xff); |
481 | Vector3D v(Qt::Uninitialized); |
482 | v.m_xyzw = _mm_div_ps(a: result, b: divisor); |
483 | return v; |
484 | } |
485 | |
486 | Q_ALWAYS_INLINE Vector3D operator*(const Matrix4x4_SSE &matrix, const Vector3D &vector) |
487 | { |
488 | const Matrix4x4_SSE transposed = matrix.transposed(); |
489 | return vector * transposed; |
490 | } |
491 | |
492 | } // Qt3DCore |
493 | |
494 | |
495 | Q_DECLARE_TYPEINFO(Qt3DCore::Matrix4x4_SSE, Q_PRIMITIVE_TYPE); |
496 | |
497 | QT_END_NAMESPACE |
498 | |
499 | Q_DECLARE_METATYPE(Qt3DCore::Matrix4x4_SSE) |
500 | |
501 | #endif // QT_COMPILER_SUPPORTS_SSE2 |
502 | |
503 | #endif // QT3DCORE_MATRIX4X4_SSE_P_H |
504 | |