1/****************************************************************************
2**
3** Copyright (C) 2016 Paul Lemire <paul.lemire350@gmail.com>
4** Contact: https://www.qt.io/licensing/
5**
6** This file is part of the Qt3D module of the Qt Toolkit.
7**
8** $QT_BEGIN_LICENSE:LGPL$
9** Commercial License Usage
10** Licensees holding valid commercial Qt licenses may use this file in
11** accordance with the commercial license agreement provided with the
12** Software or, alternatively, in accordance with the terms contained in
13** a written agreement between you and The Qt Company. For licensing terms
14** and conditions see https://www.qt.io/terms-conditions. For further
15** information use the contact form at https://www.qt.io/contact-us.
16**
17** GNU Lesser General Public License Usage
18** Alternatively, this file may be used under the terms of the GNU Lesser
19** General Public License version 3 as published by the Free Software
20** Foundation and appearing in the file LICENSE.LGPL3 included in the
21** packaging of this file. Please review the following information to
22** ensure the GNU Lesser General Public License version 3 requirements
23** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
24**
25** GNU General Public License Usage
26** Alternatively, this file may be used under the terms of the GNU
27** General Public License version 2.0 or (at your option) the GNU General
28** Public license version 3 or any later version approved by the KDE Free
29** Qt Foundation. The licenses are as published by the Free Software
30** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
31** included in the packaging of this file. Please review the following
32** information to ensure the GNU General Public License requirements will
33** be met: https://www.gnu.org/licenses/gpl-2.0.html and
34** https://www.gnu.org/licenses/gpl-3.0.html.
35**
36** $QT_END_LICENSE$
37**
38****************************************************************************/
39
40#ifndef QT3DCORE_MATRIX4X4_SSE_P_H
41#define QT3DCORE_MATRIX4X4_SSE_P_H
42
43//
44// W A R N I N G
45// -------------
46//
47// This file is not part of the Qt3D API. It exists purely as an
48// implementation detail. This header file may change from version to
49// version without notice, or even be removed.
50//
51// We mean it.
52//
53
54#include <Qt3DCore/private/vector4d_p.h>
55#include <Qt3DCore/private/vector3d_p.h>
56#include <private/qsimd_p.h>
57#include <QMatrix4x4>
58
59#ifdef QT_COMPILER_SUPPORTS_SSE2
60
61QT_BEGIN_NAMESPACE
62
63namespace Qt3DCore {
64
65class Matrix4x4_SSE
66{
67public:
68
69 Q_ALWAYS_INLINE Matrix4x4_SSE() { setToIdentity(); }
70 explicit Q_ALWAYS_INLINE Matrix4x4_SSE(Qt::Initialization) {}
71
72 // QMatrix4x4::constData returns in column major order
73 explicit Q_ALWAYS_INLINE Matrix4x4_SSE(const QMatrix4x4 &mat)
74 {
75 // data may not be properly aligned, using unaligned loads
76 const float *data = mat.constData();
77 m_col1 = _mm_loadu_ps(data);
78 m_col2 = _mm_loadu_ps(data + 4);
79 m_col3 = _mm_loadu_ps(data + 8);
80 m_col4 = _mm_loadu_ps(data + 12);
81 }
82
83 // Assumes data is 16 bytes aligned (and in column major order)
84 explicit Q_ALWAYS_INLINE Matrix4x4_SSE(float *data)
85 {
86 m_col1 = _mm_load_ps(data);
87 m_col2 = _mm_load_ps(data + 4);
88 m_col3 = _mm_load_ps(data + 8);
89 m_col4 = _mm_load_ps(data + 12);
90 }
91
92 // In (row major) but we store in column major order
93 explicit Q_ALWAYS_INLINE Matrix4x4_SSE(float m11, float m12, float m13, float m14,
94 float m21, float m22, float m23, float m24,
95 float m31, float m32, float m33, float m34,
96 float m41, float m42, float m43, float m44)
97 {
98 m_col1 = _mm_set_ps(m41, m31, m21, m11);
99 m_col2 = _mm_set_ps(m42, m32, m22, m12);
100 m_col3 = _mm_set_ps(m43, m33, m23, m13);
101 m_col4 = _mm_set_ps(m44, m34, m24, m14);
102 }
103
104 Q_ALWAYS_INLINE void setToIdentity()
105 {
106 m_col1 = _mm_set_ss(1.0f);
107 m_col2 = _mm_set_ps(0.0f, 0.0f, 1.0f, 0.0f);
108 m_col3 = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f);
109 m_col4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
110 }
111
112 Q_ALWAYS_INLINE Matrix4x4_SSE operator*(const Matrix4x4_SSE &other) const
113 {
114 Matrix4x4_SSE c(Qt::Uninitialized);
115
116 const __m128 c1 = m_col1;
117 const __m128 c2 = m_col2;
118 const __m128 c3 = m_col3;
119 const __m128 c4 = m_col4;
120
121 // c11, c21, c31, c41
122 // 1) (m11 x n11), (m11 x n21), (m11 x n31), (m11 x n41)
123 // 2) (m11 x n11) + (m21 x n12), (m11 x n21) + (m21 x n22), (m11 x n31) + (m21 x n32), (m11 x n41) + (m21 x n42)
124 // 3) (m11 x n11) + (m21 x n21) + (m31 x n13), (m11 x n21) + (m21 x n22) + (m31 x n 23), (m11 x n31) + (m21 x n32) + (m31 x n33), (m11 x n41) + (m21 x n42) (m31 x n43)
125 // 4) (m11 x n11) + (m21 x n21) + (m31 x n13) + (m41 x n14), (m11 x n21) + (m21 x n22) + (m31 x n 23) + (m41 x n24), (m11 x n31) + (m21 x n32) + (m31 x n33) + (m41 x n34), (m11 x n41) + (m21 x n42) (m31 x n43) + (m41 x n44)
126 __m128 tmp = _mm_mul_ps(_mm_set1_ps(other.m11()), c1);
127 tmp = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m21()), c2), tmp);
128 tmp = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m31()), c3), tmp);
129 c.m_col1 = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m41()), c4), tmp);
130
131 // c21, c22, c23, c24
132 tmp = _mm_mul_ps(_mm_set1_ps(other.m12()), c1);
133 tmp = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m22()), c2), tmp);
134 tmp = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m32()), c3), tmp);
135 c.m_col2 = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m42()), c4), tmp);
136
137 // c31, c32, c33, c34
138 tmp = _mm_mul_ps(_mm_set1_ps(other.m13()), c1);
139 tmp = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m23()), c2), tmp);
140 tmp = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m33()), c3), tmp);
141 c.m_col3 = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m43()), c4), tmp);
142
143 // c41, c42, c43, c44
144 tmp = _mm_mul_ps(_mm_set1_ps(other.m14()), c1);
145 tmp = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m24()), c2), tmp);
146 tmp = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m34()), c3), tmp);
147 c.m_col4 = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m44()), c4), tmp);
148
149 return c;
150 }
151
152 Q_ALWAYS_INLINE Matrix4x4_SSE operator-(const Matrix4x4_SSE &other) const
153 {
154 Matrix4x4_SSE c(Qt::Uninitialized);
155
156 c.m_col1 = _mm_sub_ps(m_col1, other.m_col1);
157 c.m_col2 = _mm_sub_ps(m_col2, other.m_col2);
158 c.m_col3 = _mm_sub_ps(m_col3, other.m_col3);
159 c.m_col4 = _mm_sub_ps(m_col4, other.m_col4);
160
161 return c;
162 }
163
164 Q_ALWAYS_INLINE Matrix4x4_SSE operator+(const Matrix4x4_SSE &other) const
165 {
166 Matrix4x4_SSE c(Qt::Uninitialized);
167
168 c.m_col1 = _mm_add_ps(m_col1, other.m_col1);
169 c.m_col2 = _mm_add_ps(m_col2, other.m_col2);
170 c.m_col3 = _mm_add_ps(m_col3, other.m_col3);
171 c.m_col4 = _mm_add_ps(m_col4, other.m_col4);
172
173 return c;
174 }
175
176 Q_ALWAYS_INLINE Matrix4x4_SSE &operator*=(const Matrix4x4_SSE &other)
177 {
178 *this = *this * other;
179 return *this;
180 }
181
182 Q_ALWAYS_INLINE Matrix4x4_SSE &operator-=(const Matrix4x4_SSE &other)
183 {
184 *this = *this - other;
185 return *this;
186 }
187
188 Q_ALWAYS_INLINE Matrix4x4_SSE &operator+=(const Matrix4x4_SSE &other)
189 {
190 *this = *this + other;
191 return *this;
192 }
193
194 Q_ALWAYS_INLINE Matrix4x4_SSE transposed() const
195 {
196 Matrix4x4_SSE c(Qt::Uninitialized);
197
198 // ~113 instructions
199 // 0b11011101 == 0xdd
200 // 0b10001000 == 0x88
201 const __m128 tmp1 = _mm_shuffle_ps(m_col1, m_col2, 0xdd);
202 const __m128 tmp2 = _mm_shuffle_ps(m_col1, m_col2, 0x88);
203 const __m128 tmp3 = _mm_shuffle_ps(m_col3, m_col4, 0xdd);
204 const __m128 tmp4 = _mm_shuffle_ps(m_col3, m_col4, 0x88);
205 c.m_col1 = _mm_shuffle_ps(tmp2, tmp4, 0x88);
206 c.m_col2 = _mm_shuffle_ps(tmp1, tmp3, 0x88);
207 c.m_col3 = _mm_shuffle_ps(tmp2, tmp4, 0xdd);
208 c.m_col4 = _mm_shuffle_ps(tmp1, tmp3, 0xdd);
209
210 return c;
211 }
212
213 Q_ALWAYS_INLINE Matrix4x4_SSE inverted() const
214 {
215 // TO DO: Optimize
216 const QMatrix4x4 mat = toQMatrix4x4();
217 return Matrix4x4_SSE(mat.inverted());
218 }
219
220 Q_ALWAYS_INLINE bool operator==(const Matrix4x4_SSE &other) const
221 {
222 // 0b1111 == 0xf
223 return (_mm_movemask_ps(_mm_cmpeq_ps(m_col1, other.m_col1)) == 0xf &&
224 _mm_movemask_ps(_mm_cmpeq_ps(m_col2, other.m_col2)) == 0xf &&
225 _mm_movemask_ps(_mm_cmpeq_ps(m_col3, other.m_col3)) == 0xf &&
226 _mm_movemask_ps(_mm_cmpeq_ps(m_col4, other.m_col4)) == 0xf);
227 }
228
229 Q_ALWAYS_INLINE bool operator!=(const Matrix4x4_SSE &other) const
230 {
231 return !(*this == other);
232 }
233
234 Q_ALWAYS_INLINE float m11() const { return _mm_cvtss_f32(m_col1); }
235 Q_ALWAYS_INLINE float m12() const { return _mm_cvtss_f32(m_col2); }
236 Q_ALWAYS_INLINE float m13() const { return _mm_cvtss_f32(m_col3); }
237 Q_ALWAYS_INLINE float m14() const { return _mm_cvtss_f32(m_col4); }
238
239 Q_ALWAYS_INLINE float m21() const
240 {
241 // 0b01010101 = 0x55
242 return _mm_cvtss_f32(_mm_shuffle_ps(m_col1, m_col1, 0x55));
243 }
244 Q_ALWAYS_INLINE float m22() const
245 {
246 // 0b01010101 = 0x55
247 return _mm_cvtss_f32(_mm_shuffle_ps(m_col2, m_col2, 0x55));
248 }
249 Q_ALWAYS_INLINE float m23() const
250 {
251 // 0b01010101 = 0x55
252 return _mm_cvtss_f32(_mm_shuffle_ps(m_col3, m_col3, 0x55));
253 }
254 Q_ALWAYS_INLINE float m24() const
255 {
256 // 0b01010101 = 0x55
257 return _mm_cvtss_f32(_mm_shuffle_ps(m_col4, m_col4, 0x55));
258 }
259
260 Q_ALWAYS_INLINE float m31() const
261 {
262 // 0b10101010 = 0xaa
263 return _mm_cvtss_f32(_mm_shuffle_ps(m_col1, m_col1, 0xaa));
264 }
265 Q_ALWAYS_INLINE float m32() const
266 {
267 // 0b10101010 = 0xaa
268 return _mm_cvtss_f32(_mm_shuffle_ps(m_col2, m_col2, 0xaa));
269 }
270 Q_ALWAYS_INLINE float m33() const
271 {
272 // 0b10101010 = 0xaa
273 return _mm_cvtss_f32(_mm_shuffle_ps(m_col3, m_col3, 0xaa));
274 }
275 Q_ALWAYS_INLINE float m34() const
276 {
277 // 0b10101010 = 0xaa
278 return _mm_cvtss_f32(_mm_shuffle_ps(m_col4, m_col4, 0xaa));
279 }
280
281 Q_ALWAYS_INLINE float m41() const
282 {
283 // 0b11111111 = 0xff
284 return _mm_cvtss_f32(_mm_shuffle_ps(m_col1, m_col1, 0xff));
285 }
286 Q_ALWAYS_INLINE float m42() const
287 {
288 // 0b11111111 = 0xff
289 return _mm_cvtss_f32(_mm_shuffle_ps(m_col2, m_col2, 0xff));
290 }
291 Q_ALWAYS_INLINE float m43() const
292 {
293 // 0b11111111 = 0xff
294 return _mm_cvtss_f32(_mm_shuffle_ps(m_col3, m_col3, 0xff));
295 }
296 Q_ALWAYS_INLINE float m44() const
297 {
298 // 0b11111111 = 0xff
299 return _mm_cvtss_f32(_mm_shuffle_ps(m_col4, m_col4, 0xff));
300 }
301
302 Q_ALWAYS_INLINE Vector4D row(int index) const
303 {
304 switch (index) {
305 case 0:
306 return Vector4D(m11(), m12(), m13(), m14());
307 case 1:
308 return Vector4D(m21(), m22(), m23(), m24());
309 case 2:
310 return Vector4D(m31(), m32(), m33(), m34());
311 case 3:
312 return Vector4D(m41(), m42(), m43(), m44());
313 default:
314 Q_UNREACHABLE();
315 return Vector4D();
316 }
317 }
318
319 Q_ALWAYS_INLINE Vector4D column(int index) const
320 {
321 Vector4D c(Qt::Uninitialized);
322 switch (index) {
323 case 0:
324 c.m_xyzw = m_col1;
325 break;
326 case 1:
327 c.m_xyzw = m_col2;
328 break;
329 case 2:
330 c.m_xyzw = m_col3;
331 break;
332 case 3:
333 c.m_xyzw = m_col4;
334 break;
335 default:
336 Q_UNREACHABLE();
337 return Vector4D();
338 }
339 return c;
340 }
341
342 Q_ALWAYS_INLINE QMatrix4x4 toQMatrix4x4() const { return QMatrix4x4(m11(), m12(), m13(), m14(),
343 m21(), m22(), m23(), m24(),
344 m31(), m32(), m33(), m34(),
345 m41(), m42(), m43(), m44()); }
346
347 Q_ALWAYS_INLINE Vector3D_SSE map(const Vector3D_SSE &point) const
348 {
349 return *this * point;
350 }
351
352 Q_ALWAYS_INLINE Vector4D_SSE map(const Vector4D_SSE &point) const
353 {
354 return *this * point;
355 }
356
357 Q_ALWAYS_INLINE Vector3D_SSE mapVector(const Vector3D_SSE &vector) const
358 {
359 const Vector3D_SSE row1(m11(), m12(), m13());
360 const Vector3D_SSE row2(m21(), m22(), m23());
361 const Vector3D_SSE row3(m31(), m32(), m33());
362
363 return Vector3D(Vector3D_SSE::dotProduct(row1, vector),
364 Vector3D_SSE::dotProduct(row2, vector),
365 Vector3D_SSE::dotProduct(row3, vector));
366 }
367
368 friend Q_ALWAYS_INLINE Vector4D operator*(const Vector4D &vector, const Matrix4x4_SSE &matrix);
369 friend Q_ALWAYS_INLINE Vector4D operator*(const Matrix4x4_SSE &matrix, const Vector4D &vector);
370
371 friend Q_ALWAYS_INLINE Vector3D operator*(const Vector3D &vector, const Matrix4x4_SSE &matrix);
372 friend Q_ALWAYS_INLINE Vector3D operator*(const Matrix4x4_SSE &matrix, const Vector3D &vector);
373
374 friend Q_3DCORE_PRIVATE_EXPORT QDebug operator<<(QDebug dbg, const Matrix4x4_SSE &m);
375
376private:
377 // Internally we will store the matrix as indicated below
378 // Q_DECL_ALIGN(16) // aligned on 16 bytes boundary for SSE (column major)
379 // struct
380 // {
381 // float m_m11, m_m21, m_m31, m_m41;
382 // float m_m12, m_m22, m_m32, m_m42;
383 // float m_m13, m_m23, m_m33, m_m43;
384 // float m_m14, m_m24, m_m34, m_m44;
385 // };
386 // struct
387 // {
388 // float m[16];
389 // };
390 __m128 m_col1;
391 __m128 m_col2;
392 __m128 m_col3;
393 __m128 m_col4;
394};
395
396Q_ALWAYS_INLINE Vector4D operator*(const Vector4D &vector, const Matrix4x4_SSE &matrix)
397{
398 const __m128 vCol1 = _mm_mul_ps(matrix.m_col1, vector.m_xyzw);
399 const __m128 vCol2 = _mm_mul_ps(matrix.m_col2, vector.m_xyzw);
400 const __m128 vCol3 = _mm_mul_ps(matrix.m_col3, vector.m_xyzw);
401 const __m128 vCol4 = _mm_mul_ps(matrix.m_col4, vector.m_xyzw);
402
403
404 // 0b01000100 == 0x44
405 // 0b11101110 == 0xee
406
407 // vCol1.x, vCol1.y, vCol2.x, vCol2.y
408 __m128 tmp1 = _mm_shuffle_ps(vCol1, vCol2, 0x44);
409 // vCol1.z, vCol1.w, vCol2.z, vCol2.w
410 __m128 tmp2 = _mm_shuffle_ps(vCol1, vCol2, 0xee);
411
412 // vCol1.x + vCol1.z, vCol1.y + vCol1.w, vCol2.x + vCol2.z, vCol2.y + vCol2.w,
413 const __m128 tmpSum01 = _mm_add_ps(tmp1, tmp2);
414
415 // vCol3.x, vCol3.y, vCol4.x, vCol4.y
416 tmp1 = _mm_shuffle_ps(vCol3, vCol4, 0x44);
417 // vCol3.z, vCol3.w, vCol4.z, vCol4.w
418 tmp2 = _mm_shuffle_ps(vCol3, vCol4, 0xee);
419
420 // vCol3.x + vCol3.z, vCol3.y + vCol3.w, vCol4.x + vCol4.z, vCol4.y + vCol4.w,
421 const __m128 tmpSum02 = _mm_add_ps(tmp1, tmp2);
422
423 // 0b10001000 == 0x88
424 // 0b11011101 == 0xdd
425
426 // vCol1.x + vCol1.z, vCol2.x + vCol2.z, vCol3.x + vCol3.z, vCol4.x + vCol4.z,
427 tmp1 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0x88);
428 // vCol1.y + vCol1.w, vCol2.y + vCol2.w, vCol3.y + vCol3.w, vCol4.y + vCol4.w,
429 tmp2 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0xdd);
430
431 Vector4D v(Qt::Uninitialized);
432 v.m_xyzw = _mm_add_ps(tmp1, tmp2);
433 return v;
434}
435
436Q_ALWAYS_INLINE Vector4D operator*(const Matrix4x4_SSE &matrix, const Vector4D &vector)
437{
438 const Matrix4x4_SSE transposed = matrix.transposed();
439 return vector * transposed;
440}
441
442Q_ALWAYS_INLINE Vector3D operator*(const Vector3D &vector, const Matrix4x4_SSE &matrix)
443{
444 const __m128 vec4 = _mm_set_ps(1.0f, vector.z(), vector.y(), vector.x());
445
446 const __m128 vCol1 = _mm_mul_ps(matrix.m_col1, vec4);
447 const __m128 vCol2 = _mm_mul_ps(matrix.m_col2, vec4);
448 const __m128 vCol3 = _mm_mul_ps(matrix.m_col3, vec4);
449 const __m128 vCol4 = _mm_mul_ps(matrix.m_col4, vec4);
450
451 // 0b01000100 == 0x44
452 // 0b11101110 == 0xee
453
454 // vCol1.x, vCol1.y, vCol2.x, vCol2.y
455 __m128 tmp1 = _mm_shuffle_ps(vCol1, vCol2, 0x44);
456 // vCol1.z, vCol1.w, vCol2.z, vCol2.w
457 __m128 tmp2 = _mm_shuffle_ps(vCol1, vCol2, 0xee);
458
459 // vCol1.x + vCol1.z, vCol1.y + vCol1.w, vCol2.x + vCol2.z, vCol2.y + vCol2.w,
460 const __m128 tmpSum01 = _mm_add_ps(tmp1, tmp2);
461
462 // vCol3.x, vCol3.y, vCol4.x, vCol4.y
463 tmp1 = _mm_shuffle_ps(vCol3, vCol4, 0x44);
464 // vCol3.z, vCol3.w, vCol4.z, vCol4.w
465 tmp2 = _mm_shuffle_ps(vCol3, vCol4, 0xee);
466
467 // vCol3.x + vCol3.z, vCol3.y + vCol3.w, vCol4.x + vCol4.z, vCol4.y + vCol4.w,
468 const __m128 tmpSum02 = _mm_add_ps(tmp1, tmp2);
469
470 // 0b10001000 == 0x88
471 // 0b11011101 == 0xdd
472
473 // vCol1.x + vCol1.z, vCol2.x + vCol2.z, vCol3.x + vCol3.z, vCol4.x + vCol4.z,
474 tmp1 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0x88);
475 // vCol1.y + vCol1.w, vCol2.y + vCol2.w, vCol3.y + vCol3.w, vCol4.y + vCol4.w,
476 tmp2 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0xdd);
477
478 const __m128 result = _mm_add_ps(tmp1, tmp2);
479 // 0b11111111 = 0xff
480 const __m128 divisor = _mm_shuffle_ps(result, result, 0xff);
481 Vector3D v(Qt::Uninitialized);
482 v.m_xyzw = _mm_div_ps(result, divisor);
483 return v;
484}
485
486Q_ALWAYS_INLINE Vector3D operator*(const Matrix4x4_SSE &matrix, const Vector3D &vector)
487{
488 const Matrix4x4_SSE transposed = matrix.transposed();
489 return vector * transposed;
490}
491
492} // Qt3DCore
493
494
495Q_DECLARE_TYPEINFO(Qt3DCore::Matrix4x4_SSE, Q_PRIMITIVE_TYPE);
496
497QT_END_NAMESPACE
498
499Q_DECLARE_METATYPE(Qt3DCore::Matrix4x4_SSE)
500
501#endif // QT_COMPILER_SUPPORTS_SSE2
502
503#endif // QT3DCORE_MATRIX4X4_SSE_P_H
504

source code of qt3d/src/core/transforms/matrix4x4_sse_p.h