| 1 | /**************************************************************************** |
| 2 | ** |
| 3 | ** Copyright (C) 2016 Paul Lemire <paul.lemire350@gmail.com> |
| 4 | ** Contact: https://www.qt.io/licensing/ |
| 5 | ** |
| 6 | ** This file is part of the Qt3D module of the Qt Toolkit. |
| 7 | ** |
| 8 | ** $QT_BEGIN_LICENSE:LGPL$ |
| 9 | ** Commercial License Usage |
| 10 | ** Licensees holding valid commercial Qt licenses may use this file in |
| 11 | ** accordance with the commercial license agreement provided with the |
| 12 | ** Software or, alternatively, in accordance with the terms contained in |
| 13 | ** a written agreement between you and The Qt Company. For licensing terms |
| 14 | ** and conditions see https://www.qt.io/terms-conditions. For further |
| 15 | ** information use the contact form at https://www.qt.io/contact-us. |
| 16 | ** |
| 17 | ** GNU Lesser General Public License Usage |
| 18 | ** Alternatively, this file may be used under the terms of the GNU Lesser |
| 19 | ** General Public License version 3 as published by the Free Software |
| 20 | ** Foundation and appearing in the file LICENSE.LGPL3 included in the |
| 21 | ** packaging of this file. Please review the following information to |
| 22 | ** ensure the GNU Lesser General Public License version 3 requirements |
| 23 | ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. |
| 24 | ** |
| 25 | ** GNU General Public License Usage |
| 26 | ** Alternatively, this file may be used under the terms of the GNU |
| 27 | ** General Public License version 2.0 or (at your option) the GNU General |
| 28 | ** Public license version 3 or any later version approved by the KDE Free |
| 29 | ** Qt Foundation. The licenses are as published by the Free Software |
| 30 | ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 |
| 31 | ** included in the packaging of this file. Please review the following |
| 32 | ** information to ensure the GNU General Public License requirements will |
| 33 | ** be met: https://www.gnu.org/licenses/gpl-2.0.html and |
| 34 | ** https://www.gnu.org/licenses/gpl-3.0.html. |
| 35 | ** |
| 36 | ** $QT_END_LICENSE$ |
| 37 | ** |
| 38 | ****************************************************************************/ |
| 39 | |
| 40 | #ifndef QT3DCORE_MATRIX4X4_SSE_P_H |
| 41 | #define QT3DCORE_MATRIX4X4_SSE_P_H |
| 42 | |
| 43 | // |
| 44 | // W A R N I N G |
| 45 | // ------------- |
| 46 | // |
| 47 | // This file is not part of the Qt3D API. It exists purely as an |
| 48 | // implementation detail. This header file may change from version to |
| 49 | // version without notice, or even be removed. |
| 50 | // |
| 51 | // We mean it. |
| 52 | // |
| 53 | |
| 54 | #include <Qt3DCore/private/vector4d_p.h> |
| 55 | #include <Qt3DCore/private/vector3d_p.h> |
| 56 | #include <private/qsimd_p.h> |
| 57 | #include <QMatrix4x4> |
| 58 | |
| 59 | #ifdef QT_COMPILER_SUPPORTS_SSE2 |
| 60 | |
| 61 | QT_BEGIN_NAMESPACE |
| 62 | |
| 63 | namespace Qt3DCore { |
| 64 | |
| 65 | class Matrix4x4_SSE |
| 66 | { |
| 67 | public: |
| 68 | |
| 69 | Q_ALWAYS_INLINE Matrix4x4_SSE() { setToIdentity(); } |
| 70 | explicit Q_ALWAYS_INLINE Matrix4x4_SSE(Qt::Initialization) {} |
| 71 | |
| 72 | // QMatrix4x4::constData returns in column major order |
| 73 | explicit Q_ALWAYS_INLINE Matrix4x4_SSE(const QMatrix4x4 &mat) |
| 74 | { |
| 75 | // data may not be properly aligned, using unaligned loads |
| 76 | const float *data = mat.constData(); |
| 77 | m_col1 = _mm_loadu_ps(p: data); |
| 78 | m_col2 = _mm_loadu_ps(p: data + 4); |
| 79 | m_col3 = _mm_loadu_ps(p: data + 8); |
| 80 | m_col4 = _mm_loadu_ps(p: data + 12); |
| 81 | } |
| 82 | |
| 83 | // Assumes data is 16 bytes aligned (and in column major order) |
| 84 | explicit Q_ALWAYS_INLINE Matrix4x4_SSE(float *data) |
| 85 | { |
| 86 | m_col1 = _mm_load_ps(p: data); |
| 87 | m_col2 = _mm_load_ps(p: data + 4); |
| 88 | m_col3 = _mm_load_ps(p: data + 8); |
| 89 | m_col4 = _mm_load_ps(p: data + 12); |
| 90 | } |
| 91 | |
| 92 | // In (row major) but we store in column major order |
| 93 | explicit Q_ALWAYS_INLINE Matrix4x4_SSE(float m11, float m12, float m13, float m14, |
| 94 | float m21, float m22, float m23, float m24, |
| 95 | float m31, float m32, float m33, float m34, |
| 96 | float m41, float m42, float m43, float m44) |
| 97 | { |
| 98 | m_col1 = _mm_set_ps(z: m41, y: m31, x: m21, w: m11); |
| 99 | m_col2 = _mm_set_ps(z: m42, y: m32, x: m22, w: m12); |
| 100 | m_col3 = _mm_set_ps(z: m43, y: m33, x: m23, w: m13); |
| 101 | m_col4 = _mm_set_ps(z: m44, y: m34, x: m24, w: m14); |
| 102 | } |
| 103 | |
| 104 | Q_ALWAYS_INLINE void setToIdentity() |
| 105 | { |
| 106 | m_col1 = _mm_set_ss(w: 1.0f); |
| 107 | m_col2 = _mm_set_ps(z: 0.0f, y: 0.0f, x: 1.0f, w: 0.0f); |
| 108 | m_col3 = _mm_set_ps(z: 0.0f, y: 1.0f, x: 0.0f, w: 0.0f); |
| 109 | m_col4 = _mm_set_ps(z: 1.0f, y: 0.0f, x: 0.0f, w: 0.0f); |
| 110 | } |
| 111 | |
| 112 | Q_ALWAYS_INLINE Matrix4x4_SSE operator*(const Matrix4x4_SSE &other) const |
| 113 | { |
| 114 | Matrix4x4_SSE c(Qt::Uninitialized); |
| 115 | |
| 116 | const __m128 c1 = m_col1; |
| 117 | const __m128 c2 = m_col2; |
| 118 | const __m128 c3 = m_col3; |
| 119 | const __m128 c4 = m_col4; |
| 120 | |
| 121 | // c11, c21, c31, c41 |
| 122 | // 1) (m11 x n11), (m11 x n21), (m11 x n31), (m11 x n41) |
| 123 | // 2) (m11 x n11) + (m21 x n12), (m11 x n21) + (m21 x n22), (m11 x n31) + (m21 x n32), (m11 x n41) + (m21 x n42) |
| 124 | // 3) (m11 x n11) + (m21 x n21) + (m31 x n13), (m11 x n21) + (m21 x n22) + (m31 x n 23), (m11 x n31) + (m21 x n32) + (m31 x n33), (m11 x n41) + (m21 x n42) (m31 x n43) |
| 125 | // 4) (m11 x n11) + (m21 x n21) + (m31 x n13) + (m41 x n14), (m11 x n21) + (m21 x n22) + (m31 x n 23) + (m41 x n24), (m11 x n31) + (m21 x n32) + (m31 x n33) + (m41 x n34), (m11 x n41) + (m21 x n42) (m31 x n43) + (m41 x n44) |
| 126 | __m128 tmp = _mm_mul_ps(a: _mm_set1_ps(w: other.m11()), b: c1); |
| 127 | tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m21()), b: c2), b: tmp); |
| 128 | tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m31()), b: c3), b: tmp); |
| 129 | c.m_col1 = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m41()), b: c4), b: tmp); |
| 130 | |
| 131 | // c21, c22, c23, c24 |
| 132 | tmp = _mm_mul_ps(a: _mm_set1_ps(w: other.m12()), b: c1); |
| 133 | tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m22()), b: c2), b: tmp); |
| 134 | tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m32()), b: c3), b: tmp); |
| 135 | c.m_col2 = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m42()), b: c4), b: tmp); |
| 136 | |
| 137 | // c31, c32, c33, c34 |
| 138 | tmp = _mm_mul_ps(a: _mm_set1_ps(w: other.m13()), b: c1); |
| 139 | tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m23()), b: c2), b: tmp); |
| 140 | tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m33()), b: c3), b: tmp); |
| 141 | c.m_col3 = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m43()), b: c4), b: tmp); |
| 142 | |
| 143 | // c41, c42, c43, c44 |
| 144 | tmp = _mm_mul_ps(a: _mm_set1_ps(w: other.m14()), b: c1); |
| 145 | tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m24()), b: c2), b: tmp); |
| 146 | tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m34()), b: c3), b: tmp); |
| 147 | c.m_col4 = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m44()), b: c4), b: tmp); |
| 148 | |
| 149 | return c; |
| 150 | } |
| 151 | |
| 152 | Q_ALWAYS_INLINE Matrix4x4_SSE operator-(const Matrix4x4_SSE &other) const |
| 153 | { |
| 154 | Matrix4x4_SSE c(Qt::Uninitialized); |
| 155 | |
| 156 | c.m_col1 = _mm_sub_ps(a: m_col1, b: other.m_col1); |
| 157 | c.m_col2 = _mm_sub_ps(a: m_col2, b: other.m_col2); |
| 158 | c.m_col3 = _mm_sub_ps(a: m_col3, b: other.m_col3); |
| 159 | c.m_col4 = _mm_sub_ps(a: m_col4, b: other.m_col4); |
| 160 | |
| 161 | return c; |
| 162 | } |
| 163 | |
| 164 | Q_ALWAYS_INLINE Matrix4x4_SSE operator+(const Matrix4x4_SSE &other) const |
| 165 | { |
| 166 | Matrix4x4_SSE c(Qt::Uninitialized); |
| 167 | |
| 168 | c.m_col1 = _mm_add_ps(a: m_col1, b: other.m_col1); |
| 169 | c.m_col2 = _mm_add_ps(a: m_col2, b: other.m_col2); |
| 170 | c.m_col3 = _mm_add_ps(a: m_col3, b: other.m_col3); |
| 171 | c.m_col4 = _mm_add_ps(a: m_col4, b: other.m_col4); |
| 172 | |
| 173 | return c; |
| 174 | } |
| 175 | |
| 176 | Q_ALWAYS_INLINE Matrix4x4_SSE &operator*=(const Matrix4x4_SSE &other) |
| 177 | { |
| 178 | *this = *this * other; |
| 179 | return *this; |
| 180 | } |
| 181 | |
| 182 | Q_ALWAYS_INLINE Matrix4x4_SSE &operator-=(const Matrix4x4_SSE &other) |
| 183 | { |
| 184 | *this = *this - other; |
| 185 | return *this; |
| 186 | } |
| 187 | |
| 188 | Q_ALWAYS_INLINE Matrix4x4_SSE &operator+=(const Matrix4x4_SSE &other) |
| 189 | { |
| 190 | *this = *this + other; |
| 191 | return *this; |
| 192 | } |
| 193 | |
| 194 | Q_ALWAYS_INLINE Matrix4x4_SSE transposed() const |
| 195 | { |
| 196 | Matrix4x4_SSE c(Qt::Uninitialized); |
| 197 | |
| 198 | // ~113 instructions |
| 199 | // 0b11011101 == 0xdd |
| 200 | // 0b10001000 == 0x88 |
| 201 | const __m128 tmp1 = _mm_shuffle_ps(m_col1, m_col2, 0xdd); |
| 202 | const __m128 tmp2 = _mm_shuffle_ps(m_col1, m_col2, 0x88); |
| 203 | const __m128 tmp3 = _mm_shuffle_ps(m_col3, m_col4, 0xdd); |
| 204 | const __m128 tmp4 = _mm_shuffle_ps(m_col3, m_col4, 0x88); |
| 205 | c.m_col1 = _mm_shuffle_ps(tmp2, tmp4, 0x88); |
| 206 | c.m_col2 = _mm_shuffle_ps(tmp1, tmp3, 0x88); |
| 207 | c.m_col3 = _mm_shuffle_ps(tmp2, tmp4, 0xdd); |
| 208 | c.m_col4 = _mm_shuffle_ps(tmp1, tmp3, 0xdd); |
| 209 | |
| 210 | return c; |
| 211 | } |
| 212 | |
| 213 | Q_ALWAYS_INLINE Matrix4x4_SSE inverted() const |
| 214 | { |
| 215 | // TO DO: Optimize |
| 216 | const QMatrix4x4 mat = toQMatrix4x4(); |
| 217 | return Matrix4x4_SSE(mat.inverted()); |
| 218 | } |
| 219 | |
| 220 | Q_ALWAYS_INLINE bool operator==(const Matrix4x4_SSE &other) const |
| 221 | { |
| 222 | // 0b1111 == 0xf |
| 223 | return (_mm_movemask_ps(a: _mm_cmpeq_ps(a: m_col1, b: other.m_col1)) == 0xf && |
| 224 | _mm_movemask_ps(a: _mm_cmpeq_ps(a: m_col2, b: other.m_col2)) == 0xf && |
| 225 | _mm_movemask_ps(a: _mm_cmpeq_ps(a: m_col3, b: other.m_col3)) == 0xf && |
| 226 | _mm_movemask_ps(a: _mm_cmpeq_ps(a: m_col4, b: other.m_col4)) == 0xf); |
| 227 | } |
| 228 | |
| 229 | Q_ALWAYS_INLINE bool operator!=(const Matrix4x4_SSE &other) const |
| 230 | { |
| 231 | return !(*this == other); |
| 232 | } |
| 233 | |
| 234 | Q_ALWAYS_INLINE float m11() const { return _mm_cvtss_f32(a: m_col1); } |
| 235 | Q_ALWAYS_INLINE float m12() const { return _mm_cvtss_f32(a: m_col2); } |
| 236 | Q_ALWAYS_INLINE float m13() const { return _mm_cvtss_f32(a: m_col3); } |
| 237 | Q_ALWAYS_INLINE float m14() const { return _mm_cvtss_f32(a: m_col4); } |
| 238 | |
| 239 | Q_ALWAYS_INLINE float m21() const |
| 240 | { |
| 241 | // 0b01010101 = 0x55 |
| 242 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col1, m_col1, 0x55)); |
| 243 | } |
| 244 | Q_ALWAYS_INLINE float m22() const |
| 245 | { |
| 246 | // 0b01010101 = 0x55 |
| 247 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col2, m_col2, 0x55)); |
| 248 | } |
| 249 | Q_ALWAYS_INLINE float m23() const |
| 250 | { |
| 251 | // 0b01010101 = 0x55 |
| 252 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col3, m_col3, 0x55)); |
| 253 | } |
| 254 | Q_ALWAYS_INLINE float m24() const |
| 255 | { |
| 256 | // 0b01010101 = 0x55 |
| 257 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col4, m_col4, 0x55)); |
| 258 | } |
| 259 | |
| 260 | Q_ALWAYS_INLINE float m31() const |
| 261 | { |
| 262 | // 0b10101010 = 0xaa |
| 263 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col1, m_col1, 0xaa)); |
| 264 | } |
| 265 | Q_ALWAYS_INLINE float m32() const |
| 266 | { |
| 267 | // 0b10101010 = 0xaa |
| 268 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col2, m_col2, 0xaa)); |
| 269 | } |
| 270 | Q_ALWAYS_INLINE float m33() const |
| 271 | { |
| 272 | // 0b10101010 = 0xaa |
| 273 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col3, m_col3, 0xaa)); |
| 274 | } |
| 275 | Q_ALWAYS_INLINE float m34() const |
| 276 | { |
| 277 | // 0b10101010 = 0xaa |
| 278 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col4, m_col4, 0xaa)); |
| 279 | } |
| 280 | |
| 281 | Q_ALWAYS_INLINE float m41() const |
| 282 | { |
| 283 | // 0b11111111 = 0xff |
| 284 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col1, m_col1, 0xff)); |
| 285 | } |
| 286 | Q_ALWAYS_INLINE float m42() const |
| 287 | { |
| 288 | // 0b11111111 = 0xff |
| 289 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col2, m_col2, 0xff)); |
| 290 | } |
| 291 | Q_ALWAYS_INLINE float m43() const |
| 292 | { |
| 293 | // 0b11111111 = 0xff |
| 294 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col3, m_col3, 0xff)); |
| 295 | } |
| 296 | Q_ALWAYS_INLINE float m44() const |
| 297 | { |
| 298 | // 0b11111111 = 0xff |
| 299 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col4, m_col4, 0xff)); |
| 300 | } |
| 301 | |
| 302 | Q_ALWAYS_INLINE Vector4D row(int index) const |
| 303 | { |
| 304 | switch (index) { |
| 305 | case 0: |
| 306 | return Vector4D(m11(), m12(), m13(), m14()); |
| 307 | case 1: |
| 308 | return Vector4D(m21(), m22(), m23(), m24()); |
| 309 | case 2: |
| 310 | return Vector4D(m31(), m32(), m33(), m34()); |
| 311 | case 3: |
| 312 | return Vector4D(m41(), m42(), m43(), m44()); |
| 313 | default: |
| 314 | Q_UNREACHABLE(); |
| 315 | return Vector4D(); |
| 316 | } |
| 317 | } |
| 318 | |
| 319 | Q_ALWAYS_INLINE Vector4D column(int index) const |
| 320 | { |
| 321 | Vector4D c(Qt::Uninitialized); |
| 322 | switch (index) { |
| 323 | case 0: |
| 324 | c.m_xyzw = m_col1; |
| 325 | break; |
| 326 | case 1: |
| 327 | c.m_xyzw = m_col2; |
| 328 | break; |
| 329 | case 2: |
| 330 | c.m_xyzw = m_col3; |
| 331 | break; |
| 332 | case 3: |
| 333 | c.m_xyzw = m_col4; |
| 334 | break; |
| 335 | default: |
| 336 | Q_UNREACHABLE(); |
| 337 | return Vector4D(); |
| 338 | } |
| 339 | return c; |
| 340 | } |
| 341 | |
| 342 | Q_ALWAYS_INLINE QMatrix4x4 toQMatrix4x4() const { return QMatrix4x4(m11(), m12(), m13(), m14(), |
| 343 | m21(), m22(), m23(), m24(), |
| 344 | m31(), m32(), m33(), m34(), |
| 345 | m41(), m42(), m43(), m44()); } |
| 346 | |
| 347 | Q_ALWAYS_INLINE Vector3D_SSE map(const Vector3D_SSE &point) const |
| 348 | { |
| 349 | return *this * point; |
| 350 | } |
| 351 | |
| 352 | Q_ALWAYS_INLINE Vector4D_SSE map(const Vector4D_SSE &point) const |
| 353 | { |
| 354 | return *this * point; |
| 355 | } |
| 356 | |
| 357 | Q_ALWAYS_INLINE Vector3D_SSE mapVector(const Vector3D_SSE &vector) const |
| 358 | { |
| 359 | const Vector3D_SSE row1(m11(), m12(), m13()); |
| 360 | const Vector3D_SSE row2(m21(), m22(), m23()); |
| 361 | const Vector3D_SSE row3(m31(), m32(), m33()); |
| 362 | |
| 363 | return Vector3D(Vector3D_SSE::dotProduct(a: row1, b: vector), |
| 364 | Vector3D_SSE::dotProduct(a: row2, b: vector), |
| 365 | Vector3D_SSE::dotProduct(a: row3, b: vector)); |
| 366 | } |
| 367 | |
| 368 | friend Q_ALWAYS_INLINE Vector4D operator*(const Vector4D &vector, const Matrix4x4_SSE &matrix); |
| 369 | friend Q_ALWAYS_INLINE Vector4D operator*(const Matrix4x4_SSE &matrix, const Vector4D &vector); |
| 370 | |
| 371 | friend Q_ALWAYS_INLINE Vector3D operator*(const Vector3D &vector, const Matrix4x4_SSE &matrix); |
| 372 | friend Q_ALWAYS_INLINE Vector3D operator*(const Matrix4x4_SSE &matrix, const Vector3D &vector); |
| 373 | |
| 374 | friend Q_3DCORE_PRIVATE_EXPORT QDebug operator<<(QDebug dbg, const Matrix4x4_SSE &m); |
| 375 | |
| 376 | private: |
| 377 | // Internally we will store the matrix as indicated below |
| 378 | // Q_DECL_ALIGN(16) // aligned on 16 bytes boundary for SSE (column major) |
| 379 | // struct |
| 380 | // { |
| 381 | // float m_m11, m_m21, m_m31, m_m41; |
| 382 | // float m_m12, m_m22, m_m32, m_m42; |
| 383 | // float m_m13, m_m23, m_m33, m_m43; |
| 384 | // float m_m14, m_m24, m_m34, m_m44; |
| 385 | // }; |
| 386 | // struct |
| 387 | // { |
| 388 | // float m[16]; |
| 389 | // }; |
| 390 | __m128 m_col1; |
| 391 | __m128 m_col2; |
| 392 | __m128 m_col3; |
| 393 | __m128 m_col4; |
| 394 | }; |
| 395 | |
| 396 | Q_ALWAYS_INLINE Vector4D operator*(const Vector4D &vector, const Matrix4x4_SSE &matrix) |
| 397 | { |
| 398 | const __m128 vCol1 = _mm_mul_ps(a: matrix.m_col1, b: vector.m_xyzw); |
| 399 | const __m128 vCol2 = _mm_mul_ps(a: matrix.m_col2, b: vector.m_xyzw); |
| 400 | const __m128 vCol3 = _mm_mul_ps(a: matrix.m_col3, b: vector.m_xyzw); |
| 401 | const __m128 vCol4 = _mm_mul_ps(a: matrix.m_col4, b: vector.m_xyzw); |
| 402 | |
| 403 | |
| 404 | // 0b01000100 == 0x44 |
| 405 | // 0b11101110 == 0xee |
| 406 | |
| 407 | // vCol1.x, vCol1.y, vCol2.x, vCol2.y |
| 408 | __m128 tmp1 = _mm_shuffle_ps(vCol1, vCol2, 0x44); |
| 409 | // vCol1.z, vCol1.w, vCol2.z, vCol2.w |
| 410 | __m128 tmp2 = _mm_shuffle_ps(vCol1, vCol2, 0xee); |
| 411 | |
| 412 | // vCol1.x + vCol1.z, vCol1.y + vCol1.w, vCol2.x + vCol2.z, vCol2.y + vCol2.w, |
| 413 | const __m128 tmpSum01 = _mm_add_ps(a: tmp1, b: tmp2); |
| 414 | |
| 415 | // vCol3.x, vCol3.y, vCol4.x, vCol4.y |
| 416 | tmp1 = _mm_shuffle_ps(vCol3, vCol4, 0x44); |
| 417 | // vCol3.z, vCol3.w, vCol4.z, vCol4.w |
| 418 | tmp2 = _mm_shuffle_ps(vCol3, vCol4, 0xee); |
| 419 | |
| 420 | // vCol3.x + vCol3.z, vCol3.y + vCol3.w, vCol4.x + vCol4.z, vCol4.y + vCol4.w, |
| 421 | const __m128 tmpSum02 = _mm_add_ps(a: tmp1, b: tmp2); |
| 422 | |
| 423 | // 0b10001000 == 0x88 |
| 424 | // 0b11011101 == 0xdd |
| 425 | |
| 426 | // vCol1.x + vCol1.z, vCol2.x + vCol2.z, vCol3.x + vCol3.z, vCol4.x + vCol4.z, |
| 427 | tmp1 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0x88); |
| 428 | // vCol1.y + vCol1.w, vCol2.y + vCol2.w, vCol3.y + vCol3.w, vCol4.y + vCol4.w, |
| 429 | tmp2 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0xdd); |
| 430 | |
| 431 | Vector4D v(Qt::Uninitialized); |
| 432 | v.m_xyzw = _mm_add_ps(a: tmp1, b: tmp2); |
| 433 | return v; |
| 434 | } |
| 435 | |
| 436 | Q_ALWAYS_INLINE Vector4D operator*(const Matrix4x4_SSE &matrix, const Vector4D &vector) |
| 437 | { |
| 438 | const Matrix4x4_SSE transposed = matrix.transposed(); |
| 439 | return vector * transposed; |
| 440 | } |
| 441 | |
| 442 | Q_ALWAYS_INLINE Vector3D operator*(const Vector3D &vector, const Matrix4x4_SSE &matrix) |
| 443 | { |
| 444 | const __m128 vec4 = _mm_set_ps(z: 1.0f, y: vector.z(), x: vector.y(), w: vector.x()); |
| 445 | |
| 446 | const __m128 vCol1 = _mm_mul_ps(a: matrix.m_col1, b: vec4); |
| 447 | const __m128 vCol2 = _mm_mul_ps(a: matrix.m_col2, b: vec4); |
| 448 | const __m128 vCol3 = _mm_mul_ps(a: matrix.m_col3, b: vec4); |
| 449 | const __m128 vCol4 = _mm_mul_ps(a: matrix.m_col4, b: vec4); |
| 450 | |
| 451 | // 0b01000100 == 0x44 |
| 452 | // 0b11101110 == 0xee |
| 453 | |
| 454 | // vCol1.x, vCol1.y, vCol2.x, vCol2.y |
| 455 | __m128 tmp1 = _mm_shuffle_ps(vCol1, vCol2, 0x44); |
| 456 | // vCol1.z, vCol1.w, vCol2.z, vCol2.w |
| 457 | __m128 tmp2 = _mm_shuffle_ps(vCol1, vCol2, 0xee); |
| 458 | |
| 459 | // vCol1.x + vCol1.z, vCol1.y + vCol1.w, vCol2.x + vCol2.z, vCol2.y + vCol2.w, |
| 460 | const __m128 tmpSum01 = _mm_add_ps(a: tmp1, b: tmp2); |
| 461 | |
| 462 | // vCol3.x, vCol3.y, vCol4.x, vCol4.y |
| 463 | tmp1 = _mm_shuffle_ps(vCol3, vCol4, 0x44); |
| 464 | // vCol3.z, vCol3.w, vCol4.z, vCol4.w |
| 465 | tmp2 = _mm_shuffle_ps(vCol3, vCol4, 0xee); |
| 466 | |
| 467 | // vCol3.x + vCol3.z, vCol3.y + vCol3.w, vCol4.x + vCol4.z, vCol4.y + vCol4.w, |
| 468 | const __m128 tmpSum02 = _mm_add_ps(a: tmp1, b: tmp2); |
| 469 | |
| 470 | // 0b10001000 == 0x88 |
| 471 | // 0b11011101 == 0xdd |
| 472 | |
| 473 | // vCol1.x + vCol1.z, vCol2.x + vCol2.z, vCol3.x + vCol3.z, vCol4.x + vCol4.z, |
| 474 | tmp1 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0x88); |
| 475 | // vCol1.y + vCol1.w, vCol2.y + vCol2.w, vCol3.y + vCol3.w, vCol4.y + vCol4.w, |
| 476 | tmp2 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0xdd); |
| 477 | |
| 478 | const __m128 result = _mm_add_ps(a: tmp1, b: tmp2); |
| 479 | // 0b11111111 = 0xff |
| 480 | const __m128 divisor = _mm_shuffle_ps(result, result, 0xff); |
| 481 | Vector3D v(Qt::Uninitialized); |
| 482 | v.m_xyzw = _mm_div_ps(a: result, b: divisor); |
| 483 | return v; |
| 484 | } |
| 485 | |
| 486 | Q_ALWAYS_INLINE Vector3D operator*(const Matrix4x4_SSE &matrix, const Vector3D &vector) |
| 487 | { |
| 488 | const Matrix4x4_SSE transposed = matrix.transposed(); |
| 489 | return vector * transposed; |
| 490 | } |
| 491 | |
| 492 | } // Qt3DCore |
| 493 | |
| 494 | |
| 495 | Q_DECLARE_TYPEINFO(Qt3DCore::Matrix4x4_SSE, Q_PRIMITIVE_TYPE); |
| 496 | |
| 497 | QT_END_NAMESPACE |
| 498 | |
| 499 | Q_DECLARE_METATYPE(Qt3DCore::Matrix4x4_SSE) |
| 500 | |
| 501 | #endif // QT_COMPILER_SUPPORTS_SSE2 |
| 502 | |
| 503 | #endif // QT3DCORE_MATRIX4X4_SSE_P_H |
| 504 | |