1 | /**************************************************************************** |
---|---|

2 | ** |

3 | ** Copyright (C) 2016 Paul Lemire <paul.lemire350@gmail.com> |

4 | ** Contact: https://www.qt.io/licensing/ |

5 | ** |

6 | ** This file is part of the Qt3D module of the Qt Toolkit. |

7 | ** |

8 | ** $QT_BEGIN_LICENSE:LGPL$ |

9 | ** Commercial License Usage |

10 | ** Licensees holding valid commercial Qt licenses may use this file in |

11 | ** accordance with the commercial license agreement provided with the |

12 | ** Software or, alternatively, in accordance with the terms contained in |

13 | ** a written agreement between you and The Qt Company. For licensing terms |

14 | ** and conditions see https://www.qt.io/terms-conditions. For further |

15 | ** information use the contact form at https://www.qt.io/contact-us. |

16 | ** |

17 | ** GNU Lesser General Public License Usage |

18 | ** Alternatively, this file may be used under the terms of the GNU Lesser |

19 | ** General Public License version 3 as published by the Free Software |

20 | ** Foundation and appearing in the file LICENSE.LGPL3 included in the |

21 | ** packaging of this file. Please review the following information to |

22 | ** ensure the GNU Lesser General Public License version 3 requirements |

23 | ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. |

24 | ** |

25 | ** GNU General Public License Usage |

26 | ** Alternatively, this file may be used under the terms of the GNU |

27 | ** General Public License version 2.0 or (at your option) the GNU General |

28 | ** Public license version 3 or any later version approved by the KDE Free |

29 | ** Qt Foundation. The licenses are as published by the Free Software |

30 | ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 |

31 | ** included in the packaging of this file. Please review the following |

32 | ** information to ensure the GNU General Public License requirements will |

33 | ** be met: https://www.gnu.org/licenses/gpl-2.0.html and |

34 | ** https://www.gnu.org/licenses/gpl-3.0.html. |

35 | ** |

36 | ** $QT_END_LICENSE$ |

37 | ** |

38 | ****************************************************************************/ |

39 | |

40 | #ifndef QT3DCORE_MATRIX4X4_SSE_P_H |

41 | #define QT3DCORE_MATRIX4X4_SSE_P_H |

42 | |

43 | // |

44 | // W A R N I N G |

45 | // ------------- |

46 | // |

47 | // This file is not part of the Qt3D API. It exists purely as an |

48 | // implementation detail. This header file may change from version to |

49 | // version without notice, or even be removed. |

50 | // |

51 | // We mean it. |

52 | // |

53 | |

54 | #include <Qt3DCore/private/vector4d_p.h> |

55 | #include <Qt3DCore/private/vector3d_p.h> |

56 | #include <private/qsimd_p.h> |

57 | #include <QMatrix4x4> |

58 | |

59 | #ifdef QT_COMPILER_SUPPORTS_SSE2 |

60 | |

61 | QT_BEGIN_NAMESPACE |

62 | |

63 | namespace Qt3DCore { |

64 | |

65 | class Matrix4x4_SSE |

66 | { |

67 | public: |

68 | |

69 | Q_ALWAYS_INLINE Matrix4x4_SSE() { setToIdentity(); } |

70 | explicit Q_ALWAYS_INLINE Matrix4x4_SSE(Qt::Initialization) {} |

71 | |

72 | // QMatrix4x4::constData returns in column major order |

73 | explicit Q_ALWAYS_INLINE Matrix4x4_SSE(const QMatrix4x4 &mat) |

74 | { |

75 | // data may not be properly aligned, using unaligned loads |

76 | const float *data = mat.constData(); |

77 | m_col1 = _mm_loadu_ps(data); |

78 | m_col2 = _mm_loadu_ps(data + 4); |

79 | m_col3 = _mm_loadu_ps(data + 8); |

80 | m_col4 = _mm_loadu_ps(data + 12); |

81 | } |

82 | |

83 | // Assumes data is 16 bytes aligned (and in column major order) |

84 | explicit Q_ALWAYS_INLINE Matrix4x4_SSE(float *data) |

85 | { |

86 | m_col1 = _mm_load_ps(data); |

87 | m_col2 = _mm_load_ps(data + 4); |

88 | m_col3 = _mm_load_ps(data + 8); |

89 | m_col4 = _mm_load_ps(data + 12); |

90 | } |

91 | |

92 | // In (row major) but we store in column major order |

93 | explicit Q_ALWAYS_INLINE Matrix4x4_SSE(float m11, float m12, float m13, float m14, |

94 | float m21, float m22, float m23, float m24, |

95 | float m31, float m32, float m33, float m34, |

96 | float m41, float m42, float m43, float m44) |

97 | { |

98 | m_col1 = _mm_set_ps(m41, m31, m21, m11); |

99 | m_col2 = _mm_set_ps(m42, m32, m22, m12); |

100 | m_col3 = _mm_set_ps(m43, m33, m23, m13); |

101 | m_col4 = _mm_set_ps(m44, m34, m24, m14); |

102 | } |

103 | |

104 | Q_ALWAYS_INLINE void setToIdentity() |

105 | { |

106 | m_col1 = _mm_set_ss(1.0f); |

107 | m_col2 = _mm_set_ps(0.0f, 0.0f, 1.0f, 0.0f); |

108 | m_col3 = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); |

109 | m_col4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f); |

110 | } |

111 | |

112 | Q_ALWAYS_INLINE Matrix4x4_SSE operator*(const Matrix4x4_SSE &other) const |

113 | { |

114 | Matrix4x4_SSE c(Qt::Uninitialized); |

115 | |

116 | const __m128 c1 = m_col1; |

117 | const __m128 c2 = m_col2; |

118 | const __m128 c3 = m_col3; |

119 | const __m128 c4 = m_col4; |

120 | |

121 | // c11, c21, c31, c41 |

122 | // 1) (m11 x n11), (m11 x n21), (m11 x n31), (m11 x n41) |

123 | // 2) (m11 x n11) + (m21 x n12), (m11 x n21) + (m21 x n22), (m11 x n31) + (m21 x n32), (m11 x n41) + (m21 x n42) |

124 | // 3) (m11 x n11) + (m21 x n21) + (m31 x n13), (m11 x n21) + (m21 x n22) + (m31 x n 23), (m11 x n31) + (m21 x n32) + (m31 x n33), (m11 x n41) + (m21 x n42) (m31 x n43) |

125 | // 4) (m11 x n11) + (m21 x n21) + (m31 x n13) + (m41 x n14), (m11 x n21) + (m21 x n22) + (m31 x n 23) + (m41 x n24), (m11 x n31) + (m21 x n32) + (m31 x n33) + (m41 x n34), (m11 x n41) + (m21 x n42) (m31 x n43) + (m41 x n44) |

126 | __m128 tmp = _mm_mul_ps(_mm_set1_ps(other.m11()), c1); |

127 | tmp = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m21()), c2), tmp); |

128 | tmp = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m31()), c3), tmp); |

129 | c.m_col1 = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m41()), c4), tmp); |

130 | |

131 | // c21, c22, c23, c24 |

132 | tmp = _mm_mul_ps(_mm_set1_ps(other.m12()), c1); |

133 | tmp = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m22()), c2), tmp); |

134 | tmp = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m32()), c3), tmp); |

135 | c.m_col2 = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m42()), c4), tmp); |

136 | |

137 | // c31, c32, c33, c34 |

138 | tmp = _mm_mul_ps(_mm_set1_ps(other.m13()), c1); |

139 | tmp = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m23()), c2), tmp); |

140 | tmp = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m33()), c3), tmp); |

141 | c.m_col3 = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m43()), c4), tmp); |

142 | |

143 | // c41, c42, c43, c44 |

144 | tmp = _mm_mul_ps(_mm_set1_ps(other.m14()), c1); |

145 | tmp = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m24()), c2), tmp); |

146 | tmp = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m34()), c3), tmp); |

147 | c.m_col4 = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m44()), c4), tmp); |

148 | |

149 | return c; |

150 | } |

151 | |

152 | Q_ALWAYS_INLINE Matrix4x4_SSE operator-(const Matrix4x4_SSE &other) const |

153 | { |

154 | Matrix4x4_SSE c(Qt::Uninitialized); |

155 | |

156 | c.m_col1 = _mm_sub_ps(m_col1, other.m_col1); |

157 | c.m_col2 = _mm_sub_ps(m_col2, other.m_col2); |

158 | c.m_col3 = _mm_sub_ps(m_col3, other.m_col3); |

159 | c.m_col4 = _mm_sub_ps(m_col4, other.m_col4); |

160 | |

161 | return c; |

162 | } |

163 | |

164 | Q_ALWAYS_INLINE Matrix4x4_SSE operator+(const Matrix4x4_SSE &other) const |

165 | { |

166 | Matrix4x4_SSE c(Qt::Uninitialized); |

167 | |

168 | c.m_col1 = _mm_add_ps(m_col1, other.m_col1); |

169 | c.m_col2 = _mm_add_ps(m_col2, other.m_col2); |

170 | c.m_col3 = _mm_add_ps(m_col3, other.m_col3); |

171 | c.m_col4 = _mm_add_ps(m_col4, other.m_col4); |

172 | |

173 | return c; |

174 | } |

175 | |

176 | Q_ALWAYS_INLINE Matrix4x4_SSE &operator*=(const Matrix4x4_SSE &other) |

177 | { |

178 | *this = *this * other; |

179 | return *this; |

180 | } |

181 | |

182 | Q_ALWAYS_INLINE Matrix4x4_SSE &operator-=(const Matrix4x4_SSE &other) |

183 | { |

184 | *this = *this - other; |

185 | return *this; |

186 | } |

187 | |

188 | Q_ALWAYS_INLINE Matrix4x4_SSE &operator+=(const Matrix4x4_SSE &other) |

189 | { |

190 | *this = *this + other; |

191 | return *this; |

192 | } |

193 | |

194 | Q_ALWAYS_INLINE Matrix4x4_SSE transposed() const |

195 | { |

196 | Matrix4x4_SSE c(Qt::Uninitialized); |

197 | |

198 | // ~113 instructions |

199 | // 0b11011101 == 0xdd |

200 | // 0b10001000 == 0x88 |

201 | const __m128 tmp1 = _mm_shuffle_ps(m_col1, m_col2, 0xdd); |

202 | const __m128 tmp2 = _mm_shuffle_ps(m_col1, m_col2, 0x88); |

203 | const __m128 tmp3 = _mm_shuffle_ps(m_col3, m_col4, 0xdd); |

204 | const __m128 tmp4 = _mm_shuffle_ps(m_col3, m_col4, 0x88); |

205 | c.m_col1 = _mm_shuffle_ps(tmp2, tmp4, 0x88); |

206 | c.m_col2 = _mm_shuffle_ps(tmp1, tmp3, 0x88); |

207 | c.m_col3 = _mm_shuffle_ps(tmp2, tmp4, 0xdd); |

208 | c.m_col4 = _mm_shuffle_ps(tmp1, tmp3, 0xdd); |

209 | |

210 | return c; |

211 | } |

212 | |

213 | Q_ALWAYS_INLINE Matrix4x4_SSE inverted() const |

214 | { |

215 | // TO DO: Optimize |

216 | const QMatrix4x4 mat = toQMatrix4x4(); |

217 | return Matrix4x4_SSE(mat.inverted()); |

218 | } |

219 | |

220 | Q_ALWAYS_INLINE bool operator==(const Matrix4x4_SSE &other) const |

221 | { |

222 | // 0b1111 == 0xf |

223 | return (_mm_movemask_ps(_mm_cmpeq_ps(m_col1, other.m_col1)) == 0xf && |

224 | _mm_movemask_ps(_mm_cmpeq_ps(m_col2, other.m_col2)) == 0xf && |

225 | _mm_movemask_ps(_mm_cmpeq_ps(m_col3, other.m_col3)) == 0xf && |

226 | _mm_movemask_ps(_mm_cmpeq_ps(m_col4, other.m_col4)) == 0xf); |

227 | } |

228 | |

229 | Q_ALWAYS_INLINE bool operator!=(const Matrix4x4_SSE &other) const |

230 | { |

231 | return !(*this == other); |

232 | } |

233 | |

234 | Q_ALWAYS_INLINE float m11() const { return _mm_cvtss_f32(m_col1); } |

235 | Q_ALWAYS_INLINE float m12() const { return _mm_cvtss_f32(m_col2); } |

236 | Q_ALWAYS_INLINE float m13() const { return _mm_cvtss_f32(m_col3); } |

237 | Q_ALWAYS_INLINE float m14() const { return _mm_cvtss_f32(m_col4); } |

238 | |

239 | Q_ALWAYS_INLINE float m21() const |

240 | { |

241 | // 0b01010101 = 0x55 |

242 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col1, m_col1, 0x55)); |

243 | } |

244 | Q_ALWAYS_INLINE float m22() const |

245 | { |

246 | // 0b01010101 = 0x55 |

247 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col2, m_col2, 0x55)); |

248 | } |

249 | Q_ALWAYS_INLINE float m23() const |

250 | { |

251 | // 0b01010101 = 0x55 |

252 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col3, m_col3, 0x55)); |

253 | } |

254 | Q_ALWAYS_INLINE float m24() const |

255 | { |

256 | // 0b01010101 = 0x55 |

257 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col4, m_col4, 0x55)); |

258 | } |

259 | |

260 | Q_ALWAYS_INLINE float m31() const |

261 | { |

262 | // 0b10101010 = 0xaa |

263 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col1, m_col1, 0xaa)); |

264 | } |

265 | Q_ALWAYS_INLINE float m32() const |

266 | { |

267 | // 0b10101010 = 0xaa |

268 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col2, m_col2, 0xaa)); |

269 | } |

270 | Q_ALWAYS_INLINE float m33() const |

271 | { |

272 | // 0b10101010 = 0xaa |

273 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col3, m_col3, 0xaa)); |

274 | } |

275 | Q_ALWAYS_INLINE float m34() const |

276 | { |

277 | // 0b10101010 = 0xaa |

278 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col4, m_col4, 0xaa)); |

279 | } |

280 | |

281 | Q_ALWAYS_INLINE float m41() const |

282 | { |

283 | // 0b11111111 = 0xff |

284 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col1, m_col1, 0xff)); |

285 | } |

286 | Q_ALWAYS_INLINE float m42() const |

287 | { |

288 | // 0b11111111 = 0xff |

289 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col2, m_col2, 0xff)); |

290 | } |

291 | Q_ALWAYS_INLINE float m43() const |

292 | { |

293 | // 0b11111111 = 0xff |

294 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col3, m_col3, 0xff)); |

295 | } |

296 | Q_ALWAYS_INLINE float m44() const |

297 | { |

298 | // 0b11111111 = 0xff |

299 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col4, m_col4, 0xff)); |

300 | } |

301 | |

302 | Q_ALWAYS_INLINE Vector4D row(int index) const |

303 | { |

304 | switch (index) { |

305 | case 0: |

306 | return Vector4D(m11(), m12(), m13(), m14()); |

307 | case 1: |

308 | return Vector4D(m21(), m22(), m23(), m24()); |

309 | case 2: |

310 | return Vector4D(m31(), m32(), m33(), m34()); |

311 | case 3: |

312 | return Vector4D(m41(), m42(), m43(), m44()); |

313 | default: |

314 | Q_UNREACHABLE(); |

315 | return Vector4D(); |

316 | } |

317 | } |

318 | |

319 | Q_ALWAYS_INLINE Vector4D column(int index) const |

320 | { |

321 | Vector4D c(Qt::Uninitialized); |

322 | switch (index) { |

323 | case 0: |

324 | c.m_xyzw = m_col1; |

325 | break; |

326 | case 1: |

327 | c.m_xyzw = m_col2; |

328 | break; |

329 | case 2: |

330 | c.m_xyzw = m_col3; |

331 | break; |

332 | case 3: |

333 | c.m_xyzw = m_col4; |

334 | break; |

335 | default: |

336 | Q_UNREACHABLE(); |

337 | return Vector4D(); |

338 | } |

339 | return c; |

340 | } |

341 | |

342 | Q_ALWAYS_INLINE QMatrix4x4 toQMatrix4x4() const { return QMatrix4x4(m11(), m12(), m13(), m14(), |

343 | m21(), m22(), m23(), m24(), |

344 | m31(), m32(), m33(), m34(), |

345 | m41(), m42(), m43(), m44()); } |

346 | |

347 | Q_ALWAYS_INLINE Vector3D_SSE map(const Vector3D_SSE &point) const |

348 | { |

349 | return *this * point; |

350 | } |

351 | |

352 | Q_ALWAYS_INLINE Vector4D_SSE map(const Vector4D_SSE &point) const |

353 | { |

354 | return *this * point; |

355 | } |

356 | |

357 | Q_ALWAYS_INLINE Vector3D_SSE mapVector(const Vector3D_SSE &vector) const |

358 | { |

359 | const Vector3D_SSE row1(m11(), m12(), m13()); |

360 | const Vector3D_SSE row2(m21(), m22(), m23()); |

361 | const Vector3D_SSE row3(m31(), m32(), m33()); |

362 | |

363 | return Vector3D(Vector3D_SSE::dotProduct(row1, vector), |

364 | Vector3D_SSE::dotProduct(row2, vector), |

365 | Vector3D_SSE::dotProduct(row3, vector)); |

366 | } |

367 | |

368 | friend Q_ALWAYS_INLINE Vector4D operator*(const Vector4D &vector, const Matrix4x4_SSE &matrix); |

369 | friend Q_ALWAYS_INLINE Vector4D operator*(const Matrix4x4_SSE &matrix, const Vector4D &vector); |

370 | |

371 | friend Q_ALWAYS_INLINE Vector3D operator*(const Vector3D &vector, const Matrix4x4_SSE &matrix); |

372 | friend Q_ALWAYS_INLINE Vector3D operator*(const Matrix4x4_SSE &matrix, const Vector3D &vector); |

373 | |

374 | friend Q_3DCORE_PRIVATE_EXPORT QDebug operator<<(QDebug dbg, const Matrix4x4_SSE &m); |

375 | |

376 | private: |

377 | // Internally we will store the matrix as indicated below |

378 | // Q_DECL_ALIGN(16) // aligned on 16 bytes boundary for SSE (column major) |

379 | // struct |

380 | // { |

381 | // float m_m11, m_m21, m_m31, m_m41; |

382 | // float m_m12, m_m22, m_m32, m_m42; |

383 | // float m_m13, m_m23, m_m33, m_m43; |

384 | // float m_m14, m_m24, m_m34, m_m44; |

385 | // }; |

386 | // struct |

387 | // { |

388 | // float m[16]; |

389 | // }; |

390 | __m128 m_col1; |

391 | __m128 m_col2; |

392 | __m128 m_col3; |

393 | __m128 m_col4; |

394 | }; |

395 | |

396 | Q_ALWAYS_INLINE Vector4D operator*(const Vector4D &vector, const Matrix4x4_SSE &matrix) |

397 | { |

398 | const __m128 vCol1 = _mm_mul_ps(matrix.m_col1, vector.m_xyzw); |

399 | const __m128 vCol2 = _mm_mul_ps(matrix.m_col2, vector.m_xyzw); |

400 | const __m128 vCol3 = _mm_mul_ps(matrix.m_col3, vector.m_xyzw); |

401 | const __m128 vCol4 = _mm_mul_ps(matrix.m_col4, vector.m_xyzw); |

402 | |

403 | |

404 | // 0b01000100 == 0x44 |

405 | // 0b11101110 == 0xee |

406 | |

407 | // vCol1.x, vCol1.y, vCol2.x, vCol2.y |

408 | __m128 tmp1 = _mm_shuffle_ps(vCol1, vCol2, 0x44); |

409 | // vCol1.z, vCol1.w, vCol2.z, vCol2.w |

410 | __m128 tmp2 = _mm_shuffle_ps(vCol1, vCol2, 0xee); |

411 | |

412 | // vCol1.x + vCol1.z, vCol1.y + vCol1.w, vCol2.x + vCol2.z, vCol2.y + vCol2.w, |

413 | const __m128 tmpSum01 = _mm_add_ps(tmp1, tmp2); |

414 | |

415 | // vCol3.x, vCol3.y, vCol4.x, vCol4.y |

416 | tmp1 = _mm_shuffle_ps(vCol3, vCol4, 0x44); |

417 | // vCol3.z, vCol3.w, vCol4.z, vCol4.w |

418 | tmp2 = _mm_shuffle_ps(vCol3, vCol4, 0xee); |

419 | |

420 | // vCol3.x + vCol3.z, vCol3.y + vCol3.w, vCol4.x + vCol4.z, vCol4.y + vCol4.w, |

421 | const __m128 tmpSum02 = _mm_add_ps(tmp1, tmp2); |

422 | |

423 | // 0b10001000 == 0x88 |

424 | // 0b11011101 == 0xdd |

425 | |

426 | // vCol1.x + vCol1.z, vCol2.x + vCol2.z, vCol3.x + vCol3.z, vCol4.x + vCol4.z, |

427 | tmp1 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0x88); |

428 | // vCol1.y + vCol1.w, vCol2.y + vCol2.w, vCol3.y + vCol3.w, vCol4.y + vCol4.w, |

429 | tmp2 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0xdd); |

430 | |

431 | Vector4D v(Qt::Uninitialized); |

432 | v.m_xyzw = _mm_add_ps(tmp1, tmp2); |

433 | return v; |

434 | } |

435 | |

436 | Q_ALWAYS_INLINE Vector4D operator*(const Matrix4x4_SSE &matrix, const Vector4D &vector) |

437 | { |

438 | const Matrix4x4_SSE transposed = matrix.transposed(); |

439 | return vector * transposed; |

440 | } |

441 | |

442 | Q_ALWAYS_INLINE Vector3D operator*(const Vector3D &vector, const Matrix4x4_SSE &matrix) |

443 | { |

444 | const __m128 vec4 = _mm_set_ps(1.0f, vector.z(), vector.y(), vector.x()); |

445 | |

446 | const __m128 vCol1 = _mm_mul_ps(matrix.m_col1, vec4); |

447 | const __m128 vCol2 = _mm_mul_ps(matrix.m_col2, vec4); |

448 | const __m128 vCol3 = _mm_mul_ps(matrix.m_col3, vec4); |

449 | const __m128 vCol4 = _mm_mul_ps(matrix.m_col4, vec4); |

450 | |

451 | // 0b01000100 == 0x44 |

452 | // 0b11101110 == 0xee |

453 | |

454 | // vCol1.x, vCol1.y, vCol2.x, vCol2.y |

455 | __m128 tmp1 = _mm_shuffle_ps(vCol1, vCol2, 0x44); |

456 | // vCol1.z, vCol1.w, vCol2.z, vCol2.w |

457 | __m128 tmp2 = _mm_shuffle_ps(vCol1, vCol2, 0xee); |

458 | |

459 | // vCol1.x + vCol1.z, vCol1.y + vCol1.w, vCol2.x + vCol2.z, vCol2.y + vCol2.w, |

460 | const __m128 tmpSum01 = _mm_add_ps(tmp1, tmp2); |

461 | |

462 | // vCol3.x, vCol3.y, vCol4.x, vCol4.y |

463 | tmp1 = _mm_shuffle_ps(vCol3, vCol4, 0x44); |

464 | // vCol3.z, vCol3.w, vCol4.z, vCol4.w |

465 | tmp2 = _mm_shuffle_ps(vCol3, vCol4, 0xee); |

466 | |

467 | // vCol3.x + vCol3.z, vCol3.y + vCol3.w, vCol4.x + vCol4.z, vCol4.y + vCol4.w, |

468 | const __m128 tmpSum02 = _mm_add_ps(tmp1, tmp2); |

469 | |

470 | // 0b10001000 == 0x88 |

471 | // 0b11011101 == 0xdd |

472 | |

473 | // vCol1.x + vCol1.z, vCol2.x + vCol2.z, vCol3.x + vCol3.z, vCol4.x + vCol4.z, |

474 | tmp1 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0x88); |

475 | // vCol1.y + vCol1.w, vCol2.y + vCol2.w, vCol3.y + vCol3.w, vCol4.y + vCol4.w, |

476 | tmp2 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0xdd); |

477 | |

478 | const __m128 result = _mm_add_ps(tmp1, tmp2); |

479 | // 0b11111111 = 0xff |

480 | const __m128 divisor = _mm_shuffle_ps(result, result, 0xff); |

481 | Vector3D v(Qt::Uninitialized); |

482 | v.m_xyzw = _mm_div_ps(result, divisor); |

483 | return v; |

484 | } |

485 | |

486 | Q_ALWAYS_INLINE Vector3D operator*(const Matrix4x4_SSE &matrix, const Vector3D &vector) |

487 | { |

488 | const Matrix4x4_SSE transposed = matrix.transposed(); |

489 | return vector * transposed; |

490 | } |

491 | |

492 | } // Qt3DCore |

493 | |

494 | |

495 | Q_DECLARE_TYPEINFO(Qt3DCore::Matrix4x4_SSE, Q_PRIMITIVE_TYPE); |

496 | |

497 | QT_END_NAMESPACE |

498 | |

499 | Q_DECLARE_METATYPE(Qt3DCore::Matrix4x4_SSE) |

500 | |

501 | #endif // QT_COMPILER_SUPPORTS_SSE2 |

502 | |

503 | #endif // QT3DCORE_MATRIX4X4_SSE_P_H |

504 |