1 | // Copyright (C) 2016 The Qt Company Ltd. |
2 | // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only |
3 | |
4 | #include "qvideoframeconversionhelper_p.h" |
5 | |
6 | #ifdef QT_COMPILER_SUPPORTS_AVX2 |
7 | |
8 | QT_BEGIN_NAMESPACE |
9 | |
10 | namespace { |
11 | |
12 | template<int a, int r, int g, int b> |
13 | void convert_to_ARGB32_avx2(const QVideoFrame &frame, uchar *output) |
14 | { |
15 | FETCH_INFO_PACKED(frame) |
16 | MERGE_LOOPS(width, height, stride, 4) |
17 | quint32 *argb = reinterpret_cast<quint32*>(output); |
18 | |
19 | #if Q_BYTE_ORDER == Q_LITTLE_ENDIAN |
20 | __m256i shuffleMask = _mm256_set_epi8(b31: 12 + a, b30: 12 + r, b29: 12 + g, b28: 12 + b, |
21 | b27: 8 + a, b26: 8 + r, b25: 8 + g, b24: 8 + b, |
22 | b23: 4 + a, b22: 4 + r, b21: 4 + g, b20: 4 + b, |
23 | b19: 0 + a, b18: 0 + r, b17: 0 + g, b16: 0 + b, |
24 | b15: 12 + a, b14: 12 + r, b13: 12 + g, b12: 12 + b, |
25 | b11: 8 + a, b10: 8 + r, b09: 8 + g, b08: 8 + b, |
26 | b07: 4 + a, b06: 4 + r, b05: 4 + g, b04: 4 + b, |
27 | b03: 0 + a, b02: 0 + r, b01: 0 + g, b00: 0 + b); |
28 | #else |
29 | __m256i shuffleMask = _mm256_set_epi8(15 - a, 15 - r, 15 - g, 15 - b, |
30 | 11 - a, 11 - r, 11 - g, 11 - b, |
31 | 7 - a, 7 - r, 7 - g, 7 - b, |
32 | 3 - a, 3 - r, 3 - g, 3 - b, |
33 | 15 - a, 15 - r, 15 - g, 15 - b, |
34 | 11 - a, 11 - r, 11 - g, 11 - b, |
35 | 7 - a, 7 - r, 7 - g, 7 - b, |
36 | 3 - a, 3 - r, 3 - g, 3 - b); |
37 | #endif |
38 | |
39 | using Pixel = const ArgbPixel<a, r, g, b>; |
40 | |
41 | for (int y = 0; y < height; ++y) { |
42 | auto *pixel = reinterpret_cast<const Pixel *>(src); |
43 | |
44 | int x = 0; |
45 | ALIGN(32, argb, x, width) { |
46 | *argb = pixel->convert(); |
47 | ++pixel; |
48 | ++argb; |
49 | } |
50 | |
51 | for (; x < width - 15; x += 16) { |
52 | __m256i pixelData = _mm256_loadu_si256(p: reinterpret_cast<const __m256i*>(pixel)); |
53 | __m256i pixelData2 = _mm256_loadu_si256(p: reinterpret_cast<const __m256i*>(pixel + 8)); |
54 | pixel += 16; |
55 | pixelData = _mm256_shuffle_epi8(a: pixelData, b: shuffleMask); |
56 | pixelData2 = _mm256_shuffle_epi8(a: pixelData2, b: shuffleMask); |
57 | _mm256_store_si256(p: reinterpret_cast<__m256i*>(argb), a: pixelData); |
58 | _mm256_store_si256(p: reinterpret_cast<__m256i*>(argb + 8), a: pixelData2); |
59 | argb += 16; |
60 | } |
61 | |
62 | // leftovers |
63 | for (; x < width; ++x) { |
64 | *argb = pixel->convert(); |
65 | ++pixel; |
66 | ++argb; |
67 | } |
68 | |
69 | src += stride; |
70 | } |
71 | } |
72 | |
73 | } |
74 | |
75 | |
76 | void QT_FASTCALL qt_convert_ARGB8888_to_ARGB32_avx2(const QVideoFrame &frame, uchar *output) |
77 | { |
78 | convert_to_ARGB32_avx2<0, 1, 2, 3>(frame, output); |
79 | } |
80 | |
81 | void QT_FASTCALL qt_convert_ABGR8888_to_ARGB32_avx2(const QVideoFrame &frame, uchar *output) |
82 | { |
83 | convert_to_ARGB32_avx2<0, 3, 2, 1>(frame, output); |
84 | } |
85 | |
86 | void QT_FASTCALL qt_convert_RGBA8888_to_ARGB32_avx2(const QVideoFrame &frame, uchar *output) |
87 | { |
88 | convert_to_ARGB32_avx2<3, 0, 1, 2>(frame, output); |
89 | } |
90 | |
91 | void QT_FASTCALL qt_convert_BGRA8888_to_ARGB32_avx2(const QVideoFrame &frame, uchar *output) |
92 | { |
93 | convert_to_ARGB32_avx2<3, 2, 1, 0>(frame, output); |
94 | } |
95 | |
96 | void QT_FASTCALL qt_copy_pixels_with_mask_avx2(uint32_t *dst, const uint32_t *src, size_t size, uint32_t mask) |
97 | { |
98 | const auto mask256 = _mm256_set_epi32(i0: mask, i1: mask, i2: mask, i3: mask, i4: mask, i5: mask, i6: mask, i7: mask); |
99 | |
100 | size_t x = 0; |
101 | |
102 | ALIGN(32, dst, x, size) |
103 | *(dst++) = *(src++) | mask; |
104 | |
105 | for (; x < size - (8 * 4 + 1); x += 8 * 4) { |
106 | const auto srcData1 = _mm256_loadu_si256(p: reinterpret_cast<const __m256i *>(src)); |
107 | const auto srcData2 = _mm256_loadu_si256(p: reinterpret_cast<const __m256i *>(src += 8)); |
108 | const auto srcData3 = _mm256_loadu_si256(p: reinterpret_cast<const __m256i *>(src += 8)); |
109 | const auto srcData4 = _mm256_loadu_si256(p: reinterpret_cast<const __m256i *>(src += 8)); |
110 | |
111 | _mm256_store_si256(p: reinterpret_cast<__m256i *>(dst), a: _mm256_or_si256(a: srcData1, b: mask256)); |
112 | _mm256_store_si256(p: reinterpret_cast<__m256i *>(dst += 8), a: _mm256_or_si256(a: srcData2, b: mask256)); |
113 | _mm256_store_si256(p: reinterpret_cast<__m256i *>(dst += 8), a: _mm256_or_si256(a: srcData3, b: mask256)); |
114 | _mm256_store_si256(p: reinterpret_cast<__m256i *>(dst += 8), a: _mm256_or_si256(a: srcData4, b: mask256)); |
115 | |
116 | src += 8; |
117 | dst += 8; |
118 | } |
119 | |
120 | // leftovers |
121 | for (; x < size - 7; x += 8) { |
122 | const auto srcData = _mm256_loadu_si256(p: reinterpret_cast<const __m256i *>(src)); |
123 | _mm256_store_si256(p: reinterpret_cast<__m256i *>(dst), a: _mm256_or_si256(a: srcData, b: mask256)); |
124 | |
125 | src += 8; |
126 | dst += 8; |
127 | } |
128 | |
129 | for (; x < size; ++x) |
130 | *(dst++) = *(src++) | mask; |
131 | } |
132 | |
133 | QT_END_NAMESPACE |
134 | |
135 | #endif |
136 | |