1 | // Copyright (C) 2016 The Qt Company Ltd. |
2 | // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only |
3 | |
4 | #include "qvideoframeconversionhelper_p.h" |
5 | |
6 | #ifdef QT_COMPILER_SUPPORTS_SSE2 |
7 | |
8 | QT_BEGIN_NAMESPACE |
9 | |
10 | namespace { |
11 | |
12 | template<int a, int r, int b, int g> |
13 | void convert_to_ARGB32_sse2(const QVideoFrame &frame, uchar *output) |
14 | { |
15 | FETCH_INFO_PACKED(frame) |
16 | MERGE_LOOPS(width, height, stride, 4) |
17 | quint32 *argb = reinterpret_cast<quint32*>(output); |
18 | |
19 | const __m128i zero = _mm_setzero_si128(); |
20 | #if Q_BYTE_ORDER == Q_LITTLE_ENDIAN |
21 | const uchar shuffle = _MM_SHUFFLE(a, r, b, g); |
22 | #else |
23 | const uchar shuffle = _MM_SHUFFLE(3-a, 3-r, 3-b, 3-g); |
24 | #endif |
25 | |
26 | using Pixel = const ArgbPixel<a, r, g, b>; |
27 | |
28 | for (int y = 0; y < height; ++y) { |
29 | auto *pixel = reinterpret_cast<const Pixel *>(src); |
30 | |
31 | int x = 0; |
32 | ALIGN(16, argb, x, width) { |
33 | *argb = pixel->convert(); |
34 | ++pixel; |
35 | ++argb; |
36 | } |
37 | |
38 | for (; x < width - 3; x += 4) { |
39 | __m128i pixelData = _mm_loadu_si128(p: reinterpret_cast<const __m128i*>(pixel)); |
40 | pixel += 4; |
41 | __m128i lowPixels = _mm_unpacklo_epi8(a: pixelData, b: zero); |
42 | __m128i highPixels = _mm_unpackhi_epi8(a: pixelData, b: zero); |
43 | lowPixels = _mm_shufflelo_epi16(_mm_shufflehi_epi16(lowPixels, shuffle), shuffle); |
44 | highPixels = _mm_shufflelo_epi16(_mm_shufflehi_epi16(highPixels, shuffle), shuffle); |
45 | pixelData = _mm_packus_epi16(a: lowPixels, b: highPixels); |
46 | _mm_store_si128(p: reinterpret_cast<__m128i*>(argb), b: pixelData); |
47 | argb += 4; |
48 | } |
49 | |
50 | // leftovers |
51 | for (; x < width; ++x) { |
52 | *argb = pixel->convert(); |
53 | ++pixel; |
54 | ++argb; |
55 | } |
56 | |
57 | src += stride; |
58 | } |
59 | } |
60 | |
61 | } |
62 | |
63 | void QT_FASTCALL qt_convert_ARGB8888_to_ARGB32_sse2(const QVideoFrame &frame, uchar *output) |
64 | { |
65 | convert_to_ARGB32_sse2<0, 1, 2, 3>(frame, output); |
66 | } |
67 | |
68 | void QT_FASTCALL qt_convert_ABGR8888_to_ARGB32_sse2(const QVideoFrame &frame, uchar *output) |
69 | { |
70 | convert_to_ARGB32_sse2<0, 3, 2, 1>(frame, output); |
71 | } |
72 | |
73 | void QT_FASTCALL qt_convert_RGBA8888_to_ARGB32_sse2(const QVideoFrame &frame, uchar *output) |
74 | { |
75 | convert_to_ARGB32_sse2<3, 0, 1, 2>(frame, output); |
76 | } |
77 | |
78 | void QT_FASTCALL qt_convert_BGRA8888_to_ARGB32_sse2(const QVideoFrame &frame, uchar *output) |
79 | { |
80 | convert_to_ARGB32_sse2<3, 2, 1, 0>(frame, output); |
81 | } |
82 | |
83 | void QT_FASTCALL qt_copy_pixels_with_mask_sse2(uint32_t *dst, const uint32_t *src, size_t size, uint32_t mask) |
84 | { |
85 | const auto mask128 = _mm_set_epi32(i3: mask, i2: mask, i1: mask, i0: mask); |
86 | |
87 | size_t x = 0; |
88 | |
89 | ALIGN(16, dst, x, size) |
90 | *(dst++) = *(src++) | mask; |
91 | |
92 | for (; x < size - (4 * 4 - 1); x += 4 * 4) { |
93 | const auto srcData0 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(src)); |
94 | const auto srcData1 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(src += 4)); |
95 | const auto srcData2 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(src += 4)); |
96 | const auto srcData3 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(src += 4)); |
97 | |
98 | _mm_store_si128(p: reinterpret_cast<__m128i *>(dst), b: _mm_or_si128(a: srcData0, b: mask128)); |
99 | _mm_store_si128(p: reinterpret_cast<__m128i *>(dst += 4), b: _mm_or_si128(a: srcData1, b: mask128)); |
100 | _mm_store_si128(p: reinterpret_cast<__m128i *>(dst += 4), b: _mm_or_si128(a: srcData2, b: mask128)); |
101 | _mm_store_si128(p: reinterpret_cast<__m128i *>(dst += 4), b: _mm_or_si128(a: srcData3, b: mask128)); |
102 | |
103 | src += 4; |
104 | dst += 4; |
105 | } |
106 | |
107 | for (; x < size - 3; x += 4) { |
108 | const auto srcData = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(src)); |
109 | |
110 | _mm_store_si128(p: reinterpret_cast<__m128i *>(dst), b: _mm_or_si128(a: srcData, b: mask128)); |
111 | |
112 | src += 4; |
113 | dst += 4; |
114 | } |
115 | |
116 | // leftovers |
117 | for (; x < size; ++x) |
118 | *(dst++) = *(src++) | mask; |
119 | } |
120 | |
121 | QT_END_NAMESPACE |
122 | |
123 | #endif |
124 | |