| 1 | // Copyright (C) 2016 The Qt Company Ltd. |
| 2 | // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only |
| 3 | |
| 4 | #include <qimage.h> |
| 5 | #include <private/qimage_p.h> |
| 6 | #include <private/qsimd_p.h> |
| 7 | |
| 8 | #ifdef QT_COMPILER_SUPPORTS_SSSE3 |
| 9 | |
| 10 | QT_BEGIN_NAMESPACE |
| 11 | |
| 12 | // Convert a scanline of RGB888 (src) to RGB32 (dst) |
| 13 | // src must be at least len * 3 bytes |
| 14 | // dst must be at least len * 4 bytes |
| 15 | Q_GUI_EXPORT void QT_FASTCALL qt_convert_rgb888_to_rgb32_ssse3(quint32 *dst, const uchar *src, int len) |
| 16 | { |
| 17 | int i = 0; |
| 18 | |
| 19 | // Prologue, align dst to 16 bytes. |
| 20 | ALIGNMENT_PROLOGUE_16BYTES(dst, i, len) { |
| 21 | dst[i] = qRgb(r: src[0], g: src[1], b: src[2]); |
| 22 | src += 3; |
| 23 | } |
| 24 | |
| 25 | // Mask the 4 first colors of the RGB888 vector |
| 26 | const __m128i shuffleMask = _mm_set_epi8(b15: char(0xff), b14: 9, b13: 10, b12: 11, b11: char(0xff), b10: 6, b9: 7, b8: 8, b7: char(0xff), b6: 3, b5: 4, b4: 5, b3: char(0xff), b2: 0, b1: 1, b0: 2); |
| 27 | |
| 28 | // Mask the 4 last colors of a RGB888 vector with an offset of 1 (so the last 3 bytes are RGB) |
| 29 | const __m128i shuffleMaskEnd = _mm_set_epi8(b15: char(0xff), b14: 13, b13: 14, b12: 15, b11: char(0xff), b10: 10, b9: 11, b8: 12, b7: char(0xff), b6: 7, b5: 8, b4: 9, b3: char(0xff), b2: 4, b1: 5, b0: 6); |
| 30 | |
| 31 | // Mask to have alpha = 0xff |
| 32 | const __m128i alphaMask = _mm_set1_epi32(i: 0xff000000); |
| 33 | |
| 34 | const __m128i *inVectorPtr = (const __m128i *)src; |
| 35 | __m128i *dstVectorPtr = (__m128i *)(dst + i); |
| 36 | |
| 37 | for (; i < (len - 15); i += 16) { // one iteration in the loop converts 16 pixels |
| 38 | /* |
| 39 | RGB888 has 5 pixels per vector, + 1 byte from the next pixel. The idea here is |
| 40 | to load vectors of RGB888 and use palignr to select a vector out of two vectors. |
| 41 | |
| 42 | After 3 loads of RGB888 and 3 stores of RGB32, we have 4 pixels left in the last |
| 43 | vector of RGB888, we can mask it directly to get a last store or RGB32. After that, |
| 44 | the first next byte is a R, and we can loop for the next 16 pixels. |
| 45 | |
| 46 | The conversion itself is done with a byte permutation (pshufb). |
| 47 | */ |
| 48 | __m128i firstSrcVector = _mm_lddqu_si128(p: inVectorPtr); |
| 49 | __m128i outputVector = _mm_shuffle_epi8(a: firstSrcVector, b: shuffleMask); |
| 50 | _mm_store_si128(p: dstVectorPtr, b: _mm_or_si128(a: outputVector, b: alphaMask)); |
| 51 | ++inVectorPtr; |
| 52 | ++dstVectorPtr; |
| 53 | |
| 54 | // There are 4 unused bytes left in srcVector, we need to load the next 16 bytes |
| 55 | // and load the next input with palignr |
| 56 | __m128i secondSrcVector = _mm_lddqu_si128(p: inVectorPtr); |
| 57 | __m128i srcVector = _mm_alignr_epi8(secondSrcVector, firstSrcVector, 12); |
| 58 | outputVector = _mm_shuffle_epi8(a: srcVector, b: shuffleMask); |
| 59 | _mm_store_si128(p: dstVectorPtr, b: _mm_or_si128(a: outputVector, b: alphaMask)); |
| 60 | ++inVectorPtr; |
| 61 | ++dstVectorPtr; |
| 62 | firstSrcVector = secondSrcVector; |
| 63 | |
| 64 | // We now have 8 unused bytes left in firstSrcVector |
| 65 | secondSrcVector = _mm_lddqu_si128(p: inVectorPtr); |
| 66 | srcVector = _mm_alignr_epi8(secondSrcVector, firstSrcVector, 8); |
| 67 | outputVector = _mm_shuffle_epi8(a: srcVector, b: shuffleMask); |
| 68 | _mm_store_si128(p: dstVectorPtr, b: _mm_or_si128(a: outputVector, b: alphaMask)); |
| 69 | ++inVectorPtr; |
| 70 | ++dstVectorPtr; |
| 71 | |
| 72 | // There are now 12 unused bytes in firstSrcVector. |
| 73 | // We can mask them directly, almost there. |
| 74 | outputVector = _mm_shuffle_epi8(a: secondSrcVector, b: shuffleMaskEnd); |
| 75 | _mm_store_si128(p: dstVectorPtr, b: _mm_or_si128(a: outputVector, b: alphaMask)); |
| 76 | ++dstVectorPtr; |
| 77 | } |
| 78 | src = (const uchar *)inVectorPtr; |
| 79 | |
| 80 | SIMD_EPILOGUE(i, len, 15) { |
| 81 | dst[i] = qRgb(r: src[0], g: src[1], b: src[2]); |
| 82 | src += 3; |
| 83 | } |
| 84 | } |
| 85 | |
| 86 | void convert_RGB888_to_RGB32_ssse3(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags) |
| 87 | { |
| 88 | Q_ASSERT(src->format == QImage::Format_RGB888 || src->format == QImage::Format_BGR888); |
| 89 | if (src->format == QImage::Format_BGR888) |
| 90 | Q_ASSERT(dest->format == QImage::Format_RGBX8888 || dest->format == QImage::Format_RGBA8888 || dest->format == QImage::Format_RGBA8888_Premultiplied); |
| 91 | else |
| 92 | Q_ASSERT(dest->format == QImage::Format_RGB32 || dest->format == QImage::Format_ARGB32 || dest->format == QImage::Format_ARGB32_Premultiplied); |
| 93 | Q_ASSERT(src->width == dest->width); |
| 94 | Q_ASSERT(src->height == dest->height); |
| 95 | |
| 96 | const uchar *src_data = (uchar *) src->data; |
| 97 | quint32 *dest_data = (quint32 *) dest->data; |
| 98 | |
| 99 | for (int i = 0; i < src->height; ++i) { |
| 100 | qt_convert_rgb888_to_rgb32_ssse3(dst: dest_data, src: src_data, len: src->width); |
| 101 | src_data += src->bytes_per_line; |
| 102 | dest_data = (quint32 *)((uchar*)dest_data + dest->bytes_per_line); |
| 103 | } |
| 104 | } |
| 105 | |
| 106 | QT_END_NAMESPACE |
| 107 | |
| 108 | #endif // QT_COMPILER_SUPPORTS_SSSE3 |
| 109 | |