| 1 | // Copyright (C) 2018 The Qt Company Ltd. |
| 2 | // Copyright (C) 2018 Intel Corporation. |
| 3 | // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only |
| 4 | |
| 5 | #include <private/qdrawhelper_x86_p.h> |
| 6 | |
| 7 | #if defined(QT_COMPILER_SUPPORTS_SSSE3) |
| 8 | |
| 9 | #include <private/qdrawingprimitive_sse2_p.h> |
| 10 | |
| 11 | QT_BEGIN_NAMESPACE |
| 12 | |
| 13 | /* The instruction palignr uses direct arguments, so we have to generate the code fo the different |
| 14 | shift (4, 8, 12). Checking the alignment inside the loop is unfortunately way too slow. |
| 15 | */ |
| 16 | #define BLENDING_LOOP(palignrOffset, length)\ |
| 17 | for (; x-minusOffsetToAlignSrcOn16Bytes < length-7; x += 4) { \ |
| 18 | const __m128i srcVectorLastLoaded = _mm_load_si128((const __m128i *)&src[x - minusOffsetToAlignSrcOn16Bytes + 4]);\ |
| 19 | const __m128i srcVector = _mm_alignr_epi8(srcVectorLastLoaded, srcVectorPrevLoaded, palignrOffset); \ |
| 20 | const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); \ |
| 21 | if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) { \ |
| 22 | _mm_store_si128((__m128i *)&dst[x], srcVector); \ |
| 23 | } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) { \ |
| 24 | __m128i alphaChannel = _mm_shuffle_epi8(srcVector, alphaShuffleMask); \ |
| 25 | alphaChannel = _mm_sub_epi16(one, alphaChannel); \ |
| 26 | const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); \ |
| 27 | __m128i destMultipliedByOneMinusAlpha; \ |
| 28 | BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); \ |
| 29 | const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); \ |
| 30 | _mm_store_si128((__m128i *)&dst[x], result); \ |
| 31 | } \ |
| 32 | srcVectorPrevLoaded = srcVectorLastLoaded;\ |
| 33 | } |
| 34 | |
| 35 | |
| 36 | // Basically blend src over dst with the const alpha defined as constAlphaVector. |
| 37 | // nullVector, half, one, colorMask are constant across the whole image/texture, and should be defined as: |
| 38 | //const __m128i nullVector = _mm_set1_epi32(0); |
| 39 | //const __m128i half = _mm_set1_epi16(0x80); |
| 40 | //const __m128i one = _mm_set1_epi16(0xff); |
| 41 | //const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); |
| 42 | //const __m128i alphaMask = _mm_set1_epi32(0xff000000); |
| 43 | // |
| 44 | // The computation being done is: |
| 45 | // result = s + d * (1-alpha) |
| 46 | // with shortcuts if fully opaque or fully transparent. |
| 47 | static inline void Q_DECL_VECTORCALL |
| 48 | BLEND_SOURCE_OVER_ARGB32_SSSE3(quint32 *dst, const quint32 *src, int length, |
| 49 | __m128i nullVector, __m128i half, __m128i one, __m128i colorMask, __m128i alphaMask) |
| 50 | { |
| 51 | int x = 0; |
| 52 | |
| 53 | /* First, get dst aligned. */ |
| 54 | ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) { |
| 55 | blend_pixel(dst&: dst[x], src: src[x]); |
| 56 | } |
| 57 | |
| 58 | const int minusOffsetToAlignSrcOn16Bytes = (reinterpret_cast<quintptr>(&(src[x])) >> 2) & 0x3; |
| 59 | |
| 60 | if (!minusOffsetToAlignSrcOn16Bytes) { |
| 61 | /* src is aligned, usual algorithm but with aligned operations. |
| 62 | See the SSE2 version for more documentation on the algorithm itself. */ |
| 63 | const __m128i alphaShuffleMask = _mm_set_epi8(b15: char(0xff),b14: 15,b13: char(0xff),b12: 15,b11: char(0xff),b10: 11,b9: char(0xff),b8: 11,b7: char(0xff),b6: 7,b5: char(0xff),b4: 7,b3: char(0xff),b2: 3,b1: char(0xff),b0: 3); |
| 64 | for (; x < length-3; x += 4) { |
| 65 | const __m128i srcVector = _mm_load_si128(p: (const __m128i *)&src[x]); |
| 66 | const __m128i srcVectorAlpha = _mm_and_si128(a: srcVector, b: alphaMask); |
| 67 | if (_mm_movemask_epi8(a: _mm_cmpeq_epi32(a: srcVectorAlpha, b: alphaMask)) == 0xffff) { |
| 68 | _mm_store_si128(p: (__m128i *)&dst[x], b: srcVector); |
| 69 | } else if (_mm_movemask_epi8(a: _mm_cmpeq_epi32(a: srcVectorAlpha, b: nullVector)) != 0xffff) { |
| 70 | __m128i alphaChannel = _mm_shuffle_epi8(a: srcVector, b: alphaShuffleMask); |
| 71 | alphaChannel = _mm_sub_epi16(a: one, b: alphaChannel); |
| 72 | const __m128i dstVector = _mm_load_si128(p: (__m128i *)&dst[x]); |
| 73 | __m128i destMultipliedByOneMinusAlpha; |
| 74 | BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); |
| 75 | const __m128i result = _mm_add_epi8(a: srcVector, b: destMultipliedByOneMinusAlpha); |
| 76 | _mm_store_si128(p: (__m128i *)&dst[x], b: result); |
| 77 | } |
| 78 | } /* end for() */ |
| 79 | } else if ((length - x) >= 8) { |
| 80 | /* We use two vectors to extract the src: prevLoaded for the first pixels, lastLoaded for the current pixels. */ |
| 81 | __m128i srcVectorPrevLoaded = _mm_load_si128(p: (const __m128i *)&src[x - minusOffsetToAlignSrcOn16Bytes]); |
| 82 | const int palignrOffset = minusOffsetToAlignSrcOn16Bytes << 2; |
| 83 | |
| 84 | const __m128i alphaShuffleMask = _mm_set_epi8(b15: char(0xff),b14: 15,b13: char(0xff),b12: 15,b11: char(0xff),b10: 11,b9: char(0xff),b8: 11,b7: char(0xff),b6: 7,b5: char(0xff),b4: 7,b3: char(0xff),b2: 3,b1: char(0xff),b0: 3); |
| 85 | switch (palignrOffset) { |
| 86 | case 4: |
| 87 | BLENDING_LOOP(4, length) |
| 88 | break; |
| 89 | case 8: |
| 90 | BLENDING_LOOP(8, length) |
| 91 | break; |
| 92 | case 12: |
| 93 | BLENDING_LOOP(12, length) |
| 94 | break; |
| 95 | } |
| 96 | } |
| 97 | for (; x < length; ++x) |
| 98 | blend_pixel(dst&: dst[x], src: src[x]); |
| 99 | } |
| 100 | |
| 101 | void qt_blend_argb32_on_argb32_ssse3(uchar *destPixels, int dbpl, |
| 102 | const uchar *srcPixels, int sbpl, |
| 103 | int w, int h, |
| 104 | int const_alpha) |
| 105 | { |
| 106 | const quint32 *src = (const quint32 *) srcPixels; |
| 107 | quint32 *dst = (quint32 *) destPixels; |
| 108 | if (const_alpha == 256) { |
| 109 | const __m128i alphaMask = _mm_set1_epi32(i: 0xff000000); |
| 110 | const __m128i nullVector = _mm_setzero_si128(); |
| 111 | const __m128i half = _mm_set1_epi16(w: 0x80); |
| 112 | const __m128i one = _mm_set1_epi16(w: 0xff); |
| 113 | const __m128i colorMask = _mm_set1_epi32(i: 0x00ff00ff); |
| 114 | |
| 115 | for (int y = 0; y < h; ++y) { |
| 116 | BLEND_SOURCE_OVER_ARGB32_SSSE3(dst, src, length: w, nullVector, half, one, colorMask, alphaMask); |
| 117 | dst = (quint32 *)(((uchar *) dst) + dbpl); |
| 118 | src = (const quint32 *)(((const uchar *) src) + sbpl); |
| 119 | } |
| 120 | } else if (const_alpha != 0) { |
| 121 | // dest = (s + d * sia) * ca + d * cia |
| 122 | // = s * ca + d * (sia * ca + cia) |
| 123 | // = s * ca + d * (1 - sa*ca) |
| 124 | const_alpha = (const_alpha * 255) >> 8; |
| 125 | const __m128i nullVector = _mm_setzero_si128(); |
| 126 | const __m128i half = _mm_set1_epi16(w: 0x80); |
| 127 | const __m128i one = _mm_set1_epi16(w: 0xff); |
| 128 | const __m128i colorMask = _mm_set1_epi32(i: 0x00ff00ff); |
| 129 | const __m128i constAlphaVector = _mm_set1_epi16(w: const_alpha); |
| 130 | for (int y = 0; y < h; ++y) { |
| 131 | BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, w, nullVector, half, one, colorMask, constAlphaVector) |
| 132 | dst = (quint32 *)(((uchar *) dst) + dbpl); |
| 133 | src = (const quint32 *)(((const uchar *) src) + sbpl); |
| 134 | } |
| 135 | } |
| 136 | } |
| 137 | |
| 138 | const uint *QT_FASTCALL fetchPixelsBPP24_ssse3(uint *buffer, const uchar *src, int index, int count) |
| 139 | { |
| 140 | const quint24 *s = reinterpret_cast<const quint24 *>(src); |
| 141 | for (int i = 0; i < count; ++i) |
| 142 | buffer[i] = s[index + i]; |
| 143 | return buffer; |
| 144 | } |
| 145 | |
| 146 | extern void QT_FASTCALL qt_convert_rgb888_to_rgb32_ssse3(quint32 *dst, const uchar *src, int len); |
| 147 | |
| 148 | const uint * QT_FASTCALL qt_fetchUntransformed_888_ssse3(uint *buffer, const Operator *, const QSpanData *data, |
| 149 | int y, int x, int length) |
| 150 | { |
| 151 | const uchar *line = data->texture.scanLine(y) + x * 3; |
| 152 | qt_convert_rgb888_to_rgb32_ssse3(dst: buffer, src: line, len: length); |
| 153 | return buffer; |
| 154 | } |
| 155 | |
| 156 | void qt_memfill24_ssse3(quint24 *dest, quint24 color, qsizetype count) |
| 157 | { |
| 158 | // LCM of 12 and 16 bytes is 48 bytes (16 px) |
| 159 | quint32 v = color; |
| 160 | __m128i m = _mm_cvtsi32_si128(a: v); |
| 161 | quint24 *end = dest + count; |
| 162 | |
| 163 | constexpr uchar x = 2, y = 1, z = 0; |
| 164 | alignas(__m128i) static const uchar |
| 165 | shuffleMask[16 + 1] = { x, y, z, x, y, z, x, y, z, x, y, z, x, y, z, x, y }; |
| 166 | |
| 167 | __m128i mval1 = _mm_shuffle_epi8(a: m, b: _mm_load_si128(p: reinterpret_cast<const __m128i *>(shuffleMask))); |
| 168 | __m128i mval2 = _mm_shuffle_epi8(a: m, b: _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(shuffleMask + 1))); |
| 169 | __m128i mval3 = _mm_alignr_epi8(mval2, mval1, 2); |
| 170 | |
| 171 | for ( ; dest + 16 <= end; dest += 16) { |
| 172 | #ifdef __AVX__ |
| 173 | // Store using 32-byte AVX instruction |
| 174 | __m256 mval12 = _mm256_castps128_ps256(_mm_castsi128_ps(mval1)); |
| 175 | mval12 = _mm256_insertf128_ps(mval12, _mm_castsi128_ps(mval2), 1); |
| 176 | _mm256_storeu_ps(reinterpret_cast<float *>(dest), mval12); |
| 177 | #else |
| 178 | _mm_storeu_si128(p: reinterpret_cast<__m128i *>(dest) + 0, b: mval1); |
| 179 | _mm_storeu_si128(p: reinterpret_cast<__m128i *>(dest) + 1, b: mval2); |
| 180 | #endif |
| 181 | _mm_storeu_si128(p: reinterpret_cast<__m128i *>(dest) + 2, b: mval3); |
| 182 | } |
| 183 | |
| 184 | if (count < 3) { |
| 185 | if (count > 1) |
| 186 | end[-2] = v; |
| 187 | if (count) |
| 188 | end[-1] = v; |
| 189 | return; |
| 190 | } |
| 191 | |
| 192 | // less than 16px/48B left |
| 193 | uchar *ptr = reinterpret_cast<uchar *>(dest); |
| 194 | uchar *ptr_end = reinterpret_cast<uchar *>(end); |
| 195 | qptrdiff left = ptr_end - ptr; |
| 196 | if (left >= 24) { |
| 197 | // 8px/24B or more left |
| 198 | _mm_storeu_si128(p: reinterpret_cast<__m128i *>(ptr) + 0, b: mval1); |
| 199 | _mm_storel_epi64(p: reinterpret_cast<__m128i *>(ptr) + 1, a: mval2); |
| 200 | ptr += 24; |
| 201 | left -= 24; |
| 202 | } |
| 203 | |
| 204 | // less than 8px/24B left |
| 205 | |
| 206 | if (left >= 16) { |
| 207 | // but more than 5px/15B left |
| 208 | _mm_storeu_si128(p: reinterpret_cast<__m128i *>(ptr) , b: mval1); |
| 209 | } else if (left >= 8) { |
| 210 | // but more than 2px/6B left |
| 211 | _mm_storel_epi64(p: reinterpret_cast<__m128i *>(ptr), a: mval1); |
| 212 | } |
| 213 | |
| 214 | if (left) { |
| 215 | // 1 or 2px left |
| 216 | // store 8 bytes ending with the right values (will overwrite a bit) |
| 217 | _mm_storel_epi64(p: reinterpret_cast<__m128i *>(ptr_end - 8), a: mval2); |
| 218 | } |
| 219 | } |
| 220 | |
| 221 | void QT_FASTCALL rbSwap_888_ssse3(uchar *dst, const uchar *src, int count) |
| 222 | { |
| 223 | int i = 0; |
| 224 | |
| 225 | const static __m128i shuffleMask1 = _mm_setr_epi8(b0: 2, b1: 1, b2: 0, b3: 5, b4: 4, b5: 3, b6: 8, b7: 7, b8: 6, b9: 11, b10: 10, b11: 9, b12: 14, b13: 13, b14: 12, /*!!*/b15: 15); |
| 226 | const static __m128i shuffleMask2 = _mm_setr_epi8(b0: 0, /*!!*/b1: 1, b2: 4, b3: 3, b4: 2, b5: 7, b6: 6, b7: 5, b8: 10, b9: 9, b10: 8, b11: 13, b12: 12, b13: 11, /*!!*/b14: 14, b15: 15); |
| 227 | const static __m128i shuffleMask3 = _mm_setr_epi8(/*!!*/b0: 0, b1: 3, b2: 2, b3: 1, b4: 6, b5: 5, b6: 4, b7: 9, b8: 8, b9: 7, b10: 12, b11: 11, b12: 10, b13: 15, b14: 14, b15: 13); |
| 228 | |
| 229 | for (; i + 15 < count; i += 16) { |
| 230 | __m128i s1 = _mm_loadu_si128(p: (const __m128i *)src); |
| 231 | __m128i s2 = _mm_loadu_si128(p: (const __m128i *)(src + 16)); |
| 232 | __m128i s3 = _mm_loadu_si128(p: (const __m128i *)(src + 32)); |
| 233 | s1 = _mm_shuffle_epi8(a: s1, b: shuffleMask1); |
| 234 | s2 = _mm_shuffle_epi8(a: s2, b: shuffleMask2); |
| 235 | s3 = _mm_shuffle_epi8(a: s3, b: shuffleMask3); |
| 236 | _mm_storeu_si128(p: (__m128i *)dst, b: s1); |
| 237 | _mm_storeu_si128(p: (__m128i *)(dst + 16), b: s2); |
| 238 | _mm_storeu_si128(p: (__m128i *)(dst + 32), b: s3); |
| 239 | |
| 240 | // Now fix the last four misplaced values |
| 241 | std::swap(a&: dst[15], b&: dst[17]); |
| 242 | std::swap(a&: dst[30], b&: dst[32]); |
| 243 | |
| 244 | src += 48; |
| 245 | dst += 48; |
| 246 | } |
| 247 | |
| 248 | if (src != dst) { |
| 249 | SIMD_EPILOGUE(i, count, 15) { |
| 250 | dst[0] = src[2]; |
| 251 | dst[1] = src[1]; |
| 252 | dst[2] = src[0]; |
| 253 | dst += 3; |
| 254 | src += 3; |
| 255 | } |
| 256 | } else { |
| 257 | SIMD_EPILOGUE(i, count, 15) { |
| 258 | std::swap(a&: dst[0], b&: dst[2]); |
| 259 | dst += 3; |
| 260 | } |
| 261 | } |
| 262 | } |
| 263 | |
| 264 | QT_END_NAMESPACE |
| 265 | |
| 266 | #endif // QT_COMPILER_SUPPORTS_SSSE3 |
| 267 | |