| 1 | // Copyright (C) 2024 Loongson Technology Corporation Limited. |
| 2 | // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only |
| 3 | |
| 4 | #ifndef QDRAWINGPRIMITIVE_LSX_P_H |
| 5 | #define QDRAWINGPRIMITIVE_LSX_P_H |
| 6 | |
| 7 | #include <QtGui/private/qtguiglobal_p.h> |
| 8 | #include <private/qsimd_p.h> |
| 9 | #include "qdrawhelper_loongarch64_p.h" |
| 10 | #include "qrgba64_p.h" |
| 11 | |
| 12 | #ifdef __loongarch_sx |
| 13 | |
| 14 | // |
| 15 | // W A R N I N G |
| 16 | // ------------- |
| 17 | // |
| 18 | // This file is not part of the Qt API. It exists purely as an |
| 19 | // implementation detail. This header file may change from version to |
| 20 | // version without notice, or even be removed. |
| 21 | // |
| 22 | // We mean it. |
| 23 | // |
| 24 | |
| 25 | QT_BEGIN_NAMESPACE |
| 26 | |
| 27 | /* |
| 28 | * Multiply the components of pixelVector by alphaChannel |
| 29 | * Each 32bits components of alphaChannel must be in the form 0x00AA00AA |
| 30 | * colorMask must have 0x00ff00ff on each 32 bits component |
| 31 | * half must have the value 128 (0x80) for each 32 bits component |
| 32 | */ |
| 33 | inline static void Q_DECL_VECTORCALL |
| 34 | BYTE_MUL_LSX(__m128i &pixelVector, __m128i alphaChannel, __m128i colorMask, __m128i half) |
| 35 | { |
| 36 | /* 1. separate the colors in 2 vectors so each color is on 16 bits |
| 37 | (in order to be multiplied by the alpha |
| 38 | each 32 bit of dstVectorAG are in the form 0x00AA00GG |
| 39 | each 32 bit of dstVectorRB are in the form 0x00RR00BB */ |
| 40 | __m128i pixelVectorAG = __lsx_vsrli_h(pixelVector, 8); |
| 41 | __m128i pixelVectorRB = __lsx_vand_v(pixelVector, colorMask); |
| 42 | |
| 43 | /* 2. multiply the vectors by the alpha channel */ |
| 44 | pixelVectorAG = __lsx_vmul_h(pixelVectorAG, alphaChannel); |
| 45 | pixelVectorRB = __lsx_vmul_h(pixelVectorRB, alphaChannel); |
| 46 | |
| 47 | /* 3. divide by 255, that's the tricky part. |
| 48 | we do it like for BYTE_MUL(), with bit shift: X/255 ~= (X + X/256 + rounding)/256 */ |
| 49 | /** so first (X + X/256 + rounding) */ |
| 50 | pixelVectorRB = __lsx_vadd_h(pixelVectorRB, __lsx_vsrli_h(pixelVectorRB, 8)); |
| 51 | pixelVectorRB = __lsx_vadd_h(pixelVectorRB, half); |
| 52 | pixelVectorAG = __lsx_vadd_h(pixelVectorAG, __lsx_vsrli_h(pixelVectorAG, 8)); |
| 53 | pixelVectorAG = __lsx_vadd_h(pixelVectorAG, half); |
| 54 | |
| 55 | /** second divide by 256 */ |
| 56 | pixelVectorRB = __lsx_vsrli_h(pixelVectorRB, 8); |
| 57 | /** for AG, we could >> 8 to divide followed by << 8 to put the |
| 58 | bytes in the correct position. By masking instead, we execute |
| 59 | only one instruction */ |
| 60 | pixelVectorAG = __lsx_vandn_v(colorMask, pixelVectorAG); |
| 61 | |
| 62 | /* 4. combine the 2 pairs of colors */ |
| 63 | pixelVector = __lsx_vor_v(pixelVectorAG, pixelVectorRB); |
| 64 | } |
| 65 | |
| 66 | /* |
| 67 | * Each 32bits components of alphaChannel must be in the form 0x00AA00AA |
| 68 | * oneMinusAlphaChannel must be 255 - alpha for each 32 bits component |
| 69 | * colorMask must have 0x00ff00ff on each 32 bits component |
| 70 | * half must have the value 128 (0x80) for each 32 bits component |
| 71 | */ |
| 72 | inline static void Q_DECL_VECTORCALL |
| 73 | INTERPOLATE_PIXEL_255_LSX(__m128i srcVector, __m128i &dstVector, __m128i alphaChannel, |
| 74 | __m128i oneMinusAlphaChannel, __m128i colorMask, __m128i half) |
| 75 | { |
| 76 | /* interpolate AG */ |
| 77 | __m128i srcVectorAG = __lsx_vsrli_h(srcVector, 8); |
| 78 | __m128i dstVectorAG = __lsx_vsrli_h(dstVector, 8); |
| 79 | __m128i srcVectorAGalpha = __lsx_vmul_h(srcVectorAG, alphaChannel); |
| 80 | __m128i dstVectorAGoneMinusAlphalpha = __lsx_vmul_h(dstVectorAG, oneMinusAlphaChannel); |
| 81 | __m128i finalAG = __lsx_vadd_h(srcVectorAGalpha, dstVectorAGoneMinusAlphalpha); |
| 82 | finalAG = __lsx_vadd_h(finalAG, __lsx_vsrli_h(finalAG, 8)); |
| 83 | finalAG = __lsx_vadd_h(finalAG, half); |
| 84 | finalAG = __lsx_vandn_v(colorMask, finalAG); |
| 85 | |
| 86 | /* interpolate RB */ |
| 87 | __m128i srcVectorRB = __lsx_vand_v(srcVector, colorMask); |
| 88 | __m128i dstVectorRB = __lsx_vand_v(dstVector, colorMask); |
| 89 | __m128i srcVectorRBalpha = __lsx_vmul_h(srcVectorRB, alphaChannel); |
| 90 | __m128i dstVectorRBoneMinusAlphalpha = __lsx_vmul_h(dstVectorRB, oneMinusAlphaChannel); |
| 91 | __m128i finalRB = __lsx_vadd_h(srcVectorRBalpha, dstVectorRBoneMinusAlphalpha); |
| 92 | finalRB = __lsx_vadd_h(finalRB, __lsx_vsrli_h(finalRB, 8)); |
| 93 | finalRB = __lsx_vadd_h(finalRB, half); |
| 94 | finalRB = __lsx_vsrli_h(finalRB, 8); |
| 95 | |
| 96 | /* combine */ |
| 97 | dstVector = __lsx_vor_v(finalAG, finalRB); |
| 98 | } |
| 99 | |
| 100 | // same as BLEND_SOURCE_OVER_ARGB32_LSX, but for one vector srcVector |
| 101 | inline static void Q_DECL_VECTORCALL |
| 102 | BLEND_SOURCE_OVER_ARGB32_LSX_helper(quint32 *dst, int x, __m128i srcVector, |
| 103 | __m128i nullVector, __m128i half, __m128i one, |
| 104 | __m128i colorMask, __m128i alphaMask) |
| 105 | { |
| 106 | const __m128i srcVectorAlpha = __lsx_vand_v(srcVector, alphaMask); |
| 107 | __m128i vseq = __lsx_vseq_w(srcVectorAlpha, alphaMask); |
| 108 | v4i32 vseq_res = (v4i32)__lsx_vmsknz_b(vseq); |
| 109 | if (vseq_res[0] == (0x0000ffff)) { |
| 110 | /* all opaque */ |
| 111 | __lsx_vst(srcVector, &dst[x], 0); |
| 112 | } else { |
| 113 | __m128i vseq_n = __lsx_vseq_w(srcVectorAlpha, nullVector); |
| 114 | v4i32 vseq_n_res = (v4i32)__lsx_vmsknz_b(vseq_n); |
| 115 | if (vseq_n_res[0] != (0x0000ffff)) { |
| 116 | /* not fully transparent */ |
| 117 | /* extract the alpha channel on 2 x 16 bits */ |
| 118 | /* so we have room for the multiplication */ |
| 119 | /* each 32 bits will be in the form 0x00AA00AA */ |
| 120 | /* with A being the 1 - alpha */ |
| 121 | __m128i alphaChannel = __lsx_vsrli_w(srcVector, 24); |
| 122 | alphaChannel = __lsx_vor_v(alphaChannel, __lsx_vslli_w(alphaChannel, 16)); |
| 123 | alphaChannel = __lsx_vsub_h(one, alphaChannel); |
| 124 | |
| 125 | __m128i dstVector = __lsx_vld(&dst[x], 0); |
| 126 | BYTE_MUL_LSX(dstVector, alphaChannel, colorMask, half); |
| 127 | |
| 128 | /* result = s + d * (1-alpha) */ |
| 129 | const __m128i result = __lsx_vadd_b(srcVector, dstVector); |
| 130 | __lsx_vst(result, &dst[x], 0); |
| 131 | } |
| 132 | } |
| 133 | } |
| 134 | |
| 135 | // Basically blend src over dst with the const alpha defined as constAlphaVector. |
| 136 | // nullVector, half, one, colorMask are constant across the whole image/texture, and should be defined as: |
| 137 | //const __m128i nullVector = __lsx_vreplgr2vr_w(0); |
| 138 | //const __m128i half = __lsx_vreplgr2vr_h(0x80); |
| 139 | //const __m128i one = __lsx_vreplgr2vr_h(0xff); |
| 140 | //const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff); |
| 141 | //const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000); |
| 142 | // |
| 143 | // The computation being done is: |
| 144 | // result = s + d * (1-alpha) |
| 145 | // with shortcuts if fully opaque or fully transparent. |
| 146 | inline static void Q_DECL_VECTORCALL |
| 147 | BLEND_SOURCE_OVER_ARGB32_LSX(quint32 *dst, const quint32 *src, int length) |
| 148 | { |
| 149 | int x = 0; |
| 150 | |
| 151 | /* First, get dst aligned. */ |
| 152 | ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) { |
| 153 | blend_pixel(dst[x], src[x]); |
| 154 | } |
| 155 | |
| 156 | const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000); |
| 157 | const __m128i nullVector = __lsx_vreplgr2vr_w(0); |
| 158 | const __m128i half = __lsx_vreplgr2vr_h(0x80); |
| 159 | const __m128i one = __lsx_vreplgr2vr_h(0xff); |
| 160 | const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff); |
| 161 | |
| 162 | for (; x < length-3; x += 4) { |
| 163 | const __m128i srcVector = __lsx_vld((const __m128i *)&src[x], 0); |
| 164 | BLEND_SOURCE_OVER_ARGB32_LSX_helper(dst, x, srcVector, nullVector, half, one, colorMask, alphaMask); |
| 165 | } |
| 166 | SIMD_EPILOGUE(x, length, 3) { |
| 167 | blend_pixel(dst[x], src[x]); |
| 168 | } |
| 169 | } |
| 170 | |
| 171 | // Basically blend src over dst with the const alpha defined as constAlphaVector. |
| 172 | // The computation being done is: |
| 173 | // dest = (s + d * sia) * ca + d * cia |
| 174 | // = s * ca + d * (sia * ca + cia) |
| 175 | // = s * ca + d * (1 - sa*ca) |
| 176 | inline static void Q_DECL_VECTORCALL |
| 177 | BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_LSX(quint32 *dst, const quint32 *src, int length, uint const_alpha) |
| 178 | { |
| 179 | int x = 0; |
| 180 | |
| 181 | ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) { |
| 182 | blend_pixel(dst[x], src[x], const_alpha); |
| 183 | } |
| 184 | |
| 185 | const __m128i nullVector = __lsx_vreplgr2vr_w(0); |
| 186 | const __m128i half = __lsx_vreplgr2vr_h(0x80); |
| 187 | const __m128i one = __lsx_vreplgr2vr_h(0xff); |
| 188 | const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff); |
| 189 | const __m128i constAlphaVector = __lsx_vreplgr2vr_h(const_alpha); |
| 190 | |
| 191 | for (; x < length-3; x += 4) { |
| 192 | __m128i srcVector = __lsx_vld((const __m128i *)&src[x], 0); |
| 193 | __m128i vseq = __lsx_vseq_w(srcVector, nullVector); |
| 194 | v4i32 vseq_res = (v4i32)__lsx_vmsknz_b(vseq); |
| 195 | if (vseq_res[0] != 0x0000ffff) { |
| 196 | BYTE_MUL_LSX(srcVector, constAlphaVector, colorMask, half); |
| 197 | |
| 198 | __m128i alphaChannel = __lsx_vsrli_w(srcVector, 24); |
| 199 | alphaChannel = __lsx_vor_v(alphaChannel, __lsx_vslli_w(alphaChannel, 16)); |
| 200 | alphaChannel = __lsx_vsub_h(one, alphaChannel); |
| 201 | |
| 202 | __m128i dstVector = __lsx_vld((__m128i *)&dst[x], 0); |
| 203 | BYTE_MUL_LSX(dstVector, alphaChannel, colorMask, half); |
| 204 | |
| 205 | const __m128i result = __lsx_vadd_b(srcVector, dstVector); |
| 206 | __lsx_vst(result, &dst[x], 0); |
| 207 | } |
| 208 | } |
| 209 | SIMD_EPILOGUE(x, length, 3) { |
| 210 | blend_pixel(dst[x], src[x], const_alpha); |
| 211 | } |
| 212 | } |
| 213 | |
| 214 | typedef union |
| 215 | { |
| 216 | int i; |
| 217 | float f; |
| 218 | } FloatInt; |
| 219 | |
| 220 | /* float type data load instructions */ |
| 221 | static __m128 __lsx_vreplfr2vr_s(float val) |
| 222 | { |
| 223 | FloatInt fi_tmpval = {.f = val}; |
| 224 | return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i); |
| 225 | } |
| 226 | |
| 227 | Q_ALWAYS_INLINE __m128 Q_DECL_VECTORCALL reciprocal_mul_ps(const __m128 a, float mul) |
| 228 | { |
| 229 | __m128 ia = __lsx_vfrecip_s(a); // Approximate 1/a |
| 230 | // Improve precision of ia using Newton-Raphson |
| 231 | ia = __lsx_vfsub_s(__lsx_vfadd_s(ia, ia), __lsx_vfmul_s(ia, __lsx_vfmul_s(ia, a))); |
| 232 | ia = __lsx_vfmul_s(ia, __lsx_vreplfr2vr_s(mul)); |
| 233 | return ia; |
| 234 | } |
| 235 | |
| 236 | inline QRgb qUnpremultiply_lsx(QRgb p) |
| 237 | { |
| 238 | const uint alpha = qAlpha(p); |
| 239 | if (alpha == 255) |
| 240 | return p; |
| 241 | if (alpha == 0) |
| 242 | return 0; |
| 243 | const __m128 va = __lsx_vffint_s_w(__lsx_vreplgr2vr_w(alpha)); |
| 244 | __m128 via = reciprocal_mul_ps(va, 255.0f); // Approximate 1/a |
| 245 | const __m128i shuffleMask = (__m128i)(v16i8){0,16,16,16,1,16,16,16,2,16,16,16,3,16,16,16}; |
| 246 | __m128i vl = __lsx_vshuf_b(__lsx_vldi(0), __lsx_vreplgr2vr_w(p), shuffleMask); |
| 247 | vl = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(vl), via)); |
| 248 | vl = __lsx_vmaxi_w(vl, 0); |
| 249 | vl = __lsx_vpickev_h(__lsx_vsat_wu(vl, 15), __lsx_vsat_wu(vl, 15)); |
| 250 | vl = __lsx_vinsgr2vr_h(vl, alpha, 3); |
| 251 | vl = __lsx_vpickev_b(__lsx_vsat_hu(vl, 7), __lsx_vsat_hu(vl, 7)); |
| 252 | return __lsx_vpickve2gr_w(vl, 0); |
| 253 | } |
| 254 | |
| 255 | template<enum QtPixelOrder PixelOrder> |
| 256 | inline uint qConvertArgb32ToA2rgb30_lsx(QRgb p) |
| 257 | { |
| 258 | const uint alpha = qAlpha(p); |
| 259 | if (alpha == 255) |
| 260 | return qConvertRgb32ToRgb30<PixelOrder>(p); |
| 261 | if (alpha == 0) |
| 262 | return 0; |
| 263 | Q_CONSTEXPR float mult = 1023.0f / (255 >> 6); |
| 264 | const uint newalpha = (alpha >> 6); |
| 265 | const __m128 va = __lsx_vffint_s_w(__lsx_vreplgr2vr_w(alpha)); |
| 266 | __m128 via = reciprocal_mul_ps(va, mult * newalpha); |
| 267 | const __m128i shuffleMask = (__m128i)(v16i8){0,16,16,16,1,16,16,16,2,16,16,16,3,16,16,16}; |
| 268 | __m128i vl = __lsx_vshuf_b(__lsx_vldi(0), __lsx_vreplgr2vr_w(p), shuffleMask); |
| 269 | vl = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(vl), via)); |
| 270 | vl = __lsx_vmaxi_w(vl, 0); |
| 271 | vl = __lsx_vpickev_h(__lsx_vsat_wu(vl, 15), __lsx_vsat_wu(vl, 15)); |
| 272 | uint rgb30 = (newalpha << 30); |
| 273 | rgb30 |= ((uint)__lsx_vpickve2gr_h(vl, 1)) << 10; |
| 274 | if (PixelOrder == PixelOrderRGB) { |
| 275 | rgb30 |= ((uint)__lsx_vpickve2gr_h(vl, 2)) << 20; |
| 276 | rgb30 |= ((uint)__lsx_vpickve2gr_h(vl, 0)); |
| 277 | } else { |
| 278 | rgb30 |= ((uint)__lsx_vpickve2gr_h(vl, 0)) << 20; |
| 279 | rgb30 |= ((uint)__lsx_vpickve2gr_h(vl, 2)); |
| 280 | } |
| 281 | return rgb30; |
| 282 | } |
| 283 | |
| 284 | template<enum QtPixelOrder PixelOrder> |
| 285 | inline uint qConvertRgba64ToRgb32_lsx(QRgba64 p) |
| 286 | { |
| 287 | if (p.isTransparent()) |
| 288 | return 0; |
| 289 | __m128i vl = __lsx_vilvl_d(__lsx_vldi(0), __lsx_vldrepl_d(&p, 0)); |
| 290 | if (!p.isOpaque()) { |
| 291 | const __m128 va = __lsx_vffint_s_w(__lsx_vreplgr2vr_w(p.alpha())); |
| 292 | __m128 via = reciprocal_mul_ps(va, 65535.0f); |
| 293 | vl = __lsx_vilvl_h(__lsx_vldi(0), vl); |
| 294 | vl = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(vl) , via)); |
| 295 | vl = __lsx_vmaxi_w(vl, 0); |
| 296 | vl = __lsx_vpickev_h(__lsx_vsat_wu(vl, 15), __lsx_vsat_wu(vl, 15)); |
| 297 | vl = __lsx_vinsgr2vr_h(vl, p.alpha(), 3); |
| 298 | } |
| 299 | if (PixelOrder == PixelOrderBGR){ |
| 300 | const __m128i shuffleMask = (__m128i)(v8i16){2, 1, 0, 3, 4, 5, 6, 7}; |
| 301 | vl = __lsx_vshuf_h(shuffleMask, __lsx_vldi(0), vl); |
| 302 | } |
| 303 | vl = __lsx_vilvl_h(__lsx_vldi(0), vl); |
| 304 | vl = __lsx_vadd_w(vl, __lsx_vreplgr2vr_w(128)); |
| 305 | vl = __lsx_vsub_w(vl, __lsx_vsrli_w(vl, 8)); |
| 306 | vl = __lsx_vsrli_w(vl, 8); |
| 307 | vl = __lsx_vpickev_h(__lsx_vsat_w(vl, 15), __lsx_vsat_w(vl, 15)); |
| 308 | __m128i tmp = __lsx_vmaxi_h(vl, 0); |
| 309 | vl = __lsx_vpickev_b(__lsx_vsat_hu(tmp, 7), __lsx_vsat_hu(tmp, 7)); |
| 310 | return __lsx_vpickve2gr_w(vl, 0); |
| 311 | } |
| 312 | |
| 313 | QT_END_NAMESPACE |
| 314 | |
| 315 | #endif // __loongarch_sx |
| 316 | |
| 317 | #endif // QDRAWINGPRIMITIVE_LSX_P_H |
| 318 | |