qdrawingprimitive_lsx_p.h source code [qtbase/src/gui/painting/qdrawingprimitive_lsx_p.h]

1	// Copyright (C) 2024 Loongson Technology Corporation Limited.
2	// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3
4	#ifndef QDRAWINGPRIMITIVE_LSX_P_H
5	#define QDRAWINGPRIMITIVE_LSX_P_H
6
7	#include <QtGui/private/qtguiglobal_p.h>
8	#include <private/qsimd_p.h>
9	#include "qdrawhelper_loongarch64_p.h"
10	#include "qrgba64_p.h"
11
12	#ifdef __loongarch_sx
13
14	//
15	// W A R N I N G
16	// -------------
17	//
18	// This file is not part of the Qt API. It exists purely as an
19	// implementation detail. This header file may change from version to
20	// version without notice, or even be removed.
21	//
22	// We mean it.
23	//
24
25	QT_BEGIN_NAMESPACE
26
27	/*
28	* Multiply the components of pixelVector by alphaChannel
29	* Each 32bits components of alphaChannel must be in the form 0x00AA00AA
30	* colorMask must have 0x00ff00ff on each 32 bits component
31	* half must have the value 128 (0x80) for each 32 bits component
32	*/
33	inline static void Q_DECL_VECTORCALL
34	BYTE_MUL_LSX(__m128i &pixelVector, __m128i alphaChannel, __m128i colorMask, __m128i half)
35	{
36	/ 1. separate the colors in 2 vectors so each color is on 16 bits*
37	(in order to be multiplied by the alpha
38	each 32 bit of dstVectorAG are in the form 0x00AA00GG
39	each 32 bit of dstVectorRB are in the form 0x00RR00BB /*
40	__m128i pixelVectorAG = __lsx_vsrli_h(pixelVector, `8`);
41	__m128i pixelVectorRB = __lsx_vand_v(pixelVector, colorMask);
42
43	/ 2. multiply the vectors by the alpha channel /
44	pixelVectorAG = __lsx_vmul_h(pixelVectorAG, alphaChannel);
45	pixelVectorRB = __lsx_vmul_h(pixelVectorRB, alphaChannel);
46
47	/ 3. divide by 255, that's the tricky part.*
48	we do it like for BYTE_MUL(), with bit shift: X/255 ~= (X + X/256 + rounding)/256 /*
49	/* so first (X + X/256 + rounding) /
50	pixelVectorRB = __lsx_vadd_h(pixelVectorRB, __lsx_vsrli_h(pixelVectorRB, `8`));
51	pixelVectorRB = __lsx_vadd_h(pixelVectorRB, half);
52	pixelVectorAG = __lsx_vadd_h(pixelVectorAG, __lsx_vsrli_h(pixelVectorAG, `8`));
53	pixelVectorAG = __lsx_vadd_h(pixelVectorAG, half);
54
55	/* second divide by 256 /
56	pixelVectorRB = __lsx_vsrli_h(pixelVectorRB, `8`);
57	/* for AG, we could >> 8 to divide followed by << 8 to put the*
58	bytes in the correct position. By masking instead, we execute
59	only one instruction /*
60	pixelVectorAG = __lsx_vandn_v(colorMask, pixelVectorAG);
61
62	/ 4. combine the 2 pairs of colors /
63	pixelVector = __lsx_vor_v(pixelVectorAG, pixelVectorRB);
64	}
65
66	/*
67	* Each 32bits components of alphaChannel must be in the form 0x00AA00AA
68	* oneMinusAlphaChannel must be 255 - alpha for each 32 bits component
69	* colorMask must have 0x00ff00ff on each 32 bits component
70	* half must have the value 128 (0x80) for each 32 bits component
71	*/
72	inline static void Q_DECL_VECTORCALL
73	INTERPOLATE_PIXEL_255_LSX(__m128i srcVector, __m128i &dstVector, __m128i alphaChannel,
74	__m128i oneMinusAlphaChannel, __m128i colorMask, __m128i half)
75	{
76	/ interpolate AG /
77	__m128i srcVectorAG = __lsx_vsrli_h(srcVector, `8`);
78	__m128i dstVectorAG = __lsx_vsrli_h(dstVector, `8`);
79	__m128i srcVectorAGalpha = __lsx_vmul_h(srcVectorAG, alphaChannel);
80	__m128i dstVectorAGoneMinusAlphalpha = __lsx_vmul_h(dstVectorAG, oneMinusAlphaChannel);
81	__m128i finalAG = __lsx_vadd_h(srcVectorAGalpha, dstVectorAGoneMinusAlphalpha);
82	finalAG = __lsx_vadd_h(finalAG, __lsx_vsrli_h(finalAG, `8`));
83	finalAG = __lsx_vadd_h(finalAG, half);
84	finalAG = __lsx_vandn_v(colorMask, finalAG);
85
86	/ interpolate RB /
87	__m128i srcVectorRB = __lsx_vand_v(srcVector, colorMask);
88	__m128i dstVectorRB = __lsx_vand_v(dstVector, colorMask);
89	__m128i srcVectorRBalpha = __lsx_vmul_h(srcVectorRB, alphaChannel);
90	__m128i dstVectorRBoneMinusAlphalpha = __lsx_vmul_h(dstVectorRB, oneMinusAlphaChannel);
91	__m128i finalRB = __lsx_vadd_h(srcVectorRBalpha, dstVectorRBoneMinusAlphalpha);
92	finalRB = __lsx_vadd_h(finalRB, __lsx_vsrli_h(finalRB, `8`));
93	finalRB = __lsx_vadd_h(finalRB, half);
94	finalRB = __lsx_vsrli_h(finalRB, `8`);
95
96	/ combine /
97	dstVector = __lsx_vor_v(finalAG, finalRB);
98	}
99
100	// same as BLEND_SOURCE_OVER_ARGB32_LSX, but for one vector srcVector
101	inline static void Q_DECL_VECTORCALL
102	BLEND_SOURCE_OVER_ARGB32_LSX_helper(quint32 dst, int* x, __m128i srcVector,
103	__m128i nullVector, __m128i half, __m128i one,
104	__m128i colorMask, __m128i alphaMask)
105	{
106	const __m128i srcVectorAlpha = __lsx_vand_v(srcVector, alphaMask);
107	__m128i vseq = __lsx_vseq_w(srcVectorAlpha, alphaMask);
108	v4i32 vseq_res = (v4i32)__lsx_vmsknz_b(vseq);
109	if (vseq_res[`0`] == (`0x0000ffff`)) {
110	/ all opaque /
111	__lsx_vst(srcVector, &dst[x], `0`);
112	} else {
113	__m128i vseq_n = __lsx_vseq_w(srcVectorAlpha, nullVector);
114	v4i32 vseq_n_res = (v4i32)__lsx_vmsknz_b(vseq_n);
115	if (vseq_n_res[`0`] != (`0x0000ffff`)) {
116	/ not fully transparent /
117	/ extract the alpha channel on 2 x 16 bits /
118	/ so we have room for the multiplication /
119	/ each 32 bits will be in the form 0x00AA00AA /
120	/ with A being the 1 - alpha /
121	__m128i alphaChannel = __lsx_vsrli_w(srcVector, `24`);
122	alphaChannel = __lsx_vor_v(alphaChannel, __lsx_vslli_w(alphaChannel, `16`));
123	alphaChannel = __lsx_vsub_h(one, alphaChannel);
124
125	__m128i dstVector = __lsx_vld(&dst[x], `0`);
126	BYTE_MUL_LSX(dstVector, alphaChannel, colorMask, half);
127
128	/ result = s + d * (1-alpha) /
129	const __m128i result = __lsx_vadd_b(srcVector, dstVector);
130	__lsx_vst(result, &dst[x], `0`);
131	}
132	}
133	}
134
135	// Basically blend src over dst with the const alpha defined as constAlphaVector.
136	// nullVector, half, one, colorMask are constant across the whole image/texture, and should be defined as:
137	//const __m128i nullVector = __lsx_vreplgr2vr_w(0);
138	//const __m128i half = __lsx_vreplgr2vr_h(0x80);
139	//const __m128i one = __lsx_vreplgr2vr_h(0xff);
140	//const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
141	//const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000);
142	//
143	// The computation being done is:
144	// result = s + d (1-alpha)*
145	// with shortcuts if fully opaque or fully transparent.
146	inline static void Q_DECL_VECTORCALL
147	BLEND_SOURCE_OVER_ARGB32_LSX(quint32 dst, const* quint32 src, int* length)
148	{
149	int x = `0`;
150
151	/ First, get dst aligned. /
152	ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) {
153	blend_pixel(dst[x], src[x]);
154	}
155
156	const __m128i alphaMask = __lsx_vreplgr2vr_w(`0xff000000`);
157	const __m128i nullVector = __lsx_vreplgr2vr_w(`0`);
158	const __m128i half = __lsx_vreplgr2vr_h(`0x80`);
159	const __m128i one = __lsx_vreplgr2vr_h(`0xff`);
160	const __m128i colorMask = __lsx_vreplgr2vr_w(`0x00ff00ff`);
161
162	for (; x < length-`3`; x += `4`) {
163	const __m128i srcVector = __lsx_vld((const __m128i *)&src[x], `0`);
164	BLEND_SOURCE_OVER_ARGB32_LSX_helper(dst, x, srcVector, nullVector, half, one, colorMask, alphaMask);
165	}
166	SIMD_EPILOGUE(x, length, `3`) {
167	blend_pixel(dst[x], src[x]);
168	}
169	}
170
171	// Basically blend src over dst with the const alpha defined as constAlphaVector.
172	// The computation being done is:
173	// dest = (s + d sia) * ca + d * cia*
174	// = s ca + d * (sia * ca + cia)*
175	// = s ca + d * (1 - saca)
176	inline static void Q_DECL_VECTORCALL
177	BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_LSX(quint32 dst, const* quint32 src, int* length, uint const_alpha)
178	{
179	int x = `0`;
180
181	ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) {
182	blend_pixel(dst[x], src[x], const_alpha);
183	}
184
185	const __m128i nullVector = __lsx_vreplgr2vr_w(`0`);
186	const __m128i half = __lsx_vreplgr2vr_h(`0x80`);
187	const __m128i one = __lsx_vreplgr2vr_h(`0xff`);
188	const __m128i colorMask = __lsx_vreplgr2vr_w(`0x00ff00ff`);
189	const __m128i constAlphaVector = __lsx_vreplgr2vr_h(const_alpha);
190
191	for (; x < length-`3`; x += `4`) {
192	__m128i srcVector = __lsx_vld((const __m128i *)&src[x], `0`);
193	__m128i vseq = __lsx_vseq_w(srcVector, nullVector);
194	v4i32 vseq_res = (v4i32)__lsx_vmsknz_b(vseq);
195	if (vseq_res[`0`] != `0x0000ffff`) {
196	BYTE_MUL_LSX(srcVector, constAlphaVector, colorMask, half);
197
198	__m128i alphaChannel = __lsx_vsrli_w(srcVector, `24`);
199	alphaChannel = __lsx_vor_v(alphaChannel, __lsx_vslli_w(alphaChannel, `16`));
200	alphaChannel = __lsx_vsub_h(one, alphaChannel);
201
202	__m128i dstVector = __lsx_vld((__m128i *)&dst[x], `0`);
203	BYTE_MUL_LSX(dstVector, alphaChannel, colorMask, half);
204
205	const __m128i result = __lsx_vadd_b(srcVector, dstVector);
206	__lsx_vst(result, &dst[x], `0`);
207	}
208	}
209	SIMD_EPILOGUE(x, length, `3`) {
210	blend_pixel(dst[x], src[x], const_alpha);
211	}
212	}
213
214	typedef union
215	{
216	int i;
217	float f;
218	} FloatInt;
219
220	/ float type data load instructions /
221	static __m128 __lsx_vreplfr2vr_s(float val)
222	{
223	FloatInt fi_tmpval = {.f = val};
224	return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
225	}
226
227	Q_ALWAYS_INLINE __m128 Q_DECL_VECTORCALL reciprocal_mul_ps(const __m128 a, float mul)
228	{
229	__m128 ia = __lsx_vfrecip_s(a); // Approximate 1/a
230	// Improve precision of ia using Newton-Raphson
231	ia = __lsx_vfsub_s(__lsx_vfadd_s(ia, ia), __lsx_vfmul_s(ia, __lsx_vfmul_s(ia, a)));
232	ia = __lsx_vfmul_s(ia, __lsx_vreplfr2vr_s(mul));
233	return ia;
234	}
235
236	inline QRgb qUnpremultiply_lsx(QRgb p)
237	{
238	const uint alpha = qAlpha(p);
239	if (alpha == `255`)
240	return p;
241	if (alpha == `0`)
242	return `0`;
243	const __m128 va = __lsx_vffint_s_w(__lsx_vreplgr2vr_w(alpha));
244	__m128 via = reciprocal_mul_ps(va, `255.0f`); // Approximate 1/a
245	const __m128i shuffleMask = (__m128i)(v16i8){`0`,`16`,`16`,`16`,`1`,`16`,`16`,`16`,`2`,`16`,`16`,`16`,`3`,`16`,`16`,`16`};
246	__m128i vl = __lsx_vshuf_b(__lsx_vldi(`0`), __lsx_vreplgr2vr_w(p), shuffleMask);
247	vl = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(vl), via));
248	vl = __lsx_vmaxi_w(vl, `0`);
249	vl = __lsx_vpickev_h(__lsx_vsat_wu(vl, `15`), __lsx_vsat_wu(vl, `15`));
250	vl = __lsx_vinsgr2vr_h(vl, alpha, `3`);
251	vl = __lsx_vpickev_b(__lsx_vsat_hu(vl, `7`), __lsx_vsat_hu(vl, `7`));
252	return __lsx_vpickve2gr_w(vl, `0`);
253	}
254
255	template<enum QtPixelOrder PixelOrder>
256	inline uint qConvertArgb32ToA2rgb30_lsx(QRgb p)
257	{
258	const uint alpha = qAlpha(p);
259	if (alpha == `255`)
260	return qConvertRgb32ToRgb30<PixelOrder>(p);
261	if (alpha == `0`)
262	return `0`;
263	Q_CONSTEXPR float mult = `1023.0f` / (`255` >> `6`);
264	const uint newalpha = (alpha >> `6`);
265	const __m128 va = __lsx_vffint_s_w(__lsx_vreplgr2vr_w(alpha));
266	__m128 via = reciprocal_mul_ps(va, mult * newalpha);
267	const __m128i shuffleMask = (__m128i)(v16i8){`0`,`16`,`16`,`16`,`1`,`16`,`16`,`16`,`2`,`16`,`16`,`16`,`3`,`16`,`16`,`16`};
268	__m128i vl = __lsx_vshuf_b(__lsx_vldi(`0`), __lsx_vreplgr2vr_w(p), shuffleMask);
269	vl = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(vl), via));
270	vl = __lsx_vmaxi_w(vl, `0`);
271	vl = __lsx_vpickev_h(__lsx_vsat_wu(vl, `15`), __lsx_vsat_wu(vl, `15`));
272	uint rgb30 = (newalpha << `30`);
273	rgb30 \|= ((uint)__lsx_vpickve2gr_h(vl, `1`)) << `10`;
274	if (PixelOrder == PixelOrderRGB) {
275	rgb30 \|= ((uint)__lsx_vpickve2gr_h(vl, `2`)) << `20`;
276	rgb30 \|= ((uint)__lsx_vpickve2gr_h(vl, `0`));
277	} else {
278	rgb30 \|= ((uint)__lsx_vpickve2gr_h(vl, `0`)) << `20`;
279	rgb30 \|= ((uint)__lsx_vpickve2gr_h(vl, `2`));
280	}
281	return rgb30;
282	}
283
284	template<enum QtPixelOrder PixelOrder>
285	inline uint qConvertRgba64ToRgb32_lsx(QRgba64 p)
286	{
287	if (p.isTransparent())
288	return `0`;
289	__m128i vl = __lsx_vilvl_d(__lsx_vldi(`0`), __lsx_vldrepl_d(&p, `0`));
290	if (!p.isOpaque()) {
291	const __m128 va = __lsx_vffint_s_w(__lsx_vreplgr2vr_w(p.alpha()));
292	__m128 via = reciprocal_mul_ps(va, `65535.0f`);
293	vl = __lsx_vilvl_h(__lsx_vldi(`0`), vl);
294	vl = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(vl) , via));
295	vl = __lsx_vmaxi_w(vl, `0`);
296	vl = __lsx_vpickev_h(__lsx_vsat_wu(vl, `15`), __lsx_vsat_wu(vl, `15`));
297	vl = __lsx_vinsgr2vr_h(vl, p.alpha(), `3`);
298	}
299	if (PixelOrder == PixelOrderBGR){
300	const __m128i shuffleMask = (__m128i)(v8i16){`2`, `1`, `0`, `3`, `4`, `5`, `6`, `7`};
301	vl = __lsx_vshuf_h(shuffleMask, __lsx_vldi(`0`), vl);
302	}
303	vl = __lsx_vilvl_h(__lsx_vldi(`0`), vl);
304	vl = __lsx_vadd_w(vl, __lsx_vreplgr2vr_w(`128`));
305	vl = __lsx_vsub_w(vl, __lsx_vsrli_w(vl, `8`));
306	vl = __lsx_vsrli_w(vl, `8`);
307	vl = __lsx_vpickev_h(__lsx_vsat_w(vl, `15`), __lsx_vsat_w(vl, `15`));
308	__m128i tmp = __lsx_vmaxi_h(vl, `0`);
309	vl = __lsx_vpickev_b(__lsx_vsat_hu(tmp, `7`), __lsx_vsat_hu(tmp, `7`));
310	return __lsx_vpickve2gr_w(vl, `0`);
311	}
312
313	QT_END_NAMESPACE
314
315	#endif // __loongarch_sx
316
317	#endif // QDRAWINGPRIMITIVE_LSX_P_H
318

source code of qtbase/src/gui/painting/qdrawingprimitive_lsx_p.h