qdrawhelper_ssse3.cpp source code [qtbase/src/gui/painting/qdrawhelper_ssse3.cpp]

1	// Copyright (C) 2018 The Qt Company Ltd.
2	// Copyright (C) 2018 Intel Corporation.
3	// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4
5	#include <private/qdrawhelper_x86_p.h>
6
7	#if defined(QT_COMPILER_SUPPORTS_SSSE3)
8
9	#include <private/qdrawingprimitive_sse2_p.h>
10
11	QT_BEGIN_NAMESPACE
12
13	/ The instruction palignr uses direct arguments, so we have to generate the code fo the different*
14	shift (4, 8, 12). Checking the alignment inside the loop is unfortunately way too slow.
15	*/
16	#define BLENDING_LOOP(palignrOffset, length)\
17	for (; x-minusOffsetToAlignSrcOn16Bytes < length-7; x += 4) { \
18	const __m128i srcVectorLastLoaded = _mm_load_si128((const __m128i *)&src[x - minusOffsetToAlignSrcOn16Bytes + 4]);\
19	const __m128i srcVector = _mm_alignr_epi8(srcVectorLastLoaded, srcVectorPrevLoaded, palignrOffset); \
20	const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); \
21	if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) { \
22	_mm_store_si128((__m128i *)&dst[x], srcVector); \
23	} else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) { \
24	__m128i alphaChannel = _mm_shuffle_epi8(srcVector, alphaShuffleMask); \
25	alphaChannel = _mm_sub_epi16(one, alphaChannel); \
26	const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); \
27	__m128i destMultipliedByOneMinusAlpha; \
28	BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); \
29	const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); \
30	_mm_store_si128((__m128i *)&dst[x], result); \
31	} \
32	srcVectorPrevLoaded = srcVectorLastLoaded;\
33	}
34
35
36	// Basically blend src over dst with the const alpha defined as constAlphaVector.
37	// nullVector, half, one, colorMask are constant across the whole image/texture, and should be defined as:
38	//const __m128i nullVector = _mm_set1_epi32(0);
39	//const __m128i half = _mm_set1_epi16(0x80);
40	//const __m128i one = _mm_set1_epi16(0xff);
41	//const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
42	//const __m128i alphaMask = _mm_set1_epi32(0xff000000);
43	//
44	// The computation being done is:
45	// result = s + d (1-alpha)*
46	// with shortcuts if fully opaque or fully transparent.
47	static inline void Q_DECL_VECTORCALL
48	BLEND_SOURCE_OVER_ARGB32_SSSE3(quint32 dst, const* quint32 src, int* length,
49	__m128i nullVector, __m128i half, __m128i one, __m128i colorMask, __m128i alphaMask)
50	{
51	int x = `0`;
52
53	/ First, get dst aligned. /
54	ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) {
55	blend_pixel(dst&: dst[x], src: src[x]);
56	}
57
58	const int minusOffsetToAlignSrcOn16Bytes = (reinterpret_cast<quintptr>(&(src[x])) >> `2`) & `0x3`;
59
60	if (!minusOffsetToAlignSrcOn16Bytes) {
61	/ src is aligned, usual algorithm but with aligned operations.*
62	See the SSE2 version for more documentation on the algorithm itself. /*
63	const __m128i alphaShuffleMask = _mm_set_epi8(b15: char(`0xff`),b14: `15`,b13: char(`0xff`),b12: `15`,b11: char(`0xff`),b10: `11`,b9: char(`0xff`),b8: `11`,b7: char(`0xff`),b6: `7`,b5: char(`0xff`),b4: `7`,b3: char(`0xff`),b2: `3`,b1: char(`0xff`),b0: `3`);
64	for (; x < length-`3`; x += `4`) {
65	const __m128i srcVector = _mm_load_si128(p: (const __m128i *)&src[x]);
66	const __m128i srcVectorAlpha = _mm_and_si128(a: srcVector, b: alphaMask);
67	if (_mm_movemask_epi8(a: _mm_cmpeq_epi32(a: srcVectorAlpha, b: alphaMask)) == `0xffff`) {
68	_mm_store_si128(p: (__m128i *)&dst[x], b: srcVector);
69	} else if (_mm_movemask_epi8(a: _mm_cmpeq_epi32(a: srcVectorAlpha, b: nullVector)) != `0xffff`) {
70	__m128i alphaChannel = _mm_shuffle_epi8(a: srcVector, b: alphaShuffleMask);
71	alphaChannel = _mm_sub_epi16(a: one, b: alphaChannel);
72	const __m128i dstVector = _mm_load_si128(p: (__m128i *)&dst[x]);
73	__m128i destMultipliedByOneMinusAlpha;
74	BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half);
75	const __m128i result = _mm_add_epi8(a: srcVector, b: destMultipliedByOneMinusAlpha);
76	_mm_store_si128(p: (__m128i *)&dst[x], b: result);
77	}
78	} / end for() /
79	} else if ((length - x) >= `8`) {
80	/ We use two vectors to extract the src: prevLoaded for the first pixels, lastLoaded for the current pixels. /
81	__m128i srcVectorPrevLoaded = _mm_load_si128(p: (const __m128i *)&src[x - minusOffsetToAlignSrcOn16Bytes]);
82	const int palignrOffset = minusOffsetToAlignSrcOn16Bytes << `2`;
83
84	const __m128i alphaShuffleMask = _mm_set_epi8(b15: char(`0xff`),b14: `15`,b13: char(`0xff`),b12: `15`,b11: char(`0xff`),b10: `11`,b9: char(`0xff`),b8: `11`,b7: char(`0xff`),b6: `7`,b5: char(`0xff`),b4: `7`,b3: char(`0xff`),b2: `3`,b1: char(`0xff`),b0: `3`);
85	switch (palignrOffset) {
86	case `4`:
87	BLENDING_LOOP(`4`, length)
88	break;
89	case `8`:
90	BLENDING_LOOP(`8`, length)
91	break;
92	case `12`:
93	BLENDING_LOOP(`12`, length)
94	break;
95	}
96	}
97	for (; x < length; ++x)
98	blend_pixel(dst&: dst[x], src: src[x]);
99	}
100
101	void qt_blend_argb32_on_argb32_ssse3(uchar destPixels, int* dbpl,
102	const uchar srcPixels, int* sbpl,
103	int w, int h,
104	int const_alpha)
105	{
106	const quint32 src = (const* quint32 *) srcPixels;
107	quint32 dst = (quint32 ) destPixels;
108	if (const_alpha == `256`) {
109	const __m128i alphaMask = _mm_set1_epi32(i: `0xff000000`);
110	const __m128i nullVector = _mm_setzero_si128();
111	const __m128i half = _mm_set1_epi16(w: `0x80`);
112	const __m128i one = _mm_set1_epi16(w: `0xff`);
113	const __m128i colorMask = _mm_set1_epi32(i: `0x00ff00ff`);
114
115	for (int y = `0`; y < h; ++y) {
116	BLEND_SOURCE_OVER_ARGB32_SSSE3(dst, src, length: w, nullVector, half, one, colorMask, alphaMask);
117	dst = (quint32 )(((uchar ) dst) + dbpl);
118	src = (const quint32 )(((const* uchar *) src) + sbpl);
119	}
120	} else if (const_alpha != `0`) {
121	// dest = (s + d sia) * ca + d * cia*
122	// = s ca + d * (sia * ca + cia)*
123	// = s ca + d * (1 - saca)
124	const_alpha = (const_alpha * `255`) >> `8`;
125	const __m128i nullVector = _mm_setzero_si128();
126	const __m128i half = _mm_set1_epi16(w: `0x80`);
127	const __m128i one = _mm_set1_epi16(w: `0xff`);
128	const __m128i colorMask = _mm_set1_epi32(i: `0x00ff00ff`);
129	const __m128i constAlphaVector = _mm_set1_epi16(w: const_alpha);
130	for (int y = `0`; y < h; ++y) {
131	BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, w, nullVector, half, one, colorMask, constAlphaVector)
132	dst = (quint32 )(((uchar ) dst) + dbpl);
133	src = (const quint32 )(((const* uchar *) src) + sbpl);
134	}
135	}
136	}
137
138	const uint QT_FASTCALL fetchPixelsBPP24_ssse3(uint buffer, const uchar src, int* index, int count)
139	{
140	const quint24 s = reinterpret_cast<const* quint24 *>(src);
141	for (int i = `0`; i < count; ++i)
142	buffer[i] = s[index + i];
143	return buffer;
144	}
145
146	extern void QT_FASTCALL qt_convert_rgb888_to_rgb32_ssse3(quint32 dst, const* uchar src, int* len);
147
148	const uint * QT_FASTCALL qt_fetchUntransformed_888_ssse3(uint buffer, const* Operator , const* QSpanData *data,
149	int y, int x, int length)
150	{
151	const uchar line = data->texture.scanLine(y) + x `3`;
152	qt_convert_rgb888_to_rgb32_ssse3(dst: buffer, src: line, len: length);
153	return buffer;
154	}
155
156	void qt_memfill24_ssse3(quint24 *dest, quint24 color, qsizetype count)
157	{
158	// LCM of 12 and 16 bytes is 48 bytes (16 px)
159	quint32 v = color;
160	__m128i m = _mm_cvtsi32_si128(a: v);
161	quint24 *end = dest + count;
162
163	constexpr uchar x = `2`, y = `1`, z = `0`;
164	alignas(__m128i) static const uchar
165	shuffleMask[`16` + `1`] = { x, y, z, x, y, z, x, y, z, x, y, z, x, y, z, x, y };
166
167	__m128i mval1 = _mm_shuffle_epi8(a: m, b: _mm_load_si128(p: reinterpret_cast<const __m128i *>(shuffleMask)));
168	__m128i mval2 = _mm_shuffle_epi8(a: m, b: _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(shuffleMask + `1`)));
169	__m128i mval3 = _mm_alignr_epi8(mval2, mval1, `2`);
170
171	for ( ; dest + `16` <= end; dest += `16`) {
172	#ifdef __AVX__
173	// Store using 32-byte AVX instruction
174	__m256 mval12 = _mm256_castps128_ps256(_mm_castsi128_ps(mval1));
175	mval12 = _mm256_insertf128_ps(mval12, _mm_castsi128_ps(mval2), `1`);
176	_mm256_storeu_ps(reinterpret_cast<float *>(dest), mval12);
177	#else
178	_mm_storeu_si128(p: reinterpret_cast<__m128i *>(dest) + `0`, b: mval1);
179	_mm_storeu_si128(p: reinterpret_cast<__m128i *>(dest) + `1`, b: mval2);
180	#endif
181	_mm_storeu_si128(p: reinterpret_cast<__m128i *>(dest) + `2`, b: mval3);
182	}
183
184	if (count < `3`) {
185	if (count > `1`)
186	end[-`2`] = v;
187	if (count)
188	end[-`1`] = v;
189	return;
190	}
191
192	// less than 16px/48B left
193	uchar ptr = reinterpret_cast<uchar >(dest);
194	uchar ptr_end = reinterpret_cast<uchar >(end);
195	qptrdiff left = ptr_end - ptr;
196	if (left >= `24`) {
197	// 8px/24B or more left
198	_mm_storeu_si128(p: reinterpret_cast<__m128i *>(ptr) + `0`, b: mval1);
199	_mm_storel_epi64(p: reinterpret_cast<__m128i *>(ptr) + `1`, a: mval2);
200	ptr += `24`;
201	left -= `24`;
202	}
203
204	// less than 8px/24B left
205
206	if (left >= `16`) {
207	// but more than 5px/15B left
208	_mm_storeu_si128(p: reinterpret_cast<__m128i *>(ptr) , b: mval1);
209	} else if (left >= `8`) {
210	// but more than 2px/6B left
211	_mm_storel_epi64(p: reinterpret_cast<__m128i *>(ptr), a: mval1);
212	}
213
214	if (left) {
215	// 1 or 2px left
216	// store 8 bytes ending with the right values (will overwrite a bit)
217	_mm_storel_epi64(p: reinterpret_cast<__m128i *>(ptr_end - `8`), a: mval2);
218	}
219	}
220
221	void QT_FASTCALL rbSwap_888_ssse3(uchar dst, const* uchar src, int* count)
222	{
223	int i = `0`;
224
225	const static __m128i shuffleMask1 = _mm_setr_epi8(b0: `2`, b1: `1`, b2: `0`, b3: `5`, b4: `4`, b5: `3`, b6: `8`, b7: `7`, b8: `6`, b9: `11`, b10: `10`, b11: `9`, b12: `14`, b13: `13`, b14: `12`, /!!/b15: `15`);
226	const static __m128i shuffleMask2 = _mm_setr_epi8(b0: `0`, /!!/b1: `1`, b2: `4`, b3: `3`, b4: `2`, b5: `7`, b6: `6`, b7: `5`, b8: `10`, b9: `9`, b10: `8`, b11: `13`, b12: `12`, b13: `11`, /!!/b14: `14`, b15: `15`);
227	const static __m128i shuffleMask3 = _mm_setr_epi8(/!!/b0: `0`, b1: `3`, b2: `2`, b3: `1`, b4: `6`, b5: `5`, b6: `4`, b7: `9`, b8: `8`, b9: `7`, b10: `12`, b11: `11`, b12: `10`, b13: `15`, b14: `14`, b15: `13`);
228
229	for (; i + `15` < count; i += `16`) {
230	__m128i s1 = _mm_loadu_si128(p: (const __m128i *)src);
231	__m128i s2 = _mm_loadu_si128(p: (const __m128i *)(src + `16`));
232	__m128i s3 = _mm_loadu_si128(p: (const __m128i *)(src + `32`));
233	s1 = _mm_shuffle_epi8(a: s1, b: shuffleMask1);
234	s2 = _mm_shuffle_epi8(a: s2, b: shuffleMask2);
235	s3 = _mm_shuffle_epi8(a: s3, b: shuffleMask3);
236	_mm_storeu_si128(p: (__m128i *)dst, b: s1);
237	_mm_storeu_si128(p: (__m128i *)(dst + `16`), b: s2);
238	_mm_storeu_si128(p: (__m128i *)(dst + `32`), b: s3);
239
240	// Now fix the last four misplaced values
241	std::swap(a&: dst[`15`], b&: dst[`17`]);
242	std::swap(a&: dst[`30`], b&: dst[`32`]);
243
244	src += `48`;
245	dst += `48`;
246	}
247
248	if (src != dst) {
249	SIMD_EPILOGUE(i, count, `15`) {
250	dst[`0`] = src[`2`];
251	dst[`1`] = src[`1`];
252	dst[`2`] = src[`0`];
253	dst += `3`;
254	src += `3`;
255	}
256	} else {
257	SIMD_EPILOGUE(i, count, `15`) {
258	std::swap(a&: dst[`0`], b&: dst[`2`]);
259	dst += `3`;
260	}
261	}
262	}
263
264	QT_END_NAMESPACE
265
266	#endif // QT_COMPILER_SUPPORTS_SSSE3
267

source code of qtbase/src/gui/painting/qdrawhelper_ssse3.cpp