qdrawhelper_ssse3.cpp source code [qtbase/src/gui/painting/qdrawhelper_ssse3.cpp]

1	/****************************************************************************
2	**
3	** Copyright (C) 2018 The Qt Company Ltd.
4	** Copyright (C) 2018 Intel Corporation.
5	** Contact: https://www.qt.io/licensing/
6	**
7	** This file is part of the QtGui module of the Qt Toolkit.
8	**
9	** $QT_BEGIN_LICENSE:LGPL$
10	** Commercial License Usage
11	** Licensees holding valid commercial Qt licenses may use this file in
12	** accordance with the commercial license agreement provided with the
13	** Software or, alternatively, in accordance with the terms contained in
14	** a written agreement between you and The Qt Company. For licensing terms
15	** and conditions see https://www.qt.io/terms-conditions. For further
16	** information use the contact form at https://www.qt.io/contact-us.
17	**
18	** GNU Lesser General Public License Usage
19	** Alternatively, this file may be used under the terms of the GNU Lesser
20	** General Public License version 3 as published by the Free Software
21	** Foundation and appearing in the file LICENSE.LGPL3 included in the
22	** packaging of this file. Please review the following information to
23	** ensure the GNU Lesser General Public License version 3 requirements
24	** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
25	**
26	** GNU General Public License Usage
27	** Alternatively, this file may be used under the terms of the GNU
28	** General Public License version 2.0 or (at your option) the GNU General
29	** Public license version 3 or any later version approved by the KDE Free
30	** Qt Foundation. The licenses are as published by the Free Software
31	** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
32	** included in the packaging of this file. Please review the following
33	** information to ensure the GNU General Public License requirements will
34	** be met: https://www.gnu.org/licenses/gpl-2.0.html and
35	** https://www.gnu.org/licenses/gpl-3.0.html.
36	**
37	** $QT_END_LICENSE$
38	**
39	****************************************************************************/
40
41	#include <private/qdrawhelper_x86_p.h>
42
43	#if defined(QT_COMPILER_SUPPORTS_SSSE3)
44
45	#include <private/qdrawingprimitive_sse2_p.h>
46
47	QT_BEGIN_NAMESPACE
48
49	/ The instruction palignr uses direct arguments, so we have to generate the code fo the different*
50	shift (4, 8, 12). Checking the alignment inside the loop is unfortunatelly way too slow.
51	*/
52	#define BLENDING_LOOP(palignrOffset, length)\
53	for (; x-minusOffsetToAlignSrcOn16Bytes < length-7; x += 4) { \
54	const __m128i srcVectorLastLoaded = _mm_load_si128((const __m128i *)&src[x - minusOffsetToAlignSrcOn16Bytes + 4]);\
55	const __m128i srcVector = _mm_alignr_epi8(srcVectorLastLoaded, srcVectorPrevLoaded, palignrOffset); \
56	const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); \
57	if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) { \
58	_mm_store_si128((__m128i *)&dst[x], srcVector); \
59	} else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) { \
60	__m128i alphaChannel = _mm_shuffle_epi8(srcVector, alphaShuffleMask); \
61	alphaChannel = _mm_sub_epi16(one, alphaChannel); \
62	const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); \
63	__m128i destMultipliedByOneMinusAlpha; \
64	BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); \
65	const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); \
66	_mm_store_si128((__m128i *)&dst[x], result); \
67	} \
68	srcVectorPrevLoaded = srcVectorLastLoaded;\
69	}
70
71
72	// Basically blend src over dst with the const alpha defined as constAlphaVector.
73	// nullVector, half, one, colorMask are constant across the whole image/texture, and should be defined as:
74	//const __m128i nullVector = _mm_set1_epi32(0);
75	//const __m128i half = _mm_set1_epi16(0x80);
76	//const __m128i one = _mm_set1_epi16(0xff);
77	//const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
78	//const __m128i alphaMask = _mm_set1_epi32(0xff000000);
79	//
80	// The computation being done is:
81	// result = s + d (1-alpha)*
82	// with shortcuts if fully opaque or fully transparent.
83	static inline void Q_DECL_VECTORCALL
84	BLEND_SOURCE_OVER_ARGB32_SSSE3(quint32 dst, const* quint32 src, int* length,
85	__m128i nullVector, __m128i half, __m128i one, __m128i colorMask, __m128i alphaMask)
86	{
87	int x = `0`;
88
89	/ First, get dst aligned. /
90	ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) {
91	blend_pixel(dst&: dst[x], src: src[x]);
92	}
93
94	const int minusOffsetToAlignSrcOn16Bytes = (reinterpret_cast<quintptr>(&(src[x])) >> `2`) & `0x3`;
95
96	if (!minusOffsetToAlignSrcOn16Bytes) {
97	/ src is aligned, usual algorithm but with aligned operations.*
98	See the SSE2 version for more documentation on the algorithm itself. /*
99	const __m128i alphaShuffleMask = _mm_set_epi8(b15: char(`0xff`),b14: `15`,b13: char(`0xff`),b12: `15`,b11: char(`0xff`),b10: `11`,b9: char(`0xff`),b8: `11`,b7: char(`0xff`),b6: `7`,b5: char(`0xff`),b4: `7`,b3: char(`0xff`),b2: `3`,b1: char(`0xff`),b0: `3`);
100	for (; x < length-`3`; x += `4`) {
101	const __m128i srcVector = _mm_load_si128(p: (const __m128i *)&src[x]);
102	const __m128i srcVectorAlpha = _mm_and_si128(a: srcVector, b: alphaMask);
103	if (_mm_movemask_epi8(a: _mm_cmpeq_epi32(a: srcVectorAlpha, b: alphaMask)) == `0xffff`) {
104	_mm_store_si128(p: (__m128i *)&dst[x], b: srcVector);
105	} else if (_mm_movemask_epi8(a: _mm_cmpeq_epi32(a: srcVectorAlpha, b: nullVector)) != `0xffff`) {
106	__m128i alphaChannel = _mm_shuffle_epi8(a: srcVector, b: alphaShuffleMask);
107	alphaChannel = _mm_sub_epi16(a: one, b: alphaChannel);
108	const __m128i dstVector = _mm_load_si128(p: (__m128i *)&dst[x]);
109	__m128i destMultipliedByOneMinusAlpha;
110	BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half);
111	const __m128i result = _mm_add_epi8(a: srcVector, b: destMultipliedByOneMinusAlpha);
112	_mm_store_si128(p: (__m128i *)&dst[x], b: result);
113	}
114	} / end for() /
115	} else if ((length - x) >= `8`) {
116	/ We use two vectors to extract the src: prevLoaded for the first pixels, lastLoaded for the current pixels. /
117	__m128i srcVectorPrevLoaded = _mm_load_si128(p: (const __m128i *)&src[x - minusOffsetToAlignSrcOn16Bytes]);
118	const int palignrOffset = minusOffsetToAlignSrcOn16Bytes << `2`;
119
120	const __m128i alphaShuffleMask = _mm_set_epi8(b15: char(`0xff`),b14: `15`,b13: char(`0xff`),b12: `15`,b11: char(`0xff`),b10: `11`,b9: char(`0xff`),b8: `11`,b7: char(`0xff`),b6: `7`,b5: char(`0xff`),b4: `7`,b3: char(`0xff`),b2: `3`,b1: char(`0xff`),b0: `3`);
121	switch (palignrOffset) {
122	case `4`:
123	BLENDING_LOOP(`4`, length)
124	break;
125	case `8`:
126	BLENDING_LOOP(`8`, length)
127	break;
128	case `12`:
129	BLENDING_LOOP(`12`, length)
130	break;
131	}
132	}
133	for (; x < length; ++x)
134	blend_pixel(dst&: dst[x], src: src[x]);
135	}
136
137	void qt_blend_argb32_on_argb32_ssse3(uchar destPixels, int* dbpl,
138	const uchar srcPixels, int* sbpl,
139	int w, int h,
140	int const_alpha)
141	{
142	const quint32 src = (const* quint32 *) srcPixels;
143	quint32 dst = (quint32 ) destPixels;
144	if (const_alpha == `256`) {
145	const __m128i alphaMask = _mm_set1_epi32(i: `0xff000000`);
146	const __m128i nullVector = _mm_setzero_si128();
147	const __m128i half = _mm_set1_epi16(w: `0x80`);
148	const __m128i one = _mm_set1_epi16(w: `0xff`);
149	const __m128i colorMask = _mm_set1_epi32(i: `0x00ff00ff`);
150
151	for (int y = `0`; y < h; ++y) {
152	BLEND_SOURCE_OVER_ARGB32_SSSE3(dst, src, length: w, nullVector, half, one, colorMask, alphaMask);
153	dst = (quint32 )(((uchar ) dst) + dbpl);
154	src = (const quint32 )(((const* uchar *) src) + sbpl);
155	}
156	} else if (const_alpha != `0`) {
157	// dest = (s + d sia) * ca + d * cia*
158	// = s ca + d * (sia * ca + cia)*
159	// = s ca + d * (1 - saca)
160	const_alpha = (const_alpha * `255`) >> `8`;
161	const __m128i nullVector = _mm_setzero_si128();
162	const __m128i half = _mm_set1_epi16(w: `0x80`);
163	const __m128i one = _mm_set1_epi16(w: `0xff`);
164	const __m128i colorMask = _mm_set1_epi32(i: `0x00ff00ff`);
165	const __m128i constAlphaVector = _mm_set1_epi16(w: const_alpha);
166	for (int y = `0`; y < h; ++y) {
167	BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, w, nullVector, half, one, colorMask, constAlphaVector)
168	dst = (quint32 )(((uchar ) dst) + dbpl);
169	src = (const quint32 )(((const* uchar *) src) + sbpl);
170	}
171	}
172	}
173
174	const uint QT_FASTCALL fetchPixelsBPP24_ssse3(uint buffer, const uchar src, int* index, int count)
175	{
176	const quint24 s = reinterpret_cast<const* quint24 *>(src);
177	for (int i = `0`; i < count; ++i)
178	buffer[i] = s[index + i];
179	return buffer;
180	}
181
182	extern void QT_FASTCALL qt_convert_rgb888_to_rgb32_ssse3(quint32 dst, const* uchar src, int* len);
183
184	const uint * QT_FASTCALL qt_fetchUntransformed_888_ssse3(uint buffer, const* Operator , const* QSpanData *data,
185	int y, int x, int length)
186	{
187	const uchar line = data->texture.scanLine(y) + x `3`;
188	qt_convert_rgb888_to_rgb32_ssse3(dst: buffer, src: line, len: length);
189	return buffer;
190	}
191
192	void qt_memfill24_ssse3(quint24 *dest, quint24 color, qsizetype count)
193	{
194	// LCM of 12 and 16 bytes is 48 bytes (16 px)
195	quint32 v = color;
196	__m128i m = _mm_cvtsi32_si128(a: v);
197	quint24 *end = dest + count;
198
199	constexpr uchar x = `2`, y = `1`, z = `0`;
200	Q_DECL_ALIGN(__m128i) static const uchar
201	shuffleMask[`16` + `1`] = { x, y, z, x, y, z, x, y, z, x, y, z, x, y, z, x, y };
202
203	__m128i mval1 = _mm_shuffle_epi8(a: m, b: _mm_load_si128(p: reinterpret_cast<const __m128i *>(shuffleMask)));
204	__m128i mval2 = _mm_shuffle_epi8(a: m, b: _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(shuffleMask + `1`)));
205	__m128i mval3 = _mm_alignr_epi8(mval2, mval1, `2`);
206
207	for ( ; dest + `16` <= end; dest += `16`) {
208	#ifdef __AVX__
209	// Store using 32-byte AVX instruction
210	__m256 mval12 = _mm256_castps128_ps256(_mm_castsi128_ps(mval1));
211	mval12 = _mm256_insertf128_ps(mval12, _mm_castsi128_ps(mval2), `1`);
212	_mm256_storeu_ps(reinterpret_cast<float *>(dest), mval12);
213	#else
214	_mm_storeu_si128(p: reinterpret_cast<__m128i *>(dest) + `0`, b: mval1);
215	_mm_storeu_si128(p: reinterpret_cast<__m128i *>(dest) + `1`, b: mval2);
216	#endif
217	_mm_storeu_si128(p: reinterpret_cast<__m128i *>(dest) + `2`, b: mval3);
218	}
219
220	if (count < `3`) {
221	if (count > `1`)
222	end[-`2`] = v;
223	if (count)
224	end[-`1`] = v;
225	return;
226	}
227
228	// less than 16px/48B left
229	uchar ptr = reinterpret_cast<uchar >(dest);
230	uchar ptr_end = reinterpret_cast<uchar >(end);
231	qptrdiff left = ptr_end - ptr;
232	if (left >= `24`) {
233	// 8px/24B or more left
234	_mm_storeu_si128(p: reinterpret_cast<__m128i *>(ptr) + `0`, b: mval1);
235	_mm_storel_epi64(p: reinterpret_cast<__m128i *>(ptr) + `1`, a: mval2);
236	ptr += `24`;
237	left -= `24`;
238	}
239
240	// less than 8px/24B left
241
242	if (left >= `16`) {
243	// but more than 5px/15B left
244	_mm_storeu_si128(p: reinterpret_cast<__m128i *>(ptr) , b: mval1);
245	} else if (left >= `8`) {
246	// but more than 2px/6B left
247	_mm_storel_epi64(p: reinterpret_cast<__m128i *>(ptr), a: mval1);
248	}
249
250	if (left) {
251	// 1 or 2px left
252	// store 8 bytes ending with the right values (will overwrite a bit)
253	_mm_storel_epi64(p: reinterpret_cast<__m128i *>(ptr_end - `8`), a: mval2);
254	}
255	}
256
257	void QT_FASTCALL rbSwap_888_ssse3(uchar dst, const* uchar src, int* count)
258	{
259	int i = `0`;
260
261	const static __m128i shuffleMask1 = _mm_setr_epi8(b0: `2`, b1: `1`, b2: `0`, b3: `5`, b4: `4`, b5: `3`, b6: `8`, b7: `7`, b8: `6`, b9: `11`, b10: `10`, b11: `9`, b12: `14`, b13: `13`, b14: `12`, /!!/b15: `15`);
262	const static __m128i shuffleMask2 = _mm_setr_epi8(b0: `0`, /!!/b1: `1`, b2: `4`, b3: `3`, b4: `2`, b5: `7`, b6: `6`, b7: `5`, b8: `10`, b9: `9`, b10: `8`, b11: `13`, b12: `12`, b13: `11`, /!!/b14: `14`, b15: `15`);
263	const static __m128i shuffleMask3 = _mm_setr_epi8(/!!/b0: `0`, b1: `3`, b2: `2`, b3: `1`, b4: `6`, b5: `5`, b6: `4`, b7: `9`, b8: `8`, b9: `7`, b10: `12`, b11: `11`, b12: `10`, b13: `15`, b14: `14`, b15: `13`);
264
265	for (; i + `15` < count; i += `16`) {
266	__m128i s1 = _mm_loadu_si128(p: (const __m128i *)src);
267	__m128i s2 = _mm_loadu_si128(p: (const __m128i *)(src + `16`));
268	__m128i s3 = _mm_loadu_si128(p: (const __m128i *)(src + `32`));
269	s1 = _mm_shuffle_epi8(a: s1, b: shuffleMask1);
270	s2 = _mm_shuffle_epi8(a: s2, b: shuffleMask2);
271	s3 = _mm_shuffle_epi8(a: s3, b: shuffleMask3);
272	_mm_storeu_si128(p: (__m128i *)dst, b: s1);
273	_mm_storeu_si128(p: (__m128i *)(dst + `16`), b: s2);
274	_mm_storeu_si128(p: (__m128i *)(dst + `32`), b: s3);
275
276	// Now fix the last four misplaced values
277	std::swap(a&: dst[`15`], b&: dst[`17`]);
278	std::swap(a&: dst[`30`], b&: dst[`32`]);
279
280	src += `48`;
281	dst += `48`;
282	}
283
284	if (src != dst) {
285	SIMD_EPILOGUE(i, count, `15`) {
286	dst[`0`] = src[`2`];
287	dst[`1`] = src[`1`];
288	dst[`2`] = src[`0`];
289	dst += `3`;
290	src += `3`;
291	}
292	} else {
293	SIMD_EPILOGUE(i, count, `15`) {
294	std::swap(a&: dst[`0`], b&: dst[`2`]);
295	dst += `3`;
296	}
297	}
298	}
299
300	QT_END_NAMESPACE
301
302	#endif // QT_COMPILER_SUPPORTS_SSSE3
303

source code of qtbase/src/gui/painting/qdrawhelper_ssse3.cpp