qdrawhelper_avx2.cpp source code [qtbase/src/gui/painting/qdrawhelper_avx2.cpp]

1	/****************************************************************************
2	**
3	** Copyright (C) 2018 The Qt Company Ltd.
4	** Copyright (C) 2018 Intel Corporation.
5	** Contact: https://www.qt.io/licensing/
6	**
7	** This file is part of the QtGui module of the Qt Toolkit.
8	**
9	** $QT_BEGIN_LICENSE:LGPL$
10	** Commercial License Usage
11	** Licensees holding valid commercial Qt licenses may use this file in
12	** accordance with the commercial license agreement provided with the
13	** Software or, alternatively, in accordance with the terms contained in
14	** a written agreement between you and The Qt Company. For licensing terms
15	** and conditions see https://www.qt.io/terms-conditions. For further
16	** information use the contact form at https://www.qt.io/contact-us.
17	**
18	** GNU Lesser General Public License Usage
19	** Alternatively, this file may be used under the terms of the GNU Lesser
20	** General Public License version 3 as published by the Free Software
21	** Foundation and appearing in the file LICENSE.LGPL3 included in the
22	** packaging of this file. Please review the following information to
23	** ensure the GNU Lesser General Public License version 3 requirements
24	** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
25	**
26	** GNU General Public License Usage
27	** Alternatively, this file may be used under the terms of the GNU
28	** General Public License version 2.0 or (at your option) the GNU General
29	** Public license version 3 or any later version approved by the KDE Free
30	** Qt Foundation. The licenses are as published by the Free Software
31	** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
32	** included in the packaging of this file. Please review the following
33	** information to ensure the GNU General Public License requirements will
34	** be met: https://www.gnu.org/licenses/gpl-2.0.html and
35	** https://www.gnu.org/licenses/gpl-3.0.html.
36	**
37	** $QT_END_LICENSE$
38	**
39	****************************************************************************/
40
41	#include "qdrawhelper_p.h"
42	#include "qdrawhelper_x86_p.h"
43	#include "qdrawingprimitive_sse2_p.h"
44	#include "qrgba64_p.h"
45
46	#if defined(QT_COMPILER_SUPPORTS_AVX2)
47
48	QT_BEGIN_NAMESPACE
49
50	enum {
51	FixedScale = `1` << `16`,
52	HalfPoint = `1` << `15`
53	};
54
55	// Vectorized blend functions:
56
57	// See BYTE_MUL_SSE2 for details.
58	inline static void Q_DECL_VECTORCALL
59	BYTE_MUL_AVX2(__m256i &pixelVector, __m256i alphaChannel, __m256i colorMask, __m256i half)
60	{
61	__m256i pixelVectorAG = _mm256_srli_epi16(a: pixelVector, count: `8`);
62	__m256i pixelVectorRB = _mm256_and_si256(a: pixelVector, b: colorMask);
63
64	pixelVectorAG = _mm256_mullo_epi16(a: pixelVectorAG, b: alphaChannel);
65	pixelVectorRB = _mm256_mullo_epi16(a: pixelVectorRB, b: alphaChannel);
66
67	pixelVectorRB = _mm256_add_epi16(a: pixelVectorRB, b: _mm256_srli_epi16(a: pixelVectorRB, count: `8`));
68	pixelVectorAG = _mm256_add_epi16(a: pixelVectorAG, b: _mm256_srli_epi16(a: pixelVectorAG, count: `8`));
69	pixelVectorRB = _mm256_add_epi16(a: pixelVectorRB, b: half);
70	pixelVectorAG = _mm256_add_epi16(a: pixelVectorAG, b: half);
71
72	pixelVectorRB = _mm256_srli_epi16(a: pixelVectorRB, count: `8`);
73	pixelVectorAG = _mm256_andnot_si256(a: colorMask, b: pixelVectorAG);
74
75	pixelVector = _mm256_or_si256(a: pixelVectorAG, b: pixelVectorRB);
76	}
77
78	inline static void Q_DECL_VECTORCALL
79	BYTE_MUL_RGB64_AVX2(__m256i &pixelVector, __m256i alphaChannel, __m256i colorMask, __m256i half)
80	{
81	__m256i pixelVectorAG = _mm256_srli_epi32(a: pixelVector, count: `16`);
82	__m256i pixelVectorRB = _mm256_and_si256(a: pixelVector, b: colorMask);
83
84	pixelVectorAG = _mm256_mullo_epi32(a: pixelVectorAG, b: alphaChannel);
85	pixelVectorRB = _mm256_mullo_epi32(a: pixelVectorRB, b: alphaChannel);
86
87	pixelVectorRB = _mm256_add_epi32(a: pixelVectorRB, b: _mm256_srli_epi32(a: pixelVectorRB, count: `16`));
88	pixelVectorAG = _mm256_add_epi32(a: pixelVectorAG, b: _mm256_srli_epi32(a: pixelVectorAG, count: `16`));
89	pixelVectorRB = _mm256_add_epi32(a: pixelVectorRB, b: half);
90	pixelVectorAG = _mm256_add_epi32(a: pixelVectorAG, b: half);
91
92	pixelVectorRB = _mm256_srli_epi32(a: pixelVectorRB, count: `16`);
93	pixelVectorAG = _mm256_andnot_si256(a: colorMask, b: pixelVectorAG);
94
95	pixelVector = _mm256_or_si256(a: pixelVectorAG, b: pixelVectorRB);
96	}
97
98	// See INTERPOLATE_PIXEL_255_SSE2 for details.
99	inline static void Q_DECL_VECTORCALL
100	INTERPOLATE_PIXEL_255_AVX2(__m256i srcVector, __m256i &dstVector, __m256i alphaChannel, __m256i oneMinusAlphaChannel, __m256i colorMask, __m256i half)
101	{
102	const __m256i srcVectorAG = _mm256_srli_epi16(a: srcVector, count: `8`);
103	const __m256i dstVectorAG = _mm256_srli_epi16(a: dstVector, count: `8`);
104	const __m256i srcVectorRB = _mm256_and_si256(a: srcVector, b: colorMask);
105	const __m256i dstVectorRB = _mm256_and_si256(a: dstVector, b: colorMask);
106	const __m256i srcVectorAGalpha = _mm256_mullo_epi16(a: srcVectorAG, b: alphaChannel);
107	const __m256i srcVectorRBalpha = _mm256_mullo_epi16(a: srcVectorRB, b: alphaChannel);
108	const __m256i dstVectorAGoneMinusAlpha = _mm256_mullo_epi16(a: dstVectorAG, b: oneMinusAlphaChannel);
109	const __m256i dstVectorRBoneMinusAlpha = _mm256_mullo_epi16(a: dstVectorRB, b: oneMinusAlphaChannel);
110	__m256i finalAG = _mm256_add_epi16(a: srcVectorAGalpha, b: dstVectorAGoneMinusAlpha);
111	__m256i finalRB = _mm256_add_epi16(a: srcVectorRBalpha, b: dstVectorRBoneMinusAlpha);
112	finalAG = _mm256_add_epi16(a: finalAG, b: _mm256_srli_epi16(a: finalAG, count: `8`));
113	finalRB = _mm256_add_epi16(a: finalRB, b: _mm256_srli_epi16(a: finalRB, count: `8`));
114	finalAG = _mm256_add_epi16(a: finalAG, b: half);
115	finalRB = _mm256_add_epi16(a: finalRB, b: half);
116	finalAG = _mm256_andnot_si256(a: colorMask, b: finalAG);
117	finalRB = _mm256_srli_epi16(a: finalRB, count: `8`);
118
119	dstVector = _mm256_or_si256(a: finalAG, b: finalRB);
120	}
121
122	inline static void Q_DECL_VECTORCALL
123	INTERPOLATE_PIXEL_RGB64_AVX2(__m256i srcVector, __m256i &dstVector, __m256i alphaChannel, __m256i oneMinusAlphaChannel, __m256i colorMask, __m256i half)
124	{
125	const __m256i srcVectorAG = _mm256_srli_epi32(a: srcVector, count: `16`);
126	const __m256i dstVectorAG = _mm256_srli_epi32(a: dstVector, count: `16`);
127	const __m256i srcVectorRB = _mm256_and_si256(a: srcVector, b: colorMask);
128	const __m256i dstVectorRB = _mm256_and_si256(a: dstVector, b: colorMask);
129	const __m256i srcVectorAGalpha = _mm256_mullo_epi32(a: srcVectorAG, b: alphaChannel);
130	const __m256i srcVectorRBalpha = _mm256_mullo_epi32(a: srcVectorRB, b: alphaChannel);
131	const __m256i dstVectorAGoneMinusAlpha = _mm256_mullo_epi32(a: dstVectorAG, b: oneMinusAlphaChannel);
132	const __m256i dstVectorRBoneMinusAlpha = _mm256_mullo_epi32(a: dstVectorRB, b: oneMinusAlphaChannel);
133	__m256i finalAG = _mm256_add_epi32(a: srcVectorAGalpha, b: dstVectorAGoneMinusAlpha);
134	__m256i finalRB = _mm256_add_epi32(a: srcVectorRBalpha, b: dstVectorRBoneMinusAlpha);
135	finalAG = _mm256_add_epi32(a: finalAG, b: _mm256_srli_epi32(a: finalAG, count: `16`));
136	finalRB = _mm256_add_epi32(a: finalRB, b: _mm256_srli_epi32(a: finalRB, count: `16`));
137	finalAG = _mm256_add_epi32(a: finalAG, b: half);
138	finalRB = _mm256_add_epi32(a: finalRB, b: half);
139	finalAG = _mm256_andnot_si256(a: colorMask, b: finalAG);
140	finalRB = _mm256_srli_epi32(a: finalRB, count: `16`);
141
142	dstVector = _mm256_or_si256(a: finalAG, b: finalRB);
143	}
144
145
146	// See BLEND_SOURCE_OVER_ARGB32_SSE2 for details.
147	inline static void Q_DECL_VECTORCALL BLEND_SOURCE_OVER_ARGB32_AVX2(quint32 dst, const* quint32 src, const* int length)
148	{
149	const __m256i half = _mm256_set1_epi16(w: `0x80`);
150	const __m256i one = _mm256_set1_epi16(w: `0xff`);
151	const __m256i colorMask = _mm256_set1_epi32(i: `0x00ff00ff`);
152	const __m256i alphaMask = _mm256_set1_epi32(i: `0xff000000`);
153	const __m256i offsetMask = _mm256_setr_epi32(i0: `0`, i1: `1`, i2: `2`, i3: `3`, i4: `4`, i5: `5`, i6: `6`, i7: `7`);
154	const __m256i alphaShuffleMask = _mm256_set_epi8(b31: char(`0xff`),b30: `15`,b29: char(`0xff`),b28: `15`,b27: char(`0xff`),b26: `11`,b25: char(`0xff`),b24: `11`,b23: char(`0xff`),b22: `7`,b21: char(`0xff`),b20: `7`,b19: char(`0xff`),b18: `3`,b17: char(`0xff`),b16: `3`,
155	b15: char(`0xff`),b14: `15`,b13: char(`0xff`),b12: `15`,b11: char(`0xff`),b10: `11`,b09: char(`0xff`),b08: `11`,b07: char(`0xff`),b06: `7`,b05: char(`0xff`),b04: `7`,b03: char(`0xff`),b02: `3`,b01: char(`0xff`),b00: `3`);
156
157	const int minusOffsetToAlignDstOn32Bytes = (reinterpret_cast<quintptr>(dst) >> `2`) & `0x7`;
158
159	int x = `0`;
160	// Prologue to handle all pixels until dst is 32-byte aligned in one step.
161	if (minusOffsetToAlignDstOn32Bytes != `0` && x < (length - `7`)) {
162	const __m256i prologueMask = _mm256_sub_epi32(a: _mm256_set1_epi32(i: minusOffsetToAlignDstOn32Bytes - `1`), b: offsetMask);
163	const __m256i srcVector = _mm256_maskload_epi32(X: (const int *)&src[x - minusOffsetToAlignDstOn32Bytes], M: prologueMask);
164	const __m256i prologueAlphaMask = _mm256_blendv_epi8(V1: _mm256_setzero_si256(), V2: alphaMask, M: prologueMask);
165	if (!_mm256_testz_si256(a: srcVector, b: prologueAlphaMask)) {
166	if (_mm256_testc_si256(a: srcVector, b: prologueAlphaMask)) {
167	_mm256_maskstore_epi32(X: (int *)&dst[x - minusOffsetToAlignDstOn32Bytes], M: prologueMask, Y: srcVector);
168	} else {
169	__m256i alphaChannel = _mm256_shuffle_epi8(a: srcVector, b: alphaShuffleMask);
170	alphaChannel = _mm256_sub_epi16(a: one, b: alphaChannel);
171	__m256i dstVector = _mm256_maskload_epi32(X: (int *)&dst[x - minusOffsetToAlignDstOn32Bytes], M: prologueMask);
172	BYTE_MUL_AVX2(pixelVector&: dstVector, alphaChannel, colorMask, half);
173	dstVector = _mm256_add_epi8(a: dstVector, b: srcVector);
174	_mm256_maskstore_epi32(X: (int *)&dst[x - minusOffsetToAlignDstOn32Bytes], M: prologueMask, Y: dstVector);
175	}
176	}
177	x += (`8` - minusOffsetToAlignDstOn32Bytes);
178	}
179
180	for (; x < (length - `7`); x += `8`) {
181	const __m256i srcVector = _mm256_lddqu_si256(p: (const __m256i *)&src[x]);
182	if (!_mm256_testz_si256(a: srcVector, b: alphaMask)) {
183	if (_mm256_testc_si256(a: srcVector, b: alphaMask)) {
184	_mm256_store_si256(p: (__m256i *)&dst[x], a: srcVector);
185	} else {
186	__m256i alphaChannel = _mm256_shuffle_epi8(a: srcVector, b: alphaShuffleMask);
187	alphaChannel = _mm256_sub_epi16(a: one, b: alphaChannel);
188	__m256i dstVector = _mm256_load_si256(p: (__m256i *)&dst[x]);
189	BYTE_MUL_AVX2(pixelVector&: dstVector, alphaChannel, colorMask, half);
190	dstVector = _mm256_add_epi8(a: dstVector, b: srcVector);
191	_mm256_store_si256(p: (__m256i *)&dst[x], a: dstVector);
192	}
193	}
194	}
195
196	// Epilogue to handle all remaining pixels in one step.
197	if (x < length) {
198	const __m256i epilogueMask = _mm256_add_epi32(a: offsetMask, b: _mm256_set1_epi32(i: x - length));
199	const __m256i srcVector = _mm256_maskload_epi32(X: (const int *)&src[x], M: epilogueMask);
200	const __m256i epilogueAlphaMask = _mm256_blendv_epi8(V1: _mm256_setzero_si256(), V2: alphaMask, M: epilogueMask);
201	if (!_mm256_testz_si256(a: srcVector, b: epilogueAlphaMask)) {
202	if (_mm256_testc_si256(a: srcVector, b: epilogueAlphaMask)) {
203	_mm256_maskstore_epi32(X: (int *)&dst[x], M: epilogueMask, Y: srcVector);
204	} else {
205	__m256i alphaChannel = _mm256_shuffle_epi8(a: srcVector, b: alphaShuffleMask);
206	alphaChannel = _mm256_sub_epi16(a: one, b: alphaChannel);
207	__m256i dstVector = _mm256_maskload_epi32(X: (int *)&dst[x], M: epilogueMask);
208	BYTE_MUL_AVX2(pixelVector&: dstVector, alphaChannel, colorMask, half);
209	dstVector = _mm256_add_epi8(a: dstVector, b: srcVector);
210	_mm256_maskstore_epi32(X: (int *)&dst[x], M: epilogueMask, Y: dstVector);
211	}
212	}
213	}
214	}
215
216
217	// See BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2 for details.
218	inline static void Q_DECL_VECTORCALL
219	BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_AVX2(quint32 dst, const* quint32 src, const* int length, const int const_alpha)
220	{
221	int x = `0`;
222
223	ALIGNMENT_PROLOGUE_32BYTES(dst, x, length)
224	blend_pixel(dst&: dst[x], src: src[x], const_alpha);
225
226	const __m256i half = _mm256_set1_epi16(w: `0x80`);
227	const __m256i one = _mm256_set1_epi16(w: `0xff`);
228	const __m256i colorMask = _mm256_set1_epi32(i: `0x00ff00ff`);
229	const __m256i alphaMask = _mm256_set1_epi32(i: `0xff000000`);
230	const __m256i alphaShuffleMask = _mm256_set_epi8(b31: char(`0xff`),b30: `15`,b29: char(`0xff`),b28: `15`,b27: char(`0xff`),b26: `11`,b25: char(`0xff`),b24: `11`,b23: char(`0xff`),b22: `7`,b21: char(`0xff`),b20: `7`,b19: char(`0xff`),b18: `3`,b17: char(`0xff`),b16: `3`,
231	b15: char(`0xff`),b14: `15`,b13: char(`0xff`),b12: `15`,b11: char(`0xff`),b10: `11`,b09: char(`0xff`),b08: `11`,b07: char(`0xff`),b06: `7`,b05: char(`0xff`),b04: `7`,b03: char(`0xff`),b02: `3`,b01: char(`0xff`),b00: `3`);
232	const __m256i constAlphaVector = _mm256_set1_epi16(w: const_alpha);
233	for (; x < (length - `7`); x += `8`) {
234	__m256i srcVector = _mm256_lddqu_si256(p: (const __m256i *)&src[x]);
235	if (!_mm256_testz_si256(a: srcVector, b: alphaMask)) {
236	BYTE_MUL_AVX2(pixelVector&: srcVector, alphaChannel: constAlphaVector, colorMask, half);
237
238	__m256i alphaChannel = _mm256_shuffle_epi8(a: srcVector, b: alphaShuffleMask);
239	alphaChannel = _mm256_sub_epi16(a: one, b: alphaChannel);
240	__m256i dstVector = _mm256_load_si256(p: (__m256i *)&dst[x]);
241	BYTE_MUL_AVX2(pixelVector&: dstVector, alphaChannel, colorMask, half);
242	dstVector = _mm256_add_epi8(a: dstVector, b: srcVector);
243	_mm256_store_si256(p: (__m256i *)&dst[x], a: dstVector);
244	}
245	}
246	SIMD_EPILOGUE(x, length, `7`)
247	blend_pixel(dst&: dst[x], src: src[x], const_alpha);
248	}
249
250	void qt_blend_argb32_on_argb32_avx2(uchar destPixels, int* dbpl,
251	const uchar srcPixels, int* sbpl,
252	int w, int h,
253	int const_alpha)
254	{
255	if (const_alpha == `256`) {
256	for (int y = `0`; y < h; ++y) {
257	const quint32 src = reinterpret_cast<const* quint32 *>(srcPixels);
258	quint32 dst = reinterpret_cast<quint32 >(destPixels);
259	BLEND_SOURCE_OVER_ARGB32_AVX2(dst, src, length: w);
260	destPixels += dbpl;
261	srcPixels += sbpl;
262	}
263	} else if (const_alpha != `0`) {
264	const_alpha = (const_alpha * `255`) >> `8`;
265	for (int y = `0`; y < h; ++y) {
266	const quint32 src = reinterpret_cast<const* quint32 *>(srcPixels);
267	quint32 dst = reinterpret_cast<quint32 >(destPixels);
268	BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_AVX2(dst, src, length: w, const_alpha);
269	destPixels += dbpl;
270	srcPixels += sbpl;
271	}
272	}
273	}
274
275	void qt_blend_rgb32_on_rgb32_avx2(uchar destPixels, int* dbpl,
276	const uchar srcPixels, int* sbpl,
277	int w, int h,
278	int const_alpha)
279	{
280	if (const_alpha == `256`) {
281	for (int y = `0`; y < h; ++y) {
282	const quint32 src = reinterpret_cast<const* quint32 *>(srcPixels);
283	quint32 dst = reinterpret_cast<quint32 >(destPixels);
284	::memcpy(dest: dst, src: src, n: w * sizeof(uint));
285	srcPixels += sbpl;
286	destPixels += dbpl;
287	}
288	return;
289	}
290	if (const_alpha == `0`)
291	return;
292
293	const __m256i half = _mm256_set1_epi16(w: `0x80`);
294	const __m256i colorMask = _mm256_set1_epi32(i: `0x00ff00ff`);
295
296	const_alpha = (const_alpha * `255`) >> `8`;
297	int one_minus_const_alpha = `255` - const_alpha;
298	const __m256i constAlphaVector = _mm256_set1_epi16(w: const_alpha);
299	const __m256i oneMinusConstAlpha = _mm256_set1_epi16(w: one_minus_const_alpha);
300	for (int y = `0`; y < h; ++y) {
301	const quint32 src = reinterpret_cast<const* quint32 *>(srcPixels);
302	quint32 dst = reinterpret_cast<quint32 >(destPixels);
303	int x = `0`;
304
305	// First, align dest to 32 bytes:
306	ALIGNMENT_PROLOGUE_32BYTES(dst, x, w)
307	dst[x] = INTERPOLATE_PIXEL_255(x: src[x], a: const_alpha, y: dst[x], b: one_minus_const_alpha);
308
309	// 2) interpolate pixels with AVX2
310	for (; x < (w - `7`); x += `8`) {
311	const __m256i srcVector = _mm256_lddqu_si256(p: (const __m256i *)&src[x]);
312	__m256i dstVector = _mm256_load_si256(p: (__m256i *)&dst[x]);
313	INTERPOLATE_PIXEL_255_AVX2(srcVector, dstVector, alphaChannel: constAlphaVector, oneMinusAlphaChannel: oneMinusConstAlpha, colorMask, half);
314	_mm256_store_si256(p: (__m256i *)&dst[x], a: dstVector);
315	}
316
317	// 3) Epilogue
318	SIMD_EPILOGUE(x, w, `7`)
319	dst[x] = INTERPOLATE_PIXEL_255(x: src[x], a: const_alpha, y: dst[x], b: one_minus_const_alpha);
320
321	srcPixels += sbpl;
322	destPixels += dbpl;
323	}
324	}
325
326	static Q_NEVER_INLINE
327	void Q_DECL_VECTORCALL qt_memfillXX_avx2(uchar *dest, __m256i value256, qsizetype bytes)
328	{
329	__m128i value128 = _mm256_castsi256_si128(a: value256);
330
331	// main body
332	__m256i dst256 = reinterpret_cast<__m256i >(dest);
333	uchar *end = dest + bytes;
334	while (reinterpret_cast<uchar *>(dst256 + `4`) <= end) {
335	_mm256_storeu_si256(p: dst256 + `0`, a: value256);
336	_mm256_storeu_si256(p: dst256 + `1`, a: value256);
337	_mm256_storeu_si256(p: dst256 + `2`, a: value256);
338	_mm256_storeu_si256(p: dst256 + `3`, a: value256);
339	dst256 += `4`;
340	}
341
342	// first epilogue: fewer than 128 bytes / 32 entries
343	bytes = end - reinterpret_cast<uchar *>(dst256);
344	switch (bytes / sizeof(value256)) {
345	case `3`: _mm256_storeu_si256(p: dst256++, a: value256); Q_FALLTHROUGH();
346	case `2`: _mm256_storeu_si256(p: dst256++, a: value256); Q_FALLTHROUGH();
347	case `1`: _mm256_storeu_si256(p: dst256++, a: value256);
348	}
349
350	// second epilogue: fewer than 32 bytes
351	__m128i dst128 = reinterpret_cast<__m128i >(dst256);
352	if (bytes & sizeof(value128))
353	_mm_storeu_si128(p: dst128++, b: value128);
354
355	// third epilogue: fewer than 16 bytes
356	if (bytes & `8`)
357	_mm_storel_epi64(p: reinterpret_cast<__m128i *>(end - `8`), a: value128);
358	}
359
360	void qt_memfill64_avx2(quint64 *dest, quint64 value, qsizetype count)
361	{
362	#if defined(Q_CC_GNU) && !defined(Q_CC_CLANG) && !defined(Q_CC_INTEL)
363	// work around https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80820
364	__m128i value64 = _mm_set_epi64x(`0`, value); // _mm_cvtsi64_si128(value);
365	# ifdef Q_PROCESSOR_X86_64
366	asm ("" : "+x" (value64));
367	# endif
368	__m256i value256 = _mm256_broadcastq_epi64(value64);
369	#else
370	__m256i value256 = _mm256_set1_epi64x(q: value);
371	#endif
372
373	qt_memfillXX_avx2(dest: reinterpret_cast<uchar >(dest), value256, bytes: count sizeof(quint64));
374	}
375
376	void qt_memfill32_avx2(quint32 *dest, quint32 value, qsizetype count)
377	{
378	if (count % `2`) {
379	// odd number of pixels, round to even
380	*dest++ = value;
381	--count;
382	}
383	qt_memfillXX_avx2(dest: reinterpret_cast<uchar >(dest), value256: _mm256_set1_epi32(i: value), bytes: count sizeof(quint32));
384	}
385
386	void QT_FASTCALL comp_func_SourceOver_avx2(uint destPixels, const* uint srcPixels, int* length, uint const_alpha)
387	{
388	Q_ASSERT(const_alpha < `256`);
389
390	const quint32 src = (const* quint32 *) srcPixels;
391	quint32 dst = (quint32 ) destPixels;
392
393	if (const_alpha == `255`)
394	BLEND_SOURCE_OVER_ARGB32_AVX2(dst, src, length);
395	else
396	BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_AVX2(dst, src, length, const_alpha);
397	}
398
399	#if QT_CONFIG(raster_64bit)
400	void QT_FASTCALL comp_func_SourceOver_rgb64_avx2(QRgba64 dst, const* QRgba64 src, int* length, uint const_alpha)
401	{
402	Q_ASSERT(const_alpha < `256`); // const_alpha is in [0-255]
403	const __m256i half = _mm256_set1_epi32(i: `0x8000`);
404	const __m256i one = _mm256_set1_epi32(i: `0xffff`);
405	const __m256i colorMask = _mm256_set1_epi32(i: `0x0000ffff`);
406	__m256i alphaMask = _mm256_set1_epi32(i: `0xff000000`);
407	alphaMask = _mm256_unpacklo_epi8(a: alphaMask, b: alphaMask);
408	const __m256i alphaShuffleMask = _mm256_set_epi8(b31: char(`0xff`),b30: char(`0xff`),b29: `15`,b28: `14`,b27: char(`0xff`),b26: char(`0xff`),b25: `15`,b24: `14`,b23: char(`0xff`),b22: char(`0xff`),b21: `7`,b20: `6`,b19: char(`0xff`),b18: char(`0xff`),b17: `7`,b16: `6`,
409	b15: char(`0xff`),b14: char(`0xff`),b13: `15`,b12: `14`,b11: char(`0xff`),b10: char(`0xff`),b09: `15`,b08: `14`,b07: char(`0xff`),b06: char(`0xff`),b05: `7`,b04: `6`,b03: char(`0xff`),b02: char(`0xff`),b01: `7`,b00: `6`);
410
411	if (const_alpha == `255`) {
412	int x = `0`;
413	for (; x < length && (quintptr(dst + x) & `31`); ++x)
414	blend_pixel(dst&: dst[x], src: src[x]);
415	for (; x < length - `3`; x += `4`) {
416	const __m256i srcVector = _mm256_lddqu_si256(p: (const __m256i *)&src[x]);
417	if (!_mm256_testz_si256(a: srcVector, b: alphaMask)) {
418	// Not all transparent
419	if (_mm256_testc_si256(a: srcVector, b: alphaMask)) {
420	// All opaque
421	_mm256_store_si256(p: (__m256i *)&dst[x], a: srcVector);
422	} else {
423	__m256i alphaChannel = _mm256_shuffle_epi8(a: srcVector, b: alphaShuffleMask);
424	alphaChannel = _mm256_sub_epi32(a: one, b: alphaChannel);
425	__m256i dstVector = _mm256_load_si256(p: (__m256i *)&dst[x]);
426	BYTE_MUL_RGB64_AVX2(pixelVector&: dstVector, alphaChannel, colorMask, half);
427	dstVector = _mm256_add_epi16(a: dstVector, b: srcVector);
428	_mm256_store_si256(p: (__m256i *)&dst[x], a: dstVector);
429	}
430	}
431	}
432	SIMD_EPILOGUE(x, length, `3`)
433	blend_pixel(dst&: dst[x], src: src[x]);
434	} else {
435	const __m256i constAlphaVector = _mm256_set1_epi32(i: const_alpha \| (const_alpha << `8`));
436	int x = `0`;
437	for (; x < length && (quintptr(dst + x) & `31`); ++x)
438	blend_pixel(dst&: dst[x], src: src[x], const_alpha);
439	for (; x < length - `3`; x += `4`) {
440	__m256i srcVector = _mm256_lddqu_si256(p: (const __m256i *)&src[x]);
441	if (!_mm256_testz_si256(a: srcVector, b: alphaMask)) {
442	// Not all transparent
443	BYTE_MUL_RGB64_AVX2(pixelVector&: srcVector, alphaChannel: constAlphaVector, colorMask, half);
444
445	__m256i alphaChannel = _mm256_shuffle_epi8(a: srcVector, b: alphaShuffleMask);
446	alphaChannel = _mm256_sub_epi32(a: one, b: alphaChannel);
447	__m256i dstVector = _mm256_load_si256(p: (__m256i *)&dst[x]);
448	BYTE_MUL_RGB64_AVX2(pixelVector&: dstVector, alphaChannel, colorMask, half);
449	dstVector = _mm256_add_epi16(a: dstVector, b: srcVector);
450	_mm256_store_si256(p: (__m256i *)&dst[x], a: dstVector);
451	}
452	}
453	SIMD_EPILOGUE(x, length, `3`)
454	blend_pixel(dst&: dst[x], src: src[x], const_alpha);
455	}
456	}
457	#endif
458
459	void QT_FASTCALL comp_func_Source_avx2(uint dst, const* uint src, int* length, uint const_alpha)
460	{
461	if (const_alpha == `255`) {
462	::memcpy(dest: dst, src: src, n: length * sizeof(uint));
463	} else {
464	const int ialpha = `255` - const_alpha;
465
466	int x = `0`;
467
468	// 1) prologue, align on 32 bytes
469	ALIGNMENT_PROLOGUE_32BYTES(dst, x, length)
470	dst[x] = INTERPOLATE_PIXEL_255(x: src[x], a: const_alpha, y: dst[x], b: ialpha);
471
472	// 2) interpolate pixels with AVX2
473	const __m256i half = _mm256_set1_epi16(w: `0x80`);
474	const __m256i colorMask = _mm256_set1_epi32(i: `0x00ff00ff`);
475	const __m256i constAlphaVector = _mm256_set1_epi16(w: const_alpha);
476	const __m256i oneMinusConstAlpha = _mm256_set1_epi16(w: ialpha);
477	for (; x < length - `7`; x += `8`) {
478	const __m256i srcVector = _mm256_lddqu_si256(p: (const __m256i *)&src[x]);
479	__m256i dstVector = _mm256_load_si256(p: (__m256i *)&dst[x]);
480	INTERPOLATE_PIXEL_255_AVX2(srcVector, dstVector, alphaChannel: constAlphaVector, oneMinusAlphaChannel: oneMinusConstAlpha, colorMask, half);
481	_mm256_store_si256(p: (__m256i *)&dst[x], a: dstVector);
482	}
483
484	// 3) Epilogue
485	SIMD_EPILOGUE(x, length, `7`)
486	dst[x] = INTERPOLATE_PIXEL_255(x: src[x], a: const_alpha, y: dst[x], b: ialpha);
487	}
488	}
489
490	#if QT_CONFIG(raster_64bit)
491	void QT_FASTCALL comp_func_Source_rgb64_avx2(QRgba64 dst, const* QRgba64 src, int* length, uint const_alpha)
492	{
493	Q_ASSERT(const_alpha < `256`); // const_alpha is in [0-255]
494	if (const_alpha == `255`) {
495	::memcpy(dest: dst, src: src, n: length * sizeof(QRgba64));
496	} else {
497	const uint ca = const_alpha \| (const_alpha << `8`); // adjust to [0-65535]
498	const uint cia = `65535` - ca;
499
500	int x = `0`;
501
502	// 1) prologue, align on 32 bytes
503	for (; x < length && (quintptr(dst + x) & `31`); ++x)
504	dst[x] = interpolate65535(x: src[x], alpha1: ca, y: dst[x], alpha2: cia);
505
506	// 2) interpolate pixels with AVX2
507	const __m256i half = _mm256_set1_epi32(i: `0x8000`);
508	const __m256i colorMask = _mm256_set1_epi32(i: `0x0000ffff`);
509	const __m256i constAlphaVector = _mm256_set1_epi32(i: ca);
510	const __m256i oneMinusConstAlpha = _mm256_set1_epi32(i: cia);
511	for (; x < length - `3`; x += `4`) {
512	const __m256i srcVector = _mm256_lddqu_si256(p: (const __m256i *)&src[x]);
513	__m256i dstVector = _mm256_load_si256(p: (__m256i *)&dst[x]);
514	INTERPOLATE_PIXEL_RGB64_AVX2(srcVector, dstVector, alphaChannel: constAlphaVector, oneMinusAlphaChannel: oneMinusConstAlpha, colorMask, half);
515	_mm256_store_si256(p: (__m256i *)&dst[x], a: dstVector);
516	}
517
518	// 3) Epilogue
519	SIMD_EPILOGUE(x, length, `3`)
520	dst[x] = interpolate65535(x: src[x], alpha1: ca, y: dst[x], alpha2: cia);
521	}
522	}
523	#endif
524
525	void QT_FASTCALL comp_func_solid_SourceOver_avx2(uint destPixels, int* length, uint color, uint const_alpha)
526	{
527	if ((const_alpha & qAlpha(rgb: color)) == `255`) {
528	qt_memfill32(destPixels, color, length);
529	} else {
530	if (const_alpha != `255`)
531	color = BYTE_MUL(x: color, a: const_alpha);
532
533	const quint32 minusAlphaOfColor = qAlpha(rgb: ~color);
534	int x = `0`;
535
536	quint32 dst = (quint32 ) destPixels;
537	const __m256i colorVector = _mm256_set1_epi32(i: color);
538	const __m256i colorMask = _mm256_set1_epi32(i: `0x00ff00ff`);
539	const __m256i half = _mm256_set1_epi16(w: `0x80`);
540	const __m256i minusAlphaOfColorVector = _mm256_set1_epi16(w: minusAlphaOfColor);
541
542	ALIGNMENT_PROLOGUE_32BYTES(dst, x, length)
543	destPixels[x] = color + BYTE_MUL(x: destPixels[x], a: minusAlphaOfColor);
544
545	for (; x < length - `7`; x += `8`) {
546	__m256i dstVector = _mm256_load_si256(p: (__m256i *)&dst[x]);
547	BYTE_MUL_AVX2(pixelVector&: dstVector, alphaChannel: minusAlphaOfColorVector, colorMask, half);
548	dstVector = _mm256_add_epi8(a: colorVector, b: dstVector);
549	_mm256_store_si256(p: (__m256i *)&dst[x], a: dstVector);
550	}
551	SIMD_EPILOGUE(x, length, `7`)
552	destPixels[x] = color + BYTE_MUL(x: destPixels[x], a: minusAlphaOfColor);
553	}
554	}
555
556	#if QT_CONFIG(raster_64bit)
557	void QT_FASTCALL comp_func_solid_SourceOver_rgb64_avx2(QRgba64 destPixels, int* length, QRgba64 color, uint const_alpha)
558	{
559	Q_ASSERT(const_alpha < `256`); // const_alpha is in [0-255]
560	if (const_alpha == `255` && color.isOpaque()) {
561	qt_memfill64((quint64*)destPixels, color, length);
562	} else {
563	if (const_alpha != `255`)
564	color = multiplyAlpha255(rgba64: color, alpha255: const_alpha);
565
566	const uint minusAlphaOfColor = `65535` - color.alpha();
567	int x = `0`;
568	quint64 dst = (quint64 ) destPixels;
569	const __m256i colorVector = _mm256_set1_epi64x(q: color);
570	const __m256i colorMask = _mm256_set1_epi32(i: `0x0000ffff`);
571	const __m256i half = _mm256_set1_epi32(i: `0x8000`);
572	const __m256i minusAlphaOfColorVector = _mm256_set1_epi32(i: minusAlphaOfColor);
573
574	for (; x < length && (quintptr(dst + x) & `31`); ++x)
575	destPixels[x] = color + multiplyAlpha65535(rgba64: destPixels[x], alpha65535: minusAlphaOfColor);
576
577	for (; x < length - `3`; x += `4`) {
578	__m256i dstVector = _mm256_load_si256(p: (__m256i *)&dst[x]);
579	BYTE_MUL_RGB64_AVX2(pixelVector&: dstVector, alphaChannel: minusAlphaOfColorVector, colorMask, half);
580	dstVector = _mm256_add_epi16(a: colorVector, b: dstVector);
581	_mm256_store_si256(p: (__m256i *)&dst[x], a: dstVector);
582	}
583	SIMD_EPILOGUE(x, length, `3`)
584	destPixels[x] = color + multiplyAlpha65535(rgba64: destPixels[x], alpha65535: minusAlphaOfColor);
585	}
586	}
587	#endif
588
589	#define interpolate_4_pixels_16_avx2(tlr1, tlr2, blr1, blr2, distx, disty, colorMask, v_256, b) \
590	{ \
591	/* Correct for later unpack */ \
592	const __m256i vdistx = _mm256_permute4x64_epi64(distx, _MM_SHUFFLE(3, 1, 2, 0)); \
593	const __m256i vdisty = _mm256_permute4x64_epi64(disty, _MM_SHUFFLE(3, 1, 2, 0)); \
594	\
595	__m256i dxdy = _mm256_mullo_epi16 (vdistx, vdisty); \
596	const __m256i distx_ = _mm256_slli_epi16(vdistx, 4); \
597	const __m256i disty_ = _mm256_slli_epi16(vdisty, 4); \
598	__m256i idxidy = _mm256_add_epi16(dxdy, _mm256_sub_epi16(v_256, _mm256_add_epi16(distx_, disty_))); \
599	__m256i dxidy = _mm256_sub_epi16(distx_, dxdy); \
600	__m256i idxdy = _mm256_sub_epi16(disty_, dxdy); \
601	\
602	__m256i tlr1AG = _mm256_srli_epi16(tlr1, 8); \
603	__m256i tlr1RB = _mm256_and_si256(tlr1, colorMask); \
604	__m256i tlr2AG = _mm256_srli_epi16(tlr2, 8); \
605	__m256i tlr2RB = _mm256_and_si256(tlr2, colorMask); \
606	__m256i blr1AG = _mm256_srli_epi16(blr1, 8); \
607	__m256i blr1RB = _mm256_and_si256(blr1, colorMask); \
608	__m256i blr2AG = _mm256_srli_epi16(blr2, 8); \
609	__m256i blr2RB = _mm256_and_si256(blr2, colorMask); \
610	\
611	__m256i odxidy1 = _mm256_unpacklo_epi32(idxidy, dxidy); \
612	__m256i odxidy2 = _mm256_unpackhi_epi32(idxidy, dxidy); \
613	tlr1AG = _mm256_mullo_epi16(tlr1AG, odxidy1); \
614	tlr1RB = _mm256_mullo_epi16(tlr1RB, odxidy1); \
615	tlr2AG = _mm256_mullo_epi16(tlr2AG, odxidy2); \
616	tlr2RB = _mm256_mullo_epi16(tlr2RB, odxidy2); \
617	__m256i odxdy1 = _mm256_unpacklo_epi32(idxdy, dxdy); \
618	__m256i odxdy2 = _mm256_unpackhi_epi32(idxdy, dxdy); \
619	blr1AG = _mm256_mullo_epi16(blr1AG, odxdy1); \
620	blr1RB = _mm256_mullo_epi16(blr1RB, odxdy1); \
621	blr2AG = _mm256_mullo_epi16(blr2AG, odxdy2); \
622	blr2RB = _mm256_mullo_epi16(blr2RB, odxdy2); \
623	\
624	/* Add the values, and shift to only keep 8 significant bits per colors */ \
625	__m256i topAG = _mm256_hadd_epi32(tlr1AG, tlr2AG); \
626	__m256i topRB = _mm256_hadd_epi32(tlr1RB, tlr2RB); \
627	__m256i botAG = _mm256_hadd_epi32(blr1AG, blr2AG); \
628	__m256i botRB = _mm256_hadd_epi32(blr1RB, blr2RB); \
629	__m256i rAG = _mm256_add_epi16(topAG, botAG); \
630	__m256i rRB = _mm256_add_epi16(topRB, botRB); \
631	rRB = _mm256_srli_epi16(rRB, 8); \
632	/* Correct for hadd */ \
633	rAG = _mm256_permute4x64_epi64(rAG, _MM_SHUFFLE(3, 1, 2, 0)); \
634	rRB = _mm256_permute4x64_epi64(rRB, _MM_SHUFFLE(3, 1, 2, 0)); \
635	_mm256_storeu_si256((__m256i*)(b), _mm256_blendv_epi8(rAG, rRB, colorMask)); \
636	}
637
638	inline void fetchTransformedBilinear_pixelBounds(int, int l1, int l2, int &v1, int &v2)
639	{
640	if (v1 < l1)
641	v2 = v1 = l1;
642	else if (v1 >= l2)
643	v2 = v1 = l2;
644	else
645	v2 = v1 + `1`;
646	Q_ASSERT(v1 >= l1 && v1 <= l2);
647	Q_ASSERT(v2 >= l1 && v2 <= l2);
648	}
649
650	void QT_FASTCALL intermediate_adder_avx2(uint b, uint end, const IntermediateBuffer &intermediate, int offset, int &fx, int fdx);
651
652	void QT_FASTCALL fetchTransformedBilinearARGB32PM_simple_scale_helper_avx2(uint b, uint end, const QTextureData &image,
653	int &fx, int &fy, int fdx, int /fdy/)
654	{
655	int y1 = (fy >> `16`);
656	int y2;
657	fetchTransformedBilinear_pixelBounds(image.height, l1: image.y1, l2: image.y2 - `1`, v1&: y1, v2&: y2);
658	const uint s1 = (const* uint *)image.scanLine(y: y1);
659	const uint s2 = (const* uint *)image.scanLine(y: y2);
660
661	const int disty = (fy & `0x0000ffff`) >> `8`;
662	const int idisty = `256` - disty;
663	const int length = end - b;
664
665	// The intermediate buffer is generated in the positive direction
666	const int adjust = (fdx < `0`) ? fdx * length : `0`;
667	const int offset = (fx + adjust) >> `16`;
668	int x = offset;
669
670	IntermediateBuffer intermediate;
671	// count is the size used in the intermediate_buffer.
672	int count = (qint64(length) * qAbs(t: fdx) + FixedScale - `1`) / FixedScale + `2`;
673	// length is supposed to be <= BufferSize either because data->m11 < 1 or
674	// data->m11 < 2, and any larger buffers split
675	Q_ASSERT(count <= BufferSize + `2`);
676	int f = `0`;
677	int lim = qMin(a: count, b: image.x2 - x);
678	if (x < image.x1) {
679	Q_ASSERT(x < image.x2);
680	uint t = s1[image.x1];
681	uint b = s2[image.x1];
682	quint32 rb = (((t & `0xff00ff`) * idisty + (b & `0xff00ff`) * disty) >> `8`) & `0xff00ff`;
683	quint32 ag = ((((t>>`8`) & `0xff00ff`) * idisty + ((b>>`8`) & `0xff00ff`) * disty) >> `8`) & `0xff00ff`;
684	do {
685	intermediate.buffer_rb[f] = rb;
686	intermediate.buffer_ag[f] = ag;
687	f++;
688	x++;
689	} while (x < image.x1 && f < lim);
690	}
691
692	const __m256i disty_ = _mm256_set1_epi16(w: disty);
693	const __m256i idisty_ = _mm256_set1_epi16(w: idisty);
694	const __m256i colorMask = _mm256_set1_epi32(i: `0x00ff00ff`);
695
696	lim -= `7`;
697	for (; f < lim; x += `8`, f += `8`) {
698	// Load 8 pixels from s1, and split the alpha-green and red-blue component
699	__m256i top = _mm256_loadu_si256(p: (const __m256i)((const* uint *)(s1)+x));
700	__m256i topAG = _mm256_srli_epi16(a: top, count: `8`);
701	__m256i topRB = _mm256_and_si256(a: top, b: colorMask);
702	// Multiplies each color component by idisty
703	topAG = _mm256_mullo_epi16 (a: topAG, b: idisty_);
704	topRB = _mm256_mullo_epi16 (a: topRB, b: idisty_);
705
706	// Same for the s2 vector
707	__m256i bottom = _mm256_loadu_si256(p: (const __m256i)((const* uint *)(s2)+x));
708	__m256i bottomAG = _mm256_srli_epi16(a: bottom, count: `8`);
709	__m256i bottomRB = _mm256_and_si256(a: bottom, b: colorMask);
710	bottomAG = _mm256_mullo_epi16 (a: bottomAG, b: disty_);
711	bottomRB = _mm256_mullo_epi16 (a: bottomRB, b: disty_);
712
713	// Add the values, and shift to only keep 8 significant bits per colors
714	__m256i rAG =_mm256_add_epi16(a: topAG, b: bottomAG);
715	rAG = _mm256_srli_epi16(a: rAG, count: `8`);
716	_mm256_storeu_si256(p: (__m256i*)(&intermediate.buffer_ag[f]), a: rAG);
717	__m256i rRB =_mm256_add_epi16(a: topRB, b: bottomRB);
718	rRB = _mm256_srli_epi16(a: rRB, count: `8`);
719	_mm256_storeu_si256(p: (__m256i*)(&intermediate.buffer_rb[f]), a: rRB);
720	}
721
722	for (; f < count; f++) { // Same as above but without simd
723	x = qMin(a: x, b: image.x2 - `1`);
724
725	uint t = s1[x];
726	uint b = s2[x];
727
728	intermediate.buffer_rb[f] = (((t & `0xff00ff`) * idisty + (b & `0xff00ff`) * disty) >> `8`) & `0xff00ff`;
729	intermediate.buffer_ag[f] = ((((t>>`8`) & `0xff00ff`) * idisty + ((b>>`8`) & `0xff00ff`) * disty) >> `8`) & `0xff00ff`;
730	x++;
731	}
732
733	// Now interpolate the values from the intermediate_buffer to get the final result.
734	intermediate_adder_avx2(b, end, intermediate, offset, fx, fdx);
735	}
736
737	void QT_FASTCALL intermediate_adder_avx2(uint b, uint end, const IntermediateBuffer &intermediate, int offset, int &fx, int fdx)
738	{
739	fx -= offset * FixedScale;
740
741	const __m128i v_fdx = _mm_set1_epi32(i: fdx * `4`);
742	const __m128i v_blend = _mm_set1_epi32(i: `0x00800080`);
743	const __m128i vdx_shuffle = _mm_set_epi8(b15: char(`0x80`), b14: `13`, b13: char(`0x80`), b12: `13`, b11: char(`0x80`), b10: `9`, b9: char(`0x80`), b8: `9`,
744	b7: char(`0x80`), b6: `5`, b5: char(`0x80`), b4: `5`, b3: char(`0x80`), b2: `1`, b1: char(`0x80`), b0: `1`);
745	__m128i v_fx = _mm_setr_epi32(i0: fx, i1: fx + fdx, i2: fx + fdx + fdx, i3: fx + fdx + fdx + fdx);
746
747	while (b < end - `3`) {
748	const __m128i offset = _mm_srli_epi32(a: v_fx, count: `16`);
749	__m256i vrb = _mm256_i32gather_epi64((const long long *)intermediate.buffer_rb, offset, `4`);
750	__m256i vag = _mm256_i32gather_epi64((const long long *)intermediate.buffer_ag, offset, `4`);
751
752	__m128i vdx = _mm_shuffle_epi8(a: v_fx, b: vdx_shuffle);
753	__m128i vidx = _mm_sub_epi16(a: _mm_set1_epi16(w: `256`), b: vdx);
754	__m256i vmulx = _mm256_castsi128_si256(a: _mm_unpacklo_epi32(a: vidx, b: vdx));
755	vmulx = _mm256_inserti128_si256(vmulx, _mm_unpackhi_epi32(vidx, vdx), `1`);
756
757	vrb = _mm256_mullo_epi16(a: vrb, b: vmulx);
758	vag = _mm256_mullo_epi16(a: vag, b: vmulx);
759
760	__m256i vrbag = _mm256_hadd_epi32(a: vrb, b: vag);
761	vrbag = _mm256_permute4x64_epi64(vrbag, _MM_SHUFFLE(`3`, `1`, `2`, `0`));
762
763	__m128i rb = _mm256_castsi256_si128(a: vrbag);
764	__m128i ag = _mm256_extracti128_si256(vrbag, `1`);
765	rb = _mm_srli_epi16(a: rb, count: `8`);
766
767	_mm_storeu_si128(p: (__m128i*)b, b: _mm_blendv_epi8(V1: ag, V2: rb, M: v_blend));
768
769	b += `4`;
770	v_fx = _mm_add_epi32(a: v_fx, b: v_fdx);
771	}
772	fx = _mm_cvtsi128_si32(a: v_fx);
773	while (b < end) {
774	const int x = (fx >> `16`);
775
776	const uint distx = (fx & `0x0000ffff`) >> `8`;
777	const uint idistx = `256` - distx;
778	const uint rb = (intermediate.buffer_rb[x] * idistx + intermediate.buffer_rb[x + `1`] * distx) & `0xff00ff00`;
779	const uint ag = (intermediate.buffer_ag[x] * idistx + intermediate.buffer_ag[x + `1`] * distx) & `0xff00ff00`;
780	*b = (rb >> `8`) \| ag;
781	b++;
782	fx += fdx;
783	}
784	fx += offset * FixedScale;
785	}
786
787	void QT_FASTCALL fetchTransformedBilinearARGB32PM_downscale_helper_avx2(uint b, uint end, const QTextureData &image,
788	int &fx, int &fy, int fdx, int /fdy/)
789	{
790	int y1 = (fy >> `16`);
791	int y2;
792	fetchTransformedBilinear_pixelBounds(image.height, l1: image.y1, l2: image.y2 - `1`, v1&: y1, v2&: y2);
793	const uint s1 = (const* uint *)image.scanLine(y: y1);
794	const uint s2 = (const* uint *)image.scanLine(y: y2);
795	const int disty8 = (fy & `0x0000ffff`) >> `8`;
796	const int disty4 = (disty8 + `0x08`) >> `4`;
797
798	const qint64 min_fx = qint64(image.x1) * FixedScale;
799	const qint64 max_fx = qint64(image.x2 - `1`) * FixedScale;
800	while (b < end) {
801	int x1 = (fx >> `16`);
802	int x2;
803	fetchTransformedBilinear_pixelBounds(image.width, l1: image.x1, l2: image.x2 - `1`, v1&: x1, v2&: x2);
804	if (x1 != x2)
805	break;
806	uint top = s1[x1];
807	uint bot = s2[x1];
808	*b = INTERPOLATE_PIXEL_256(x: top, a: `256` - disty8, y: bot, b: disty8);
809	fx += fdx;
810	++b;
811	}
812	uint *boundedEnd = end;
813	if (fdx > `0`)
814	boundedEnd = qMin(a: boundedEnd, b: b + (max_fx - fx) / fdx);
815	else if (fdx < `0`)
816	boundedEnd = qMin(a: boundedEnd, b: b + (min_fx - fx) / fdx);
817
818	// A fast middle part without boundary checks
819	const __m256i vdistShuffle =
820	_mm256_setr_epi8(b31: `0`, b30: char(`0x80`), b29: `0`, b28: char(`0x80`), b27: `4`, b26: char(`0x80`), b25: `4`, b24: char(`0x80`), b23: `8`, b22: char(`0x80`), b21: `8`, b20: char(`0x80`), b19: `12`, b18: char(`0x80`), b17: `12`, b16: char(`0x80`),
821	b15: `0`, b14: char(`0x80`), b13: `0`, b12: char(`0x80`), b11: `4`, b10: char(`0x80`), b09: `4`, b08: char(`0x80`), b07: `8`, b06: char(`0x80`), b05: `8`, b04: char(`0x80`), b03: `12`, b02: char(`0x80`), b01: `12`, b00: char(`0x80`));
822	const __m256i colorMask = _mm256_set1_epi32(i: `0x00ff00ff`);
823	const __m256i v_256 = _mm256_set1_epi16(w: `256`);
824	const __m256i v_disty = _mm256_set1_epi16(w: disty4);
825	const __m256i v_fdx = _mm256_set1_epi32(i: fdx * `8`);
826	const __m256i v_fx_r = _mm256_set1_epi32(i: `0x08`);
827	const __m256i v_index = _mm256_setr_epi32(i0: `0`, i1: `1`, i2: `2`, i3: `3`, i4: `4`, i5: `5`, i6: `6`, i7: `7`);
828	__m256i v_fx = _mm256_set1_epi32(i: fx);
829	v_fx = _mm256_add_epi32(a: v_fx, b: _mm256_mullo_epi32(a: _mm256_set1_epi32(i: fdx), b: v_index));
830
831	while (b < boundedEnd - `7`) {
832	const __m256i offset = _mm256_srli_epi32(a: v_fx, count: `16`);
833	const __m128i offsetLo = _mm256_castsi256_si128(a: offset);
834	const __m128i offsetHi = _mm256_extracti128_si256(offset, `1`);
835	const __m256i toplo = _mm256_i32gather_epi64((const long long *)s1, offsetLo, `4`);
836	const __m256i tophi = _mm256_i32gather_epi64((const long long *)s1, offsetHi, `4`);
837	const __m256i botlo = _mm256_i32gather_epi64((const long long *)s2, offsetLo, `4`);
838	const __m256i bothi = _mm256_i32gather_epi64((const long long *)s2, offsetHi, `4`);
839
840	__m256i v_distx = _mm256_srli_epi16(a: v_fx, count: `8`);
841	v_distx = _mm256_srli_epi16(a: _mm256_add_epi32(a: v_distx, b: v_fx_r), count: `4`);
842	v_distx = _mm256_shuffle_epi8(a: v_distx, b: vdistShuffle);
843
844	interpolate_4_pixels_16_avx2(toplo, tophi, botlo, bothi, v_distx, v_disty, colorMask, v_256, b);
845	b += `8`;
846	v_fx = _mm256_add_epi32(a: v_fx, b: v_fdx);
847	}
848	fx = _mm_extract_epi32(_mm256_castsi256_si128(v_fx) , `0`);
849
850	while (b < boundedEnd) {
851	int x = (fx >> `16`);
852	int distx8 = (fx & `0x0000ffff`) >> `8`;
853	*b = interpolate_4_pixels(t: s1 + x, b: s2 + x, distx: distx8, disty: disty8);
854	fx += fdx;
855	++b;
856	}
857
858	while (b < end) {
859	int x1 = (fx >> `16`);
860	int x2;
861	fetchTransformedBilinear_pixelBounds(image.width, l1: image.x1, l2: image.x2 - `1`, v1&: x1, v2&: x2);
862	uint tl = s1[x1];
863	uint tr = s1[x2];
864	uint bl = s2[x1];
865	uint br = s2[x2];
866	int distx8 = (fx & `0x0000ffff`) >> `8`;
867	*b = interpolate_4_pixels(tl, tr, bl, br, distx: distx8, disty: disty8);
868	fx += fdx;
869	++b;
870	}
871	}
872
873	void QT_FASTCALL fetchTransformedBilinearARGB32PM_fast_rotate_helper_avx2(uint b, uint end, const QTextureData &image,
874	int &fx, int &fy, int fdx, int fdy)
875	{
876	const qint64 min_fx = qint64(image.x1) * FixedScale;
877	const qint64 max_fx = qint64(image.x2 - `1`) * FixedScale;
878	const qint64 min_fy = qint64(image.y1) * FixedScale;
879	const qint64 max_fy = qint64(image.y2 - `1`) * FixedScale;
880	// first handle the possibly bounded part in the beginning
881	while (b < end) {
882	int x1 = (fx >> `16`);
883	int x2;
884	int y1 = (fy >> `16`);
885	int y2;
886	fetchTransformedBilinear_pixelBounds(image.width, l1: image.x1, l2: image.x2 - `1`, v1&: x1, v2&: x2);
887	fetchTransformedBilinear_pixelBounds(image.height, l1: image.y1, l2: image.y2 - `1`, v1&: y1, v2&: y2);
888	if (x1 != x2 && y1 != y2)
889	break;
890	const uint s1 = (const* uint *)image.scanLine(y: y1);
891	const uint s2 = (const* uint *)image.scanLine(y: y2);
892	uint tl = s1[x1];
893	uint tr = s1[x2];
894	uint bl = s2[x1];
895	uint br = s2[x2];
896	int distx = (fx & `0x0000ffff`) >> `8`;
897	int disty = (fy & `0x0000ffff`) >> `8`;
898	*b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
899	fx += fdx;
900	fy += fdy;
901	++b;
902	}
903	uint *boundedEnd = end;
904	if (fdx > `0`)
905	boundedEnd = qMin(a: boundedEnd, b: b + (max_fx - fx) / fdx);
906	else if (fdx < `0`)
907	boundedEnd = qMin(a: boundedEnd, b: b + (min_fx - fx) / fdx);
908	if (fdy > `0`)
909	boundedEnd = qMin(a: boundedEnd, b: b + (max_fy - fy) / fdy);
910	else if (fdy < `0`)
911	boundedEnd = qMin(a: boundedEnd, b: b + (min_fy - fy) / fdy);
912
913	// until boundedEnd we can now have a fast middle part without boundary checks
914	const __m256i vdistShuffle =
915	_mm256_setr_epi8(b31: `0`, b30: char(`0x80`), b29: `0`, b28: char(`0x80`), b27: `4`, b26: char(`0x80`), b25: `4`, b24: char(`0x80`), b23: `8`, b22: char(`0x80`), b21: `8`, b20: char(`0x80`), b19: `12`, b18: char(`0x80`), b17: `12`, b16: char(`0x80`),
916	b15: `0`, b14: char(`0x80`), b13: `0`, b12: char(`0x80`), b11: `4`, b10: char(`0x80`), b09: `4`, b08: char(`0x80`), b07: `8`, b06: char(`0x80`), b05: `8`, b04: char(`0x80`), b03: `12`, b02: char(`0x80`), b01: `12`, b00: char(`0x80`));
917	const __m256i colorMask = _mm256_set1_epi32(i: `0x00ff00ff`);
918	const __m256i v_256 = _mm256_set1_epi16(w: `256`);
919	const __m256i v_fdx = _mm256_set1_epi32(i: fdx * `8`);
920	const __m256i v_fdy = _mm256_set1_epi32(i: fdy * `8`);
921	const __m256i v_fxy_r = _mm256_set1_epi32(i: `0x08`);
922	const __m256i v_index = _mm256_setr_epi32(i0: `0`, i1: `1`, i2: `2`, i3: `3`, i4: `4`, i5: `5`, i6: `6`, i7: `7`);
923	__m256i v_fx = _mm256_set1_epi32(i: fx);
924	__m256i v_fy = _mm256_set1_epi32(i: fy);
925	v_fx = _mm256_add_epi32(a: v_fx, b: _mm256_mullo_epi32(a: _mm256_set1_epi32(i: fdx), b: v_index));
926	v_fy = _mm256_add_epi32(a: v_fy, b: _mm256_mullo_epi32(a: _mm256_set1_epi32(i: fdy), b: v_index));
927
928	const uchar *textureData = image.imageData;
929	const qsizetype bytesPerLine = image.bytesPerLine;
930	const __m256i vbpl = _mm256_set1_epi16(w: bytesPerLine/`4`);
931
932	while (b < boundedEnd - `7`) {
933	const __m256i vy = _mm256_packs_epi32(a: _mm256_srli_epi32(a: v_fy, count: `16`), b: _mm256_setzero_si256());
934	// 8x16bit 8x16bit -> 8x32bit*
935	__m256i offset = _mm256_unpacklo_epi16(a: _mm256_mullo_epi16(a: vy, b: vbpl), b: _mm256_mulhi_epi16(a: vy, b: vbpl));
936	offset = _mm256_add_epi32(a: offset, b: _mm256_srli_epi32(a: v_fx, count: `16`));
937	const __m128i offsetLo = _mm256_castsi256_si128(a: offset);
938	const __m128i offsetHi = _mm256_extracti128_si256(offset, `1`);
939	const uint topData = (const* uint *)(textureData);
940	const uint botData = (const* uint *)(textureData + bytesPerLine);
941	const __m256i toplo = _mm256_i32gather_epi64((const long long *)topData, offsetLo, `4`);
942	const __m256i tophi = _mm256_i32gather_epi64((const long long *)topData, offsetHi, `4`);
943	const __m256i botlo = _mm256_i32gather_epi64((const long long *)botData, offsetLo, `4`);
944	const __m256i bothi = _mm256_i32gather_epi64((const long long *)botData, offsetHi, `4`);
945
946	__m256i v_distx = _mm256_srli_epi16(a: v_fx, count: `8`);
947	__m256i v_disty = _mm256_srli_epi16(a: v_fy, count: `8`);
948	v_distx = _mm256_srli_epi16(a: _mm256_add_epi32(a: v_distx, b: v_fxy_r), count: `4`);
949	v_disty = _mm256_srli_epi16(a: _mm256_add_epi32(a: v_disty, b: v_fxy_r), count: `4`);
950	v_distx = _mm256_shuffle_epi8(a: v_distx, b: vdistShuffle);
951	v_disty = _mm256_shuffle_epi8(a: v_disty, b: vdistShuffle);
952
953	interpolate_4_pixels_16_avx2(toplo, tophi, botlo, bothi, v_distx, v_disty, colorMask, v_256, b);
954	b += `8`;
955	v_fx = _mm256_add_epi32(a: v_fx, b: v_fdx);
956	v_fy = _mm256_add_epi32(a: v_fy, b: v_fdy);
957	}
958	fx = _mm_extract_epi32(_mm256_castsi256_si128(v_fx) , `0`);
959	fy = _mm_extract_epi32(_mm256_castsi256_si128(v_fy) , `0`);
960
961	while (b < boundedEnd) {
962	int x = (fx >> `16`);
963	int y = (fy >> `16`);
964
965	const uint s1 = (const* uint *)image.scanLine(y);
966	const uint s2 = (const* uint *)image.scanLine(y: y + `1`);
967
968	int distx = (fx & `0x0000ffff`) >> `8`;
969	int disty = (fy & `0x0000ffff`) >> `8`;
970	*b = interpolate_4_pixels(t: s1 + x, b: s2 + x, distx, disty);
971
972	fx += fdx;
973	fy += fdy;
974	++b;
975	}
976
977	while (b < end) {
978	int x1 = (fx >> `16`);
979	int x2;
980	int y1 = (fy >> `16`);
981	int y2;
982
983	fetchTransformedBilinear_pixelBounds(image.width, l1: image.x1, l2: image.x2 - `1`, v1&: x1, v2&: x2);
984	fetchTransformedBilinear_pixelBounds(image.height, l1: image.y1, l2: image.y2 - `1`, v1&: y1, v2&: y2);
985
986	const uint s1 = (const* uint *)image.scanLine(y: y1);
987	const uint s2 = (const* uint *)image.scanLine(y: y2);
988
989	uint tl = s1[x1];
990	uint tr = s1[x2];
991	uint bl = s2[x1];
992	uint br = s2[x2];
993
994	int distx = (fx & `0x0000ffff`) >> `8`;
995	int disty = (fy & `0x0000ffff`) >> `8`;
996	*b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
997
998	fx += fdx;
999	fy += fdy;
1000	++b;
1001	}
1002	}
1003
1004	static inline __m256i epilogueMaskFromCount(qsizetype count)
1005	{
1006	Q_ASSERT(count > `0`);
1007	static const __m256i offsetMask = _mm256_setr_epi32(i0: `0`, i1: `1`, i2: `2`, i3: `3`, i4: `4`, i5: `5`, i6: `6`, i7: `7`);
1008	return _mm256_add_epi32(a: offsetMask, b: _mm256_set1_epi32(i: -count));
1009	}
1010
1011	template<bool RGBA>
1012	static void convertARGBToARGB32PM_avx2(uint buffer, const* uint *src, qsizetype count)
1013	{
1014	qsizetype i = `0`;
1015	const __m256i alphaMask = _mm256_set1_epi32(i: `0xff000000`);
1016	const __m256i rgbaMask = _mm256_broadcastsi128_si256(X: _mm_setr_epi8(b0: `2`, b1: `1`, b2: `0`, b3: `3`, b4: `6`, b5: `5`, b6: `4`, b7: `7`, b8: `10`, b9: `9`, b10: `8`, b11: `11`, b12: `14`, b13: `13`, b14: `12`, b15: `15`));
1017	const __m256i shuffleMask = _mm256_broadcastsi128_si256(X: _mm_setr_epi8(b0: `6`, b1: `7`, b2: `6`, b3: `7`, b4: `6`, b5: `7`, b6: `6`, b7: `7`, b8: `14`, b9: `15`, b10: `14`, b11: `15`, b12: `14`, b13: `15`, b14: `14`, b15: `15`));
1018	const __m256i half = _mm256_set1_epi16(w: `0x0080`);
1019	const __m256i zero = _mm256_setzero_si256();
1020
1021	for (; i < count - `7`; i += `8`) {
1022	__m256i srcVector = _mm256_loadu_si256(p: reinterpret_cast<const __m256i *>(src + i));
1023	if (!_mm256_testz_si256(a: srcVector, b: alphaMask)) {
1024	// keep the two _mm_test[zc]_siXXX next to each other
1025	bool cf = _mm256_testc_si256(a: srcVector, b: alphaMask);
1026	if (RGBA)
1027	srcVector = _mm256_shuffle_epi8(a: srcVector, b: rgbaMask);
1028	if (!cf) {
1029	__m256i src1 = _mm256_unpacklo_epi8(a: srcVector, b: zero);
1030	__m256i src2 = _mm256_unpackhi_epi8(a: srcVector, b: zero);
1031	__m256i alpha1 = _mm256_shuffle_epi8(a: src1, b: shuffleMask);
1032	__m256i alpha2 = _mm256_shuffle_epi8(a: src2, b: shuffleMask);
1033	src1 = _mm256_mullo_epi16(a: src1, b: alpha1);
1034	src2 = _mm256_mullo_epi16(a: src2, b: alpha2);
1035	src1 = _mm256_add_epi16(a: src1, b: _mm256_srli_epi16(a: src1, count: `8`));
1036	src2 = _mm256_add_epi16(a: src2, b: _mm256_srli_epi16(a: src2, count: `8`));
1037	src1 = _mm256_add_epi16(a: src1, b: half);
1038	src2 = _mm256_add_epi16(a: src2, b: half);
1039	src1 = _mm256_srli_epi16(a: src1, count: `8`);
1040	src2 = _mm256_srli_epi16(a: src2, count: `8`);
1041	src1 = _mm256_blend_epi16(src1, alpha1, `0x88`);
1042	src2 = _mm256_blend_epi16(src2, alpha2, `0x88`);
1043	srcVector = _mm256_packus_epi16(a: src1, b: src2);
1044	_mm256_storeu_si256(p: reinterpret_cast<__m256i *>(buffer + i), a: srcVector);
1045	} else {
1046	if (buffer != src \|\| RGBA)
1047	_mm256_storeu_si256(p: reinterpret_cast<__m256i *>(buffer + i), a: srcVector);
1048	}
1049	} else {
1050	_mm256_storeu_si256(p: reinterpret_cast<__m256i *>(buffer + i), a: zero);
1051	}
1052	}
1053
1054	if (i < count) {
1055	const __m256i epilogueMask = epilogueMaskFromCount(count: count - i);
1056	__m256i srcVector = _mm256_maskload_epi32(X: reinterpret_cast<const int *>(src + i), M: epilogueMask);
1057	const __m256i epilogueAlphaMask = _mm256_blendv_epi8(V1: _mm256_setzero_si256(), V2: alphaMask, M: epilogueMask);
1058
1059	if (!_mm256_testz_si256(a: srcVector, b: epilogueAlphaMask)) {
1060	// keep the two _mm_test[zc]_siXXX next to each other
1061	bool cf = _mm256_testc_si256(a: srcVector, b: epilogueAlphaMask);
1062	if (RGBA)
1063	srcVector = _mm256_shuffle_epi8(a: srcVector, b: rgbaMask);
1064	if (!cf) {
1065	__m256i src1 = _mm256_unpacklo_epi8(a: srcVector, b: zero);
1066	__m256i src2 = _mm256_unpackhi_epi8(a: srcVector, b: zero);
1067	__m256i alpha1 = _mm256_shuffle_epi8(a: src1, b: shuffleMask);
1068	__m256i alpha2 = _mm256_shuffle_epi8(a: src2, b: shuffleMask);
1069	src1 = _mm256_mullo_epi16(a: src1, b: alpha1);
1070	src2 = _mm256_mullo_epi16(a: src2, b: alpha2);
1071	src1 = _mm256_add_epi16(a: src1, b: _mm256_srli_epi16(a: src1, count: `8`));
1072	src2 = _mm256_add_epi16(a: src2, b: _mm256_srli_epi16(a: src2, count: `8`));
1073	src1 = _mm256_add_epi16(a: src1, b: half);
1074	src2 = _mm256_add_epi16(a: src2, b: half);
1075	src1 = _mm256_srli_epi16(a: src1, count: `8`);
1076	src2 = _mm256_srli_epi16(a: src2, count: `8`);
1077	src1 = _mm256_blend_epi16(src1, alpha1, `0x88`);
1078	src2 = _mm256_blend_epi16(src2, alpha2, `0x88`);
1079	srcVector = _mm256_packus_epi16(a: src1, b: src2);
1080	_mm256_maskstore_epi32(X: reinterpret_cast<int *>(buffer + i), M: epilogueMask, Y: srcVector);
1081	} else {
1082	if (buffer != src \|\| RGBA)
1083	_mm256_maskstore_epi32(X: reinterpret_cast<int *>(buffer + i), M: epilogueMask, Y: srcVector);
1084	}
1085	} else {
1086	_mm256_maskstore_epi32(X: reinterpret_cast<int *>(buffer + i), M: epilogueMask, Y: zero);
1087	}
1088	}
1089	}
1090
1091	void QT_FASTCALL convertARGB32ToARGB32PM_avx2(uint buffer, int* count, const QVector<QRgb> *)
1092	{
1093	convertARGBToARGB32PM_avx2<false>(buffer, src: buffer, count);
1094	}
1095
1096	void QT_FASTCALL convertRGBA8888ToARGB32PM_avx2(uint buffer, int* count, const QVector<QRgb> *)
1097	{
1098	convertARGBToARGB32PM_avx2<true>(buffer, src: buffer, count);
1099	}
1100
1101	const uint QT_FASTCALL fetchARGB32ToARGB32PM_avx2(uint buffer, const uchar src, int* index, int count,
1102	const QVector<QRgb> , QDitherInfo )
1103	{
1104	convertARGBToARGB32PM_avx2<false>(buffer, src: reinterpret_cast<const uint *>(src) + index, count);
1105	return buffer;
1106	}
1107
1108	const uint QT_FASTCALL fetchRGBA8888ToARGB32PM_avx2(uint buffer, const uchar src, int* index, int count,
1109	const QVector<QRgb> , QDitherInfo )
1110	{
1111	convertARGBToARGB32PM_avx2<true>(buffer, src: reinterpret_cast<const uint *>(src) + index, count);
1112	return buffer;
1113	}
1114
1115	template<bool RGBA>
1116	static void convertARGBToRGBA64PM_avx2(QRgba64 buffer, const* uint *src, qsizetype count)
1117	{
1118	qsizetype i = `0`;
1119	const __m256i alphaMask = _mm256_set1_epi32(i: `0xff000000`);
1120	const __m256i rgbaMask = _mm256_broadcastsi128_si256(X: _mm_setr_epi8(b0: `2`, b1: `1`, b2: `0`, b3: `3`, b4: `6`, b5: `5`, b6: `4`, b7: `7`, b8: `10`, b9: `9`, b10: `8`, b11: `11`, b12: `14`, b13: `13`, b14: `12`, b15: `15`));
1121	const __m256i shuffleMask = _mm256_broadcastsi128_si256(X: _mm_setr_epi8(b0: `6`, b1: `7`, b2: `6`, b3: `7`, b4: `6`, b5: `7`, b6: `6`, b7: `7`, b8: `14`, b9: `15`, b10: `14`, b11: `15`, b12: `14`, b13: `15`, b14: `14`, b15: `15`));
1122	const __m256i zero = _mm256_setzero_si256();
1123
1124	for (; i < count - `7`; i += `8`) {
1125	__m256i dst1, dst2;
1126	__m256i srcVector = _mm256_loadu_si256(p: reinterpret_cast<const __m256i *>(src + i));
1127	if (!_mm256_testz_si256(a: srcVector, b: alphaMask)) {
1128	// keep the two _mm_test[zc]_siXXX next to each other
1129	bool cf = _mm256_testc_si256(a: srcVector, b: alphaMask);
1130	if (!RGBA)
1131	srcVector = _mm256_shuffle_epi8(a: srcVector, b: rgbaMask);
1132
1133	// The two unpack instructions unpack the low and upper halves of
1134	// each 128-bit half of the 256-bit register. Here's the tracking
1135	// of what's where: (p is 32-bit, P is 64-bit)
1136	// as loaded: [ p1, p2, p3, p4; p5, p6, p7, p8 ]
1137	// after permute4x64 [ p1, p2, p5, p6; p3, p4, p7, p8 ]
1138	// after unpacklo/hi [ P1, P2; P3, P4 ] [ P5, P6; P7, P8 ]
1139	srcVector = _mm256_permute4x64_epi64(srcVector, _MM_SHUFFLE(`3`, `1`, `2`, `0`));
1140
1141	const __m256i src1 = _mm256_unpacklo_epi8(a: srcVector, b: srcVector);
1142	const __m256i src2 = _mm256_unpackhi_epi8(a: srcVector, b: srcVector);
1143	if (!cf) {
1144	const __m256i alpha1 = _mm256_shuffle_epi8(a: src1, b: shuffleMask);
1145	const __m256i alpha2 = _mm256_shuffle_epi8(a: src2, b: shuffleMask);
1146	dst1 = _mm256_mulhi_epu16(a: src1, b: alpha1);
1147	dst2 = _mm256_mulhi_epu16(a: src2, b: alpha2);
1148	dst1 = _mm256_add_epi16(a: dst1, b: _mm256_srli_epi16(a: dst1, count: `15`));
1149	dst2 = _mm256_add_epi16(a: dst2, b: _mm256_srli_epi16(a: dst2, count: `15`));
1150	dst1 = _mm256_blend_epi16(dst1, src1, `0x88`);
1151	dst2 = _mm256_blend_epi16(dst2, src2, `0x88`);
1152	} else {
1153	dst1 = src1;
1154	dst2 = src2;
1155	}
1156	} else {
1157	dst1 = dst2 = zero;
1158	}
1159	_mm256_storeu_si256(p: reinterpret_cast<__m256i *>(buffer + i), a: dst1);
1160	_mm256_storeu_si256(p: reinterpret_cast<__m256i *>(buffer + i) + `1`, a: dst2);
1161	}
1162
1163	if (i < count) {
1164	__m256i epilogueMask = epilogueMaskFromCount(count: count - i);
1165	const __m256i epilogueAlphaMask = _mm256_blendv_epi8(V1: _mm256_setzero_si256(), V2: alphaMask, M: epilogueMask);
1166	__m256i dst1, dst2;
1167	__m256i srcVector = _mm256_maskload_epi32(X: reinterpret_cast<const int *>(src + i), M: epilogueMask);
1168
1169	if (!_mm256_testz_si256(a: srcVector, b: epilogueAlphaMask)) {
1170	// keep the two _mm_test[zc]_siXXX next to each other
1171	bool cf = _mm256_testc_si256(a: srcVector, b: epilogueAlphaMask);
1172	if (!RGBA)
1173	srcVector = _mm256_shuffle_epi8(a: srcVector, b: rgbaMask);
1174	srcVector = _mm256_permute4x64_epi64(srcVector, _MM_SHUFFLE(`3`, `1`, `2`, `0`));
1175	const __m256i src1 = _mm256_unpacklo_epi8(a: srcVector, b: srcVector);
1176	const __m256i src2 = _mm256_unpackhi_epi8(a: srcVector, b: srcVector);
1177	if (!cf) {
1178	const __m256i alpha1 = _mm256_shuffle_epi8(a: src1, b: shuffleMask);
1179	const __m256i alpha2 = _mm256_shuffle_epi8(a: src2, b: shuffleMask);
1180	dst1 = _mm256_mulhi_epu16(a: src1, b: alpha1);
1181	dst2 = _mm256_mulhi_epu16(a: src2, b: alpha2);
1182	dst1 = _mm256_add_epi16(a: dst1, b: _mm256_srli_epi16(a: dst1, count: `15`));
1183	dst2 = _mm256_add_epi16(a: dst2, b: _mm256_srli_epi16(a: dst2, count: `15`));
1184	dst1 = _mm256_blend_epi16(dst1, src1, `0x88`);
1185	dst2 = _mm256_blend_epi16(dst2, src2, `0x88`);
1186	} else {
1187	dst1 = src1;
1188	dst2 = src2;
1189	}
1190	} else {
1191	dst1 = dst2 = zero;
1192	}
1193	epilogueMask = _mm256_permute4x64_epi64(epilogueMask, _MM_SHUFFLE(`3`, `1`, `2`, `0`));
1194	_mm256_maskstore_epi64(X: reinterpret_cast<qint64 *>(buffer + i),
1195	M: _mm256_unpacklo_epi32(a: epilogueMask, b: epilogueMask),
1196	Y: dst1);
1197	_mm256_maskstore_epi64(X: reinterpret_cast<qint64 *>(buffer + i + `4`),
1198	M: _mm256_unpackhi_epi32(a: epilogueMask, b: epilogueMask),
1199	Y: dst2);
1200	}
1201	}
1202
1203	const QRgba64 * QT_FASTCALL convertARGB32ToRGBA64PM_avx2(QRgba64 buffer, const* uint src, int* count,
1204	const QVector<QRgb> , QDitherInfo )
1205	{
1206	convertARGBToRGBA64PM_avx2<false>(buffer, src, count);
1207	return buffer;
1208	}
1209
1210	const QRgba64 * QT_FASTCALL convertRGBA8888ToRGBA64PM_avx2(QRgba64 buffer, const* uint src, int* count,
1211	const QVector<QRgb> , QDitherInfo )
1212	{
1213	convertARGBToRGBA64PM_avx2<true>(buffer, src, count);
1214	return buffer;
1215	}
1216
1217	const QRgba64 QT_FASTCALL fetchARGB32ToRGBA64PM_avx2(QRgba64 buffer, const uchar src, int* index, int count,
1218	const QVector<QRgb> , QDitherInfo )
1219	{
1220	convertARGBToRGBA64PM_avx2<false>(buffer, src: reinterpret_cast<const uint *>(src) + index, count);
1221	return buffer;
1222	}
1223
1224	const QRgba64 QT_FASTCALL fetchRGBA8888ToRGBA64PM_avx2(QRgba64 buffer, const uchar src, int* index, int count,
1225	const QVector<QRgb> , QDitherInfo )
1226	{
1227	convertARGBToRGBA64PM_avx2<true>(buffer, src: reinterpret_cast<const uint *>(src) + index, count);
1228	return buffer;
1229	}
1230
1231	QT_END_NAMESPACE
1232
1233	#endif
1234

source code of qtbase/src/gui/painting/qdrawhelper_avx2.cpp