qdrawhelper_avx2.cpp source code [qtbase/src/gui/painting/qdrawhelper_avx2.cpp]

1	// Copyright (C) 2018 The Qt Company Ltd.
2	// Copyright (C) 2018 Intel Corporation.
3	// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4
5	#include "qdrawhelper_p.h"
6	#include "qdrawhelper_x86_p.h"
7	#include "qdrawingprimitive_sse2_p.h"
8	#include "qpixellayout_p.h"
9	#include "qrgba64_p.h"
10
11	#if defined(QT_COMPILER_SUPPORTS_AVX2)
12
13	QT_BEGIN_NAMESPACE
14
15	enum {
16	FixedScale = `1` << `16`,
17	HalfPoint = `1` << `15`
18	};
19
20	// Vectorized blend functions:
21
22	// See BYTE_MUL_SSE2 for details.
23	inline static void Q_DECL_VECTORCALL
24	BYTE_MUL_AVX2(__m256i &pixelVector, __m256i alphaChannel, __m256i colorMask, __m256i half)
25	{
26	__m256i pixelVectorAG = _mm256_srli_epi16(a: pixelVector, count: `8`);
27	__m256i pixelVectorRB = _mm256_and_si256(a: pixelVector, b: colorMask);
28
29	pixelVectorAG = _mm256_mullo_epi16(a: pixelVectorAG, b: alphaChannel);
30	pixelVectorRB = _mm256_mullo_epi16(a: pixelVectorRB, b: alphaChannel);
31
32	pixelVectorRB = _mm256_add_epi16(a: pixelVectorRB, b: _mm256_srli_epi16(a: pixelVectorRB, count: `8`));
33	pixelVectorAG = _mm256_add_epi16(a: pixelVectorAG, b: _mm256_srli_epi16(a: pixelVectorAG, count: `8`));
34	pixelVectorRB = _mm256_add_epi16(a: pixelVectorRB, b: half);
35	pixelVectorAG = _mm256_add_epi16(a: pixelVectorAG, b: half);
36
37	pixelVectorRB = _mm256_srli_epi16(a: pixelVectorRB, count: `8`);
38	pixelVectorAG = _mm256_andnot_si256(a: colorMask, b: pixelVectorAG);
39
40	pixelVector = _mm256_or_si256(a: pixelVectorAG, b: pixelVectorRB);
41	}
42
43	inline static void Q_DECL_VECTORCALL
44	BYTE_MUL_RGB64_AVX2(__m256i &pixelVector, __m256i alphaChannel, __m256i colorMask, __m256i half)
45	{
46	__m256i pixelVectorAG = _mm256_srli_epi32(a: pixelVector, count: `16`);
47	__m256i pixelVectorRB = _mm256_and_si256(a: pixelVector, b: colorMask);
48
49	pixelVectorAG = _mm256_mullo_epi32(a: pixelVectorAG, b: alphaChannel);
50	pixelVectorRB = _mm256_mullo_epi32(a: pixelVectorRB, b: alphaChannel);
51
52	pixelVectorRB = _mm256_add_epi32(a: pixelVectorRB, b: _mm256_srli_epi32(a: pixelVectorRB, count: `16`));
53	pixelVectorAG = _mm256_add_epi32(a: pixelVectorAG, b: _mm256_srli_epi32(a: pixelVectorAG, count: `16`));
54	pixelVectorRB = _mm256_add_epi32(a: pixelVectorRB, b: half);
55	pixelVectorAG = _mm256_add_epi32(a: pixelVectorAG, b: half);
56
57	pixelVectorRB = _mm256_srli_epi32(a: pixelVectorRB, count: `16`);
58	pixelVectorAG = _mm256_andnot_si256(a: colorMask, b: pixelVectorAG);
59
60	pixelVector = _mm256_or_si256(a: pixelVectorAG, b: pixelVectorRB);
61	}
62
63	// See INTERPOLATE_PIXEL_255_SSE2 for details.
64	inline static void Q_DECL_VECTORCALL
65	INTERPOLATE_PIXEL_255_AVX2(__m256i srcVector, __m256i &dstVector, __m256i alphaChannel, __m256i oneMinusAlphaChannel, __m256i colorMask, __m256i half)
66	{
67	const __m256i srcVectorAG = _mm256_srli_epi16(a: srcVector, count: `8`);
68	const __m256i dstVectorAG = _mm256_srli_epi16(a: dstVector, count: `8`);
69	const __m256i srcVectorRB = _mm256_and_si256(a: srcVector, b: colorMask);
70	const __m256i dstVectorRB = _mm256_and_si256(a: dstVector, b: colorMask);
71	const __m256i srcVectorAGalpha = _mm256_mullo_epi16(a: srcVectorAG, b: alphaChannel);
72	const __m256i srcVectorRBalpha = _mm256_mullo_epi16(a: srcVectorRB, b: alphaChannel);
73	const __m256i dstVectorAGoneMinusAlpha = _mm256_mullo_epi16(a: dstVectorAG, b: oneMinusAlphaChannel);
74	const __m256i dstVectorRBoneMinusAlpha = _mm256_mullo_epi16(a: dstVectorRB, b: oneMinusAlphaChannel);
75	__m256i finalAG = _mm256_add_epi16(a: srcVectorAGalpha, b: dstVectorAGoneMinusAlpha);
76	__m256i finalRB = _mm256_add_epi16(a: srcVectorRBalpha, b: dstVectorRBoneMinusAlpha);
77	finalAG = _mm256_add_epi16(a: finalAG, b: _mm256_srli_epi16(a: finalAG, count: `8`));
78	finalRB = _mm256_add_epi16(a: finalRB, b: _mm256_srli_epi16(a: finalRB, count: `8`));
79	finalAG = _mm256_add_epi16(a: finalAG, b: half);
80	finalRB = _mm256_add_epi16(a: finalRB, b: half);
81	finalAG = _mm256_andnot_si256(a: colorMask, b: finalAG);
82	finalRB = _mm256_srli_epi16(a: finalRB, count: `8`);
83
84	dstVector = _mm256_or_si256(a: finalAG, b: finalRB);
85	}
86
87	inline static void Q_DECL_VECTORCALL
88	INTERPOLATE_PIXEL_RGB64_AVX2(__m256i srcVector, __m256i &dstVector, __m256i alphaChannel, __m256i oneMinusAlphaChannel, __m256i colorMask, __m256i half)
89	{
90	const __m256i srcVectorAG = _mm256_srli_epi32(a: srcVector, count: `16`);
91	const __m256i dstVectorAG = _mm256_srli_epi32(a: dstVector, count: `16`);
92	const __m256i srcVectorRB = _mm256_and_si256(a: srcVector, b: colorMask);
93	const __m256i dstVectorRB = _mm256_and_si256(a: dstVector, b: colorMask);
94	const __m256i srcVectorAGalpha = _mm256_mullo_epi32(a: srcVectorAG, b: alphaChannel);
95	const __m256i srcVectorRBalpha = _mm256_mullo_epi32(a: srcVectorRB, b: alphaChannel);
96	const __m256i dstVectorAGoneMinusAlpha = _mm256_mullo_epi32(a: dstVectorAG, b: oneMinusAlphaChannel);
97	const __m256i dstVectorRBoneMinusAlpha = _mm256_mullo_epi32(a: dstVectorRB, b: oneMinusAlphaChannel);
98	__m256i finalAG = _mm256_add_epi32(a: srcVectorAGalpha, b: dstVectorAGoneMinusAlpha);
99	__m256i finalRB = _mm256_add_epi32(a: srcVectorRBalpha, b: dstVectorRBoneMinusAlpha);
100	finalAG = _mm256_add_epi32(a: finalAG, b: _mm256_srli_epi32(a: finalAG, count: `16`));
101	finalRB = _mm256_add_epi32(a: finalRB, b: _mm256_srli_epi32(a: finalRB, count: `16`));
102	finalAG = _mm256_add_epi32(a: finalAG, b: half);
103	finalRB = _mm256_add_epi32(a: finalRB, b: half);
104	finalAG = _mm256_andnot_si256(a: colorMask, b: finalAG);
105	finalRB = _mm256_srli_epi32(a: finalRB, count: `16`);
106
107	dstVector = _mm256_or_si256(a: finalAG, b: finalRB);
108	}
109
110	// See BLEND_SOURCE_OVER_ARGB32_SSE2 for details.
111	inline static void Q_DECL_VECTORCALL BLEND_SOURCE_OVER_ARGB32_AVX2(quint32 dst, const* quint32 src, const* int length)
112	{
113	const __m256i half = _mm256_set1_epi16(w: `0x80`);
114	const __m256i one = _mm256_set1_epi16(w: `0xff`);
115	const __m256i colorMask = _mm256_set1_epi32(i: `0x00ff00ff`);
116	const __m256i alphaMask = _mm256_set1_epi32(i: `0xff000000`);
117	const __m256i offsetMask = _mm256_setr_epi32(i0: `0`, i1: `1`, i2: `2`, i3: `3`, i4: `4`, i5: `5`, i6: `6`, i7: `7`);
118	const __m256i alphaShuffleMask = _mm256_set_epi8(b31: char(`0xff`),b30: `15`,b29: char(`0xff`),b28: `15`,b27: char(`0xff`),b26: `11`,b25: char(`0xff`),b24: `11`,b23: char(`0xff`),b22: `7`,b21: char(`0xff`),b20: `7`,b19: char(`0xff`),b18: `3`,b17: char(`0xff`),b16: `3`,
119	b15: char(`0xff`),b14: `15`,b13: char(`0xff`),b12: `15`,b11: char(`0xff`),b10: `11`,b09: char(`0xff`),b08: `11`,b07: char(`0xff`),b06: `7`,b05: char(`0xff`),b04: `7`,b03: char(`0xff`),b02: `3`,b01: char(`0xff`),b00: `3`);
120
121	const int minusOffsetToAlignDstOn32Bytes = (reinterpret_cast<quintptr>(dst) >> `2`) & `0x7`;
122
123	int x = `0`;
124	// Prologue to handle all pixels until dst is 32-byte aligned in one step.
125	if (minusOffsetToAlignDstOn32Bytes != `0` && x < (length - `7`)) {
126	const __m256i prologueMask = _mm256_sub_epi32(a: _mm256_set1_epi32(i: minusOffsetToAlignDstOn32Bytes - `1`), b: offsetMask);
127	const __m256i srcVector = _mm256_maskload_epi32(X: (const int *)&src[x - minusOffsetToAlignDstOn32Bytes], M: prologueMask);
128	const __m256i prologueAlphaMask = _mm256_blendv_epi8(V1: _mm256_setzero_si256(), V2: alphaMask, M: prologueMask);
129	if (!_mm256_testz_si256(a: srcVector, b: prologueAlphaMask)) {
130	if (_mm256_testc_si256(a: srcVector, b: prologueAlphaMask)) {
131	_mm256_maskstore_epi32(X: (int *)&dst[x - minusOffsetToAlignDstOn32Bytes], M: prologueMask, Y: srcVector);
132	} else {
133	__m256i alphaChannel = _mm256_shuffle_epi8(a: srcVector, b: alphaShuffleMask);
134	alphaChannel = _mm256_sub_epi16(a: one, b: alphaChannel);
135	__m256i dstVector = _mm256_maskload_epi32(X: (int *)&dst[x - minusOffsetToAlignDstOn32Bytes], M: prologueMask);
136	BYTE_MUL_AVX2(pixelVector&: dstVector, alphaChannel, colorMask, half);
137	dstVector = _mm256_add_epi8(a: dstVector, b: srcVector);
138	_mm256_maskstore_epi32(X: (int *)&dst[x - minusOffsetToAlignDstOn32Bytes], M: prologueMask, Y: dstVector);
139	}
140	}
141	x += (`8` - minusOffsetToAlignDstOn32Bytes);
142	}
143
144	for (; x < (length - `7`); x += `8`) {
145	const __m256i srcVector = _mm256_lddqu_si256(p: (const __m256i *)&src[x]);
146	if (!_mm256_testz_si256(a: srcVector, b: alphaMask)) {
147	if (_mm256_testc_si256(a: srcVector, b: alphaMask)) {
148	_mm256_store_si256(p: (__m256i *)&dst[x], a: srcVector);
149	} else {
150	__m256i alphaChannel = _mm256_shuffle_epi8(a: srcVector, b: alphaShuffleMask);
151	alphaChannel = _mm256_sub_epi16(a: one, b: alphaChannel);
152	__m256i dstVector = _mm256_load_si256(p: (__m256i *)&dst[x]);
153	BYTE_MUL_AVX2(pixelVector&: dstVector, alphaChannel, colorMask, half);
154	dstVector = _mm256_add_epi8(a: dstVector, b: srcVector);
155	_mm256_store_si256(p: (__m256i *)&dst[x], a: dstVector);
156	}
157	}
158	}
159
160	// Epilogue to handle all remaining pixels in one step.
161	if (x < length) {
162	const __m256i epilogueMask = _mm256_add_epi32(a: offsetMask, b: _mm256_set1_epi32(i: x - length));
163	const __m256i srcVector = _mm256_maskload_epi32(X: (const int *)&src[x], M: epilogueMask);
164	const __m256i epilogueAlphaMask = _mm256_blendv_epi8(V1: _mm256_setzero_si256(), V2: alphaMask, M: epilogueMask);
165	if (!_mm256_testz_si256(a: srcVector, b: epilogueAlphaMask)) {
166	if (_mm256_testc_si256(a: srcVector, b: epilogueAlphaMask)) {
167	_mm256_maskstore_epi32(X: (int *)&dst[x], M: epilogueMask, Y: srcVector);
168	} else {
169	__m256i alphaChannel = _mm256_shuffle_epi8(a: srcVector, b: alphaShuffleMask);
170	alphaChannel = _mm256_sub_epi16(a: one, b: alphaChannel);
171	__m256i dstVector = _mm256_maskload_epi32(X: (int *)&dst[x], M: epilogueMask);
172	BYTE_MUL_AVX2(pixelVector&: dstVector, alphaChannel, colorMask, half);
173	dstVector = _mm256_add_epi8(a: dstVector, b: srcVector);
174	_mm256_maskstore_epi32(X: (int *)&dst[x], M: epilogueMask, Y: dstVector);
175	}
176	}
177	}
178	}
179
180
181	// See BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2 for details.
182	inline static void Q_DECL_VECTORCALL
183	BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_AVX2(quint32 dst, const* quint32 src, const* int length, const int const_alpha)
184	{
185	int x = `0`;
186
187	ALIGNMENT_PROLOGUE_32BYTES(dst, x, length)
188	blend_pixel(dst&: dst[x], src: src[x], const_alpha);
189
190	const __m256i half = _mm256_set1_epi16(w: `0x80`);
191	const __m256i one = _mm256_set1_epi16(w: `0xff`);
192	const __m256i colorMask = _mm256_set1_epi32(i: `0x00ff00ff`);
193	const __m256i alphaMask = _mm256_set1_epi32(i: `0xff000000`);
194	const __m256i alphaShuffleMask = _mm256_set_epi8(b31: char(`0xff`),b30: `15`,b29: char(`0xff`),b28: `15`,b27: char(`0xff`),b26: `11`,b25: char(`0xff`),b24: `11`,b23: char(`0xff`),b22: `7`,b21: char(`0xff`),b20: `7`,b19: char(`0xff`),b18: `3`,b17: char(`0xff`),b16: `3`,
195	b15: char(`0xff`),b14: `15`,b13: char(`0xff`),b12: `15`,b11: char(`0xff`),b10: `11`,b09: char(`0xff`),b08: `11`,b07: char(`0xff`),b06: `7`,b05: char(`0xff`),b04: `7`,b03: char(`0xff`),b02: `3`,b01: char(`0xff`),b00: `3`);
196	const __m256i constAlphaVector = _mm256_set1_epi16(w: const_alpha);
197	for (; x < (length - `7`); x += `8`) {
198	__m256i srcVector = _mm256_lddqu_si256(p: (const __m256i *)&src[x]);
199	if (!_mm256_testz_si256(a: srcVector, b: alphaMask)) {
200	BYTE_MUL_AVX2(pixelVector&: srcVector, alphaChannel: constAlphaVector, colorMask, half);
201
202	__m256i alphaChannel = _mm256_shuffle_epi8(a: srcVector, b: alphaShuffleMask);
203	alphaChannel = _mm256_sub_epi16(a: one, b: alphaChannel);
204	__m256i dstVector = _mm256_load_si256(p: (__m256i *)&dst[x]);
205	BYTE_MUL_AVX2(pixelVector&: dstVector, alphaChannel, colorMask, half);
206	dstVector = _mm256_add_epi8(a: dstVector, b: srcVector);
207	_mm256_store_si256(p: (__m256i *)&dst[x], a: dstVector);
208	}
209	}
210	SIMD_EPILOGUE(x, length, `7`)
211	blend_pixel(dst&: dst[x], src: src[x], const_alpha);
212	}
213
214	void qt_blend_argb32_on_argb32_avx2(uchar destPixels, int* dbpl,
215	const uchar srcPixels, int* sbpl,
216	int w, int h,
217	int const_alpha)
218	{
219	if (const_alpha == `256`) {
220	for (int y = `0`; y < h; ++y) {
221	const quint32 src = reinterpret_cast<const* quint32 *>(srcPixels);
222	quint32 dst = reinterpret_cast<quint32 >(destPixels);
223	BLEND_SOURCE_OVER_ARGB32_AVX2(dst, src, length: w);
224	destPixels += dbpl;
225	srcPixels += sbpl;
226	}
227	} else if (const_alpha != `0`) {
228	const_alpha = (const_alpha * `255`) >> `8`;
229	for (int y = `0`; y < h; ++y) {
230	const quint32 src = reinterpret_cast<const* quint32 *>(srcPixels);
231	quint32 dst = reinterpret_cast<quint32 >(destPixels);
232	BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_AVX2(dst, src, length: w, const_alpha);
233	destPixels += dbpl;
234	srcPixels += sbpl;
235	}
236	}
237	}
238
239	void qt_blend_rgb32_on_rgb32_avx2(uchar destPixels, int* dbpl,
240	const uchar srcPixels, int* sbpl,
241	int w, int h,
242	int const_alpha)
243	{
244	if (const_alpha == `256`) {
245	for (int y = `0`; y < h; ++y) {
246	const quint32 src = reinterpret_cast<const* quint32 *>(srcPixels);
247	quint32 dst = reinterpret_cast<quint32 >(destPixels);
248	::memcpy(dest: dst, src: src, n: w * sizeof(uint));
249	srcPixels += sbpl;
250	destPixels += dbpl;
251	}
252	return;
253	}
254	if (const_alpha == `0`)
255	return;
256
257	const __m256i half = _mm256_set1_epi16(w: `0x80`);
258	const __m256i colorMask = _mm256_set1_epi32(i: `0x00ff00ff`);
259
260	const_alpha = (const_alpha * `255`) >> `8`;
261	int one_minus_const_alpha = `255` - const_alpha;
262	const __m256i constAlphaVector = _mm256_set1_epi16(w: const_alpha);
263	const __m256i oneMinusConstAlpha = _mm256_set1_epi16(w: one_minus_const_alpha);
264	for (int y = `0`; y < h; ++y) {
265	const quint32 src = reinterpret_cast<const* quint32 *>(srcPixels);
266	quint32 dst = reinterpret_cast<quint32 >(destPixels);
267	int x = `0`;
268
269	// First, align dest to 32 bytes:
270	ALIGNMENT_PROLOGUE_32BYTES(dst, x, w)
271	dst[x] = INTERPOLATE_PIXEL_255(x: src[x], a: const_alpha, y: dst[x], b: one_minus_const_alpha);
272
273	// 2) interpolate pixels with AVX2
274	for (; x < (w - `7`); x += `8`) {
275	const __m256i srcVector = _mm256_lddqu_si256(p: (const __m256i *)&src[x]);
276	__m256i dstVector = _mm256_load_si256(p: (__m256i *)&dst[x]);
277	INTERPOLATE_PIXEL_255_AVX2(srcVector, dstVector, alphaChannel: constAlphaVector, oneMinusAlphaChannel: oneMinusConstAlpha, colorMask, half);
278	_mm256_store_si256(p: (__m256i *)&dst[x], a: dstVector);
279	}
280
281	// 3) Epilogue
282	SIMD_EPILOGUE(x, w, `7`)
283	dst[x] = INTERPOLATE_PIXEL_255(x: src[x], a: const_alpha, y: dst[x], b: one_minus_const_alpha);
284
285	srcPixels += sbpl;
286	destPixels += dbpl;
287	}
288	}
289
290	static Q_NEVER_INLINE
291	void Q_DECL_VECTORCALL qt_memfillXX_avx2(uchar *dest, __m256i value256, qsizetype bytes)
292	{
293	__m128i value128 = _mm256_castsi256_si128(a: value256);
294
295	// main body
296	__m256i dst256 = reinterpret_cast<__m256i >(dest);
297	uchar *end = dest + bytes;
298	while (reinterpret_cast<uchar *>(dst256 + `4`) <= end) {
299	_mm256_storeu_si256(p: dst256 + `0`, a: value256);
300	_mm256_storeu_si256(p: dst256 + `1`, a: value256);
301	_mm256_storeu_si256(p: dst256 + `2`, a: value256);
302	_mm256_storeu_si256(p: dst256 + `3`, a: value256);
303	dst256 += `4`;
304	}
305
306	// first epilogue: fewer than 128 bytes / 32 entries
307	bytes = end - reinterpret_cast<uchar *>(dst256);
308	switch (bytes / sizeof(value256)) {
309	case `3`: _mm256_storeu_si256(p: dst256++, a: value256); Q_FALLTHROUGH();
310	case `2`: _mm256_storeu_si256(p: dst256++, a: value256); Q_FALLTHROUGH();
311	case `1`: _mm256_storeu_si256(p: dst256++, a: value256);
312	}
313
314	// second epilogue: fewer than 32 bytes
315	__m128i dst128 = reinterpret_cast<__m128i >(dst256);
316	if (bytes & sizeof(value128))
317	_mm_storeu_si128(p: dst128++, b: value128);
318
319	// third epilogue: fewer than 16 bytes
320	if (bytes & `8`)
321	_mm_storel_epi64(p: reinterpret_cast<__m128i *>(end - `8`), a: value128);
322	}
323
324	void qt_memfill64_avx2(quint64 *dest, quint64 value, qsizetype count)
325	{
326	#if defined(Q_CC_GNU) && !defined(Q_CC_CLANG)
327	// work around https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80820
328	__m128i value64 = _mm_set_epi64x(`0`, value); // _mm_cvtsi64_si128(value);
329	# ifdef Q_PROCESSOR_X86_64
330	asm ("" : "+x" (value64));
331	# endif
332	__m256i value256 = _mm256_broadcastq_epi64(value64);
333	#else
334	__m256i value256 = _mm256_set1_epi64x(q: value);
335	#endif
336
337	qt_memfillXX_avx2(dest: reinterpret_cast<uchar >(dest), value256, bytes: count sizeof(quint64));
338	}
339
340	void qt_memfill32_avx2(quint32 *dest, quint32 value, qsizetype count)
341	{
342	if (count % `2`) {
343	// odd number of pixels, round to even
344	*dest++ = value;
345	--count;
346	}
347	qt_memfillXX_avx2(dest: reinterpret_cast<uchar >(dest), value256: _mm256_set1_epi32(i: value), bytes: count sizeof(quint32));
348	}
349
350	void QT_FASTCALL comp_func_SourceOver_avx2(uint destPixels, const* uint srcPixels, int* length, uint const_alpha)
351	{
352	Q_ASSERT(const_alpha < `256`);
353
354	const quint32 src = (const* quint32 *) srcPixels;
355	quint32 dst = (quint32 ) destPixels;
356
357	if (const_alpha == `255`)
358	BLEND_SOURCE_OVER_ARGB32_AVX2(dst, src, length);
359	else
360	BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_AVX2(dst, src, length, const_alpha);
361	}
362
363	#if QT_CONFIG(raster_64bit)
364	void QT_FASTCALL comp_func_SourceOver_rgb64_avx2(QRgba64 dst, const* QRgba64 src, int* length, uint const_alpha)
365	{
366	Q_ASSERT(const_alpha < `256`); // const_alpha is in [0-255]
367	const __m256i half = _mm256_set1_epi32(i: `0x8000`);
368	const __m256i one = _mm256_set1_epi32(i: `0xffff`);
369	const __m256i colorMask = _mm256_set1_epi32(i: `0x0000ffff`);
370	__m256i alphaMask = _mm256_set1_epi32(i: `0xff000000`);
371	alphaMask = _mm256_unpacklo_epi8(a: alphaMask, b: alphaMask);
372	const __m256i alphaShuffleMask = _mm256_set_epi8(b31: char(`0xff`),b30: char(`0xff`),b29: `15`,b28: `14`,b27: char(`0xff`),b26: char(`0xff`),b25: `15`,b24: `14`,b23: char(`0xff`),b22: char(`0xff`),b21: `7`,b20: `6`,b19: char(`0xff`),b18: char(`0xff`),b17: `7`,b16: `6`,
373	b15: char(`0xff`),b14: char(`0xff`),b13: `15`,b12: `14`,b11: char(`0xff`),b10: char(`0xff`),b09: `15`,b08: `14`,b07: char(`0xff`),b06: char(`0xff`),b05: `7`,b04: `6`,b03: char(`0xff`),b02: char(`0xff`),b01: `7`,b00: `6`);
374
375	if (const_alpha == `255`) {
376	int x = `0`;
377	for (; x < length && (quintptr(dst + x) & `31`); ++x)
378	blend_pixel(dst&: dst[x], src: src[x]);
379	for (; x < length - `3`; x += `4`) {
380	const __m256i srcVector = _mm256_lddqu_si256(p: (const __m256i *)&src[x]);
381	if (!_mm256_testz_si256(a: srcVector, b: alphaMask)) {
382	// Not all transparent
383	if (_mm256_testc_si256(a: srcVector, b: alphaMask)) {
384	// All opaque
385	_mm256_store_si256(p: (__m256i *)&dst[x], a: srcVector);
386	} else {
387	__m256i alphaChannel = _mm256_shuffle_epi8(a: srcVector, b: alphaShuffleMask);
388	alphaChannel = _mm256_sub_epi32(a: one, b: alphaChannel);
389	__m256i dstVector = _mm256_load_si256(p: (__m256i *)&dst[x]);
390	BYTE_MUL_RGB64_AVX2(pixelVector&: dstVector, alphaChannel, colorMask, half);
391	dstVector = _mm256_add_epi16(a: dstVector, b: srcVector);
392	_mm256_store_si256(p: (__m256i *)&dst[x], a: dstVector);
393	}
394	}
395	}
396	SIMD_EPILOGUE(x, length, `3`)
397	blend_pixel(dst&: dst[x], src: src[x]);
398	} else {
399	const __m256i constAlphaVector = _mm256_set1_epi32(i: const_alpha \| (const_alpha << `8`));
400	int x = `0`;
401	for (; x < length && (quintptr(dst + x) & `31`); ++x)
402	blend_pixel(dst&: dst[x], src: src[x], const_alpha);
403	for (; x < length - `3`; x += `4`) {
404	__m256i srcVector = _mm256_lddqu_si256(p: (const __m256i *)&src[x]);
405	if (!_mm256_testz_si256(a: srcVector, b: alphaMask)) {
406	// Not all transparent
407	BYTE_MUL_RGB64_AVX2(pixelVector&: srcVector, alphaChannel: constAlphaVector, colorMask, half);
408
409	__m256i alphaChannel = _mm256_shuffle_epi8(a: srcVector, b: alphaShuffleMask);
410	alphaChannel = _mm256_sub_epi32(a: one, b: alphaChannel);
411	__m256i dstVector = _mm256_load_si256(p: (__m256i *)&dst[x]);
412	BYTE_MUL_RGB64_AVX2(pixelVector&: dstVector, alphaChannel, colorMask, half);
413	dstVector = _mm256_add_epi16(a: dstVector, b: srcVector);
414	_mm256_store_si256(p: (__m256i *)&dst[x], a: dstVector);
415	}
416	}
417	SIMD_EPILOGUE(x, length, `3`)
418	blend_pixel(dst&: dst[x], src: src[x], const_alpha);
419	}
420	}
421	#endif
422
423	#if QT_CONFIG(raster_fp)
424	void QT_FASTCALL comp_func_SourceOver_rgbafp_avx2(QRgbaFloat32 dst, const* QRgbaFloat32 src, int* length, uint const_alpha)
425	{
426	Q_ASSERT(const_alpha < `256`); // const_alpha is in [0-255]
427
428	const float a = const_alpha / `255.0f`;
429	const __m128 one = _mm_set1_ps(w: `1.0f`);
430	const __m128 constAlphaVector = _mm_set1_ps(w: a);
431	const __m256 one256 = _mm256_set1_ps(w: `1.0f`);
432	const __m256 constAlphaVector256 = _mm256_set1_ps(w: a);
433	int x = `0`;
434	for (; x < length - `1`; x += `2`) {
435	__m256 srcVector = _mm256_loadu_ps(p: (const float *)&src[x]);
436	__m256 dstVector = _mm256_loadu_ps(p: (const float *)&dst[x]);
437	srcVector = _mm256_mul_ps(a: srcVector, b: constAlphaVector256);
438	__m256 alphaChannel = _mm256_permute_ps(srcVector, _MM_SHUFFLE(`3`, `3`, `3`, `3`));
439	alphaChannel = _mm256_sub_ps(a: one256, b: alphaChannel);
440	dstVector = _mm256_mul_ps(a: dstVector, b: alphaChannel);
441	dstVector = _mm256_add_ps(a: dstVector, b: srcVector);
442	_mm256_storeu_ps(p: (float *)(dst + x), a: dstVector);
443	}
444	if (x < length) {
445	__m128 srcVector = _mm_loadu_ps(p: (const float *)&src[x]);
446	__m128 dstVector = _mm_loadu_ps(p: (const float *)&dst[x]);
447	srcVector = _mm_mul_ps(a: srcVector, b: constAlphaVector);
448	__m128 alphaChannel = _mm_permute_ps(srcVector, _MM_SHUFFLE(`3`, `3`, `3`, `3`));
449	alphaChannel = _mm_sub_ps(a: one, b: alphaChannel);
450	dstVector = _mm_mul_ps(a: dstVector, b: alphaChannel);
451	dstVector = _mm_add_ps(a: dstVector, b: srcVector);
452	_mm_storeu_ps(p: (float *)(dst + x), a: dstVector);
453	}
454	}
455	#endif
456
457	void QT_FASTCALL comp_func_Source_avx2(uint dst, const* uint src, int* length, uint const_alpha)
458	{
459	if (const_alpha == `255`) {
460	::memcpy(dest: dst, src: src, n: length * sizeof(uint));
461	} else {
462	const int ialpha = `255` - const_alpha;
463
464	int x = `0`;
465
466	// 1) prologue, align on 32 bytes
467	ALIGNMENT_PROLOGUE_32BYTES(dst, x, length)
468	dst[x] = INTERPOLATE_PIXEL_255(x: src[x], a: const_alpha, y: dst[x], b: ialpha);
469
470	// 2) interpolate pixels with AVX2
471	const __m256i half = _mm256_set1_epi16(w: `0x80`);
472	const __m256i colorMask = _mm256_set1_epi32(i: `0x00ff00ff`);
473	const __m256i constAlphaVector = _mm256_set1_epi16(w: const_alpha);
474	const __m256i oneMinusConstAlpha = _mm256_set1_epi16(w: ialpha);
475	for (; x < length - `7`; x += `8`) {
476	const __m256i srcVector = _mm256_lddqu_si256(p: (const __m256i *)&src[x]);
477	__m256i dstVector = _mm256_load_si256(p: (__m256i *)&dst[x]);
478	INTERPOLATE_PIXEL_255_AVX2(srcVector, dstVector, alphaChannel: constAlphaVector, oneMinusAlphaChannel: oneMinusConstAlpha, colorMask, half);
479	_mm256_store_si256(p: (__m256i *)&dst[x], a: dstVector);
480	}
481
482	// 3) Epilogue
483	SIMD_EPILOGUE(x, length, `7`)
484	dst[x] = INTERPOLATE_PIXEL_255(x: src[x], a: const_alpha, y: dst[x], b: ialpha);
485	}
486	}
487
488	#if QT_CONFIG(raster_64bit)
489	void QT_FASTCALL comp_func_Source_rgb64_avx2(QRgba64 dst, const* QRgba64 src, int* length, uint const_alpha)
490	{
491	Q_ASSERT(const_alpha < `256`); // const_alpha is in [0-255]
492	if (const_alpha == `255`) {
493	::memcpy(dest: dst, src: src, n: length * sizeof(QRgba64));
494	} else {
495	const uint ca = const_alpha \| (const_alpha << `8`); // adjust to [0-65535]
496	const uint cia = `65535` - ca;
497
498	int x = `0`;
499
500	// 1) prologue, align on 32 bytes
501	for (; x < length && (quintptr(dst + x) & `31`); ++x)
502	dst[x] = interpolate65535(x: src[x], alpha1: ca, y: dst[x], alpha2: cia);
503
504	// 2) interpolate pixels with AVX2
505	const __m256i half = _mm256_set1_epi32(i: `0x8000`);
506	const __m256i colorMask = _mm256_set1_epi32(i: `0x0000ffff`);
507	const __m256i constAlphaVector = _mm256_set1_epi32(i: ca);
508	const __m256i oneMinusConstAlpha = _mm256_set1_epi32(i: cia);
509	for (; x < length - `3`; x += `4`) {
510	const __m256i srcVector = _mm256_lddqu_si256(p: (const __m256i *)&src[x]);
511	__m256i dstVector = _mm256_load_si256(p: (__m256i *)&dst[x]);
512	INTERPOLATE_PIXEL_RGB64_AVX2(srcVector, dstVector, alphaChannel: constAlphaVector, oneMinusAlphaChannel: oneMinusConstAlpha, colorMask, half);
513	_mm256_store_si256(p: (__m256i *)&dst[x], a: dstVector);
514	}
515
516	// 3) Epilogue
517	SIMD_EPILOGUE(x, length, `3`)
518	dst[x] = interpolate65535(x: src[x], alpha1: ca, y: dst[x], alpha2: cia);
519	}
520	}
521	#endif
522
523	#if QT_CONFIG(raster_fp)
524	void QT_FASTCALL comp_func_Source_rgbafp_avx2(QRgbaFloat32 dst, const* QRgbaFloat32 src, int* length, uint const_alpha)
525	{
526	Q_ASSERT(const_alpha < `256`); // const_alpha is in [0-255]
527	if (const_alpha == `255`) {
528	::memcpy(dest: dst, src: src, n: length * sizeof(QRgbaFloat32));
529	} else {
530	const float ca = const_alpha / `255.f`;
531	const float cia = `1.0f` - ca;
532
533	const __m128 constAlphaVector = _mm_set1_ps(w: ca);
534	const __m128 oneMinusConstAlpha = _mm_set1_ps(w: cia);
535	const __m256 constAlphaVector256 = _mm256_set1_ps(w: ca);
536	const __m256 oneMinusConstAlpha256 = _mm256_set1_ps(w: cia);
537	int x = `0`;
538	for (; x < length - `1`; x += `2`) {
539	__m256 srcVector = _mm256_loadu_ps(p: (const float *)&src[x]);
540	__m256 dstVector = _mm256_loadu_ps(p: (const float *)&dst[x]);
541	srcVector = _mm256_mul_ps(a: srcVector, b: constAlphaVector256);
542	dstVector = _mm256_mul_ps(a: dstVector, b: oneMinusConstAlpha256);
543	dstVector = _mm256_add_ps(a: dstVector, b: srcVector);
544	_mm256_storeu_ps(p: (float *)&dst[x], a: dstVector);
545	}
546	if (x < length) {
547	__m128 srcVector = _mm_loadu_ps(p: (const float *)&src[x]);
548	__m128 dstVector = _mm_loadu_ps(p: (const float *)&dst[x]);
549	srcVector = _mm_mul_ps(a: srcVector, b: constAlphaVector);
550	dstVector = _mm_mul_ps(a: dstVector, b: oneMinusConstAlpha);
551	dstVector = _mm_add_ps(a: dstVector, b: srcVector);
552	_mm_storeu_ps(p: (float *)&dst[x], a: dstVector);
553	}
554	}
555	}
556	#endif
557
558	void QT_FASTCALL comp_func_solid_SourceOver_avx2(uint destPixels, int* length, uint color, uint const_alpha)
559	{
560	if ((const_alpha & qAlpha(rgb: color)) == `255`) {
561	qt_memfill32(destPixels, color, length);
562	} else {
563	if (const_alpha != `255`)
564	color = BYTE_MUL(x: color, a: const_alpha);
565
566	const quint32 minusAlphaOfColor = qAlpha(rgb: ~color);
567	int x = `0`;
568
569	quint32 dst = (quint32 ) destPixels;
570	const __m256i colorVector = _mm256_set1_epi32(i: color);
571	const __m256i colorMask = _mm256_set1_epi32(i: `0x00ff00ff`);
572	const __m256i half = _mm256_set1_epi16(w: `0x80`);
573	const __m256i minusAlphaOfColorVector = _mm256_set1_epi16(w: minusAlphaOfColor);
574
575	ALIGNMENT_PROLOGUE_32BYTES(dst, x, length)
576	destPixels[x] = color + BYTE_MUL(x: destPixels[x], a: minusAlphaOfColor);
577
578	for (; x < length - `7`; x += `8`) {
579	__m256i dstVector = _mm256_load_si256(p: (__m256i *)&dst[x]);
580	BYTE_MUL_AVX2(pixelVector&: dstVector, alphaChannel: minusAlphaOfColorVector, colorMask, half);
581	dstVector = _mm256_add_epi8(a: colorVector, b: dstVector);
582	_mm256_store_si256(p: (__m256i *)&dst[x], a: dstVector);
583	}
584	SIMD_EPILOGUE(x, length, `7`)
585	destPixels[x] = color + BYTE_MUL(x: destPixels[x], a: minusAlphaOfColor);
586	}
587	}
588
589	#if QT_CONFIG(raster_64bit)
590	void QT_FASTCALL comp_func_solid_SourceOver_rgb64_avx2(QRgba64 destPixels, int* length, QRgba64 color, uint const_alpha)
591	{
592	Q_ASSERT(const_alpha < `256`); // const_alpha is in [0-255]
593	if (const_alpha == `255` && color.isOpaque()) {
594	qt_memfill64((quint64*)destPixels, color, length);
595	} else {
596	if (const_alpha != `255`)
597	color = multiplyAlpha255(rgba64: color, alpha255: const_alpha);
598
599	const uint minusAlphaOfColor = `65535` - color.alpha();
600	int x = `0`;
601	quint64 dst = (quint64 ) destPixels;
602	const __m256i colorVector = _mm256_set1_epi64x(q: color);
603	const __m256i colorMask = _mm256_set1_epi32(i: `0x0000ffff`);
604	const __m256i half = _mm256_set1_epi32(i: `0x8000`);
605	const __m256i minusAlphaOfColorVector = _mm256_set1_epi32(i: minusAlphaOfColor);
606
607	for (; x < length && (quintptr(dst + x) & `31`); ++x)
608	destPixels[x] = color + multiplyAlpha65535(rgba64: destPixels[x], alpha65535: minusAlphaOfColor);
609
610	for (; x < length - `3`; x += `4`) {
611	__m256i dstVector = _mm256_load_si256(p: (__m256i *)&dst[x]);
612	BYTE_MUL_RGB64_AVX2(pixelVector&: dstVector, alphaChannel: minusAlphaOfColorVector, colorMask, half);
613	dstVector = _mm256_add_epi16(a: colorVector, b: dstVector);
614	_mm256_store_si256(p: (__m256i *)&dst[x], a: dstVector);
615	}
616	SIMD_EPILOGUE(x, length, `3`)
617	destPixels[x] = color + multiplyAlpha65535(rgba64: destPixels[x], alpha65535: minusAlphaOfColor);
618	}
619	}
620	#endif
621
622	#if QT_CONFIG(raster_fp)
623	void QT_FASTCALL comp_func_solid_Source_rgbafp_avx2(QRgbaFloat32 dst, int* length, QRgbaFloat32 color, uint const_alpha)
624	{
625	Q_ASSERT(const_alpha < `256`); // const_alpha is in [0-255]
626	if (const_alpha == `255`) {
627	for (int i = `0`; i < length; ++i)
628	dst[i] = color;
629	} else {
630	const float a = const_alpha / `255.0f`;
631	const __m128 alphaVector = _mm_set1_ps(w: a);
632	const __m128 minusAlphaVector = _mm_set1_ps(w: `1.0f` - a);
633	__m128 colorVector = _mm_loadu_ps(p: (const float *)&color);
634	colorVector = _mm_mul_ps(a: colorVector, b: alphaVector);
635	const __m256 colorVector256 = _mm256_insertf128_ps(_mm256_castps128_ps256(colorVector), colorVector, `1`);
636	const __m256 minusAlphaVector256 = _mm256_set1_ps(w: `1.0f` - a);
637	int x = `0`;
638	for (; x < length - `1`; x += `2`) {
639	__m256 dstVector = _mm256_loadu_ps(p: (const float *)&dst[x]);
640	dstVector = _mm256_mul_ps(a: dstVector, b: minusAlphaVector256);
641	dstVector = _mm256_add_ps(a: dstVector, b: colorVector256);
642	_mm256_storeu_ps(p: (float *)&dst[x], a: dstVector);
643	}
644	if (x < length) {
645	__m128 dstVector = _mm_loadu_ps(p: (const float *)&dst[x]);
646	dstVector = _mm_mul_ps(a: dstVector, b: minusAlphaVector);
647	dstVector = _mm_add_ps(a: dstVector, b: colorVector);
648	_mm_storeu_ps(p: (float *)&dst[x], a: dstVector);
649	}
650	}
651	}
652
653	void QT_FASTCALL comp_func_solid_SourceOver_rgbafp_avx2(QRgbaFloat32 dst, int* length, QRgbaFloat32 color, uint const_alpha)
654	{
655	Q_ASSERT(const_alpha < `256`); // const_alpha is in [0-255]
656	if (const_alpha == `255` && color.a >= `1.0f`) {
657	for (int i = `0`; i < length; ++i)
658	dst[i] = color;
659	} else {
660	__m128 colorVector = _mm_loadu_ps(p: (const float *)&color);
661	if (const_alpha != `255`)
662	colorVector = _mm_mul_ps(a: colorVector, b: _mm_set1_ps(w: const_alpha / `255.f`));
663	__m128 minusAlphaOfColorVector =
664	_mm_sub_ps(a: _mm_set1_ps(w: `1.0f`), _mm_permute_ps(colorVector, _MM_SHUFFLE(`3`, `3`, `3`, `3`)));
665	const __m256 colorVector256 = _mm256_insertf128_ps(_mm256_castps128_ps256(colorVector), colorVector, `1`);
666	const __m256 minusAlphaVector256 = _mm256_insertf128_ps(_mm256_castps128_ps256(minusAlphaOfColorVector),
667	minusAlphaOfColorVector, `1`);
668	int x = `0`;
669	for (; x < length - `1`; x += `2`) {
670	__m256 dstVector = _mm256_loadu_ps(p: (const float *)&dst[x]);
671	dstVector = _mm256_mul_ps(a: dstVector, b: minusAlphaVector256);
672	dstVector = _mm256_add_ps(a: dstVector, b: colorVector256);
673	_mm256_storeu_ps(p: (float *)&dst[x], a: dstVector);
674	}
675	if (x < length) {
676	__m128 dstVector = _mm_loadu_ps(p: (const float *)&dst[x]);
677	dstVector = _mm_mul_ps(a: dstVector, b: minusAlphaOfColorVector);
678	dstVector = _mm_add_ps(a: dstVector, b: colorVector);
679	_mm_storeu_ps(p: (float *)&dst[x], a: dstVector);
680	}
681	}
682	}
683	#endif
684
685	#define interpolate_4_pixels_16_avx2(tlr1, tlr2, blr1, blr2, distx, disty, colorMask, v_256, b) \
686	{ \
687	/* Correct for later unpack */ \
688	const __m256i vdistx = _mm256_permute4x64_epi64(distx, _MM_SHUFFLE(3, 1, 2, 0)); \
689	const __m256i vdisty = _mm256_permute4x64_epi64(disty, _MM_SHUFFLE(3, 1, 2, 0)); \
690	\
691	__m256i dxdy = _mm256_mullo_epi16 (vdistx, vdisty); \
692	const __m256i distx_ = _mm256_slli_epi16(vdistx, 4); \
693	const __m256i disty_ = _mm256_slli_epi16(vdisty, 4); \
694	__m256i idxidy = _mm256_add_epi16(dxdy, _mm256_sub_epi16(v_256, _mm256_add_epi16(distx_, disty_))); \
695	__m256i dxidy = _mm256_sub_epi16(distx_, dxdy); \
696	__m256i idxdy = _mm256_sub_epi16(disty_, dxdy); \
697	\
698	__m256i tlr1AG = _mm256_srli_epi16(tlr1, 8); \
699	__m256i tlr1RB = _mm256_and_si256(tlr1, colorMask); \
700	__m256i tlr2AG = _mm256_srli_epi16(tlr2, 8); \
701	__m256i tlr2RB = _mm256_and_si256(tlr2, colorMask); \
702	__m256i blr1AG = _mm256_srli_epi16(blr1, 8); \
703	__m256i blr1RB = _mm256_and_si256(blr1, colorMask); \
704	__m256i blr2AG = _mm256_srli_epi16(blr2, 8); \
705	__m256i blr2RB = _mm256_and_si256(blr2, colorMask); \
706	\
707	__m256i odxidy1 = _mm256_unpacklo_epi32(idxidy, dxidy); \
708	__m256i odxidy2 = _mm256_unpackhi_epi32(idxidy, dxidy); \
709	tlr1AG = _mm256_mullo_epi16(tlr1AG, odxidy1); \
710	tlr1RB = _mm256_mullo_epi16(tlr1RB, odxidy1); \
711	tlr2AG = _mm256_mullo_epi16(tlr2AG, odxidy2); \
712	tlr2RB = _mm256_mullo_epi16(tlr2RB, odxidy2); \
713	__m256i odxdy1 = _mm256_unpacklo_epi32(idxdy, dxdy); \
714	__m256i odxdy2 = _mm256_unpackhi_epi32(idxdy, dxdy); \
715	blr1AG = _mm256_mullo_epi16(blr1AG, odxdy1); \
716	blr1RB = _mm256_mullo_epi16(blr1RB, odxdy1); \
717	blr2AG = _mm256_mullo_epi16(blr2AG, odxdy2); \
718	blr2RB = _mm256_mullo_epi16(blr2RB, odxdy2); \
719	\
720	/* Add the values, and shift to only keep 8 significant bits per colors */ \
721	__m256i topAG = _mm256_hadd_epi32(tlr1AG, tlr2AG); \
722	__m256i topRB = _mm256_hadd_epi32(tlr1RB, tlr2RB); \
723	__m256i botAG = _mm256_hadd_epi32(blr1AG, blr2AG); \
724	__m256i botRB = _mm256_hadd_epi32(blr1RB, blr2RB); \
725	__m256i rAG = _mm256_add_epi16(topAG, botAG); \
726	__m256i rRB = _mm256_add_epi16(topRB, botRB); \
727	rRB = _mm256_srli_epi16(rRB, 8); \
728	/* Correct for hadd */ \
729	rAG = _mm256_permute4x64_epi64(rAG, _MM_SHUFFLE(3, 1, 2, 0)); \
730	rRB = _mm256_permute4x64_epi64(rRB, _MM_SHUFFLE(3, 1, 2, 0)); \
731	_mm256_storeu_si256((__m256i*)(b), _mm256_blendv_epi8(rAG, rRB, colorMask)); \
732	}
733
734	inline void fetchTransformedBilinear_pixelBounds(int, int l1, int l2, int &v1, int &v2)
735	{
736	if (v1 < l1)
737	v2 = v1 = l1;
738	else if (v1 >= l2)
739	v2 = v1 = l2;
740	else
741	v2 = v1 + `1`;
742	Q_ASSERT(v1 >= l1 && v1 <= l2);
743	Q_ASSERT(v2 >= l1 && v2 <= l2);
744	}
745
746	void QT_FASTCALL intermediate_adder_avx2(uint b, uint end, const IntermediateBuffer &intermediate, int offset, int &fx, int fdx);
747
748	void QT_FASTCALL fetchTransformedBilinearARGB32PM_simple_scale_helper_avx2(uint b, uint end, const QTextureData &image,
749	int &fx, int &fy, int fdx, int /fdy/)
750	{
751	int y1 = (fy >> `16`);
752	int y2;
753	fetchTransformedBilinear_pixelBounds(image.height, l1: image.y1, l2: image.y2 - `1`, v1&: y1, v2&: y2);
754	const uint s1 = (const* uint *)image.scanLine(y: y1);
755	const uint s2 = (const* uint *)image.scanLine(y: y2);
756
757	const int disty = (fy & `0x0000ffff`) >> `8`;
758	const int idisty = `256` - disty;
759	const int length = end - b;
760
761	// The intermediate buffer is generated in the positive direction
762	const int adjust = (fdx < `0`) ? fdx * length : `0`;
763	const int offset = (fx + adjust) >> `16`;
764	int x = offset;
765
766	IntermediateBuffer intermediate;
767	// count is the size used in the intermediate_buffer.
768	int count = (qint64(length) * qAbs(t: fdx) + FixedScale - `1`) / FixedScale + `2`;
769	// length is supposed to be <= BufferSize either because data->m11 < 1 or
770	// data->m11 < 2, and any larger buffers split
771	Q_ASSERT(count <= BufferSize + `2`);
772	int f = `0`;
773	int lim = qMin(a: count, b: image.x2 - x);
774	if (x < image.x1) {
775	Q_ASSERT(x < image.x2);
776	uint t = s1[image.x1];
777	uint b = s2[image.x1];
778	quint32 rb = (((t & `0xff00ff`) * idisty + (b & `0xff00ff`) * disty) >> `8`) & `0xff00ff`;
779	quint32 ag = ((((t>>`8`) & `0xff00ff`) * idisty + ((b>>`8`) & `0xff00ff`) * disty) >> `8`) & `0xff00ff`;
780	do {
781	intermediate.buffer_rb[f] = rb;
782	intermediate.buffer_ag[f] = ag;
783	f++;
784	x++;
785	} while (x < image.x1 && f < lim);
786	}
787
788	const __m256i disty_ = _mm256_set1_epi16(w: disty);
789	const __m256i idisty_ = _mm256_set1_epi16(w: idisty);
790	const __m256i colorMask = _mm256_set1_epi32(i: `0x00ff00ff`);
791
792	lim -= `7`;
793	for (; f < lim; x += `8`, f += `8`) {
794	// Load 8 pixels from s1, and split the alpha-green and red-blue component
795	__m256i top = _mm256_loadu_si256(p: (const __m256i)((const* uint *)(s1)+x));
796	__m256i topAG = _mm256_srli_epi16(a: top, count: `8`);
797	__m256i topRB = _mm256_and_si256(a: top, b: colorMask);
798	// Multiplies each color component by idisty
799	topAG = _mm256_mullo_epi16 (a: topAG, b: idisty_);
800	topRB = _mm256_mullo_epi16 (a: topRB, b: idisty_);
801
802	// Same for the s2 vector
803	__m256i bottom = _mm256_loadu_si256(p: (const __m256i)((const* uint *)(s2)+x));
804	__m256i bottomAG = _mm256_srli_epi16(a: bottom, count: `8`);
805	__m256i bottomRB = _mm256_and_si256(a: bottom, b: colorMask);
806	bottomAG = _mm256_mullo_epi16 (a: bottomAG, b: disty_);
807	bottomRB = _mm256_mullo_epi16 (a: bottomRB, b: disty_);
808
809	// Add the values, and shift to only keep 8 significant bits per colors
810	__m256i rAG =_mm256_add_epi16(a: topAG, b: bottomAG);
811	rAG = _mm256_srli_epi16(a: rAG, count: `8`);
812	_mm256_storeu_si256(p: (__m256i*)(&intermediate.buffer_ag[f]), a: rAG);
813	__m256i rRB =_mm256_add_epi16(a: topRB, b: bottomRB);
814	rRB = _mm256_srli_epi16(a: rRB, count: `8`);
815	_mm256_storeu_si256(p: (__m256i*)(&intermediate.buffer_rb[f]), a: rRB);
816	}
817
818	for (; f < count; f++) { // Same as above but without simd
819	x = qMin(a: x, b: image.x2 - `1`);
820
821	uint t = s1[x];
822	uint b = s2[x];
823
824	intermediate.buffer_rb[f] = (((t & `0xff00ff`) * idisty + (b & `0xff00ff`) * disty) >> `8`) & `0xff00ff`;
825	intermediate.buffer_ag[f] = ((((t>>`8`) & `0xff00ff`) * idisty + ((b>>`8`) & `0xff00ff`) * disty) >> `8`) & `0xff00ff`;
826	x++;
827	}
828
829	// Now interpolate the values from the intermediate_buffer to get the final result.
830	intermediate_adder_avx2(b, end, intermediate, offset, fx, fdx);
831	}
832
833	void QT_FASTCALL intermediate_adder_avx2(uint b, uint end, const IntermediateBuffer &intermediate, int offset, int &fx, int fdx)
834	{
835	fx -= offset * FixedScale;
836
837	const __m128i v_fdx = _mm_set1_epi32(i: fdx * `4`);
838	const __m128i v_blend = _mm_set1_epi32(i: `0x00800080`);
839	const __m128i vdx_shuffle = _mm_set_epi8(b15: char(`0x80`), b14: `13`, b13: char(`0x80`), b12: `13`, b11: char(`0x80`), b10: `9`, b9: char(`0x80`), b8: `9`,
840	b7: char(`0x80`), b6: `5`, b5: char(`0x80`), b4: `5`, b3: char(`0x80`), b2: `1`, b1: char(`0x80`), b0: `1`);
841	__m128i v_fx = _mm_setr_epi32(i0: fx, i1: fx + fdx, i2: fx + fdx + fdx, i3: fx + fdx + fdx + fdx);
842
843	while (b < end - `3`) {
844	const __m128i offset = _mm_srli_epi32(a: v_fx, count: `16`);
845	__m256i vrb = _mm256_i32gather_epi64((const long long *)intermediate.buffer_rb, offset, `4`);
846	__m256i vag = _mm256_i32gather_epi64((const long long *)intermediate.buffer_ag, offset, `4`);
847
848	__m128i vdx = _mm_shuffle_epi8(a: v_fx, b: vdx_shuffle);
849	__m128i vidx = _mm_sub_epi16(a: _mm_set1_epi16(w: `256`), b: vdx);
850	__m256i vmulx = _mm256_castsi128_si256(a: _mm_unpacklo_epi32(a: vidx, b: vdx));
851	vmulx = _mm256_inserti128_si256(vmulx, _mm_unpackhi_epi32(vidx, vdx), `1`);
852
853	vrb = _mm256_mullo_epi16(a: vrb, b: vmulx);
854	vag = _mm256_mullo_epi16(a: vag, b: vmulx);
855
856	__m256i vrbag = _mm256_hadd_epi32(a: vrb, b: vag);
857	vrbag = _mm256_permute4x64_epi64(vrbag, _MM_SHUFFLE(`3`, `1`, `2`, `0`));
858
859	__m128i rb = _mm256_castsi256_si128(a: vrbag);
860	__m128i ag = _mm256_extracti128_si256(vrbag, `1`);
861	rb = _mm_srli_epi16(a: rb, count: `8`);
862
863	_mm_storeu_si128(p: (__m128i*)b, b: _mm_blendv_epi8(V1: ag, V2: rb, M: v_blend));
864
865	b += `4`;
866	v_fx = _mm_add_epi32(a: v_fx, b: v_fdx);
867	}
868	fx = _mm_cvtsi128_si32(a: v_fx);
869	while (b < end) {
870	const int x = (fx >> `16`);
871
872	const uint distx = (fx & `0x0000ffff`) >> `8`;
873	const uint idistx = `256` - distx;
874	const uint rb = (intermediate.buffer_rb[x] * idistx + intermediate.buffer_rb[x + `1`] * distx) & `0xff00ff00`;
875	const uint ag = (intermediate.buffer_ag[x] * idistx + intermediate.buffer_ag[x + `1`] * distx) & `0xff00ff00`;
876	*b = (rb >> `8`) \| ag;
877	b++;
878	fx += fdx;
879	}
880	fx += offset * FixedScale;
881	}
882
883	void QT_FASTCALL fetchTransformedBilinearARGB32PM_downscale_helper_avx2(uint b, uint end, const QTextureData &image,
884	int &fx, int &fy, int fdx, int /fdy/)
885	{
886	int y1 = (fy >> `16`);
887	int y2;
888	fetchTransformedBilinear_pixelBounds(image.height, l1: image.y1, l2: image.y2 - `1`, v1&: y1, v2&: y2);
889	const uint s1 = (const* uint *)image.scanLine(y: y1);
890	const uint s2 = (const* uint *)image.scanLine(y: y2);
891	const int disty8 = (fy & `0x0000ffff`) >> `8`;
892	const int disty4 = (disty8 + `0x08`) >> `4`;
893
894	const qint64 min_fx = qint64(image.x1) * FixedScale;
895	const qint64 max_fx = qint64(image.x2 - `1`) * FixedScale;
896	while (b < end) {
897	int x1 = (fx >> `16`);
898	int x2;
899	fetchTransformedBilinear_pixelBounds(image.width, l1: image.x1, l2: image.x2 - `1`, v1&: x1, v2&: x2);
900	if (x1 != x2)
901	break;
902	uint top = s1[x1];
903	uint bot = s2[x1];
904	*b = INTERPOLATE_PIXEL_256(x: top, a: `256` - disty8, y: bot, b: disty8);
905	fx += fdx;
906	++b;
907	}
908	uint *boundedEnd = end;
909	if (fdx > `0`)
910	boundedEnd = qMin(a: boundedEnd, b: b + (max_fx - fx) / fdx);
911	else if (fdx < `0`)
912	boundedEnd = qMin(a: boundedEnd, b: b + (min_fx - fx) / fdx);
913
914	// A fast middle part without boundary checks
915	const __m256i vdistShuffle =
916	_mm256_setr_epi8(b31: `0`, b30: char(`0x80`), b29: `0`, b28: char(`0x80`), b27: `4`, b26: char(`0x80`), b25: `4`, b24: char(`0x80`), b23: `8`, b22: char(`0x80`), b21: `8`, b20: char(`0x80`), b19: `12`, b18: char(`0x80`), b17: `12`, b16: char(`0x80`),
917	b15: `0`, b14: char(`0x80`), b13: `0`, b12: char(`0x80`), b11: `4`, b10: char(`0x80`), b09: `4`, b08: char(`0x80`), b07: `8`, b06: char(`0x80`), b05: `8`, b04: char(`0x80`), b03: `12`, b02: char(`0x80`), b01: `12`, b00: char(`0x80`));
918	const __m256i colorMask = _mm256_set1_epi32(i: `0x00ff00ff`);
919	const __m256i v_256 = _mm256_set1_epi16(w: `256`);
920	const __m256i v_disty = _mm256_set1_epi16(w: disty4);
921	const __m256i v_fdx = _mm256_set1_epi32(i: fdx * `8`);
922	const __m256i v_fx_r = _mm256_set1_epi32(i: `0x08`);
923	const __m256i v_index = _mm256_setr_epi32(i0: `0`, i1: `1`, i2: `2`, i3: `3`, i4: `4`, i5: `5`, i6: `6`, i7: `7`);
924	__m256i v_fx = _mm256_set1_epi32(i: fx);
925	v_fx = _mm256_add_epi32(a: v_fx, b: _mm256_mullo_epi32(a: _mm256_set1_epi32(i: fdx), b: v_index));
926
927	while (b < boundedEnd - `7`) {
928	const __m256i offset = _mm256_srli_epi32(a: v_fx, count: `16`);
929	const __m128i offsetLo = _mm256_castsi256_si128(a: offset);
930	const __m128i offsetHi = _mm256_extracti128_si256(offset, `1`);
931	const __m256i toplo = _mm256_i32gather_epi64((const long long *)s1, offsetLo, `4`);
932	const __m256i tophi = _mm256_i32gather_epi64((const long long *)s1, offsetHi, `4`);
933	const __m256i botlo = _mm256_i32gather_epi64((const long long *)s2, offsetLo, `4`);
934	const __m256i bothi = _mm256_i32gather_epi64((const long long *)s2, offsetHi, `4`);
935
936	__m256i v_distx = _mm256_srli_epi16(a: v_fx, count: `8`);
937	v_distx = _mm256_srli_epi16(a: _mm256_add_epi32(a: v_distx, b: v_fx_r), count: `4`);
938	v_distx = _mm256_shuffle_epi8(a: v_distx, b: vdistShuffle);
939
940	interpolate_4_pixels_16_avx2(toplo, tophi, botlo, bothi, v_distx, v_disty, colorMask, v_256, b);
941	b += `8`;
942	v_fx = _mm256_add_epi32(a: v_fx, b: v_fdx);
943	}
944	fx = _mm_extract_epi32(_mm256_castsi256_si128(v_fx) , `0`);
945
946	while (b < boundedEnd) {
947	int x = (fx >> `16`);
948	int distx8 = (fx & `0x0000ffff`) >> `8`;
949	*b = interpolate_4_pixels(t: s1 + x, b: s2 + x, distx: distx8, disty: disty8);
950	fx += fdx;
951	++b;
952	}
953
954	while (b < end) {
955	int x1 = (fx >> `16`);
956	int x2;
957	fetchTransformedBilinear_pixelBounds(image.width, l1: image.x1, l2: image.x2 - `1`, v1&: x1, v2&: x2);
958	uint tl = s1[x1];
959	uint tr = s1[x2];
960	uint bl = s2[x1];
961	uint br = s2[x2];
962	int distx8 = (fx & `0x0000ffff`) >> `8`;
963	*b = interpolate_4_pixels(tl, tr, bl, br, distx: distx8, disty: disty8);
964	fx += fdx;
965	++b;
966	}
967	}
968
969	void QT_FASTCALL fetchTransformedBilinearARGB32PM_fast_rotate_helper_avx2(uint b, uint end, const QTextureData &image,
970	int &fx, int &fy, int fdx, int fdy)
971	{
972	const qint64 min_fx = qint64(image.x1) * FixedScale;
973	const qint64 max_fx = qint64(image.x2 - `1`) * FixedScale;
974	const qint64 min_fy = qint64(image.y1) * FixedScale;
975	const qint64 max_fy = qint64(image.y2 - `1`) * FixedScale;
976	// first handle the possibly bounded part in the beginning
977	while (b < end) {
978	int x1 = (fx >> `16`);
979	int x2;
980	int y1 = (fy >> `16`);
981	int y2;
982	fetchTransformedBilinear_pixelBounds(image.width, l1: image.x1, l2: image.x2 - `1`, v1&: x1, v2&: x2);
983	fetchTransformedBilinear_pixelBounds(image.height, l1: image.y1, l2: image.y2 - `1`, v1&: y1, v2&: y2);
984	if (x1 != x2 && y1 != y2)
985	break;
986	const uint s1 = (const* uint *)image.scanLine(y: y1);
987	const uint s2 = (const* uint *)image.scanLine(y: y2);
988	uint tl = s1[x1];
989	uint tr = s1[x2];
990	uint bl = s2[x1];
991	uint br = s2[x2];
992	int distx = (fx & `0x0000ffff`) >> `8`;
993	int disty = (fy & `0x0000ffff`) >> `8`;
994	*b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
995	fx += fdx;
996	fy += fdy;
997	++b;
998	}
999	uint *boundedEnd = end;
1000	if (fdx > `0`)
1001	boundedEnd = qMin(a: boundedEnd, b: b + (max_fx - fx) / fdx);
1002	else if (fdx < `0`)
1003	boundedEnd = qMin(a: boundedEnd, b: b + (min_fx - fx) / fdx);
1004	if (fdy > `0`)
1005	boundedEnd = qMin(a: boundedEnd, b: b + (max_fy - fy) / fdy);
1006	else if (fdy < `0`)
1007	boundedEnd = qMin(a: boundedEnd, b: b + (min_fy - fy) / fdy);
1008
1009	// until boundedEnd we can now have a fast middle part without boundary checks
1010	const __m256i vdistShuffle =
1011	_mm256_setr_epi8(b31: `0`, b30: char(`0x80`), b29: `0`, b28: char(`0x80`), b27: `4`, b26: char(`0x80`), b25: `4`, b24: char(`0x80`), b23: `8`, b22: char(`0x80`), b21: `8`, b20: char(`0x80`), b19: `12`, b18: char(`0x80`), b17: `12`, b16: char(`0x80`),
1012	b15: `0`, b14: char(`0x80`), b13: `0`, b12: char(`0x80`), b11: `4`, b10: char(`0x80`), b09: `4`, b08: char(`0x80`), b07: `8`, b06: char(`0x80`), b05: `8`, b04: char(`0x80`), b03: `12`, b02: char(`0x80`), b01: `12`, b00: char(`0x80`));
1013	const __m256i colorMask = _mm256_set1_epi32(i: `0x00ff00ff`);
1014	const __m256i v_256 = _mm256_set1_epi16(w: `256`);
1015	const __m256i v_fdx = _mm256_set1_epi32(i: fdx * `8`);
1016	const __m256i v_fdy = _mm256_set1_epi32(i: fdy * `8`);
1017	const __m256i v_fxy_r = _mm256_set1_epi32(i: `0x08`);
1018	const __m256i v_index = _mm256_setr_epi32(i0: `0`, i1: `1`, i2: `2`, i3: `3`, i4: `4`, i5: `5`, i6: `6`, i7: `7`);
1019	__m256i v_fx = _mm256_set1_epi32(i: fx);
1020	__m256i v_fy = _mm256_set1_epi32(i: fy);
1021	v_fx = _mm256_add_epi32(a: v_fx, b: _mm256_mullo_epi32(a: _mm256_set1_epi32(i: fdx), b: v_index));
1022	v_fy = _mm256_add_epi32(a: v_fy, b: _mm256_mullo_epi32(a: _mm256_set1_epi32(i: fdy), b: v_index));
1023
1024	const uchar *textureData = image.imageData;
1025	const qsizetype bytesPerLine = image.bytesPerLine;
1026	const __m256i vbpl = _mm256_set1_epi16(w: bytesPerLine/`4`);
1027
1028	while (b < boundedEnd - `7`) {
1029	const __m256i vy = _mm256_packs_epi32(a: _mm256_srli_epi32(a: v_fy, count: `16`), b: _mm256_setzero_si256());
1030	// 8x16bit 8x16bit -> 8x32bit*
1031	__m256i offset = _mm256_unpacklo_epi16(a: _mm256_mullo_epi16(a: vy, b: vbpl), b: _mm256_mulhi_epi16(a: vy, b: vbpl));
1032	offset = _mm256_add_epi32(a: offset, b: _mm256_srli_epi32(a: v_fx, count: `16`));
1033	const __m128i offsetLo = _mm256_castsi256_si128(a: offset);
1034	const __m128i offsetHi = _mm256_extracti128_si256(offset, `1`);
1035	const uint topData = (const* uint *)(textureData);
1036	const uint botData = (const* uint *)(textureData + bytesPerLine);
1037	const __m256i toplo = _mm256_i32gather_epi64((const long long *)topData, offsetLo, `4`);
1038	const __m256i tophi = _mm256_i32gather_epi64((const long long *)topData, offsetHi, `4`);
1039	const __m256i botlo = _mm256_i32gather_epi64((const long long *)botData, offsetLo, `4`);
1040	const __m256i bothi = _mm256_i32gather_epi64((const long long *)botData, offsetHi, `4`);
1041
1042	__m256i v_distx = _mm256_srli_epi16(a: v_fx, count: `8`);
1043	__m256i v_disty = _mm256_srli_epi16(a: v_fy, count: `8`);
1044	v_distx = _mm256_srli_epi16(a: _mm256_add_epi32(a: v_distx, b: v_fxy_r), count: `4`);
1045	v_disty = _mm256_srli_epi16(a: _mm256_add_epi32(a: v_disty, b: v_fxy_r), count: `4`);
1046	v_distx = _mm256_shuffle_epi8(a: v_distx, b: vdistShuffle);
1047	v_disty = _mm256_shuffle_epi8(a: v_disty, b: vdistShuffle);
1048
1049	interpolate_4_pixels_16_avx2(toplo, tophi, botlo, bothi, v_distx, v_disty, colorMask, v_256, b);
1050	b += `8`;
1051	v_fx = _mm256_add_epi32(a: v_fx, b: v_fdx);
1052	v_fy = _mm256_add_epi32(a: v_fy, b: v_fdy);
1053	}
1054	fx = _mm_extract_epi32(_mm256_castsi256_si128(v_fx) , `0`);
1055	fy = _mm_extract_epi32(_mm256_castsi256_si128(v_fy) , `0`);
1056
1057	while (b < boundedEnd) {
1058	int x = (fx >> `16`);
1059	int y = (fy >> `16`);
1060
1061	const uint s1 = (const* uint *)image.scanLine(y);
1062	const uint s2 = (const* uint *)image.scanLine(y: y + `1`);
1063
1064	int distx = (fx & `0x0000ffff`) >> `8`;
1065	int disty = (fy & `0x0000ffff`) >> `8`;
1066	*b = interpolate_4_pixels(t: s1 + x, b: s2 + x, distx, disty);
1067
1068	fx += fdx;
1069	fy += fdy;
1070	++b;
1071	}
1072
1073	while (b < end) {
1074	int x1 = (fx >> `16`);
1075	int x2;
1076	int y1 = (fy >> `16`);
1077	int y2;
1078
1079	fetchTransformedBilinear_pixelBounds(image.width, l1: image.x1, l2: image.x2 - `1`, v1&: x1, v2&: x2);
1080	fetchTransformedBilinear_pixelBounds(image.height, l1: image.y1, l2: image.y2 - `1`, v1&: y1, v2&: y2);
1081
1082	const uint s1 = (const* uint *)image.scanLine(y: y1);
1083	const uint s2 = (const* uint *)image.scanLine(y: y2);
1084
1085	uint tl = s1[x1];
1086	uint tr = s1[x2];
1087	uint bl = s2[x1];
1088	uint br = s2[x2];
1089
1090	int distx = (fx & `0x0000ffff`) >> `8`;
1091	int disty = (fy & `0x0000ffff`) >> `8`;
1092	*b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
1093
1094	fx += fdx;
1095	fy += fdy;
1096	++b;
1097	}
1098	}
1099
1100	static inline __m256i epilogueMaskFromCount(qsizetype count)
1101	{
1102	Q_ASSERT(count > `0`);
1103	static const __m256i offsetMask = _mm256_setr_epi32(i0: `0`, i1: `1`, i2: `2`, i3: `3`, i4: `4`, i5: `5`, i6: `6`, i7: `7`);
1104	return _mm256_add_epi32(a: offsetMask, b: _mm256_set1_epi32(i: -count));
1105	}
1106
1107	template<bool RGBA>
1108	static void convertARGBToARGB32PM_avx2(uint buffer, const* uint *src, qsizetype count)
1109	{
1110	qsizetype i = `0`;
1111	const __m256i alphaMask = _mm256_set1_epi32(i: `0xff000000`);
1112	const __m256i rgbaMask = _mm256_broadcastsi128_si256(X: _mm_setr_epi8(b0: `2`, b1: `1`, b2: `0`, b3: `3`, b4: `6`, b5: `5`, b6: `4`, b7: `7`, b8: `10`, b9: `9`, b10: `8`, b11: `11`, b12: `14`, b13: `13`, b14: `12`, b15: `15`));
1113	const __m256i shuffleMask = _mm256_broadcastsi128_si256(X: _mm_setr_epi8(b0: `6`, b1: `7`, b2: `6`, b3: `7`, b4: `6`, b5: `7`, b6: `6`, b7: `7`, b8: `14`, b9: `15`, b10: `14`, b11: `15`, b12: `14`, b13: `15`, b14: `14`, b15: `15`));
1114	const __m256i half = _mm256_set1_epi16(w: `0x0080`);
1115	const __m256i zero = _mm256_setzero_si256();
1116
1117	for (; i < count - `7`; i += `8`) {
1118	__m256i srcVector = _mm256_loadu_si256(p: reinterpret_cast<const __m256i *>(src + i));
1119	if (!_mm256_testz_si256(a: srcVector, b: alphaMask)) {
1120	// keep the two _mm_test[zc]_siXXX next to each other
1121	bool cf = _mm256_testc_si256(a: srcVector, b: alphaMask);
1122	if (RGBA)
1123	srcVector = _mm256_shuffle_epi8(a: srcVector, b: rgbaMask);
1124	if (!cf) {
1125	__m256i src1 = _mm256_unpacklo_epi8(a: srcVector, b: zero);
1126	__m256i src2 = _mm256_unpackhi_epi8(a: srcVector, b: zero);
1127	__m256i alpha1 = _mm256_shuffle_epi8(a: src1, b: shuffleMask);
1128	__m256i alpha2 = _mm256_shuffle_epi8(a: src2, b: shuffleMask);
1129	src1 = _mm256_mullo_epi16(a: src1, b: alpha1);
1130	src2 = _mm256_mullo_epi16(a: src2, b: alpha2);
1131	src1 = _mm256_add_epi16(a: src1, b: _mm256_srli_epi16(a: src1, count: `8`));
1132	src2 = _mm256_add_epi16(a: src2, b: _mm256_srli_epi16(a: src2, count: `8`));
1133	src1 = _mm256_add_epi16(a: src1, b: half);
1134	src2 = _mm256_add_epi16(a: src2, b: half);
1135	src1 = _mm256_srli_epi16(a: src1, count: `8`);
1136	src2 = _mm256_srli_epi16(a: src2, count: `8`);
1137	src1 = _mm256_blend_epi16(src1, alpha1, `0x88`);
1138	src2 = _mm256_blend_epi16(src2, alpha2, `0x88`);
1139	srcVector = _mm256_packus_epi16(a: src1, b: src2);
1140	_mm256_storeu_si256(p: reinterpret_cast<__m256i *>(buffer + i), a: srcVector);
1141	} else {
1142	if (buffer != src \|\| RGBA)
1143	_mm256_storeu_si256(p: reinterpret_cast<__m256i *>(buffer + i), a: srcVector);
1144	}
1145	} else {
1146	_mm256_storeu_si256(p: reinterpret_cast<__m256i *>(buffer + i), a: zero);
1147	}
1148	}
1149
1150	if (i < count) {
1151	const __m256i epilogueMask = epilogueMaskFromCount(count: count - i);
1152	__m256i srcVector = _mm256_maskload_epi32(X: reinterpret_cast<const int *>(src + i), M: epilogueMask);
1153	const __m256i epilogueAlphaMask = _mm256_blendv_epi8(V1: _mm256_setzero_si256(), V2: alphaMask, M: epilogueMask);
1154
1155	if (!_mm256_testz_si256(a: srcVector, b: epilogueAlphaMask)) {
1156	// keep the two _mm_test[zc]_siXXX next to each other
1157	bool cf = _mm256_testc_si256(a: srcVector, b: epilogueAlphaMask);
1158	if (RGBA)
1159	srcVector = _mm256_shuffle_epi8(a: srcVector, b: rgbaMask);
1160	if (!cf) {
1161	__m256i src1 = _mm256_unpacklo_epi8(a: srcVector, b: zero);
1162	__m256i src2 = _mm256_unpackhi_epi8(a: srcVector, b: zero);
1163	__m256i alpha1 = _mm256_shuffle_epi8(a: src1, b: shuffleMask);
1164	__m256i alpha2 = _mm256_shuffle_epi8(a: src2, b: shuffleMask);
1165	src1 = _mm256_mullo_epi16(a: src1, b: alpha1);
1166	src2 = _mm256_mullo_epi16(a: src2, b: alpha2);
1167	src1 = _mm256_add_epi16(a: src1, b: _mm256_srli_epi16(a: src1, count: `8`));
1168	src2 = _mm256_add_epi16(a: src2, b: _mm256_srli_epi16(a: src2, count: `8`));
1169	src1 = _mm256_add_epi16(a: src1, b: half);
1170	src2 = _mm256_add_epi16(a: src2, b: half);
1171	src1 = _mm256_srli_epi16(a: src1, count: `8`);
1172	src2 = _mm256_srli_epi16(a: src2, count: `8`);
1173	src1 = _mm256_blend_epi16(src1, alpha1, `0x88`);
1174	src2 = _mm256_blend_epi16(src2, alpha2, `0x88`);
1175	srcVector = _mm256_packus_epi16(a: src1, b: src2);
1176	_mm256_maskstore_epi32(X: reinterpret_cast<int *>(buffer + i), M: epilogueMask, Y: srcVector);
1177	} else {
1178	if (buffer != src \|\| RGBA)
1179	_mm256_maskstore_epi32(X: reinterpret_cast<int *>(buffer + i), M: epilogueMask, Y: srcVector);
1180	}
1181	} else {
1182	_mm256_maskstore_epi32(X: reinterpret_cast<int *>(buffer + i), M: epilogueMask, Y: zero);
1183	}
1184	}
1185	}
1186
1187	void QT_FASTCALL convertARGB32ToARGB32PM_avx2(uint buffer, int* count, const QList<QRgb> *)
1188	{
1189	convertARGBToARGB32PM_avx2<false>(buffer, src: buffer, count);
1190	}
1191
1192	void QT_FASTCALL convertRGBA8888ToARGB32PM_avx2(uint buffer, int* count, const QList<QRgb> *)
1193	{
1194	convertARGBToARGB32PM_avx2<true>(buffer, src: buffer, count);
1195	}
1196
1197	const uint QT_FASTCALL fetchARGB32ToARGB32PM_avx2(uint buffer, const uchar src, int* index, int count,
1198	const QList<QRgb> , QDitherInfo )
1199	{
1200	convertARGBToARGB32PM_avx2<false>(buffer, src: reinterpret_cast<const uint *>(src) + index, count);
1201	return buffer;
1202	}
1203
1204	const uint QT_FASTCALL fetchRGBA8888ToARGB32PM_avx2(uint buffer, const uchar src, int* index, int count,
1205	const QList<QRgb> , QDitherInfo )
1206	{
1207	convertARGBToARGB32PM_avx2<true>(buffer, src: reinterpret_cast<const uint *>(src) + index, count);
1208	return buffer;
1209	}
1210
1211	template<bool RGBA>
1212	static void convertARGBToRGBA64PM_avx2(QRgba64 buffer, const* uint *src, qsizetype count)
1213	{
1214	qsizetype i = `0`;
1215	const __m256i alphaMask = _mm256_set1_epi32(i: `0xff000000`);
1216	const __m256i rgbaMask = _mm256_broadcastsi128_si256(X: _mm_setr_epi8(b0: `2`, b1: `1`, b2: `0`, b3: `3`, b4: `6`, b5: `5`, b6: `4`, b7: `7`, b8: `10`, b9: `9`, b10: `8`, b11: `11`, b12: `14`, b13: `13`, b14: `12`, b15: `15`));
1217	const __m256i shuffleMask = _mm256_broadcastsi128_si256(X: _mm_setr_epi8(b0: `6`, b1: `7`, b2: `6`, b3: `7`, b4: `6`, b5: `7`, b6: `6`, b7: `7`, b8: `14`, b9: `15`, b10: `14`, b11: `15`, b12: `14`, b13: `15`, b14: `14`, b15: `15`));
1218	const __m256i zero = _mm256_setzero_si256();
1219
1220	for (; i < count - `7`; i += `8`) {
1221	__m256i dst1, dst2;
1222	__m256i srcVector = _mm256_loadu_si256(p: reinterpret_cast<const __m256i *>(src + i));
1223	if (!_mm256_testz_si256(a: srcVector, b: alphaMask)) {
1224	// keep the two _mm_test[zc]_siXXX next to each other
1225	bool cf = _mm256_testc_si256(a: srcVector, b: alphaMask);
1226	if (!RGBA)
1227	srcVector = _mm256_shuffle_epi8(a: srcVector, b: rgbaMask);
1228
1229	// The two unpack instructions unpack the low and upper halves of
1230	// each 128-bit half of the 256-bit register. Here's the tracking
1231	// of what's where: (p is 32-bit, P is 64-bit)
1232	// as loaded: [ p1, p2, p3, p4; p5, p6, p7, p8 ]
1233	// after permute4x64 [ p1, p2, p5, p6; p3, p4, p7, p8 ]
1234	// after unpacklo/hi [ P1, P2; P3, P4 ] [ P5, P6; P7, P8 ]
1235	srcVector = _mm256_permute4x64_epi64(srcVector, _MM_SHUFFLE(`3`, `1`, `2`, `0`));
1236
1237	const __m256i src1 = _mm256_unpacklo_epi8(a: srcVector, b: srcVector);
1238	const __m256i src2 = _mm256_unpackhi_epi8(a: srcVector, b: srcVector);
1239	if (!cf) {
1240	const __m256i alpha1 = _mm256_shuffle_epi8(a: src1, b: shuffleMask);
1241	const __m256i alpha2 = _mm256_shuffle_epi8(a: src2, b: shuffleMask);
1242	dst1 = _mm256_mulhi_epu16(a: src1, b: alpha1);
1243	dst2 = _mm256_mulhi_epu16(a: src2, b: alpha2);
1244	dst1 = _mm256_add_epi16(a: dst1, b: _mm256_srli_epi16(a: dst1, count: `15`));
1245	dst2 = _mm256_add_epi16(a: dst2, b: _mm256_srli_epi16(a: dst2, count: `15`));
1246	dst1 = _mm256_blend_epi16(dst1, src1, `0x88`);
1247	dst2 = _mm256_blend_epi16(dst2, src2, `0x88`);
1248	} else {
1249	dst1 = src1;
1250	dst2 = src2;
1251	}
1252	} else {
1253	dst1 = dst2 = zero;
1254	}
1255	_mm256_storeu_si256(p: reinterpret_cast<__m256i *>(buffer + i), a: dst1);
1256	_mm256_storeu_si256(p: reinterpret_cast<__m256i *>(buffer + i) + `1`, a: dst2);
1257	}
1258
1259	if (i < count) {
1260	__m256i epilogueMask = epilogueMaskFromCount(count: count - i);
1261	const __m256i epilogueAlphaMask = _mm256_blendv_epi8(V1: _mm256_setzero_si256(), V2: alphaMask, M: epilogueMask);
1262	__m256i dst1, dst2;
1263	__m256i srcVector = _mm256_maskload_epi32(X: reinterpret_cast<const int *>(src + i), M: epilogueMask);
1264
1265	if (!_mm256_testz_si256(a: srcVector, b: epilogueAlphaMask)) {
1266	// keep the two _mm_test[zc]_siXXX next to each other
1267	bool cf = _mm256_testc_si256(a: srcVector, b: epilogueAlphaMask);
1268	if (!RGBA)
1269	srcVector = _mm256_shuffle_epi8(a: srcVector, b: rgbaMask);
1270	srcVector = _mm256_permute4x64_epi64(srcVector, _MM_SHUFFLE(`3`, `1`, `2`, `0`));
1271	const __m256i src1 = _mm256_unpacklo_epi8(a: srcVector, b: srcVector);
1272	const __m256i src2 = _mm256_unpackhi_epi8(a: srcVector, b: srcVector);
1273	if (!cf) {
1274	const __m256i alpha1 = _mm256_shuffle_epi8(a: src1, b: shuffleMask);
1275	const __m256i alpha2 = _mm256_shuffle_epi8(a: src2, b: shuffleMask);
1276	dst1 = _mm256_mulhi_epu16(a: src1, b: alpha1);
1277	dst2 = _mm256_mulhi_epu16(a: src2, b: alpha2);
1278	dst1 = _mm256_add_epi16(a: dst1, b: _mm256_srli_epi16(a: dst1, count: `15`));
1279	dst2 = _mm256_add_epi16(a: dst2, b: _mm256_srli_epi16(a: dst2, count: `15`));
1280	dst1 = _mm256_blend_epi16(dst1, src1, `0x88`);
1281	dst2 = _mm256_blend_epi16(dst2, src2, `0x88`);
1282	} else {
1283	dst1 = src1;
1284	dst2 = src2;
1285	}
1286	} else {
1287	dst1 = dst2 = zero;
1288	}
1289	epilogueMask = _mm256_permute4x64_epi64(epilogueMask, _MM_SHUFFLE(`3`, `1`, `2`, `0`));
1290	_mm256_maskstore_epi64(X: reinterpret_cast<qint64 *>(buffer + i),
1291	M: _mm256_unpacklo_epi32(a: epilogueMask, b: epilogueMask),
1292	Y: dst1);
1293	_mm256_maskstore_epi64(X: reinterpret_cast<qint64 *>(buffer + i + `4`),
1294	M: _mm256_unpackhi_epi32(a: epilogueMask, b: epilogueMask),
1295	Y: dst2);
1296	}
1297	}
1298
1299	const QRgba64 * QT_FASTCALL convertARGB32ToRGBA64PM_avx2(QRgba64 buffer, const* uint src, int* count,
1300	const QList<QRgb> , QDitherInfo )
1301	{
1302	convertARGBToRGBA64PM_avx2<false>(buffer, src, count);
1303	return buffer;
1304	}
1305
1306	const QRgba64 * QT_FASTCALL convertRGBA8888ToRGBA64PM_avx2(QRgba64 buffer, const* uint src, int* count,
1307	const QList<QRgb> , QDitherInfo )
1308	{
1309	convertARGBToRGBA64PM_avx2<true>(buffer, src, count);
1310	return buffer;
1311	}
1312
1313	const QRgba64 QT_FASTCALL fetchARGB32ToRGBA64PM_avx2(QRgba64 buffer, const uchar src, int* index, int count,
1314	const QList<QRgb> , QDitherInfo )
1315	{
1316	convertARGBToRGBA64PM_avx2<false>(buffer, src: reinterpret_cast<const uint *>(src) + index, count);
1317	return buffer;
1318	}
1319
1320	const QRgba64 QT_FASTCALL fetchRGBA8888ToRGBA64PM_avx2(QRgba64 buffer, const uchar src, int* index, int count,
1321	const QList<QRgb> , QDitherInfo )
1322	{
1323	convertARGBToRGBA64PM_avx2<true>(buffer, src: reinterpret_cast<const uint *>(src) + index, count);
1324	return buffer;
1325	}
1326
1327	const QRgba64 QT_FASTCALL fetchRGBA64ToRGBA64PM_avx2(QRgba64 buffer, const uchar src, int* index, int count,
1328	const QList<QRgb> , QDitherInfo )
1329	{
1330	const QRgba64 s = reinterpret_cast<const* QRgba64 *>(src) + index;
1331	int i = `0`;
1332	const __m256i vh = _mm256_set1_epi32(i: `0x8000`);
1333	for (; i < count - `3`; i += `4`) {
1334	__m256i vs256 = _mm256_loadu_si256(p: (const __m256i *)(s + i));
1335	__m256i va256 = _mm256_shufflelo_epi16(vs256, _MM_SHUFFLE(`3`, `3`, `3`, `3`));
1336	va256 = _mm256_shufflehi_epi16(va256, _MM_SHUFFLE(`3`, `3`, `3`, `3`));
1337	const __m256i vmullo = _mm256_mullo_epi16(a: vs256, b: va256);
1338	const __m256i vmulhi = _mm256_mulhi_epu16(a: vs256, b: va256);
1339	__m256i vslo = _mm256_unpacklo_epi16(a: vmullo, b: vmulhi);
1340	__m256i vshi = _mm256_unpackhi_epi16(a: vmullo, b: vmulhi);
1341	vslo = _mm256_add_epi32(a: vslo, b: _mm256_srli_epi32(a: vslo, count: `16`));
1342	vshi = _mm256_add_epi32(a: vshi, b: _mm256_srli_epi32(a: vshi, count: `16`));
1343	vslo = _mm256_add_epi32(a: vslo, b: vh);
1344	vshi = _mm256_add_epi32(a: vshi, b: vh);
1345	vslo = _mm256_srli_epi32(a: vslo, count: `16`);
1346	vshi = _mm256_srli_epi32(a: vshi, count: `16`);
1347	vs256 = _mm256_packus_epi32(V1: vslo, V2: vshi);
1348	vs256 = _mm256_blend_epi16(vs256, va256, `0x88`);
1349	_mm256_storeu_si256(p: (__m256i *)(buffer + i), a: vs256);
1350	}
1351	for (; i < count; ++i) {
1352	const auto a = s[i].alpha();
1353	__m128i vs = _mm_loadl_epi64(p: (const __m128i *)(s + i));
1354	__m128i va = _mm_shufflelo_epi16(vs, _MM_SHUFFLE(`3`, `3`, `3`, `3`));
1355	vs = multiplyAlpha65535(rgba64: vs, va);
1356	_mm_storel_epi64(p: (__m128i *)(buffer + i), a: vs);
1357	buffer[i].setAlpha(a);
1358	}
1359	return buffer;
1360	}
1361
1362	const uint QT_FASTCALL fetchRGB16FToRGB32_avx2(uint buffer, const uchar src, int* index, int count,
1363	const QList<QRgb> , QDitherInfo )
1364	{
1365	const quint64 s = reinterpret_cast<const* quint64 *>(src) + index;
1366	const __m256 vf = _mm256_set1_ps(w: `255.0f`);
1367	const __m256 vh = _mm256_set1_ps(w: `0.5f`);
1368	int i = `0`;
1369	for (; i + `1` < count; i += `2`) {
1370	__m256 vsf = _mm256_cvtph_ps(a: _mm_loadu_si128(p: (const __m128i *)(s + i)));
1371	vsf = _mm256_mul_ps(a: vsf, b: vf);
1372	vsf = _mm256_add_ps(a: vsf, b: vh);
1373	__m256i vsi = _mm256_cvttps_epi32(a: vsf);
1374	vsi = _mm256_packs_epi32(a: vsi, b: vsi);
1375	vsi = _mm256_shufflelo_epi16(vsi, _MM_SHUFFLE(`3`, `0`, `1`, `2`));
1376	vsi = _mm256_permute4x64_epi64(vsi, _MM_SHUFFLE(`3`, `1`, `2`, `0`));
1377	__m128i vsi128 = _mm256_castsi256_si128(a: vsi);
1378	vsi128 = _mm_packus_epi16(a: vsi128, b: vsi128);
1379	_mm_storel_epi64(p: (__m128i *)(buffer + i), a: vsi128);
1380	}
1381	if (i < count) {
1382	__m128 vsf = _mm_cvtph_ps(a: _mm_loadl_epi64(p: (const __m128i *)(s + i)));
1383	vsf = _mm_mul_ps(a: vsf, b: _mm_set1_ps(w: `255.0f`));
1384	vsf = _mm_add_ps(a: vsf, b: _mm_set1_ps(w: `0.5f`));
1385	__m128i vsi = _mm_cvttps_epi32(a: vsf);
1386	vsi = _mm_packs_epi32(a: vsi, b: vsi);
1387	vsi = _mm_shufflelo_epi16(vsi, _MM_SHUFFLE(`3`, `0`, `1`, `2`));
1388	vsi = _mm_packus_epi16(a: vsi, b: vsi);
1389	buffer[i] = _mm_cvtsi128_si32(a: vsi);
1390	}
1391	return buffer;
1392	}
1393
1394	const uint QT_FASTCALL fetchRGBA16FToARGB32PM_avx2(uint buffer, const uchar src, int* index, int count,
1395	const QList<QRgb> , QDitherInfo )
1396	{
1397	const quint64 s = reinterpret_cast<const* quint64 *>(src) + index;
1398	const __m256 vf = _mm256_set1_ps(w: `255.0f`);
1399	const __m256 vh = _mm256_set1_ps(w: `0.5f`);
1400	int i = `0`;
1401	for (; i + `1` < count; i += `2`) {
1402	__m256 vsf = _mm256_cvtph_ps(a: _mm_loadu_si128(p: (const __m128i *)(s + i)));
1403	__m256 vsa = _mm256_permute_ps(vsf, _MM_SHUFFLE(`3`, `3`, `3`, `3`));
1404	vsf = _mm256_mul_ps(a: vsf, b: vsa);
1405	vsf = _mm256_blend_ps(vsf, vsa, `0x88`);
1406	vsf = _mm256_mul_ps(a: vsf, b: vf);
1407	vsf = _mm256_add_ps(a: vsf, b: vh);
1408	__m256i vsi = _mm256_cvttps_epi32(a: vsf);
1409	vsi = _mm256_packus_epi32(V1: vsi, V2: vsi);
1410	vsi = _mm256_shufflelo_epi16(vsi, _MM_SHUFFLE(`3`, `0`, `1`, `2`));
1411	vsi = _mm256_permute4x64_epi64(vsi, _MM_SHUFFLE(`3`, `1`, `2`, `0`));
1412	__m128i vsi128 = _mm256_castsi256_si128(a: vsi);
1413	vsi128 = _mm_packus_epi16(a: vsi128, b: vsi128);
1414	_mm_storel_epi64(p: (__m128i *)(buffer + i), a: vsi128);
1415	}
1416	if (i < count) {
1417	__m128 vsf = _mm_cvtph_ps(a: _mm_loadl_epi64(p: (const __m128i *)(s + i)));
1418	__m128 vsa = _mm_permute_ps(vsf, _MM_SHUFFLE(`3`, `3`, `3`, `3`));
1419	vsf = _mm_mul_ps(a: vsf, b: vsa);
1420	vsf = _mm_insert_ps(vsf, vsa, `0x30`);
1421	vsf = _mm_mul_ps(a: vsf, b: _mm_set1_ps(w: `255.0f`));
1422	vsf = _mm_add_ps(a: vsf, b: _mm_set1_ps(w: `0.5f`));
1423	__m128i vsi = _mm_cvttps_epi32(a: vsf);
1424	vsi = _mm_packus_epi32(V1: vsi, V2: vsi);
1425	vsi = _mm_shufflelo_epi16(vsi, _MM_SHUFFLE(`3`, `0`, `1`, `2`));
1426	vsi = _mm_packus_epi16(a: vsi, b: vsi);
1427	buffer[i] = _mm_cvtsi128_si32(a: vsi);
1428	}
1429	return buffer;
1430	}
1431
1432	const QRgba64 QT_FASTCALL fetchRGBA16FPMToRGBA64PM_avx2(QRgba64 buffer, const uchar src, int* index, int count,
1433	const QList<QRgb> , QDitherInfo )
1434	{
1435	const quint64 s = reinterpret_cast<const* quint64 *>(src) + index;
1436	const __m256 vf = _mm256_set1_ps(w: `65535.0f`);
1437	const __m256 vh = _mm256_set1_ps(w: `0.5f`);
1438	int i = `0`;
1439	for (; i + `1` < count; i += `2`) {
1440	__m256 vsf = _mm256_cvtph_ps(a: _mm_loadu_si128(p: (const __m128i *)(s + i)));
1441	vsf = _mm256_mul_ps(a: vsf, b: vf);
1442	vsf = _mm256_add_ps(a: vsf, b: vh);
1443	__m256i vsi = _mm256_cvttps_epi32(a: vsf);
1444	vsi = _mm256_packus_epi32(V1: vsi, V2: vsi);
1445	vsi = _mm256_permute4x64_epi64(vsi, _MM_SHUFFLE(`3`, `1`, `2`, `0`));
1446	_mm_storeu_si128(p: (__m128i *)(buffer + i), b: _mm256_castsi256_si128(a: vsi));
1447	}
1448	if (i < count) {
1449	__m128 vsf = _mm_cvtph_ps(a: _mm_loadl_epi64(p: (const __m128i *)(s + i)));
1450	vsf = _mm_mul_ps(a: vsf, b: _mm_set1_ps(w: `65535.0f`));
1451	vsf = _mm_add_ps(a: vsf, b: _mm_set1_ps(w: `0.5f`));
1452	__m128i vsi = _mm_cvttps_epi32(a: vsf);
1453	vsi = _mm_packus_epi32(V1: vsi, V2: vsi);
1454	_mm_storel_epi64(p: (__m128i *)(buffer + i), a: vsi);
1455	}
1456	return buffer;
1457	}
1458
1459	const QRgba64 QT_FASTCALL fetchRGBA16FToRGBA64PM_avx2(QRgba64 buffer, const uchar src, int* index, int count,
1460	const QList<QRgb> , QDitherInfo )
1461	{
1462	const quint64 s = reinterpret_cast<const* quint64 *>(src) + index;
1463	const __m256 vf = _mm256_set1_ps(w: `65535.0f`);
1464	const __m256 vh = _mm256_set1_ps(w: `0.5f`);
1465	int i = `0`;
1466	for (; i + `1` < count; i += `2`) {
1467	__m256 vsf = _mm256_cvtph_ps(a: _mm_loadu_si128(p: (const __m128i *)(s + i)));
1468	__m256 vsa = _mm256_shuffle_ps(vsf, vsf, _MM_SHUFFLE(`3`, `3`, `3`, `3`));
1469	vsf = _mm256_mul_ps(a: vsf, b: vsa);
1470	vsf = _mm256_blend_ps(vsf, vsa, `0x88`);
1471	vsf = _mm256_mul_ps(a: vsf, b: vf);
1472	vsf = _mm256_add_ps(a: vsf, b: vh);
1473	__m256i vsi = _mm256_cvttps_epi32(a: vsf);
1474	vsi = _mm256_packus_epi32(V1: vsi, V2: vsi);
1475	vsi = _mm256_permute4x64_epi64(vsi, _MM_SHUFFLE(`3`, `1`, `2`, `0`));
1476	_mm_storeu_si128(p: (__m128i *)(buffer + i), b: _mm256_castsi256_si128(a: vsi));
1477	}
1478	if (i < count) {
1479	__m128 vsf = _mm_cvtph_ps(a: _mm_loadl_epi64(p: (const __m128i *)(s + i)));
1480	__m128 vsa = _mm_shuffle_ps(vsf, vsf, _MM_SHUFFLE(`3`, `3`, `3`, `3`));
1481	vsf = _mm_mul_ps(a: vsf, b: vsa);
1482	vsf = _mm_insert_ps(vsf, vsa, `0x30`);
1483	vsf = _mm_mul_ps(a: vsf, b: _mm_set1_ps(w: `65535.0f`));
1484	vsf = _mm_add_ps(a: vsf, b: _mm_set1_ps(w: `0.5f`));
1485	__m128i vsi = _mm_cvttps_epi32(a: vsf);
1486	vsi = _mm_packus_epi32(V1: vsi, V2: vsi);
1487	_mm_storel_epi64(p: (__m128i *)(buffer + i), a: vsi);
1488	}
1489	return buffer;
1490	}
1491
1492	void QT_FASTCALL storeRGB16FFromRGB32_avx2(uchar dest, const* uint src, int* index, int count,
1493	const QList<QRgb> , QDitherInfo )
1494	{
1495	quint64 d = reinterpret_cast<quint64 >(dest) + index;
1496	const __m256 vf = _mm256_set1_ps(w: `1.0f` / `255.0f`);
1497	int i = `0`;
1498	for (; i + `1` < count; i += `2`) {
1499	__m256i vsi = _mm256_cvtepu8_epi32(V: _mm_loadl_epi64(p: (const __m128i *)(src + i)));
1500	vsi = _mm256_shuffle_epi32(vsi, _MM_SHUFFLE(`3`, `0`, `1`, `2`));
1501	__m256 vsf = _mm256_cvtepi32_ps(a: vsi);
1502	vsf = _mm256_mul_ps(a: vsf, b: vf);
1503	_mm_storeu_si128(p: (__m128i *)(d + i), _mm256_cvtps_ph(vsf, `0`));
1504	}
1505	if (i < count) {
1506	__m128i vsi = _mm_cvtsi32_si128(a: src[i]);
1507	vsi = _mm_cvtepu8_epi32(V: vsi);
1508	vsi = _mm_shuffle_epi32(vsi, _MM_SHUFFLE(`3`, `0`, `1`, `2`));
1509	__m128 vsf = _mm_cvtepi32_ps(a: vsi);
1510	vsf = _mm_mul_ps(a: vsf, b: _mm_set1_ps(w: `1.0f` / `255.0f`));
1511	_mm_storel_epi64(p: (__m128i *)(d + i), _mm_cvtps_ph(vsf, `0`));
1512	}
1513	}
1514
1515	void QT_FASTCALL storeRGBA16FFromARGB32PM_avx2(uchar dest, const* uint src, int* index, int count,
1516	const QList<QRgb> , QDitherInfo )
1517	{
1518	quint64 d = reinterpret_cast<quint64 >(dest) + index;
1519	const __m128 vf = _mm_set1_ps(w: `1.0f` / `255.0f`);
1520	for (int i = `0`; i < count; ++i) {
1521	const uint s = src[i];
1522	__m128i vsi = _mm_cvtsi32_si128(a: s);
1523	vsi = _mm_cvtepu8_epi32(V: vsi);
1524	vsi = _mm_shuffle_epi32(vsi, _MM_SHUFFLE(`3`, `0`, `1`, `2`));
1525	__m128 vsf = _mm_cvtepi32_ps(a: vsi);
1526	const uint8_t a = (s >> `24`);
1527	if (a == `255`)
1528	vsf = _mm_mul_ps(a: vsf, b: vf);
1529	else if (a == `0`)
1530	vsf = _mm_set1_ps(w: `0.0f`);
1531	else {
1532	const __m128 vsa = _mm_permute_ps(vsf, _MM_SHUFFLE(`3`, `3`, `3`, `3`));
1533	__m128 vsr = _mm_rcp_ps(a: vsa);
1534	vsr = _mm_sub_ps(a: _mm_add_ps(a: vsr, b: vsr), b: _mm_mul_ps(a: vsr, b: _mm_mul_ps(a: vsr, b: vsa)));
1535	vsr = _mm_insert_ps(vsr, _mm_set_ss(`1.0f`), `0x30`);
1536	vsf = _mm_mul_ps(a: vsf, b: vsr);
1537	}
1538	_mm_storel_epi64(p: (__m128i *)(d + i), _mm_cvtps_ph(vsf, `0`));
1539	}
1540	}
1541
1542	#if QT_CONFIG(raster_fp)
1543	const QRgbaFloat32 QT_FASTCALL fetchRGBA16FToRGBA32F_avx2(QRgbaFloat32 buffer, const uchar src, int* index, int count,
1544	const QList<QRgb> , QDitherInfo )
1545	{
1546	const quint64 s = reinterpret_cast<const* quint64 *>(src) + index;
1547	int i = `0`;
1548	for (; i + `1` < count; i += `2`) {
1549	__m256 vsf = _mm256_cvtph_ps(a: _mm_loadu_si128(p: (const __m128i *)(s + i)));
1550	__m256 vsa = _mm256_permute_ps(vsf, _MM_SHUFFLE(`3`, `3`, `3`, `3`));
1551	vsf = _mm256_mul_ps(a: vsf, b: vsa);
1552	vsf = _mm256_blend_ps(vsf, vsa, `0x88`);
1553	_mm256_storeu_ps(p: (float *)(buffer + i), a: vsf);
1554	}
1555	if (i < count) {
1556	__m128 vsf = _mm_cvtph_ps(a: _mm_loadl_epi64(p: (const __m128i *)(s + i)));
1557	__m128 vsa = _mm_permute_ps(vsf, _MM_SHUFFLE(`3`, `3`, `3`, `3`));
1558	vsf = _mm_mul_ps(a: vsf, b: vsa);
1559	vsf = _mm_insert_ps(vsf, vsa, `0x30`);
1560	_mm_storeu_ps(p: (float *)(buffer + i), a: vsf);
1561	}
1562	return buffer;
1563	}
1564
1565	void QT_FASTCALL storeRGBX16FFromRGBA32F_avx2(uchar dest, const* QRgbaFloat32 src, int* index, int count,
1566	const QList<QRgb> , QDitherInfo )
1567	{
1568	quint64 d = reinterpret_cast<quint64 >(dest) + index;
1569	const __m128 s = reinterpret_cast<const* __m128 *>(src);
1570	const __m128 zero = _mm_set_ps(z: `1.0f`, y: `0.0f`, x: `0.0f`, w: `0.0f`);
1571	for (int i = `0`; i < count; ++i) {
1572	__m128 vsf = _mm_loadu_ps(p: reinterpret_cast<const float *>(s + i));
1573	const __m128 vsa = _mm_permute_ps(vsf, _MM_SHUFFLE(`3`, `3`, `3`, `3`));
1574	const float a = _mm_cvtss_f32(a: vsa);
1575	if (a == `1.0f`)
1576	{ }
1577	else if (a == `0.0f`)
1578	vsf = zero;
1579	else {
1580	__m128 vsr = _mm_rcp_ps(a: vsa);
1581	vsr = _mm_sub_ps(a: _mm_add_ps(a: vsr, b: vsr), b: _mm_mul_ps(a: vsr, b: _mm_mul_ps(a: vsr, b: vsa)));
1582	vsf = _mm_mul_ps(a: vsf, b: vsr);
1583	vsf = _mm_insert_ps(vsf, _mm_set_ss(`1.0f`), `0x30`);
1584	}
1585	_mm_storel_epi64(p: (__m128i *)(d + i), _mm_cvtps_ph(vsf, `0`));
1586	}
1587	}
1588
1589	void QT_FASTCALL storeRGBA16FFromRGBA32F_avx2(uchar dest, const* QRgbaFloat32 src, int* index, int count,
1590	const QList<QRgb> , QDitherInfo )
1591	{
1592	quint64 d = reinterpret_cast<quint64 >(dest) + index;
1593	const __m128 s = reinterpret_cast<const* __m128 *>(src);
1594	const __m128 zero = _mm_set1_ps(w: `0.0f`);
1595	for (int i = `0`; i < count; ++i) {
1596	__m128 vsf = _mm_loadu_ps(p: reinterpret_cast<const float *>(s + i));
1597	const __m128 vsa = _mm_permute_ps(vsf, _MM_SHUFFLE(`3`, `3`, `3`, `3`));
1598	const float a = _mm_cvtss_f32(a: vsa);
1599	if (a == `1.0f`)
1600	{ }
1601	else if (a == `0.0f`)
1602	vsf = zero;
1603	else {
1604	__m128 vsr = _mm_rcp_ps(a: vsa);
1605	vsr = _mm_sub_ps(a: _mm_add_ps(a: vsr, b: vsr), b: _mm_mul_ps(a: vsr, b: _mm_mul_ps(a: vsr, b: vsa)));
1606	vsr = _mm_insert_ps(vsr, _mm_set_ss(`1.0f`), `0x30`);
1607	vsf = _mm_mul_ps(a: vsf, b: vsr);
1608	}
1609	_mm_storel_epi64(p: (__m128i *)(d + i), _mm_cvtps_ph(vsf, `0`));
1610	}
1611	}
1612	#endif
1613
1614	QT_END_NAMESPACE
1615
1616	#endif
1617

Provided by KDAB

Definitions

BYTE_MUL_AVX2
BYTE_MUL_RGB64_AVX2
INTERPOLATE_PIXEL_255_AVX2
INTERPOLATE_PIXEL_RGB64_AVX2
BLEND_SOURCE_OVER_ARGB32_AVX2
BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_AVX2
qt_blend_argb32_on_argb32_avx2
qt_blend_rgb32_on_rgb32_avx2
qt_memfillXX_avx2
qt_memfill64_avx2
qt_memfill32_avx2
comp_func_SourceOver_avx2
comp_func_SourceOver_rgb64_avx2
comp_func_SourceOver_rgbafp_avx2
comp_func_Source_avx2
comp_func_Source_rgb64_avx2
comp_func_Source_rgbafp_avx2
comp_func_solid_SourceOver_avx2
comp_func_solid_SourceOver_rgb64_avx2
comp_func_solid_Source_rgbafp_avx2
comp_func_solid_SourceOver_rgbafp_avx2
fetchTransformedBilinear_pixelBounds
fetchTransformedBilinearARGB32PM_simple_scale_helper_avx2
intermediate_adder_avx2
fetchTransformedBilinearARGB32PM_downscale_helper_avx2
fetchTransformedBilinearARGB32PM_fast_rotate_helper_avx2
epilogueMaskFromCount
convertARGBToARGB32PM_avx2
convertARGB32ToARGB32PM_avx2
convertRGBA8888ToARGB32PM_avx2
fetchARGB32ToARGB32PM_avx2
fetchRGBA8888ToARGB32PM_avx2
convertARGBToRGBA64PM_avx2
convertARGB32ToRGBA64PM_avx2
convertRGBA8888ToRGBA64PM_avx2
fetchARGB32ToRGBA64PM_avx2
fetchRGBA8888ToRGBA64PM_avx2
fetchRGBA64ToRGBA64PM_avx2
fetchRGB16FToRGB32_avx2
fetchRGBA16FToARGB32PM_avx2
fetchRGBA16FPMToRGBA64PM_avx2
fetchRGBA16FToRGBA64PM_avx2
storeRGB16FFromRGB32_avx2
storeRGBA16FFromARGB32PM_avx2
fetchRGBA16FToRGBA32F_avx2
storeRGBX16FFromRGBA32F_avx2

Start learning QML with our Intro Training

Find out more

Definitions

source code of qtbase/src/gui/painting/qdrawhelper_avx2.cpp