qdrawhelper_sse4.cpp source code [qtbase/src/gui/painting/qdrawhelper_sse4.cpp]

1	// Copyright (C) 2016 The Qt Company Ltd.
2	// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3
4	#include <private/qdrawhelper_p.h>
5	#include <private/qdrawingprimitive_sse2_p.h>
6	#include <private/qpaintengine_raster_p.h>
7	#include <private/qpixellayout_p.h>
8
9	#if defined(QT_COMPILER_SUPPORTS_SSE4_1)
10
11	QT_BEGIN_NAMESPACE
12
13	#ifndef __haswell__
14	template<bool RGBA>
15	static void convertARGBToARGB32PM_sse4(uint buffer, const* uint src, int* count)
16	{
17	int i = `0`;
18	const __m128i alphaMask = _mm_set1_epi32(i: `0xff000000`);
19	const __m128i rgbaMask = _mm_setr_epi8(b0: `2`, b1: `1`, b2: `0`, b3: `3`, b4: `6`, b5: `5`, b6: `4`, b7: `7`, b8: `10`, b9: `9`, b10: `8`, b11: `11`, b12: `14`, b13: `13`, b14: `12`, b15: `15`);
20	const __m128i shuffleMask = _mm_setr_epi8(b0: `6`, b1: `7`, b2: `6`, b3: `7`, b4: `6`, b5: `7`, b6: `6`, b7: `7`, b8: `14`, b9: `15`, b10: `14`, b11: `15`, b12: `14`, b13: `15`, b14: `14`, b15: `15`);
21	const __m128i half = _mm_set1_epi16(w: `0x0080`);
22	const __m128i zero = _mm_setzero_si128();
23
24	for (; i < count - `3`; i += `4`) {
25	__m128i srcVector = _mm_loadu_si128(p: (const __m128i *)&src[i]);
26	if (!_mm_testz_si128(M: srcVector, V: alphaMask)) {
27	if (!_mm_testc_si128(M: srcVector, V: alphaMask)) {
28	if (RGBA)
29	srcVector = _mm_shuffle_epi8(a: srcVector, b: rgbaMask);
30	__m128i src1 = _mm_unpacklo_epi8(a: srcVector, b: zero);
31	__m128i src2 = _mm_unpackhi_epi8(a: srcVector, b: zero);
32	__m128i alpha1 = _mm_shuffle_epi8(a: src1, b: shuffleMask);
33	__m128i alpha2 = _mm_shuffle_epi8(a: src2, b: shuffleMask);
34	src1 = _mm_mullo_epi16(a: src1, b: alpha1);
35	src2 = _mm_mullo_epi16(a: src2, b: alpha2);
36	src1 = _mm_add_epi16(a: src1, b: _mm_srli_epi16(a: src1, count: `8`));
37	src2 = _mm_add_epi16(a: src2, b: _mm_srli_epi16(a: src2, count: `8`));
38	src1 = _mm_add_epi16(a: src1, b: half);
39	src2 = _mm_add_epi16(a: src2, b: half);
40	src1 = _mm_srli_epi16(a: src1, count: `8`);
41	src2 = _mm_srli_epi16(a: src2, count: `8`);
42	src1 = _mm_blend_epi16(src1, alpha1, `0x88`);
43	src2 = _mm_blend_epi16(src2, alpha2, `0x88`);
44	srcVector = _mm_packus_epi16(a: src1, b: src2);
45	_mm_storeu_si128(p: (__m128i *)&buffer[i], b: srcVector);
46	} else {
47	if (RGBA)
48	_mm_storeu_si128(p: (__m128i *)&buffer[i], b: _mm_shuffle_epi8(a: srcVector, b: rgbaMask));
49	else if (buffer != src)
50	_mm_storeu_si128(p: (__m128i *)&buffer[i], b: srcVector);
51	}
52	} else {
53	_mm_storeu_si128(p: (__m128i *)&buffer[i], b: zero);
54	}
55	}
56
57	SIMD_EPILOGUE(i, count, `3`) {
58	uint v = qPremultiply(x: src[i]);
59	buffer[i] = RGBA ? RGBA2ARGB(x: v) : v;
60	}
61	}
62
63	template<bool RGBA>
64	static void convertARGBToRGBA64PM_sse4(QRgba64 buffer, const* uint src, int* count)
65	{
66	int i = `0`;
67	const __m128i alphaMask = _mm_set1_epi32(i: `0xff000000`);
68	const __m128i rgbaMask = _mm_setr_epi8(b0: `2`, b1: `1`, b2: `0`, b3: `3`, b4: `6`, b5: `5`, b6: `4`, b7: `7`, b8: `10`, b9: `9`, b10: `8`, b11: `11`, b12: `14`, b13: `13`, b14: `12`, b15: `15`);
69	const __m128i shuffleMask = _mm_setr_epi8(b0: `6`, b1: `7`, b2: `6`, b3: `7`, b4: `6`, b5: `7`, b6: `6`, b7: `7`, b8: `14`, b9: `15`, b10: `14`, b11: `15`, b12: `14`, b13: `15`, b14: `14`, b15: `15`);
70	const __m128i zero = _mm_setzero_si128();
71
72	for (; i < count - `3`; i += `4`) {
73	__m128i srcVector = _mm_loadu_si128(p: (const __m128i *)&src[i]);
74	if (!_mm_testz_si128(M: srcVector, V: alphaMask)) {
75	bool cf = _mm_testc_si128(M: srcVector, V: alphaMask);
76
77	if (!RGBA)
78	srcVector = _mm_shuffle_epi8(a: srcVector, b: rgbaMask);
79	const __m128i src1 = _mm_unpacklo_epi8(a: srcVector, b: srcVector);
80	const __m128i src2 = _mm_unpackhi_epi8(a: srcVector, b: srcVector);
81	if (!cf) {
82	__m128i alpha1 = _mm_shuffle_epi8(a: src1, b: shuffleMask);
83	__m128i alpha2 = _mm_shuffle_epi8(a: src2, b: shuffleMask);
84	__m128i dst1 = _mm_mulhi_epu16(a: src1, b: alpha1);
85	__m128i dst2 = _mm_mulhi_epu16(a: src2, b: alpha2);
86	// Map 0->0xfffe to 0->0xffff
87	dst1 = _mm_add_epi16(a: dst1, b: _mm_srli_epi16(a: dst1, count: `15`));
88	dst2 = _mm_add_epi16(a: dst2, b: _mm_srli_epi16(a: dst2, count: `15`));
89	// correct alpha value:
90	dst1 = _mm_blend_epi16(dst1, src1, `0x88`);
91	dst2 = _mm_blend_epi16(dst2, src2, `0x88`);
92	_mm_storeu_si128(p: (__m128i *)&buffer[i], b: dst1);
93	_mm_storeu_si128(p: (__m128i *)&buffer[i + `2`], b: dst2);
94	} else {
95	_mm_storeu_si128(p: (__m128i *)&buffer[i], b: src1);
96	_mm_storeu_si128(p: (__m128i *)&buffer[i + `2`], b: src2);
97	}
98	} else {
99	_mm_storeu_si128(p: (__m128i *)&buffer[i], b: zero);
100	_mm_storeu_si128(p: (__m128i *)&buffer[i + `2`], b: zero);
101	}
102	}
103
104	SIMD_EPILOGUE(i, count, `3`) {
105	const uint s = RGBA ? RGBA2ARGB(x: src[i]) : src[i];
106	buffer[i] = QRgba64::fromArgb32(rgb: s).premultiplied();
107	}
108	}
109	#endif // __haswell__
110
111	static inline __m128 Q_DECL_VECTORCALL reciprocal_mul_ps(__m128 a, float mul)
112	{
113	__m128 ia = _mm_rcp_ps(a: a); // Approximate 1/a
114	// Improve precision of ia using Newton-Raphson
115	ia = _mm_sub_ps(a: _mm_add_ps(a: ia, b: ia), b: _mm_mul_ps(a: ia, b: _mm_mul_ps(a: ia, b: a)));
116	ia = _mm_mul_ps(a: ia, b: _mm_set1_ps(w: mul));
117	return ia;
118	}
119
120	template<bool RGBA, bool RGBx>
121	static inline void convertARGBFromARGB32PM_sse4(uint buffer, const* uint src, int* count)
122	{
123	int i = `0`;
124	if ((_MM_GET_EXCEPTION_MASK() & _MM_MASK_INVALID) == `0`) {
125	for (; i < count; ++i) {
126	uint v = qUnpremultiply(p: src[i]);
127	if (RGBx)
128	v = `0xff000000` \| v;
129	if (RGBA)
130	v = ARGB2RGBA(x: v);
131	buffer[i] = v;
132	}
133	return;
134	}
135	const __m128i alphaMask = _mm_set1_epi32(i: `0xff000000`);
136	const __m128i rgbaMask = _mm_setr_epi8(b0: `2`, b1: `1`, b2: `0`, b3: `3`, b4: `6`, b5: `5`, b6: `4`, b7: `7`, b8: `10`, b9: `9`, b10: `8`, b11: `11`, b12: `14`, b13: `13`, b14: `12`, b15: `15`);
137	const __m128i zero = _mm_setzero_si128();
138
139	for (; i < count - `3`; i += `4`) {
140	__m128i srcVector = _mm_loadu_si128(p: (const __m128i *)&src[i]);
141	if (!_mm_testz_si128(M: srcVector, V: alphaMask)) {
142	if (!_mm_testc_si128(M: srcVector, V: alphaMask)) {
143	__m128i srcVectorAlpha = _mm_srli_epi32(a: srcVector, count: `24`);
144	if (RGBA)
145	srcVector = _mm_shuffle_epi8(a: srcVector, b: rgbaMask);
146	const __m128 a = _mm_cvtepi32_ps(a: srcVectorAlpha);
147	const __m128 ia = reciprocal_mul_ps(a, mul: `255.0f`);
148	__m128i src1 = _mm_unpacklo_epi8(a: srcVector, b: zero);
149	__m128i src3 = _mm_unpackhi_epi8(a: srcVector, b: zero);
150	__m128i src2 = _mm_unpackhi_epi16(a: src1, b: zero);
151	__m128i src4 = _mm_unpackhi_epi16(a: src3, b: zero);
152	src1 = _mm_unpacklo_epi16(a: src1, b: zero);
153	src3 = _mm_unpacklo_epi16(a: src3, b: zero);
154	__m128 ia1 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(`0`, `0`, `0`, `0`));
155	__m128 ia2 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(`1`, `1`, `1`, `1`));
156	__m128 ia3 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(`2`, `2`, `2`, `2`));
157	__m128 ia4 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(`3`, `3`, `3`, `3`));
158	src1 = _mm_cvtps_epi32(a: _mm_mul_ps(a: _mm_cvtepi32_ps(a: src1), b: ia1));
159	src2 = _mm_cvtps_epi32(a: _mm_mul_ps(a: _mm_cvtepi32_ps(a: src2), b: ia2));
160	src3 = _mm_cvtps_epi32(a: _mm_mul_ps(a: _mm_cvtepi32_ps(a: src3), b: ia3));
161	src4 = _mm_cvtps_epi32(a: _mm_mul_ps(a: _mm_cvtepi32_ps(a: src4), b: ia4));
162	src1 = _mm_packus_epi32(V1: src1, V2: src2);
163	src3 = _mm_packus_epi32(V1: src3, V2: src4);
164	src1 = _mm_packus_epi16(a: src1, b: src3);
165	// Handle potential alpha == 0 values:
166	__m128i srcVectorAlphaMask = _mm_cmpeq_epi32(a: srcVectorAlpha, b: zero);
167	src1 = _mm_andnot_si128(a: srcVectorAlphaMask, b: src1);
168	// Fixup alpha values:
169	if (RGBx)
170	srcVector = _mm_or_si128(a: src1, b: alphaMask);
171	else
172	srcVector = _mm_blendv_epi8(V1: src1, V2: srcVector, M: alphaMask);
173	_mm_storeu_si128(p: (__m128i *)&buffer[i], b: srcVector);
174	} else {
175	if (RGBA)
176	_mm_storeu_si128(p: (__m128i *)&buffer[i], b: _mm_shuffle_epi8(a: srcVector, b: rgbaMask));
177	else if (buffer != src)
178	_mm_storeu_si128(p: (__m128i *)&buffer[i], b: srcVector);
179	}
180	} else {
181	if (RGBx)
182	_mm_storeu_si128(p: (__m128i *)&buffer[i], b: alphaMask);
183	else
184	_mm_storeu_si128(p: (__m128i *)&buffer[i], b: zero);
185	}
186	}
187
188	SIMD_EPILOGUE(i, count, `3`) {
189	uint v = qUnpremultiply_sse4(p: src[i]);
190	if (RGBx)
191	v = `0xff000000` \| v;
192	if (RGBA)
193	v = ARGB2RGBA(x: v);
194	buffer[i] = v;
195	}
196	}
197
198	template<bool RGBA>
199	static inline void convertARGBFromRGBA64PM_sse4(uint buffer, const* QRgba64 src, int* count)
200	{
201	int i = `0`;
202	if ((_MM_GET_EXCEPTION_MASK() & _MM_MASK_INVALID) == `0`) {
203	for (; i < count; ++i) {
204	const QRgba64 v = src[i].unpremultiplied();
205	buffer[i] = RGBA ? toRgba8888(rgba64: v) : toArgb32(rgba64: v);
206	}
207	return;
208	}
209	const __m128i alphaMask = _mm_set1_epi64x(q: qint64(Q_UINT64_C(`0xffff`) << `48`));
210	const __m128i alphaMask32 = _mm_set1_epi32(i: `0xff000000`);
211	const __m128i rgbaMask = _mm_setr_epi8(b0: `2`, b1: `1`, b2: `0`, b3: `3`, b4: `6`, b5: `5`, b6: `4`, b7: `7`, b8: `10`, b9: `9`, b10: `8`, b11: `11`, b12: `14`, b13: `13`, b14: `12`, b15: `15`);
212	const __m128i zero = _mm_setzero_si128();
213
214	for (; i < count - `3`; i += `4`) {
215	__m128i srcVector1 = _mm_loadu_si128(p: (const __m128i *)&src[i]);
216	__m128i srcVector2 = _mm_loadu_si128(p: (const __m128i *)&src[i + `2`]);
217	bool transparent1 = _mm_testz_si128(M: srcVector1, V: alphaMask);
218	bool opaque1 = _mm_testc_si128(M: srcVector1, V: alphaMask);
219	bool transparent2 = _mm_testz_si128(M: srcVector2, V: alphaMask);
220	bool opaque2 = _mm_testc_si128(M: srcVector2, V: alphaMask);
221
222	if (!(transparent1 && transparent2)) {
223	if (!(opaque1 && opaque2)) {
224	__m128i srcVector1Alpha = _mm_srli_epi64(a: srcVector1, count: `48`);
225	__m128i srcVector2Alpha = _mm_srli_epi64(a: srcVector2, count: `48`);
226	__m128i srcVectorAlpha = _mm_packus_epi32(V1: srcVector1Alpha, V2: srcVector2Alpha);
227	const __m128 a = _mm_cvtepi32_ps(a: srcVectorAlpha);
228	// Convert srcVectorAlpha to final 8-bit alpha channel
229	srcVectorAlpha = _mm_add_epi32(a: srcVectorAlpha, b: _mm_set1_epi32(i: `128`));
230	srcVectorAlpha = _mm_sub_epi32(a: srcVectorAlpha, b: _mm_srli_epi32(a: srcVectorAlpha, count: `8`));
231	srcVectorAlpha = _mm_srli_epi32(a: srcVectorAlpha, count: `8`);
232	srcVectorAlpha = _mm_slli_epi32(a: srcVectorAlpha, count: `24`);
233	const __m128 ia = reciprocal_mul_ps(a, mul: `255.0f`);
234	__m128i src1 = _mm_unpacklo_epi16(a: srcVector1, b: zero);
235	__m128i src2 = _mm_unpackhi_epi16(a: srcVector1, b: zero);
236	__m128i src3 = _mm_unpacklo_epi16(a: srcVector2, b: zero);
237	__m128i src4 = _mm_unpackhi_epi16(a: srcVector2, b: zero);
238	__m128 ia1 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(`0`, `0`, `0`, `0`));
239	__m128 ia2 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(`1`, `1`, `1`, `1`));
240	__m128 ia3 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(`2`, `2`, `2`, `2`));
241	__m128 ia4 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(`3`, `3`, `3`, `3`));
242	src1 = _mm_cvtps_epi32(a: _mm_mul_ps(a: _mm_cvtepi32_ps(a: src1), b: ia1));
243	src2 = _mm_cvtps_epi32(a: _mm_mul_ps(a: _mm_cvtepi32_ps(a: src2), b: ia2));
244	src3 = _mm_cvtps_epi32(a: _mm_mul_ps(a: _mm_cvtepi32_ps(a: src3), b: ia3));
245	src4 = _mm_cvtps_epi32(a: _mm_mul_ps(a: _mm_cvtepi32_ps(a: src4), b: ia4));
246	src1 = _mm_packus_epi32(V1: src1, V2: src2);
247	src3 = _mm_packus_epi32(V1: src3, V2: src4);
248	// Handle potential alpha == 0 values:
249	__m128i srcVector1AlphaMask = _mm_cmpeq_epi64(V1: srcVector1Alpha, V2: zero);
250	__m128i srcVector2AlphaMask = _mm_cmpeq_epi64(V1: srcVector2Alpha, V2: zero);
251	src1 = _mm_andnot_si128(a: srcVector1AlphaMask, b: src1);
252	src3 = _mm_andnot_si128(a: srcVector2AlphaMask, b: src3);
253	src1 = _mm_packus_epi16(a: src1, b: src3);
254	// Fixup alpha values:
255	src1 = _mm_blendv_epi8(V1: src1, V2: srcVectorAlpha, M: alphaMask32);
256	// Fix RGB order
257	if (!RGBA)
258	src1 = _mm_shuffle_epi8(a: src1, b: rgbaMask);
259	_mm_storeu_si128(p: (__m128i *)&buffer[i], b: src1);
260	} else {
261	__m128i src1 = _mm_unpacklo_epi16(a: srcVector1, b: zero);
262	__m128i src2 = _mm_unpackhi_epi16(a: srcVector1, b: zero);
263	__m128i src3 = _mm_unpacklo_epi16(a: srcVector2, b: zero);
264	__m128i src4 = _mm_unpackhi_epi16(a: srcVector2, b: zero);
265	src1 = _mm_add_epi32(a: src1, b: _mm_set1_epi32(i: `128`));
266	src2 = _mm_add_epi32(a: src2, b: _mm_set1_epi32(i: `128`));
267	src3 = _mm_add_epi32(a: src3, b: _mm_set1_epi32(i: `128`));
268	src4 = _mm_add_epi32(a: src4, b: _mm_set1_epi32(i: `128`));
269	src1 = _mm_sub_epi32(a: src1, b: _mm_srli_epi32(a: src1, count: `8`));
270	src2 = _mm_sub_epi32(a: src2, b: _mm_srli_epi32(a: src2, count: `8`));
271	src3 = _mm_sub_epi32(a: src3, b: _mm_srli_epi32(a: src3, count: `8`));
272	src4 = _mm_sub_epi32(a: src4, b: _mm_srli_epi32(a: src4, count: `8`));
273	src1 = _mm_srli_epi32(a: src1, count: `8`);
274	src2 = _mm_srli_epi32(a: src2, count: `8`);
275	src3 = _mm_srli_epi32(a: src3, count: `8`);
276	src4 = _mm_srli_epi32(a: src4, count: `8`);
277	src1 = _mm_packus_epi32(V1: src1, V2: src2);
278	src3 = _mm_packus_epi32(V1: src3, V2: src4);
279	src1 = _mm_packus_epi16(a: src1, b: src3);
280	if (!RGBA)
281	src1 = _mm_shuffle_epi8(a: src1, b: rgbaMask);
282	_mm_storeu_si128(p: (__m128i *)&buffer[i], b: src1);
283	}
284	} else {
285	_mm_storeu_si128(p: (__m128i *)&buffer[i], b: zero);
286	}
287	}
288
289	SIMD_EPILOGUE(i, count, `3`) {
290	buffer[i] = qConvertRgba64ToRgb32_sse4<RGBA ? PixelOrderRGB : PixelOrderBGR>(src[i]);
291	}
292	}
293
294	template<bool mask>
295	static inline void convertRGBA64FromRGBA64PM_sse4(QRgba64 buffer, const* QRgba64 src, int* count)
296	{
297	int i = `0`;
298	if ((_MM_GET_EXCEPTION_MASK() & _MM_MASK_INVALID) == `0`) {
299	for (; i < count; ++i) {
300	QRgba64 v = src[i].unpremultiplied();
301	if (mask)
302	v.setAlpha(`65535`);
303	buffer[i] = v;
304	}
305	return;
306	}
307	const __m128i alphaMask = _mm_set1_epi64x(q: qint64(Q_UINT64_C(`0xffff`) << `48`));
308	const __m128i zero = _mm_setzero_si128();
309
310	for (; i < count - `3`; i += `4`) {
311	__m128i srcVector1 = _mm_loadu_si128(p: (const __m128i *)&src[i + `0`]);
312	__m128i srcVector2 = _mm_loadu_si128(p: (const __m128i *)&src[i + `2`]);
313	bool transparent1 = _mm_testz_si128(M: srcVector1, V: alphaMask);
314	bool opaque1 = _mm_testc_si128(M: srcVector1, V: alphaMask);
315	bool transparent2 = _mm_testz_si128(M: srcVector2, V: alphaMask);
316	bool opaque2 = _mm_testc_si128(M: srcVector2, V: alphaMask);
317
318	if (!(transparent1 && transparent2)) {
319	if (!(opaque1 && opaque2)) {
320	__m128i srcVector1Alpha = _mm_srli_epi64(a: srcVector1, count: `48`);
321	__m128i srcVector2Alpha = _mm_srli_epi64(a: srcVector2, count: `48`);
322	__m128i srcVectorAlpha = _mm_packus_epi32(V1: srcVector1Alpha, V2: srcVector2Alpha);
323	const __m128 a = _mm_cvtepi32_ps(a: srcVectorAlpha);
324	const __m128 ia = reciprocal_mul_ps(a, mul: `65535.0f`);
325	__m128i src1 = _mm_unpacklo_epi16(a: srcVector1, b: zero);
326	__m128i src2 = _mm_unpackhi_epi16(a: srcVector1, b: zero);
327	__m128i src3 = _mm_unpacklo_epi16(a: srcVector2, b: zero);
328	__m128i src4 = _mm_unpackhi_epi16(a: srcVector2, b: zero);
329	__m128 ia1 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(`0`, `0`, `0`, `0`));
330	__m128 ia2 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(`1`, `1`, `1`, `1`));
331	__m128 ia3 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(`2`, `2`, `2`, `2`));
332	__m128 ia4 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(`3`, `3`, `3`, `3`));
333	src1 = _mm_cvtps_epi32(a: _mm_mul_ps(a: _mm_cvtepi32_ps(a: src1), b: ia1));
334	src2 = _mm_cvtps_epi32(a: _mm_mul_ps(a: _mm_cvtepi32_ps(a: src2), b: ia2));
335	src3 = _mm_cvtps_epi32(a: _mm_mul_ps(a: _mm_cvtepi32_ps(a: src3), b: ia3));
336	src4 = _mm_cvtps_epi32(a: _mm_mul_ps(a: _mm_cvtepi32_ps(a: src4), b: ia4));
337	src1 = _mm_packus_epi32(V1: src1, V2: src2);
338	src3 = _mm_packus_epi32(V1: src3, V2: src4);
339	// Handle potential alpha == 0 values:
340	__m128i srcVector1AlphaMask = _mm_cmpeq_epi64(V1: srcVector1Alpha, V2: zero);
341	__m128i srcVector2AlphaMask = _mm_cmpeq_epi64(V1: srcVector2Alpha, V2: zero);
342	src1 = _mm_andnot_si128(a: srcVector1AlphaMask, b: src1);
343	src3 = _mm_andnot_si128(a: srcVector2AlphaMask, b: src3);
344	// Fixup alpha values:
345	if (mask) {
346	src1 = _mm_or_si128(a: src1, b: alphaMask);
347	src3 = _mm_or_si128(a: src3, b: alphaMask);
348	} else {
349	src1 = _mm_blendv_epi8(V1: src1, V2: srcVector1, M: alphaMask);
350	src3 = _mm_blendv_epi8(V1: src3, V2: srcVector2, M: alphaMask);
351	}
352	_mm_storeu_si128(p: (__m128i *)&buffer[i + `0`], b: src1);
353	_mm_storeu_si128(p: (__m128i *)&buffer[i + `2`], b: src3);
354	} else {
355	if (mask) {
356	srcVector1 = _mm_or_si128(a: srcVector1, b: alphaMask);
357	srcVector2 = _mm_or_si128(a: srcVector2, b: alphaMask);
358	}
359	if (mask \|\| src != buffer) {
360	_mm_storeu_si128(p: (__m128i *)&buffer[i + `0`], b: srcVector1);
361	_mm_storeu_si128(p: (__m128i *)&buffer[i + `2`], b: srcVector2);
362	}
363	}
364	} else {
365	_mm_storeu_si128(p: (__m128i *)&buffer[i + `0`], b: zero);
366	_mm_storeu_si128(p: (__m128i *)&buffer[i + `2`], b: zero);
367	}
368	}
369
370	SIMD_EPILOGUE(i, count, `3`) {
371	QRgba64 v = src[i].unpremultiplied();
372	if (mask)
373	v.setAlpha(`65535`);
374	buffer[i] = v;
375	}
376	}
377
378	#ifndef __haswell__
379	void QT_FASTCALL convertARGB32ToARGB32PM_sse4(uint buffer, int* count, const QList<QRgb> *)
380	{
381	convertARGBToARGB32PM_sse4<false>(buffer, src: buffer, count);
382	}
383
384	void QT_FASTCALL convertRGBA8888ToARGB32PM_sse4(uint buffer, int* count, const QList<QRgb> *)
385	{
386	convertARGBToARGB32PM_sse4<true>(buffer, src: buffer, count);
387	}
388
389	const QRgba64 * QT_FASTCALL convertARGB32ToRGBA64PM_sse4(QRgba64 buffer, const* uint src, int* count,
390	const QList<QRgb> , QDitherInfo )
391	{
392	convertARGBToRGBA64PM_sse4<false>(buffer, src, count);
393	return buffer;
394	}
395
396	const QRgba64 * QT_FASTCALL convertRGBA8888ToRGBA64PM_sse4(QRgba64 buffer, const* uint src, int* count,
397	const QList<QRgb> , QDitherInfo )
398	{
399	convertARGBToRGBA64PM_sse4<true>(buffer, src, count);
400	return buffer;
401	}
402
403	const uint QT_FASTCALL fetchARGB32ToARGB32PM_sse4(uint buffer, const uchar src, int* index, int count,
404	const QList<QRgb> , QDitherInfo )
405	{
406	convertARGBToARGB32PM_sse4<false>(buffer, src: reinterpret_cast<const uint *>(src) + index, count);
407	return buffer;
408	}
409
410	const uint QT_FASTCALL fetchRGBA8888ToARGB32PM_sse4(uint buffer, const uchar src, int* index, int count,
411	const QList<QRgb> , QDitherInfo )
412	{
413	convertARGBToARGB32PM_sse4<true>(buffer, src: reinterpret_cast<const uint *>(src) + index, count);
414	return buffer;
415	}
416
417	const QRgba64 QT_FASTCALL fetchARGB32ToRGBA64PM_sse4(QRgba64 buffer, const uchar src, int* index, int count,
418	const QList<QRgb> , QDitherInfo )
419	{
420	convertARGBToRGBA64PM_sse4<false>(buffer, src: reinterpret_cast<const uint *>(src) + index, count);
421	return buffer;
422	}
423
424	const QRgba64 QT_FASTCALL fetchRGBA8888ToRGBA64PM_sse4(QRgba64 buffer, const uchar src, int* index, int count,
425	const QList<QRgb> , QDitherInfo )
426	{
427	convertARGBToRGBA64PM_sse4<true>(buffer, src: reinterpret_cast<const uint *>(src) + index, count);
428	return buffer;
429	}
430	#endif // __haswell__
431
432	void QT_FASTCALL storeRGB32FromARGB32PM_sse4(uchar dest, const* uint src, int* index, int count,
433	const QList<QRgb> , QDitherInfo )
434	{
435	uint d = reinterpret_cast<uint >(dest) + index;
436	convertARGBFromARGB32PM_sse4<false,true>(buffer: d, src, count);
437	}
438
439	void QT_FASTCALL storeARGB32FromARGB32PM_sse4(uchar dest, const* uint src, int* index, int count,
440	const QList<QRgb> , QDitherInfo )
441	{
442	uint d = reinterpret_cast<uint >(dest) + index;
443	convertARGBFromARGB32PM_sse4<false,false>(buffer: d, src, count);
444	}
445
446	void QT_FASTCALL storeRGBA8888FromARGB32PM_sse4(uchar dest, const* uint src, int* index, int count,
447	const QList<QRgb> , QDitherInfo )
448	{
449	uint d = reinterpret_cast<uint >(dest) + index;
450	convertARGBFromARGB32PM_sse4<true,false>(buffer: d, src, count);
451	}
452
453	void QT_FASTCALL storeRGBXFromARGB32PM_sse4(uchar dest, const* uint src, int* index, int count,
454	const QList<QRgb> , QDitherInfo )
455	{
456	uint d = reinterpret_cast<uint >(dest) + index;
457	convertARGBFromARGB32PM_sse4<true,true>(buffer: d, src, count);
458	}
459
460	template<QtPixelOrder PixelOrder>
461	void QT_FASTCALL storeA2RGB30PMFromARGB32PM_sse4(uchar dest, const* uint src, int* index, int count,
462	const QList<QRgb> , QDitherInfo )
463	{
464	uint d = reinterpret_cast<uint >(dest) + index;
465	for (int i = `0`; i < count; ++i)
466	d[i] = qConvertArgb32ToA2rgb30_sse4<PixelOrder>(src[i]);
467	}
468
469	template
470	void QT_FASTCALL storeA2RGB30PMFromARGB32PM_sse4<PixelOrderBGR>(uchar dest, const* uint src, int* index, int count,
471	const QList<QRgb> , QDitherInfo );
472	template
473	void QT_FASTCALL storeA2RGB30PMFromARGB32PM_sse4<PixelOrderRGB>(uchar dest, const* uint src, int* index, int count,
474	const QList<QRgb> , QDitherInfo );
475
476	#if QT_CONFIG(raster_64bit)
477	void QT_FASTCALL destStore64ARGB32_sse4(QRasterBuffer rasterBuffer, int* x, int y, const QRgba64 buffer, int* length)
478	{
479	uint dest = (uint)rasterBuffer->scanLine(y) + x;
480	convertARGBFromRGBA64PM_sse4<false>(buffer: dest, src: buffer, count: length);
481	}
482
483	void QT_FASTCALL destStore64RGBA8888_sse4(QRasterBuffer rasterBuffer, int* x, int y, const QRgba64 buffer, int* length)
484	{
485	uint dest = (uint)rasterBuffer->scanLine(y) + x;
486	convertARGBFromRGBA64PM_sse4<true>(buffer: dest, src: buffer, count: length);
487	}
488	#endif
489
490	void QT_FASTCALL storeARGB32FromRGBA64PM_sse4(uchar dest, const* QRgba64 src, int* index, int count,
491	const QList<QRgb> , QDitherInfo )
492	{
493	uint d = (uint)dest + index;
494	convertARGBFromRGBA64PM_sse4<false>(buffer: d, src, count);
495	}
496
497	void QT_FASTCALL storeRGBA8888FromRGBA64PM_sse4(uchar dest, const* QRgba64 src, int* index, int count,
498	const QList<QRgb> , QDitherInfo )
499	{
500	uint d = (uint)dest + index;
501	convertARGBFromRGBA64PM_sse4<true>(buffer: d, src, count);
502	}
503
504	void QT_FASTCALL storeRGBA64FromRGBA64PM_sse4(uchar dest, const* QRgba64 src, int* index, int count,
505	const QList<QRgb> , QDitherInfo )
506	{
507	QRgba64 d = (QRgba64 )dest + index;
508	convertRGBA64FromRGBA64PM_sse4<false>(buffer: d, src, count);
509	}
510
511	void QT_FASTCALL storeRGBx64FromRGBA64PM_sse4(uchar dest, const* QRgba64 src, int* index, int count,
512	const QList<QRgb> , QDitherInfo )
513	{
514	QRgba64 d = (QRgba64 )dest + index;
515	convertRGBA64FromRGBA64PM_sse4<true>(buffer: d, src, count);
516	}
517
518	#if QT_CONFIG(raster_fp)
519	const QRgbaFloat32 QT_FASTCALL fetchRGBA32FToRGBA32F_sse4(QRgbaFloat32 buffer, const uchar src, int* index, int count,
520	const QList<QRgb> , QDitherInfo )
521	{
522	const QRgbaFloat32 s = reinterpret_cast<const* QRgbaFloat32 *>(src) + index;
523	for (int i = `0`; i < count; ++i) {
524	__m128 vsf = _mm_load_ps(p: reinterpret_cast<const float *>(s + i));
525	__m128 vsa = _mm_shuffle_ps(vsf, vsf, _MM_SHUFFLE(`3`, `3`, `3`, `3`));
526	vsf = _mm_mul_ps(a: vsf, b: vsa);
527	vsf = _mm_insert_ps(vsf, vsa, `0x30`);
528	_mm_store_ps(p: reinterpret_cast<float *>(buffer + i), a: vsf);
529	}
530	return buffer;
531	}
532
533	void QT_FASTCALL storeRGBX32FFromRGBA32F_sse4(uchar dest, const* QRgbaFloat32 src, int* index, int count,
534	const QList<QRgb> , QDitherInfo )
535	{
536	QRgbaFloat32 d = reinterpret_cast<QRgbaFloat32 >(dest) + index;
537	const __m128 zero = _mm_set_ps(z: `1.0f`, y: `0.0f`, x: `0.0f`, w: `0.0f`);
538	for (int i = `0`; i < count; ++i) {
539	__m128 vsf = _mm_load_ps(p: reinterpret_cast<const float *>(src + i));
540	const __m128 vsa = _mm_shuffle_ps(vsf, vsf, _MM_SHUFFLE(`3`, `3`, `3`, `3`));
541	const float a = _mm_cvtss_f32(a: vsa);
542	if (a == `1.0f`)
543	{ }
544	else if (a == `0.0f`)
545	vsf = zero;
546	else {
547	__m128 vsr = _mm_rcp_ps(a: vsa);
548	vsr = _mm_sub_ps(a: _mm_add_ps(a: vsr, b: vsr), b: _mm_mul_ps(a: vsr, b: _mm_mul_ps(a: vsr, b: vsa)));
549	vsf = _mm_mul_ps(a: vsf, b: vsr);
550	vsf = _mm_insert_ps(vsf, _mm_set_ss(`1.0f`), `0x30`);
551	}
552	_mm_store_ps(p: reinterpret_cast<float *>(d + i), a: vsf);
553	}
554	}
555
556	void QT_FASTCALL storeRGBA32FFromRGBA32F_sse4(uchar dest, const* QRgbaFloat32 src, int* index, int count,
557	const QList<QRgb> , QDitherInfo )
558	{
559	QRgbaFloat32 d = reinterpret_cast<QRgbaFloat32 >(dest) + index;
560	const __m128 zero = _mm_set1_ps(w: `0.0f`);
561	for (int i = `0`; i < count; ++i) {
562	__m128 vsf = _mm_load_ps(p: reinterpret_cast<const float *>(src + i));
563	const __m128 vsa = _mm_shuffle_ps(vsf, vsf, _MM_SHUFFLE(`3`, `3`, `3`, `3`));
564	const float a = _mm_cvtss_f32(a: vsa);
565	if (a == `1.0f`)
566	{ }
567	else if (a == `0.0f`)
568	vsf = zero;
569	else {
570	__m128 vsr = _mm_rcp_ps(a: vsa);
571	vsr = _mm_sub_ps(a: _mm_add_ps(a: vsr, b: vsr), b: _mm_mul_ps(a: vsr, b: _mm_mul_ps(a: vsr, b: vsa)));
572	vsr = _mm_insert_ps(vsr, _mm_set_ss(`1.0f`), `0x30`);
573	vsf = _mm_mul_ps(a: vsf, b: vsr);
574	}
575	_mm_store_ps(p: reinterpret_cast<float *>(d + i), a: vsf);
576	}
577	}
578	#endif
579
580
581	QT_END_NAMESPACE
582
583	#endif
584

source code of qtbase/src/gui/painting/qdrawhelper_sse4.cpp