qdrawhelper_sse2.cpp source code [qtbase/src/gui/painting/qdrawhelper_sse2.cpp]

1	// Copyright (C) 2016 The Qt Company Ltd.
2	// Copyright (C) 2016 Intel Corporation.
3	// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4
5	#include <private/qdrawhelper_x86_p.h>
6
7	#ifdef QT_COMPILER_SUPPORTS_SSE2
8
9	#include <private/qdrawingprimitive_sse2_p.h>
10	#include <private/qpaintengine_raster_p.h>
11
12	QT_BEGIN_NAMESPACE
13
14	#ifndef QDRAWHELPER_AVX
15	// in AVX mode, we'll use the SSSE3 code
16	void qt_blend_argb32_on_argb32_sse2(uchar destPixels, int* dbpl,
17	const uchar srcPixels, int* sbpl,
18	int w, int h,
19	int const_alpha)
20	{
21	const quint32 src = (const* quint32 *) srcPixels;
22	quint32 dst = (quint32 ) destPixels;
23	if (const_alpha == `256`) {
24	const __m128i alphaMask = _mm_set1_epi32(i: `0xff000000`);
25	const __m128i nullVector = _mm_set1_epi32(i: `0`);
26	const __m128i half = _mm_set1_epi16(w: `0x80`);
27	const __m128i one = _mm_set1_epi16(w: `0xff`);
28	const __m128i colorMask = _mm_set1_epi32(i: `0x00ff00ff`);
29	for (int y = `0`; y < h; ++y) {
30	BLEND_SOURCE_OVER_ARGB32_SSE2(dst, src, w, nullVector, half, one, colorMask, alphaMask);
31	dst = (quint32 )(((uchar ) dst) + dbpl);
32	src = (const quint32 )(((const* uchar *) src) + sbpl);
33	}
34	} else if (const_alpha != `0`) {
35	// dest = (s + d sia) * ca + d * cia*
36	// = s ca + d * (sia * ca + cia)*
37	// = s ca + d * (1 - saca)
38	const_alpha = (const_alpha * `255`) >> `8`;
39	const __m128i nullVector = _mm_set1_epi32(i: `0`);
40	const __m128i half = _mm_set1_epi16(w: `0x80`);
41	const __m128i one = _mm_set1_epi16(w: `0xff`);
42	const __m128i colorMask = _mm_set1_epi32(i: `0x00ff00ff`);
43	const __m128i constAlphaVector = _mm_set1_epi16(w: const_alpha);
44	for (int y = `0`; y < h; ++y) {
45	BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, w, nullVector, half, one, colorMask, constAlphaVector)
46	dst = (quint32 )(((uchar ) dst) + dbpl);
47	src = (const quint32 )(((const* uchar *) src) + sbpl);
48	}
49	}
50	}
51	#endif
52
53	// qblendfunctions.cpp
54	void qt_blend_rgb32_on_rgb32(uchar destPixels, int* dbpl,
55	const uchar srcPixels, int* sbpl,
56	int w, int h,
57	int const_alpha);
58
59	void qt_blend_rgb32_on_rgb32_sse2(uchar destPixels, int* dbpl,
60	const uchar srcPixels, int* sbpl,
61	int w, int h,
62	int const_alpha)
63	{
64	const quint32 src = (const* quint32 *) srcPixels;
65	quint32 dst = (quint32 ) destPixels;
66	if (const_alpha != `256`) {
67	if (const_alpha != `0`) {
68	const __m128i half = _mm_set1_epi16(w: `0x80`);
69	const __m128i colorMask = _mm_set1_epi32(i: `0x00ff00ff`);
70
71	const_alpha = (const_alpha * `255`) >> `8`;
72	int one_minus_const_alpha = `255` - const_alpha;
73	const __m128i constAlphaVector = _mm_set1_epi16(w: const_alpha);
74	const __m128i oneMinusConstAlpha = _mm_set1_epi16(w: one_minus_const_alpha);
75	for (int y = `0`; y < h; ++y) {
76	int x = `0`;
77
78	// First, align dest to 16 bytes:
79	ALIGNMENT_PROLOGUE_16BYTES(dst, x, w) {
80	dst[x] = INTERPOLATE_PIXEL_255(x: src[x], a: const_alpha, y: dst[x], b: one_minus_const_alpha);
81	}
82
83	for (; x < w-`3`; x += `4`) {
84	__m128i srcVector = _mm_loadu_si128(p: (const __m128i *)&src[x]);
85	const __m128i dstVector = _mm_load_si128(p: (__m128i *)&dst[x]);
86	__m128i result;
87	INTERPOLATE_PIXEL_255_SSE2(result, srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half);
88	_mm_store_si128(p: (__m128i *)&dst[x], b: result);
89	}
90	SIMD_EPILOGUE(x, w, `3`)
91	dst[x] = INTERPOLATE_PIXEL_255(x: src[x], a: const_alpha, y: dst[x], b: one_minus_const_alpha);
92	dst = (quint32 )(((uchar ) dst) + dbpl);
93	src = (const quint32 )(((const* uchar *) src) + sbpl);
94	}
95	}
96	} else {
97	qt_blend_rgb32_on_rgb32(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha);
98	}
99	}
100
101	void QT_FASTCALL comp_func_SourceOver_sse2(uint destPixels, const* uint srcPixels, int* length, uint const_alpha)
102	{
103	Q_ASSERT(const_alpha < `256`);
104
105	const quint32 src = (const* quint32 *) srcPixels;
106	quint32 dst = (quint32 ) destPixels;
107
108	const __m128i nullVector = _mm_set1_epi32(i: `0`);
109	const __m128i half = _mm_set1_epi16(w: `0x80`);
110	const __m128i one = _mm_set1_epi16(w: `0xff`);
111	const __m128i colorMask = _mm_set1_epi32(i: `0x00ff00ff`);
112	if (const_alpha == `255`) {
113	const __m128i alphaMask = _mm_set1_epi32(i: `0xff000000`);
114	BLEND_SOURCE_OVER_ARGB32_SSE2(dst, src, length, nullVector, half, one, colorMask, alphaMask);
115	} else {
116	const __m128i constAlphaVector = _mm_set1_epi16(w: const_alpha);
117	BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, length, nullVector, half, one, colorMask, constAlphaVector);
118	}
119	}
120
121	void QT_FASTCALL comp_func_Plus_sse2(uint dst, const* uint src, int* length, uint const_alpha)
122	{
123	int x = `0`;
124
125	if (const_alpha == `255`) {
126	// 1) Prologue: align destination on 16 bytes
127	ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
128	dst[x] = comp_func_Plus_one_pixel(d: dst[x], s: src[x]);
129
130	// 2) composition with SSE2
131	for (; x < length - `3`; x += `4`) {
132	const __m128i srcVector = _mm_loadu_si128(p: (const __m128i *)&src[x]);
133	const __m128i dstVector = _mm_load_si128(p: (__m128i *)&dst[x]);
134
135	const __m128i result = _mm_adds_epu8(a: srcVector, b: dstVector);
136	_mm_store_si128(p: (__m128i *)&dst[x], b: result);
137	}
138
139	// 3) Epilogue:
140	SIMD_EPILOGUE(x, length, `3`)
141	dst[x] = comp_func_Plus_one_pixel(d: dst[x], s: src[x]);
142	} else {
143	const int one_minus_const_alpha = `255` - const_alpha;
144	const __m128i constAlphaVector = _mm_set1_epi16(w: const_alpha);
145	const __m128i oneMinusConstAlpha = _mm_set1_epi16(w: one_minus_const_alpha);
146
147	// 1) Prologue: align destination on 16 bytes
148	ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
149	dst[x] = comp_func_Plus_one_pixel_const_alpha(d: dst[x], s: src[x], const_alpha, one_minus_const_alpha);
150
151	const __m128i half = _mm_set1_epi16(w: `0x80`);
152	const __m128i colorMask = _mm_set1_epi32(i: `0x00ff00ff`);
153	// 2) composition with SSE2
154	for (; x < length - `3`; x += `4`) {
155	const __m128i srcVector = _mm_loadu_si128(p: (const __m128i *)&src[x]);
156	const __m128i dstVector = _mm_load_si128(p: (__m128i *)&dst[x]);
157
158	__m128i result = _mm_adds_epu8(a: srcVector, b: dstVector);
159	INTERPOLATE_PIXEL_255_SSE2(result, result, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half)
160	_mm_store_si128(p: (__m128i *)&dst[x], b: result);
161	}
162
163	// 3) Epilogue:
164	SIMD_EPILOGUE(x, length, `3`)
165	dst[x] = comp_func_Plus_one_pixel_const_alpha(d: dst[x], s: src[x], const_alpha, one_minus_const_alpha);
166	}
167	}
168
169	void QT_FASTCALL comp_func_Source_sse2(uint dst, const* uint src, int* length, uint const_alpha)
170	{
171	if (const_alpha == `255`) {
172	::memcpy(dest: dst, src: src, n: length * sizeof(uint));
173	} else {
174	const int ialpha = `255` - const_alpha;
175
176	int x = `0`;
177
178	// 1) prologue, align on 16 bytes
179	ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
180	dst[x] = INTERPOLATE_PIXEL_255(x: src[x], a: const_alpha, y: dst[x], b: ialpha);
181
182	// 2) interpolate pixels with SSE2
183	const __m128i half = _mm_set1_epi16(w: `0x80`);
184	const __m128i colorMask = _mm_set1_epi32(i: `0x00ff00ff`);
185	const __m128i constAlphaVector = _mm_set1_epi16(w: const_alpha);
186	const __m128i oneMinusConstAlpha = _mm_set1_epi16(w: ialpha);
187	for (; x < length - `3`; x += `4`) {
188	const __m128i srcVector = _mm_loadu_si128(p: (const __m128i *)&src[x]);
189	__m128i dstVector = _mm_load_si128(p: (__m128i *)&dst[x]);
190	INTERPOLATE_PIXEL_255_SSE2(dstVector, srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half)
191	_mm_store_si128(p: (__m128i *)&dst[x], b: dstVector);
192	}
193
194	// 3) Epilogue
195	SIMD_EPILOGUE(x, length, `3`)
196	dst[x] = INTERPOLATE_PIXEL_255(x: src[x], a: const_alpha, y: dst[x], b: ialpha);
197	}
198	}
199
200	#ifndef __haswell__
201	static Q_NEVER_INLINE
202	void Q_DECL_VECTORCALL qt_memfillXX_aligned(void *dest, __m128i value128, quintptr bytecount)
203	{
204	__m128i dst128 = reinterpret_cast<__m128i >(dest);
205	__m128i end128 = reinterpret_cast<__m128i >(static_cast<uchar *>(dest) + bytecount);
206
207	while (dst128 + `4` <= end128) {
208	_mm_store_si128(p: dst128 + `0`, b: value128);
209	_mm_store_si128(p: dst128 + `1`, b: value128);
210	_mm_store_si128(p: dst128 + `2`, b: value128);
211	_mm_store_si128(p: dst128 + `3`, b: value128);
212	dst128 += `4`;
213	}
214
215	bytecount %= `4` * sizeof(__m128i);
216	switch (bytecount / sizeof(__m128i)) {
217	case `3`: _mm_store_si128(p: dst128++, b: value128); Q_FALLTHROUGH();
218	case `2`: _mm_store_si128(p: dst128++, b: value128); Q_FALLTHROUGH();
219	case `1`: _mm_store_si128(p: dst128++, b: value128);
220	}
221	}
222
223	void qt_memfill64_sse2(quint64 *dest, quint64 value, qsizetype count)
224	{
225	quintptr misaligned = quintptr(dest) % sizeof(__m128i);
226	if (misaligned && count) {
227	#if defined(Q_PROCESSOR_X86_32)
228	// Before SSE came out, the alignment of the stack used to be only 4
229	// bytes and some OS/ABIs (notably, code generated by MSVC) still only
230	// align to that. In any case, we cannot count on the alignment of
231	// quint64 to be 8 -- see QtPrivate::AlignOf_WorkaroundForI386Abi in
232	// qglobal.h.
233	//
234	// If the pointer is not aligned to at least 8 bytes, then we'll never
235	// in turn hit a multiple of 16 for the qt_memfillXX_aligned call
236	// below.
237	if (Q_UNLIKELY(misaligned % sizeof(quint64)))
238	return qt_memfill_template(dest, value, count);
239	#endif
240
241	*dest++ = value;
242	--count;
243	}
244
245	if (count % `2`) {
246	dest[count - `1`] = value;
247	--count;
248	}
249
250	qt_memfillXX_aligned(dest, value128: _mm_set1_epi64x(q: value), bytecount: count * sizeof(quint64));
251	}
252
253	void qt_memfill32_sse2(quint32 *dest, quint32 value, qsizetype count)
254	{
255	if (count < `4`) {
256	// this simplifies the code below: the first switch can fall through
257	// without checking the value of count
258	switch (count) {
259	case `3`: *dest++ = value; Q_FALLTHROUGH();
260	case `2`: *dest++ = value; Q_FALLTHROUGH();
261	case `1`: *dest = value;
262	}
263	return;
264	}
265
266	const int align = (quintptr)(dest) & `0xf`;
267	switch (align) {
268	case `4`: *dest++ = value; --count; Q_FALLTHROUGH();
269	case `8`: *dest++ = value; --count; Q_FALLTHROUGH();
270	case `12`: *dest++ = value; --count;
271	}
272
273	const int rest = count & `0x3`;
274	if (rest) {
275	switch (rest) {
276	case `3`: dest[count - `3`] = value; Q_FALLTHROUGH();
277	case `2`: dest[count - `2`] = value; Q_FALLTHROUGH();
278	case `1`: dest[count - `1`] = value;
279	}
280	}
281
282	qt_memfillXX_aligned(dest, value128: _mm_set1_epi32(i: value), bytecount: count * sizeof(quint32));
283	}
284	#endif // !__haswell__
285
286	void QT_FASTCALL comp_func_solid_Source_sse2(uint destPixels, int* length, uint color, uint const_alpha)
287	{
288	if (const_alpha == `255`) {
289	qt_memfill32(destPixels, color, length);
290	} else {
291	const quint32 ialpha = `255` - const_alpha;
292	color = BYTE_MUL(x: color, a: const_alpha);
293	int x = `0`;
294
295	quint32 dst = (quint32 ) destPixels;
296	const __m128i colorVector = _mm_set1_epi32(i: color);
297	const __m128i colorMask = _mm_set1_epi32(i: `0x00ff00ff`);
298	const __m128i half = _mm_set1_epi16(w: `0x80`);
299	const __m128i iAlphaVector = _mm_set1_epi16(w: ialpha);
300
301	ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
302	destPixels[x] = color + BYTE_MUL(x: destPixels[x], a: ialpha);
303
304	for (; x < length-`3`; x += `4`) {
305	__m128i dstVector = _mm_load_si128(p: (__m128i *)&dst[x]);
306	BYTE_MUL_SSE2(dstVector, dstVector, iAlphaVector, colorMask, half);
307	dstVector = _mm_add_epi8(a: colorVector, b: dstVector);
308	_mm_store_si128(p: (__m128i *)&dst[x], b: dstVector);
309	}
310	SIMD_EPILOGUE(x, length, `3`)
311	destPixels[x] = color + BYTE_MUL(x: destPixels[x], a: ialpha);
312	}
313	}
314
315	void QT_FASTCALL comp_func_solid_SourceOver_sse2(uint destPixels, int* length, uint color, uint const_alpha)
316	{
317	if ((const_alpha & qAlpha(rgb: color)) == `255`) {
318	qt_memfill32(destPixels, color, length);
319	} else {
320	if (const_alpha != `255`)
321	color = BYTE_MUL(x: color, a: const_alpha);
322
323	const quint32 minusAlphaOfColor = qAlpha(rgb: ~color);
324	int x = `0`;
325
326	quint32 dst = (quint32 ) destPixels;
327	const __m128i colorVector = _mm_set1_epi32(i: color);
328	const __m128i colorMask = _mm_set1_epi32(i: `0x00ff00ff`);
329	const __m128i half = _mm_set1_epi16(w: `0x80`);
330	const __m128i minusAlphaOfColorVector = _mm_set1_epi16(w: minusAlphaOfColor);
331
332	ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
333	destPixels[x] = color + BYTE_MUL(x: destPixels[x], a: minusAlphaOfColor);
334
335	for (; x < length-`3`; x += `4`) {
336	__m128i dstVector = _mm_load_si128(p: (__m128i *)&dst[x]);
337	BYTE_MUL_SSE2(dstVector, dstVector, minusAlphaOfColorVector, colorMask, half);
338	dstVector = _mm_add_epi8(a: colorVector, b: dstVector);
339	_mm_store_si128(p: (__m128i *)&dst[x], b: dstVector);
340	}
341	SIMD_EPILOGUE(x, length, `3`)
342	destPixels[x] = color + BYTE_MUL(x: destPixels[x], a: minusAlphaOfColor);
343	}
344	}
345
346	void qt_bitmapblit32_sse2_base(QRasterBuffer rasterBuffer, int* x, int y,
347	quint32 color,
348	const uchar src, int* width, int height, int stride)
349	{
350	quint32 dest = reinterpret_cast<quint32>(rasterBuffer->scanLine(y)) + x;
351	const int destStride = rasterBuffer->stride<quint32>();
352
353	const __m128i c128 = _mm_set1_epi32(i: color);
354	const __m128i maskmask1 = _mm_set_epi32(i3: `0x10101010`, i2: `0x20202020`,
355	i1: `0x40404040`, i0: `0x80808080`);
356	const __m128i maskadd1 = _mm_set_epi32(i3: `0x70707070`, i2: `0x60606060`,
357	i1: `0x40404040`, i0: `0x00000000`);
358
359	if (width > `4`) {
360	const __m128i maskmask2 = _mm_set_epi32(i3: `0x01010101`, i2: `0x02020202`,
361	i1: `0x04040404`, i0: `0x08080808`);
362	const __m128i maskadd2 = _mm_set_epi32(i3: `0x7f7f7f7f`, i2: `0x7e7e7e7e`,
363	i1: `0x7c7c7c7c`, i0: `0x78787878`);
364	while (--height >= `0`) {
365	for (int x = `0`; x < width; x += `8`) {
366	const quint8 s = src[x >> `3`];
367	if (!s)
368	continue;
369	__m128i mask1 = _mm_set1_epi8(b: s);
370	__m128i mask2 = mask1;
371
372	mask1 = _mm_and_si128(a: mask1, b: maskmask1);
373	mask1 = _mm_add_epi8(a: mask1, b: maskadd1);
374	_mm_maskmoveu_si128(d: c128, n: mask1, p: (char*)(dest + x));
375	mask2 = _mm_and_si128(a: mask2, b: maskmask2);
376	mask2 = _mm_add_epi8(a: mask2, b: maskadd2);
377	_mm_maskmoveu_si128(d: c128, n: mask2, p: (char*)(dest + x + `4`));
378	}
379	dest += destStride;
380	src += stride;
381	}
382	} else {
383	while (--height >= `0`) {
384	const quint8 s = *src;
385	if (s) {
386	__m128i mask1 = _mm_set1_epi8(b: s);
387	mask1 = _mm_and_si128(a: mask1, b: maskmask1);
388	mask1 = _mm_add_epi8(a: mask1, b: maskadd1);
389	_mm_maskmoveu_si128(d: c128, n: mask1, p: (char*)(dest));
390	}
391	dest += destStride;
392	src += stride;
393	}
394	}
395	}
396
397	void qt_bitmapblit32_sse2(QRasterBuffer rasterBuffer, int* x, int y,
398	const QRgba64 &color,
399	const uchar src, int* width, int height, int stride)
400	{
401	qt_bitmapblit32_sse2_base(rasterBuffer, x, y, color: color.toArgb32(), src, width, height, stride);
402	}
403
404	void qt_bitmapblit8888_sse2(QRasterBuffer rasterBuffer, int* x, int y,
405	const QRgba64 &color,
406	const uchar src, int* width, int height, int stride)
407	{
408	qt_bitmapblit32_sse2_base(rasterBuffer, x, y, color: ARGB2RGBA(x: color.toArgb32()), src, width, height, stride);
409	}
410
411	void qt_bitmapblit16_sse2(QRasterBuffer rasterBuffer, int* x, int y,
412	const QRgba64 &color,
413	const uchar src, int* width, int height, int stride)
414	{
415	const quint16 c = qConvertRgb32To16(c: color.toArgb32());
416	quint16 dest = reinterpret_cast<quint16>(rasterBuffer->scanLine(y)) + x;
417	const int destStride = rasterBuffer->stride<quint32>();
418
419	const __m128i c128 = _mm_set1_epi16(w: c);
420	QT_WARNING_DISABLE_MSVC(`4309`) // truncation of constant value
421	const __m128i maskmask = _mm_set_epi16(w7: `0x0101`, w6: `0x0202`, w5: `0x0404`, w4: `0x0808`,
422	w3: `0x1010`, w2: `0x2020`, w1: `0x4040`, w0: `0x8080`);
423	const __m128i maskadd = _mm_set_epi16(w7: `0x7f7f`, w6: `0x7e7e`, w5: `0x7c7c`, w4: `0x7878`,
424	w3: `0x7070`, w2: `0x6060`, w1: `0x4040`, w0: `0x0000`);
425
426	while (--height >= `0`) {
427	for (int x = `0`; x < width; x += `8`) {
428	const quint8 s = src[x >> `3`];
429	if (!s)
430	continue;
431	__m128i mask = _mm_set1_epi8(b: s);
432	mask = _mm_and_si128(a: mask, b: maskmask);
433	mask = _mm_add_epi8(a: mask, b: maskadd);
434	_mm_maskmoveu_si128(d: c128, n: mask, p: (char*)(dest + x));
435	}
436	dest += destStride;
437	src += stride;
438	}
439	}
440
441	class QSimdSse2
442	{
443	public:
444	typedef __m128i Int32x4;
445	typedef __m128 Float32x4;
446
447	union Vect_buffer_i { Int32x4 v; int i[`4`]; };
448	union Vect_buffer_f { Float32x4 v; float f[`4`]; };
449
450	static inline Float32x4 Q_DECL_VECTORCALL v_dup(float x) { return _mm_set1_ps(w: x); }
451	static inline Float32x4 Q_DECL_VECTORCALL v_dup(double x) { return _mm_set1_ps(w: x); }
452	static inline Int32x4 Q_DECL_VECTORCALL v_dup(int x) { return _mm_set1_epi32(i: x); }
453	static inline Int32x4 Q_DECL_VECTORCALL v_dup(uint x) { return _mm_set1_epi32(i: x); }
454
455	static inline Float32x4 Q_DECL_VECTORCALL v_add(Float32x4 a, Float32x4 b) { return _mm_add_ps(a: a, b: b); }
456	static inline Int32x4 Q_DECL_VECTORCALL v_add(Int32x4 a, Int32x4 b) { return _mm_add_epi32(a: a, b: b); }
457
458	static inline Float32x4 Q_DECL_VECTORCALL v_max(Float32x4 a, Float32x4 b) { return _mm_max_ps(a: a, b: b); }
459	static inline Float32x4 Q_DECL_VECTORCALL v_min(Float32x4 a, Float32x4 b) { return _mm_min_ps(a: a, b: b); }
460	static inline Int32x4 Q_DECL_VECTORCALL v_min_16(Int32x4 a, Int32x4 b) { return _mm_min_epi16(a: a, b: b); }
461
462	static inline Int32x4 Q_DECL_VECTORCALL v_and(Int32x4 a, Int32x4 b) { return _mm_and_si128(a: a, b: b); }
463
464	static inline Float32x4 Q_DECL_VECTORCALL v_sub(Float32x4 a, Float32x4 b) { return _mm_sub_ps(a: a, b: b); }
465	static inline Int32x4 Q_DECL_VECTORCALL v_sub(Int32x4 a, Int32x4 b) { return _mm_sub_epi32(a: a, b: b); }
466
467	static inline Float32x4 Q_DECL_VECTORCALL v_mul(Float32x4 a, Float32x4 b) { return _mm_mul_ps(a: a, b: b); }
468
469	static inline Float32x4 Q_DECL_VECTORCALL v_sqrt(Float32x4 x) { return _mm_sqrt_ps(a: x); }
470
471	static inline Int32x4 Q_DECL_VECTORCALL v_toInt(Float32x4 x) { return _mm_cvttps_epi32(a: x); }
472
473	static inline Int32x4 Q_DECL_VECTORCALL v_greaterOrEqual(Float32x4 a, Float32x4 b) { return _mm_castps_si128(a: _mm_cmpgt_ps(a: a, b: b)); }
474	};
475
476	const uint * QT_FASTCALL qt_fetch_radial_gradient_sse2(uint buffer, const* Operator op, const* QSpanData *data,
477	int y, int x, int length)
478	{
479	return qt_fetch_radial_gradient_template<QRadialFetchSimd<QSimdSse2>,uint>(buffer, op, data, y, x, length);
480	}
481
482	void qt_scale_image_argb32_on_argb32_sse2(uchar destPixels, int* dbpl,
483	const uchar srcPixels, int* sbpl, int srch,
484	const QRectF &targetRect,
485	const QRectF &sourceRect,
486	const QRect &clip,
487	int const_alpha)
488	{
489	if (const_alpha != `256`) {
490	// from qblendfunctions.cpp
491	extern void qt_scale_image_argb32_on_argb32(uchar destPixels, int* dbpl,
492	const uchar srcPixels, int* sbpl, int srch,
493	const QRectF &targetRect,
494	const QRectF &sourceRect,
495	const QRect &clip,
496	int const_alpha);
497	return qt_scale_image_argb32_on_argb32(destPixels, dbpl, srcPixels, sbpl, srch, targetRect, sourceRect, clip, const_alpha);
498	}
499
500	qreal sx = sourceRect.width() / (qreal)targetRect.width();
501	qreal sy = sourceRect.height() / (qreal)targetRect.height();
502
503	const int ix = `0x00010000` * sx;
504	const int iy = `0x00010000` * sy;
505
506	QRect tr = targetRect.normalized().toRect();
507	tr = tr.intersected(other: clip);
508	if (tr.isEmpty())
509	return;
510	const int tx1 = tr.left();
511	const int ty1 = tr.top();
512	int h = tr.height();
513	int w = tr.width();
514
515	quint32 basex;
516	quint32 srcy;
517
518	if (sx < `0`) {
519	int dstx = qFloor(v: (tx1 + qreal(`0.5`) - targetRect.right()) * sx * `65536`) + `1`;
520	basex = quint32(sourceRect.right() * `65536`) + dstx;
521	} else {
522	int dstx = qCeil(v: (tx1 + qreal(`0.5`) - targetRect.left()) * sx * `65536`) - `1`;
523	basex = quint32(sourceRect.left() * `65536`) + dstx;
524	}
525	if (sy < `0`) {
526	int dsty = qFloor(v: (ty1 + qreal(`0.5`) - targetRect.bottom()) * sy * `65536`) + `1`;
527	srcy = quint32(sourceRect.bottom() * `65536`) + dsty;
528	} else {
529	int dsty = qCeil(v: (ty1 + qreal(`0.5`) - targetRect.top()) * sy * `65536`) - `1`;
530	srcy = quint32(sourceRect.top() * `65536`) + dsty;
531	}
532
533	quint32 dst = ((quint32 ) (destPixels + ty1 * dbpl)) + tx1;
534
535	const __m128i nullVector = _mm_setzero_si128();
536	const __m128i half = _mm_set1_epi16(w: `0x80`);
537	const __m128i one = _mm_set1_epi16(w: `0xff`);
538	const __m128i colorMask = _mm_set1_epi32(i: `0x00ff00ff`);
539	const __m128i alphaMask = _mm_set1_epi32(i: `0xff000000`);
540	const __m128i ixVector = _mm_set1_epi32(i: `4`*ix);
541
542	// this bounds check here is required as floating point rounding above might in some cases lead to
543	// w/h values that are one pixel too large, falling outside of the valid image area.
544	const int ystart = srcy >> `16`;
545	if (ystart >= srch && iy < `0`) {
546	srcy += iy;
547	--h;
548	}
549	const int xstart = basex >> `16`;
550	if (xstart >= (int)(sbpl/sizeof(quint32)) && ix < `0`) {
551	basex += ix;
552	--w;
553	}
554	int yend = (srcy + iy * (h - `1`)) >> `16`;
555	if (yend < `0` \|\| yend >= srch)
556	--h;
557	int xend = (basex + ix * (w - `1`)) >> `16`;
558	if (xend < `0` \|\| xend >= (int)(sbpl/sizeof(quint32)))
559	--w;
560
561	while (--h >= `0`) {
562	const uint src = (const* quint32 ) (srcPixels + (srcy >> `16`) sbpl);
563	int srcx = basex;
564	int x = `0`;
565
566	ALIGNMENT_PROLOGUE_16BYTES(dst, x, w) {
567	uint s = src[srcx >> `16`];
568	dst[x] = s + BYTE_MUL(x: dst[x], a: qAlpha(rgb: ~s));
569	srcx += ix;
570	}
571
572	__m128i srcxVector = _mm_set_epi32(i3: srcx, i2: srcx + ix, i1: srcx + ix + ix, i0: srcx + ix + ix + ix);
573
574	for (; x < (w - `3`); x += `4`) {
575	const int idx0 = _mm_extract_epi16(srcxVector, `1`);
576	const int idx1 = _mm_extract_epi16(srcxVector, `3`);
577	const int idx2 = _mm_extract_epi16(srcxVector, `5`);
578	const int idx3 = _mm_extract_epi16(srcxVector, `7`);
579	srcxVector = _mm_add_epi32(a: srcxVector, b: ixVector);
580
581	const __m128i srcVector = _mm_set_epi32(i3: src[idx0], i2: src[idx1], i1: src[idx2], i0: src[idx3]);
582	BLEND_SOURCE_OVER_ARGB32_SSE2_helper(dst, srcVector, nullVector, half, one, colorMask, alphaMask);
583	}
584
585	SIMD_EPILOGUE(x, w, `3`) {
586	uint s = src[(basex + x*ix) >> `16`];
587	dst[x] = s + BYTE_MUL(x: dst[x], a: qAlpha(rgb: ~s));
588	}
589	dst = (quint32 )(((uchar ) dst) + dbpl);
590	srcy += iy;
591	}
592	}
593
594
595	QT_END_NAMESPACE
596
597	#endif // QT_COMPILER_SUPPORTS_SSE2
598

source code of qtbase/src/gui/painting/qdrawhelper_sse2.cpp