qdrawhelper_sse2.cpp source code [qtbase/src/gui/painting/qdrawhelper_sse2.cpp]

1	/****************************************************************************
2	**
3	** Copyright (C) 2016 The Qt Company Ltd.
4	** Copyright (C) 2016 Intel Corporation.
5	** Contact: https://www.qt.io/licensing/
6	**
7	** This file is part of the QtGui module of the Qt Toolkit.
8	**
9	** $QT_BEGIN_LICENSE:LGPL$
10	** Commercial License Usage
11	** Licensees holding valid commercial Qt licenses may use this file in
12	** accordance with the commercial license agreement provided with the
13	** Software or, alternatively, in accordance with the terms contained in
14	** a written agreement between you and The Qt Company. For licensing terms
15	** and conditions see https://www.qt.io/terms-conditions. For further
16	** information use the contact form at https://www.qt.io/contact-us.
17	**
18	** GNU Lesser General Public License Usage
19	** Alternatively, this file may be used under the terms of the GNU Lesser
20	** General Public License version 3 as published by the Free Software
21	** Foundation and appearing in the file LICENSE.LGPL3 included in the
22	** packaging of this file. Please review the following information to
23	** ensure the GNU Lesser General Public License version 3 requirements
24	** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
25	**
26	** GNU General Public License Usage
27	** Alternatively, this file may be used under the terms of the GNU
28	** General Public License version 2.0 or (at your option) the GNU General
29	** Public license version 3 or any later version approved by the KDE Free
30	** Qt Foundation. The licenses are as published by the Free Software
31	** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
32	** included in the packaging of this file. Please review the following
33	** information to ensure the GNU General Public License requirements will
34	** be met: https://www.gnu.org/licenses/gpl-2.0.html and
35	** https://www.gnu.org/licenses/gpl-3.0.html.
36	**
37	** $QT_END_LICENSE$
38	**
39	****************************************************************************/
40
41	#include <private/qdrawhelper_x86_p.h>
42
43	#ifdef QT_COMPILER_SUPPORTS_SSE2
44
45	#include <private/qdrawingprimitive_sse2_p.h>
46	#include <private/qpaintengine_raster_p.h>
47
48	QT_BEGIN_NAMESPACE
49
50	#ifndef QDRAWHELPER_AVX
51	// in AVX mode, we'll use the SSSE3 code
52	void qt_blend_argb32_on_argb32_sse2(uchar destPixels, int* dbpl,
53	const uchar srcPixels, int* sbpl,
54	int w, int h,
55	int const_alpha)
56	{
57	const quint32 src = (const* quint32 *) srcPixels;
58	quint32 dst = (quint32 ) destPixels;
59	if (const_alpha == `256`) {
60	const __m128i alphaMask = _mm_set1_epi32(i: `0xff000000`);
61	const __m128i nullVector = _mm_set1_epi32(i: `0`);
62	const __m128i half = _mm_set1_epi16(w: `0x80`);
63	const __m128i one = _mm_set1_epi16(w: `0xff`);
64	const __m128i colorMask = _mm_set1_epi32(i: `0x00ff00ff`);
65	for (int y = `0`; y < h; ++y) {
66	BLEND_SOURCE_OVER_ARGB32_SSE2(dst, src, w, nullVector, half, one, colorMask, alphaMask);
67	dst = (quint32 )(((uchar ) dst) + dbpl);
68	src = (const quint32 )(((const* uchar *) src) + sbpl);
69	}
70	} else if (const_alpha != `0`) {
71	// dest = (s + d sia) * ca + d * cia*
72	// = s ca + d * (sia * ca + cia)*
73	// = s ca + d * (1 - saca)
74	const_alpha = (const_alpha * `255`) >> `8`;
75	const __m128i nullVector = _mm_set1_epi32(i: `0`);
76	const __m128i half = _mm_set1_epi16(w: `0x80`);
77	const __m128i one = _mm_set1_epi16(w: `0xff`);
78	const __m128i colorMask = _mm_set1_epi32(i: `0x00ff00ff`);
79	const __m128i constAlphaVector = _mm_set1_epi16(w: const_alpha);
80	for (int y = `0`; y < h; ++y) {
81	BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, w, nullVector, half, one, colorMask, constAlphaVector)
82	dst = (quint32 )(((uchar ) dst) + dbpl);
83	src = (const quint32 )(((const* uchar *) src) + sbpl);
84	}
85	}
86	}
87	#endif
88
89	// qblendfunctions.cpp
90	void qt_blend_rgb32_on_rgb32(uchar destPixels, int* dbpl,
91	const uchar srcPixels, int* sbpl,
92	int w, int h,
93	int const_alpha);
94
95	void qt_blend_rgb32_on_rgb32_sse2(uchar destPixels, int* dbpl,
96	const uchar srcPixels, int* sbpl,
97	int w, int h,
98	int const_alpha)
99	{
100	const quint32 src = (const* quint32 *) srcPixels;
101	quint32 dst = (quint32 ) destPixels;
102	if (const_alpha != `256`) {
103	if (const_alpha != `0`) {
104	const __m128i half = _mm_set1_epi16(w: `0x80`);
105	const __m128i colorMask = _mm_set1_epi32(i: `0x00ff00ff`);
106
107	const_alpha = (const_alpha * `255`) >> `8`;
108	int one_minus_const_alpha = `255` - const_alpha;
109	const __m128i constAlphaVector = _mm_set1_epi16(w: const_alpha);
110	const __m128i oneMinusConstAlpha = _mm_set1_epi16(w: one_minus_const_alpha);
111	for (int y = `0`; y < h; ++y) {
112	int x = `0`;
113
114	// First, align dest to 16 bytes:
115	ALIGNMENT_PROLOGUE_16BYTES(dst, x, w) {
116	dst[x] = INTERPOLATE_PIXEL_255(x: src[x], a: const_alpha, y: dst[x], b: one_minus_const_alpha);
117	}
118
119	for (; x < w-`3`; x += `4`) {
120	__m128i srcVector = _mm_loadu_si128(p: (const __m128i *)&src[x]);
121	const __m128i dstVector = _mm_load_si128(p: (__m128i *)&dst[x]);
122	__m128i result;
123	INTERPOLATE_PIXEL_255_SSE2(result, srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half);
124	_mm_store_si128(p: (__m128i *)&dst[x], b: result);
125	}
126	SIMD_EPILOGUE(x, w, `3`)
127	dst[x] = INTERPOLATE_PIXEL_255(x: src[x], a: const_alpha, y: dst[x], b: one_minus_const_alpha);
128	dst = (quint32 )(((uchar ) dst) + dbpl);
129	src = (const quint32 )(((const* uchar *) src) + sbpl);
130	}
131	}
132	} else {
133	qt_blend_rgb32_on_rgb32(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha);
134	}
135	}
136
137	void QT_FASTCALL comp_func_SourceOver_sse2(uint destPixels, const* uint srcPixels, int* length, uint const_alpha)
138	{
139	Q_ASSERT(const_alpha < `256`);
140
141	const quint32 src = (const* quint32 *) srcPixels;
142	quint32 dst = (quint32 ) destPixels;
143
144	const __m128i nullVector = _mm_set1_epi32(i: `0`);
145	const __m128i half = _mm_set1_epi16(w: `0x80`);
146	const __m128i one = _mm_set1_epi16(w: `0xff`);
147	const __m128i colorMask = _mm_set1_epi32(i: `0x00ff00ff`);
148	if (const_alpha == `255`) {
149	const __m128i alphaMask = _mm_set1_epi32(i: `0xff000000`);
150	BLEND_SOURCE_OVER_ARGB32_SSE2(dst, src, length, nullVector, half, one, colorMask, alphaMask);
151	} else {
152	const __m128i constAlphaVector = _mm_set1_epi16(w: const_alpha);
153	BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, length, nullVector, half, one, colorMask, constAlphaVector);
154	}
155	}
156
157	void QT_FASTCALL comp_func_Plus_sse2(uint dst, const* uint src, int* length, uint const_alpha)
158	{
159	int x = `0`;
160
161	if (const_alpha == `255`) {
162	// 1) Prologue: align destination on 16 bytes
163	ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
164	dst[x] = comp_func_Plus_one_pixel(d: dst[x], s: src[x]);
165
166	// 2) composition with SSE2
167	for (; x < length - `3`; x += `4`) {
168	const __m128i srcVector = _mm_loadu_si128(p: (const __m128i *)&src[x]);
169	const __m128i dstVector = _mm_load_si128(p: (__m128i *)&dst[x]);
170
171	const __m128i result = _mm_adds_epu8(a: srcVector, b: dstVector);
172	_mm_store_si128(p: (__m128i *)&dst[x], b: result);
173	}
174
175	// 3) Epilogue:
176	SIMD_EPILOGUE(x, length, `3`)
177	dst[x] = comp_func_Plus_one_pixel(d: dst[x], s: src[x]);
178	} else {
179	const int one_minus_const_alpha = `255` - const_alpha;
180	const __m128i constAlphaVector = _mm_set1_epi16(w: const_alpha);
181	const __m128i oneMinusConstAlpha = _mm_set1_epi16(w: one_minus_const_alpha);
182
183	// 1) Prologue: align destination on 16 bytes
184	ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
185	dst[x] = comp_func_Plus_one_pixel_const_alpha(d: dst[x], s: src[x], const_alpha, one_minus_const_alpha);
186
187	const __m128i half = _mm_set1_epi16(w: `0x80`);
188	const __m128i colorMask = _mm_set1_epi32(i: `0x00ff00ff`);
189	// 2) composition with SSE2
190	for (; x < length - `3`; x += `4`) {
191	const __m128i srcVector = _mm_loadu_si128(p: (const __m128i *)&src[x]);
192	const __m128i dstVector = _mm_load_si128(p: (__m128i *)&dst[x]);
193
194	__m128i result = _mm_adds_epu8(a: srcVector, b: dstVector);
195	INTERPOLATE_PIXEL_255_SSE2(result, result, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half)
196	_mm_store_si128(p: (__m128i *)&dst[x], b: result);
197	}
198
199	// 3) Epilogue:
200	SIMD_EPILOGUE(x, length, `3`)
201	dst[x] = comp_func_Plus_one_pixel_const_alpha(d: dst[x], s: src[x], const_alpha, one_minus_const_alpha);
202	}
203	}
204
205	void QT_FASTCALL comp_func_Source_sse2(uint dst, const* uint src, int* length, uint const_alpha)
206	{
207	if (const_alpha == `255`) {
208	::memcpy(dest: dst, src: src, n: length * sizeof(uint));
209	} else {
210	const int ialpha = `255` - const_alpha;
211
212	int x = `0`;
213
214	// 1) prologue, align on 16 bytes
215	ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
216	dst[x] = INTERPOLATE_PIXEL_255(x: src[x], a: const_alpha, y: dst[x], b: ialpha);
217
218	// 2) interpolate pixels with SSE2
219	const __m128i half = _mm_set1_epi16(w: `0x80`);
220	const __m128i colorMask = _mm_set1_epi32(i: `0x00ff00ff`);
221	const __m128i constAlphaVector = _mm_set1_epi16(w: const_alpha);
222	const __m128i oneMinusConstAlpha = _mm_set1_epi16(w: ialpha);
223	for (; x < length - `3`; x += `4`) {
224	const __m128i srcVector = _mm_loadu_si128(p: (const __m128i *)&src[x]);
225	__m128i dstVector = _mm_load_si128(p: (__m128i *)&dst[x]);
226	INTERPOLATE_PIXEL_255_SSE2(dstVector, srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half)
227	_mm_store_si128(p: (__m128i *)&dst[x], b: dstVector);
228	}
229
230	// 3) Epilogue
231	SIMD_EPILOGUE(x, length, `3`)
232	dst[x] = INTERPOLATE_PIXEL_255(x: src[x], a: const_alpha, y: dst[x], b: ialpha);
233	}
234	}
235
236	#ifndef __AVX2__
237	static Q_NEVER_INLINE
238	void Q_DECL_VECTORCALL qt_memfillXX_aligned(void *dest, __m128i value128, quintptr bytecount)
239	{
240	__m128i dst128 = reinterpret_cast<__m128i >(dest);
241	__m128i end128 = reinterpret_cast<__m128i >(static_cast<uchar *>(dest) + bytecount);
242
243	while (dst128 + `4` <= end128) {
244	_mm_store_si128(p: dst128 + `0`, b: value128);
245	_mm_store_si128(p: dst128 + `1`, b: value128);
246	_mm_store_si128(p: dst128 + `2`, b: value128);
247	_mm_store_si128(p: dst128 + `3`, b: value128);
248	dst128 += `4`;
249	}
250
251	bytecount %= `4` * sizeof(__m128i);
252	switch (bytecount / sizeof(__m128i)) {
253	case `3`: _mm_store_si128(p: dst128++, b: value128); Q_FALLTHROUGH();
254	case `2`: _mm_store_si128(p: dst128++, b: value128); Q_FALLTHROUGH();
255	case `1`: _mm_store_si128(p: dst128++, b: value128);
256	}
257	}
258
259	void qt_memfill64_sse2(quint64 *dest, quint64 value, qsizetype count)
260	{
261	quintptr misaligned = quintptr(dest) % sizeof(__m128i);
262	if (misaligned && count) {
263	#if defined(Q_PROCESSOR_X86_32)
264	// Before SSE came out, the alignment of the stack used to be only 4
265	// bytes and some OS/ABIs (notably, code generated by MSVC) still only
266	// align to that. In any case, we cannot count on the alignment of
267	// quint64 to be 8 -- see QtPrivate::AlignOf_WorkaroundForI386Abi in
268	// qglobal.h.
269	//
270	// If the pointer is not aligned to at least 8 bytes, then we'll never
271	// in turn hit a multiple of 16 for the qt_memfillXX_aligned call
272	// below.
273	if (Q_UNLIKELY(misaligned % sizeof(quint64)))
274	return qt_memfill_template(dest, value, count);
275	#endif
276
277	*dest++ = value;
278	--count;
279	}
280
281	if (count % `2`) {
282	dest[count - `1`] = value;
283	--count;
284	}
285
286	qt_memfillXX_aligned(dest, value128: _mm_set1_epi64x(q: value), bytecount: count * sizeof(quint64));
287	}
288
289	void qt_memfill32_sse2(quint32 *dest, quint32 value, qsizetype count)
290	{
291	if (count < `4`) {
292	// this simplifies the code below: the first switch can fall through
293	// without checking the value of count
294	switch (count) {
295	case `3`: *dest++ = value; Q_FALLTHROUGH();
296	case `2`: *dest++ = value; Q_FALLTHROUGH();
297	case `1`: *dest = value;
298	}
299	return;
300	}
301
302	const int align = (quintptr)(dest) & `0xf`;
303	switch (align) {
304	case `4`: *dest++ = value; --count; Q_FALLTHROUGH();
305	case `8`: *dest++ = value; --count; Q_FALLTHROUGH();
306	case `12`: *dest++ = value; --count;
307	}
308
309	const int rest = count & `0x3`;
310	if (rest) {
311	switch (rest) {
312	case `3`: dest[count - `3`] = value; Q_FALLTHROUGH();
313	case `2`: dest[count - `2`] = value; Q_FALLTHROUGH();
314	case `1`: dest[count - `1`] = value;
315	}
316	}
317
318	qt_memfillXX_aligned(dest, value128: _mm_set1_epi32(i: value), bytecount: count * sizeof(quint32));
319	}
320	#endif // !__AVX2__
321
322	void QT_FASTCALL comp_func_solid_Source_sse2(uint destPixels, int* length, uint color, uint const_alpha)
323	{
324	if (const_alpha == `255`) {
325	qt_memfill32(destPixels, color, length);
326	} else {
327	const quint32 ialpha = `255` - const_alpha;
328	color = BYTE_MUL(x: color, a: const_alpha);
329	int x = `0`;
330
331	quint32 dst = (quint32 ) destPixels;
332	const __m128i colorVector = _mm_set1_epi32(i: color);
333	const __m128i colorMask = _mm_set1_epi32(i: `0x00ff00ff`);
334	const __m128i half = _mm_set1_epi16(w: `0x80`);
335	const __m128i iAlphaVector = _mm_set1_epi16(w: ialpha);
336
337	ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
338	destPixels[x] = color + BYTE_MUL(x: destPixels[x], a: ialpha);
339
340	for (; x < length-`3`; x += `4`) {
341	__m128i dstVector = _mm_load_si128(p: (__m128i *)&dst[x]);
342	BYTE_MUL_SSE2(dstVector, dstVector, iAlphaVector, colorMask, half);
343	dstVector = _mm_add_epi8(a: colorVector, b: dstVector);
344	_mm_store_si128(p: (__m128i *)&dst[x], b: dstVector);
345	}
346	SIMD_EPILOGUE(x, length, `3`)
347	destPixels[x] = color + BYTE_MUL(x: destPixels[x], a: ialpha);
348	}
349	}
350
351	void QT_FASTCALL comp_func_solid_SourceOver_sse2(uint destPixels, int* length, uint color, uint const_alpha)
352	{
353	if ((const_alpha & qAlpha(rgb: color)) == `255`) {
354	qt_memfill32(destPixels, color, length);
355	} else {
356	if (const_alpha != `255`)
357	color = BYTE_MUL(x: color, a: const_alpha);
358
359	const quint32 minusAlphaOfColor = qAlpha(rgb: ~color);
360	int x = `0`;
361
362	quint32 dst = (quint32 ) destPixels;
363	const __m128i colorVector = _mm_set1_epi32(i: color);
364	const __m128i colorMask = _mm_set1_epi32(i: `0x00ff00ff`);
365	const __m128i half = _mm_set1_epi16(w: `0x80`);
366	const __m128i minusAlphaOfColorVector = _mm_set1_epi16(w: minusAlphaOfColor);
367
368	ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
369	destPixels[x] = color + BYTE_MUL(x: destPixels[x], a: minusAlphaOfColor);
370
371	for (; x < length-`3`; x += `4`) {
372	__m128i dstVector = _mm_load_si128(p: (__m128i *)&dst[x]);
373	BYTE_MUL_SSE2(dstVector, dstVector, minusAlphaOfColorVector, colorMask, half);
374	dstVector = _mm_add_epi8(a: colorVector, b: dstVector);
375	_mm_store_si128(p: (__m128i *)&dst[x], b: dstVector);
376	}
377	SIMD_EPILOGUE(x, length, `3`)
378	destPixels[x] = color + BYTE_MUL(x: destPixels[x], a: minusAlphaOfColor);
379	}
380	}
381
382	void qt_bitmapblit32_sse2_base(QRasterBuffer rasterBuffer, int* x, int y,
383	quint32 color,
384	const uchar src, int* width, int height, int stride)
385	{
386	quint32 dest = reinterpret_cast<quint32>(rasterBuffer->scanLine(y)) + x;
387	const int destStride = rasterBuffer->stride<quint32>();
388
389	const __m128i c128 = _mm_set1_epi32(i: color);
390	const __m128i maskmask1 = _mm_set_epi32(i3: `0x10101010`, i2: `0x20202020`,
391	i1: `0x40404040`, i0: `0x80808080`);
392	const __m128i maskadd1 = _mm_set_epi32(i3: `0x70707070`, i2: `0x60606060`,
393	i1: `0x40404040`, i0: `0x00000000`);
394
395	if (width > `4`) {
396	const __m128i maskmask2 = _mm_set_epi32(i3: `0x01010101`, i2: `0x02020202`,
397	i1: `0x04040404`, i0: `0x08080808`);
398	const __m128i maskadd2 = _mm_set_epi32(i3: `0x7f7f7f7f`, i2: `0x7e7e7e7e`,
399	i1: `0x7c7c7c7c`, i0: `0x78787878`);
400	while (height--) {
401	for (int x = `0`; x < width; x += `8`) {
402	const quint8 s = src[x >> `3`];
403	if (!s)
404	continue;
405	__m128i mask1 = _mm_set1_epi8(b: s);
406	__m128i mask2 = mask1;
407
408	mask1 = _mm_and_si128(a: mask1, b: maskmask1);
409	mask1 = _mm_add_epi8(a: mask1, b: maskadd1);
410	_mm_maskmoveu_si128(d: c128, n: mask1, p: (char*)(dest + x));
411	mask2 = _mm_and_si128(a: mask2, b: maskmask2);
412	mask2 = _mm_add_epi8(a: mask2, b: maskadd2);
413	_mm_maskmoveu_si128(d: c128, n: mask2, p: (char*)(dest + x + `4`));
414	}
415	dest += destStride;
416	src += stride;
417	}
418	} else {
419	while (height--) {
420	const quint8 s = *src;
421	if (s) {
422	__m128i mask1 = _mm_set1_epi8(b: s);
423	mask1 = _mm_and_si128(a: mask1, b: maskmask1);
424	mask1 = _mm_add_epi8(a: mask1, b: maskadd1);
425	_mm_maskmoveu_si128(d: c128, n: mask1, p: (char*)(dest));
426	}
427	dest += destStride;
428	src += stride;
429	}
430	}
431	}
432
433	void qt_bitmapblit32_sse2(QRasterBuffer rasterBuffer, int* x, int y,
434	const QRgba64 &color,
435	const uchar src, int* width, int height, int stride)
436	{
437	qt_bitmapblit32_sse2_base(rasterBuffer, x, y, color: color.toArgb32(), src, width, height, stride);
438	}
439
440	void qt_bitmapblit8888_sse2(QRasterBuffer rasterBuffer, int* x, int y,
441	const QRgba64 &color,
442	const uchar src, int* width, int height, int stride)
443	{
444	qt_bitmapblit32_sse2_base(rasterBuffer, x, y, color: ARGB2RGBA(x: color.toArgb32()), src, width, height, stride);
445	}
446
447	void qt_bitmapblit16_sse2(QRasterBuffer rasterBuffer, int* x, int y,
448	const QRgba64 &color,
449	const uchar src, int* width, int height, int stride)
450	{
451	const quint16 c = qConvertRgb32To16(c: color.toArgb32());
452	quint16 dest = reinterpret_cast<quint16>(rasterBuffer->scanLine(y)) + x;
453	const int destStride = rasterBuffer->stride<quint32>();
454
455	const __m128i c128 = _mm_set1_epi16(w: c);
456	QT_WARNING_DISABLE_MSVC(`4309`) // truncation of constant value
457	const __m128i maskmask = _mm_set_epi16(w7: `0x0101`, w6: `0x0202`, w5: `0x0404`, w4: `0x0808`,
458	w3: `0x1010`, w2: `0x2020`, w1: `0x4040`, w0: `0x8080`);
459	const __m128i maskadd = _mm_set_epi16(w7: `0x7f7f`, w6: `0x7e7e`, w5: `0x7c7c`, w4: `0x7878`,
460	w3: `0x7070`, w2: `0x6060`, w1: `0x4040`, w0: `0x0000`);
461
462	while (height--) {
463	for (int x = `0`; x < width; x += `8`) {
464	const quint8 s = src[x >> `3`];
465	if (!s)
466	continue;
467	__m128i mask = _mm_set1_epi8(b: s);
468	mask = _mm_and_si128(a: mask, b: maskmask);
469	mask = _mm_add_epi8(a: mask, b: maskadd);
470	_mm_maskmoveu_si128(d: c128, n: mask, p: (char*)(dest + x));
471	}
472	dest += destStride;
473	src += stride;
474	}
475	}
476
477	class QSimdSse2
478	{
479	public:
480	typedef __m128i Int32x4;
481	typedef __m128 Float32x4;
482
483	union Vect_buffer_i { Int32x4 v; int i[`4`]; };
484	union Vect_buffer_f { Float32x4 v; float f[`4`]; };
485
486	static inline Float32x4 Q_DECL_VECTORCALL v_dup(float x) { return _mm_set1_ps(w: x); }
487	static inline Float32x4 Q_DECL_VECTORCALL v_dup(double x) { return _mm_set1_ps(w: x); }
488	static inline Int32x4 Q_DECL_VECTORCALL v_dup(int x) { return _mm_set1_epi32(i: x); }
489	static inline Int32x4 Q_DECL_VECTORCALL v_dup(uint x) { return _mm_set1_epi32(i: x); }
490
491	static inline Float32x4 Q_DECL_VECTORCALL v_add(Float32x4 a, Float32x4 b) { return _mm_add_ps(a: a, b: b); }
492	static inline Int32x4 Q_DECL_VECTORCALL v_add(Int32x4 a, Int32x4 b) { return _mm_add_epi32(a: a, b: b); }
493
494	static inline Float32x4 Q_DECL_VECTORCALL v_max(Float32x4 a, Float32x4 b) { return _mm_max_ps(a: a, b: b); }
495	static inline Float32x4 Q_DECL_VECTORCALL v_min(Float32x4 a, Float32x4 b) { return _mm_min_ps(a: a, b: b); }
496	static inline Int32x4 Q_DECL_VECTORCALL v_min_16(Int32x4 a, Int32x4 b) { return _mm_min_epi16(a: a, b: b); }
497
498	static inline Int32x4 Q_DECL_VECTORCALL v_and(Int32x4 a, Int32x4 b) { return _mm_and_si128(a: a, b: b); }
499
500	static inline Float32x4 Q_DECL_VECTORCALL v_sub(Float32x4 a, Float32x4 b) { return _mm_sub_ps(a: a, b: b); }
501	static inline Int32x4 Q_DECL_VECTORCALL v_sub(Int32x4 a, Int32x4 b) { return _mm_sub_epi32(a: a, b: b); }
502
503	static inline Float32x4 Q_DECL_VECTORCALL v_mul(Float32x4 a, Float32x4 b) { return _mm_mul_ps(a: a, b: b); }
504
505	static inline Float32x4 Q_DECL_VECTORCALL v_sqrt(Float32x4 x) { return _mm_sqrt_ps(a: x); }
506
507	static inline Int32x4 Q_DECL_VECTORCALL v_toInt(Float32x4 x) { return _mm_cvttps_epi32(a: x); }
508
509	static inline Int32x4 Q_DECL_VECTORCALL v_greaterOrEqual(Float32x4 a, Float32x4 b) { return _mm_castps_si128(a: _mm_cmpgt_ps(a: a, b: b)); }
510	};
511
512	const uint * QT_FASTCALL qt_fetch_radial_gradient_sse2(uint buffer, const* Operator op, const* QSpanData *data,
513	int y, int x, int length)
514	{
515	return qt_fetch_radial_gradient_template<QRadialFetchSimd<QSimdSse2>,uint>(buffer, op, data, y, x, length);
516	}
517
518	void qt_scale_image_argb32_on_argb32_sse2(uchar destPixels, int* dbpl,
519	const uchar srcPixels, int* sbpl, int srch,
520	const QRectF &targetRect,
521	const QRectF &sourceRect,
522	const QRect &clip,
523	int const_alpha)
524	{
525	if (const_alpha != `256`) {
526	// from qblendfunctions.cpp
527	extern void qt_scale_image_argb32_on_argb32(uchar destPixels, int* dbpl,
528	const uchar srcPixels, int* sbpl, int srch,
529	const QRectF &targetRect,
530	const QRectF &sourceRect,
531	const QRect &clip,
532	int const_alpha);
533	return qt_scale_image_argb32_on_argb32(destPixels, dbpl, srcPixels, sbpl, srch, targetRect, sourceRect, clip, const_alpha);
534	}
535
536	qreal sx = targetRect.width() / (qreal) sourceRect.width();
537	qreal sy = targetRect.height() / (qreal) sourceRect.height();
538
539	int ix = `0x00010000` / sx;
540	int iy = `0x00010000` / sy;
541
542	int cx1 = clip.x();
543	int cx2 = clip.x() + clip.width();
544	int cy1 = clip.top();
545	int cy2 = clip.y() + clip.height();
546
547	int tx1 = qRound(d: targetRect.left());
548	int tx2 = qRound(d: targetRect.right());
549	int ty1 = qRound(d: targetRect.top());
550	int ty2 = qRound(d: targetRect.bottom());
551
552	if (tx2 < tx1)
553	qSwap(value1&: tx2, value2&: tx1);
554	if (ty2 < ty1)
555	qSwap(value1&: ty2, value2&: ty1);
556
557	if (tx1 < cx1)
558	tx1 = cx1;
559	if (tx2 >= cx2)
560	tx2 = cx2;
561
562	if (tx1 >= tx2)
563	return;
564
565	if (ty1 < cy1)
566	ty1 = cy1;
567	if (ty2 >= cy2)
568	ty2 = cy2;
569	if (ty1 >= ty2)
570	return;
571
572	int h = ty2 - ty1;
573	int w = tx2 - tx1;
574
575	quint32 basex;
576	quint32 srcy;
577
578	if (sx < `0`) {
579	int dstx = qFloor(v: (tx1 + qreal(`0.5`) - targetRect.right()) * ix) + `1`;
580	basex = quint32(sourceRect.right() * `65536`) + dstx;
581	} else {
582	int dstx = qCeil(v: (tx1 + qreal(`0.5`) - targetRect.left()) * ix) - `1`;
583	basex = quint32(sourceRect.left() * `65536`) + dstx;
584	}
585	if (sy < `0`) {
586	int dsty = qFloor(v: (ty1 + qreal(`0.5`) - targetRect.bottom()) * iy) + `1`;
587	srcy = quint32(sourceRect.bottom() * `65536`) + dsty;
588	} else {
589	int dsty = qCeil(v: (ty1 + qreal(`0.5`) - targetRect.top()) * iy) - `1`;
590	srcy = quint32(sourceRect.top() * `65536`) + dsty;
591	}
592
593	quint32 dst = ((quint32 ) (destPixels + ty1 * dbpl)) + tx1;
594
595	const __m128i nullVector = _mm_set1_epi32(i: `0`);
596	const __m128i half = _mm_set1_epi16(w: `0x80`);
597	const __m128i one = _mm_set1_epi16(w: `0xff`);
598	const __m128i colorMask = _mm_set1_epi32(i: `0x00ff00ff`);
599	const __m128i alphaMask = _mm_set1_epi32(i: `0xff000000`);
600	const __m128i ixVector = _mm_set1_epi32(i: `4`*ix);
601
602	// this bounds check here is required as floating point rounding above might in some cases lead to
603	// w/h values that are one pixel too large, falling outside of the valid image area.
604	const int ystart = srcy >> `16`;
605	if (ystart >= srch && iy < `0`) {
606	srcy += iy;
607	--h;
608	}
609	const int xstart = basex >> `16`;
610	if (xstart >= (int)(sbpl/sizeof(quint32)) && ix < `0`) {
611	basex += ix;
612	--w;
613	}
614	int yend = (srcy + iy * (h - `1`)) >> `16`;
615	if (yend < `0` \|\| yend >= srch)
616	--h;
617	int xend = (basex + ix * (w - `1`)) >> `16`;
618	if (xend < `0` \|\| xend >= (int)(sbpl/sizeof(quint32)))
619	--w;
620
621	while (h--) {
622	const uint src = (const* quint32 ) (srcPixels + (srcy >> `16`) sbpl);
623	int srcx = basex;
624	int x = `0`;
625
626	ALIGNMENT_PROLOGUE_16BYTES(dst, x, w) {
627	uint s = src[srcx >> `16`];
628	dst[x] = s + BYTE_MUL(x: dst[x], a: qAlpha(rgb: ~s));
629	srcx += ix;
630	}
631
632	__m128i srcxVector = _mm_set_epi32(i3: srcx, i2: srcx + ix, i1: srcx + ix + ix, i0: srcx + ix + ix + ix);
633
634	for (; x<w - `3`; x += `4`) {
635	union Vect_buffer { __m128i vect; quint32 i[`4`]; };
636	Vect_buffer addr;
637	addr.vect = _mm_srli_epi32(a: srcxVector, count: `16`);
638	srcxVector = _mm_add_epi32(a: srcxVector, b: ixVector);
639
640	const __m128i srcVector = _mm_set_epi32(i3: src[addr.i[`0`]], i2: src[addr.i[`1`]], i1: src[addr.i[`2`]], i0: src[addr.i[`3`]]);
641	BLEND_SOURCE_OVER_ARGB32_SSE2_helper(dst, srcVector, nullVector, half, one, colorMask, alphaMask);
642	}
643
644	SIMD_EPILOGUE(x, w, `3`) {
645	uint s = src[(basex + x*ix) >> `16`];
646	dst[x] = s + BYTE_MUL(x: dst[x], a: qAlpha(rgb: ~s));
647	}
648	dst = (quint32 )(((uchar ) dst) + dbpl);
649	srcy += iy;
650	}
651	}
652
653
654	QT_END_NAMESPACE
655
656	#endif // QT_COMPILER_SUPPORTS_SSE2
657

source code of qtbase/src/gui/painting/qdrawhelper_sse2.cpp