qimagescale_sse4.cpp source code [qtbase/src/gui/painting/qimagescale_sse4.cpp]

1	// Copyright (C) 2016 The Qt Company Ltd.
2	// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3
4	#include "qimagescale_p.h"
5	#include "qimage.h"
6	#include <private/qdrawhelper_x86_p.h>
7	#include <private/qsimd_p.h>
8
9	#if QT_CONFIG(qtgui_threadpool)
10	#include <qsemaphore.h>
11	#include <qthreadpool.h>
12	#include <private/qguiapplication_p.h>
13	#include <private/qthreadpool_p.h>
14	#endif
15
16	#if defined(QT_COMPILER_SUPPORTS_SSE4_1)
17
18	QT_BEGIN_NAMESPACE
19
20	using namespace QImageScale;
21
22	template<typename T>
23	static inline void multithread_pixels_function(QImageScaleInfo isi, int* dh, const T &scaleSection)
24	{
25	#if QT_CONFIG(qtgui_threadpool)
26	int segments = (qsizetype(isi->sh) * isi->sw) / (`1`<<`16`);
27	segments = std::min(a: segments, b: dh);
28	QThreadPool *threadPool = QGuiApplicationPrivate::qtGuiThreadPool();
29	if (segments > `1` && threadPool && !threadPool->contains(thread: QThread::currentThread())) {
30	QSemaphore semaphore;
31	int y = `0`;
32	for (int i = `0`; i < segments; ++i) {
33	int yn = (dh - y) / (segments - i);
34	threadPool->start([&, y, yn]() {
35	scaleSection(y, y + yn);
36	semaphore.release(n: `1`);
37	});
38	y += yn;
39	}
40	semaphore.acquire(n: segments);
41	return;
42	}
43	#endif
44	scaleSection(`0`, dh);
45	}
46
47	inline static __m128i Q_DECL_VECTORCALL
48	qt_qimageScaleAARGBA_helper(const unsigned int pix, int* xyap, int Cxy, int step, const __m128i vxyap, const __m128i vCxy)
49	{
50	__m128i vpix = _mm_cvtepu8_epi32(V: _mm_cvtsi32_si128(a: *pix));
51	__m128i vx = _mm_mullo_epi32(V1: vpix, V2: vxyap);
52	int i;
53	for (i = (`1` << `14`) - xyap; i > Cxy; i -= Cxy) {
54	pix += step;
55	vpix = _mm_cvtepu8_epi32(V: _mm_cvtsi32_si128(a: *pix));
56	vx = _mm_add_epi32(a: vx, b: _mm_mullo_epi32(V1: vpix, V2: vCxy));
57	}
58	pix += step;
59	vpix = _mm_cvtepu8_epi32(V: _mm_cvtsi32_si128(a: *pix));
60	vx = _mm_add_epi32(a: vx, b: _mm_mullo_epi32(V1: vpix, V2: _mm_set1_epi32(i: i)));
61	return vx;
62	}
63
64	template<bool RGB>
65	void qt_qimageScaleAARGBA_up_x_down_y_sse4(QImageScaleInfo isi, unsigned* int *dest,
66	int dw, int dh, int dow, int sow)
67	{
68	const unsigned int **ypoints = isi->ypoints;
69	const int *xpoints = isi->xpoints;
70	const int *xapoints = isi->xapoints;
71	const int *yapoints = isi->yapoints;
72
73	const __m128i v256 = _mm_set1_epi32(i: `256`);
74
75	/ go through every scanline in the output buffer /
76	auto scaleSection = [&] (int yStart, int yEnd) {
77	for (int y = yStart; y < yEnd; ++y) {
78	const int Cy = yapoints[y] >> `16`;
79	const int yap = yapoints[y] & `0xffff`;
80	const __m128i vCy = _mm_set1_epi32(i: Cy);
81	const __m128i vyap = _mm_set1_epi32(i: yap);
82
83	unsigned int dptr = dest + (y dow);
84	for (int x = `0`; x < dw; x++) {
85	const unsigned int *sptr = ypoints[y] + xpoints[x];
86	__m128i vx = qt_qimageScaleAARGBA_helper(pix: sptr, xyap: yap, Cxy: Cy, step: sow, vxyap: vyap, vCxy: vCy);
87
88	const int xap = xapoints[x];
89	if (xap > `0`) {
90	const __m128i vxap = _mm_set1_epi32(i: xap);
91	const __m128i vinvxap = _mm_sub_epi32(a: v256, b: vxap);
92	__m128i vr = qt_qimageScaleAARGBA_helper(pix: sptr + `1`, xyap: yap, Cxy: Cy, step: sow, vxyap: vyap, vCxy: vCy);
93
94	vx = _mm_mullo_epi32(V1: vx, V2: vinvxap);
95	vr = _mm_mullo_epi32(V1: vr, V2: vxap);
96	vx = _mm_add_epi32(a: vx, b: vr);
97	vx = _mm_srli_epi32(a: vx, count: `8`);
98	}
99	vx = _mm_srli_epi32(a: vx, count: `14`);
100	vx = _mm_packus_epi32(V1: vx, V2: vx);
101	vx = _mm_packus_epi16(a: vx, b: vx);
102	*dptr = _mm_cvtsi128_si32(a: vx);
103	if (RGB)
104	*dptr \|= `0xff000000`;
105	dptr++;
106	}
107	}
108	};
109	multithread_pixels_function(isi, dh, scaleSection);
110	}
111
112	template<bool RGB>
113	void qt_qimageScaleAARGBA_down_x_up_y_sse4(QImageScaleInfo isi, unsigned* int *dest,
114	int dw, int dh, int dow, int sow)
115	{
116	const unsigned int **ypoints = isi->ypoints;
117	int *xpoints = isi->xpoints;
118	int *xapoints = isi->xapoints;
119	int *yapoints = isi->yapoints;
120
121	const __m128i v256 = _mm_set1_epi32(i: `256`);
122
123	/ go through every scanline in the output buffer /
124	auto scaleSection = [&] (int yStart, int yEnd) {
125	for (int y = yStart; y < yEnd; ++y) {
126	unsigned int dptr = dest + (y dow);
127	for (int x = `0`; x < dw; x++) {
128	int Cx = xapoints[x] >> `16`;
129	int xap = xapoints[x] & `0xffff`;
130	const __m128i vCx = _mm_set1_epi32(i: Cx);
131	const __m128i vxap = _mm_set1_epi32(i: xap);
132
133	const unsigned int *sptr = ypoints[y] + xpoints[x];
134	__m128i vx = qt_qimageScaleAARGBA_helper(pix: sptr, xyap: xap, Cxy: Cx, step: `1`, vxyap: vxap, vCxy: vCx);
135
136	int yap = yapoints[y];
137	if (yap > `0`) {
138	const __m128i vyap = _mm_set1_epi32(i: yap);
139	const __m128i vinvyap = _mm_sub_epi32(a: v256, b: vyap);
140	__m128i vr = qt_qimageScaleAARGBA_helper(pix: sptr + sow, xyap: xap, Cxy: Cx, step: `1`, vxyap: vxap, vCxy: vCx);
141
142	vx = _mm_mullo_epi32(V1: vx, V2: vinvyap);
143	vr = _mm_mullo_epi32(V1: vr, V2: vyap);
144	vx = _mm_add_epi32(a: vx, b: vr);
145	vx = _mm_srli_epi32(a: vx, count: `8`);
146	}
147	vx = _mm_srli_epi32(a: vx, count: `14`);
148	vx = _mm_packus_epi32(V1: vx, V2: vx);
149	vx = _mm_packus_epi16(a: vx, b: vx);
150	*dptr = _mm_cvtsi128_si32(a: vx);
151	if (RGB)
152	*dptr \|= `0xff000000`;
153	dptr++;
154	}
155	}
156	};
157	multithread_pixels_function(isi, dh, scaleSection);
158	}
159
160	template<bool RGB>
161	void qt_qimageScaleAARGBA_down_xy_sse4(QImageScaleInfo isi, unsigned* int *dest,
162	int dw, int dh, int dow, int sow)
163	{
164	const unsigned int **ypoints = isi->ypoints;
165	int *xpoints = isi->xpoints;
166	int *xapoints = isi->xapoints;
167	int *yapoints = isi->yapoints;
168
169	auto scaleSection = [&] (int yStart, int yEnd) {
170	for (int y = yStart; y < yEnd; ++y) {
171	int Cy = yapoints[y] >> `16`;
172	int yap = yapoints[y] & `0xffff`;
173	const __m128i vCy = _mm_set1_epi32(i: Cy);
174	const __m128i vyap = _mm_set1_epi32(i: yap);
175
176	unsigned int dptr = dest + (y dow);
177	for (int x = `0`; x < dw; x++) {
178	const int Cx = xapoints[x] >> `16`;
179	const int xap = xapoints[x] & `0xffff`;
180	const __m128i vCx = _mm_set1_epi32(i: Cx);
181	const __m128i vxap = _mm_set1_epi32(i: xap);
182
183	const unsigned int *sptr = ypoints[y] + xpoints[x];
184	__m128i vx = qt_qimageScaleAARGBA_helper(pix: sptr, xyap: xap, Cxy: Cx, step: `1`, vxyap: vxap, vCxy: vCx);
185	__m128i vr = _mm_mullo_epi32(V1: _mm_srli_epi32(a: vx, count: `4`), V2: vyap);
186
187	int j;
188	for (j = (`1` << `14`) - yap; j > Cy; j -= Cy) {
189	sptr += sow;
190	vx = qt_qimageScaleAARGBA_helper(pix: sptr, xyap: xap, Cxy: Cx, step: `1`, vxyap: vxap, vCxy: vCx);
191	vr = _mm_add_epi32(a: vr, b: _mm_mullo_epi32(V1: _mm_srli_epi32(a: vx, count: `4`), V2: vCy));
192	}
193	sptr += sow;
194	vx = qt_qimageScaleAARGBA_helper(pix: sptr, xyap: xap, Cxy: Cx, step: `1`, vxyap: vxap, vCxy: vCx);
195	vr = _mm_add_epi32(a: vr, b: _mm_mullo_epi32(V1: _mm_srli_epi32(a: vx, count: `4`), V2: _mm_set1_epi32(i: j)));
196
197	vr = _mm_srli_epi32(a: vr, count: `24`);
198	vr = _mm_packus_epi32(V1: vr, V2: _mm_setzero_si128());
199	vr = _mm_packus_epi16(a: vr, b: _mm_setzero_si128());
200	*dptr = _mm_cvtsi128_si32(a: vr);
201	if (RGB)
202	*dptr \|= `0xff000000`;
203	dptr++;
204	}
205	}
206	};
207	multithread_pixels_function(isi, dh, scaleSection);
208	}
209
210	template void qt_qimageScaleAARGBA_up_x_down_y_sse4<false>(QImageScaleInfo isi, unsigned* int *dest,
211	int dw, int dh, int dow, int sow);
212
213	template void qt_qimageScaleAARGBA_up_x_down_y_sse4<true>(QImageScaleInfo isi, unsigned* int *dest,
214	int dw, int dh, int dow, int sow);
215
216	template void qt_qimageScaleAARGBA_down_x_up_y_sse4<false>(QImageScaleInfo isi, unsigned* int *dest,
217	int dw, int dh, int dow, int sow);
218
219	template void qt_qimageScaleAARGBA_down_x_up_y_sse4<true>(QImageScaleInfo isi, unsigned* int *dest,
220	int dw, int dh, int dow, int sow);
221
222	template void qt_qimageScaleAARGBA_down_xy_sse4<false>(QImageScaleInfo isi, unsigned* int *dest,
223	int dw, int dh, int dow, int sow);
224
225	template void qt_qimageScaleAARGBA_down_xy_sse4<true>(QImageScaleInfo isi, unsigned* int *dest,
226	int dw, int dh, int dow, int sow);
227
228	QT_END_NAMESPACE
229
230	#endif
231

source code of qtbase/src/gui/painting/qimagescale_sse4.cpp