1 | // Copyright (C) 2016 The Qt Company Ltd. |
2 | // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only |
3 | |
4 | #include "qimagescale_p.h" |
5 | #include "qimage.h" |
6 | #include <private/qdrawhelper_x86_p.h> |
7 | #include <private/qsimd_p.h> |
8 | |
9 | #if QT_CONFIG(thread) && !defined(Q_OS_WASM) |
10 | #include <qsemaphore.h> |
11 | #include <qthreadpool.h> |
12 | #include <private/qthreadpool_p.h> |
13 | #endif |
14 | |
15 | #if defined(QT_COMPILER_SUPPORTS_SSE4_1) |
16 | |
17 | QT_BEGIN_NAMESPACE |
18 | |
19 | using namespace QImageScale; |
20 | |
21 | template<typename T> |
22 | static inline void multithread_pixels_function(QImageScaleInfo *isi, int dh, const T &scaleSection) |
23 | { |
24 | #if QT_CONFIG(thread) && !defined(Q_OS_WASM) |
25 | int segments = (qsizetype(isi->sh) * isi->sw) / (1<<16); |
26 | segments = std::min(a: segments, b: dh); |
27 | QThreadPool *threadPool = QThreadPoolPrivate::qtGuiInstance(); |
28 | if (segments > 1 && threadPool && !threadPool->contains(thread: QThread::currentThread())) { |
29 | QSemaphore semaphore; |
30 | int y = 0; |
31 | for (int i = 0; i < segments; ++i) { |
32 | int yn = (dh - y) / (segments - i); |
33 | threadPool->start([&, y, yn]() { |
34 | scaleSection(y, y + yn); |
35 | semaphore.release(n: 1); |
36 | }); |
37 | y += yn; |
38 | } |
39 | semaphore.acquire(n: segments); |
40 | return; |
41 | } |
42 | #endif |
43 | scaleSection(0, dh); |
44 | } |
45 | |
46 | inline static __m128i Q_DECL_VECTORCALL |
47 | qt_qimageScaleAARGBA_helper(const unsigned int *pix, int xyap, int Cxy, int step, const __m128i vxyap, const __m128i vCxy) |
48 | { |
49 | __m128i vpix = _mm_cvtepu8_epi32(V: _mm_cvtsi32_si128(a: *pix)); |
50 | __m128i vx = _mm_mullo_epi32(V1: vpix, V2: vxyap); |
51 | int i; |
52 | for (i = (1 << 14) - xyap; i > Cxy; i -= Cxy) { |
53 | pix += step; |
54 | vpix = _mm_cvtepu8_epi32(V: _mm_cvtsi32_si128(a: *pix)); |
55 | vx = _mm_add_epi32(a: vx, b: _mm_mullo_epi32(V1: vpix, V2: vCxy)); |
56 | } |
57 | pix += step; |
58 | vpix = _mm_cvtepu8_epi32(V: _mm_cvtsi32_si128(a: *pix)); |
59 | vx = _mm_add_epi32(a: vx, b: _mm_mullo_epi32(V1: vpix, V2: _mm_set1_epi32(i: i))); |
60 | return vx; |
61 | } |
62 | |
63 | template<bool RGB> |
64 | void qt_qimageScaleAARGBA_up_x_down_y_sse4(QImageScaleInfo *isi, unsigned int *dest, |
65 | int dw, int dh, int dow, int sow) |
66 | { |
67 | const unsigned int **ypoints = isi->ypoints; |
68 | const int *xpoints = isi->xpoints; |
69 | const int *xapoints = isi->xapoints; |
70 | const int *yapoints = isi->yapoints; |
71 | |
72 | const __m128i v256 = _mm_set1_epi32(i: 256); |
73 | |
74 | /* go through every scanline in the output buffer */ |
75 | auto scaleSection = [&] (int yStart, int yEnd) { |
76 | for (int y = yStart; y < yEnd; ++y) { |
77 | const int Cy = yapoints[y] >> 16; |
78 | const int yap = yapoints[y] & 0xffff; |
79 | const __m128i vCy = _mm_set1_epi32(i: Cy); |
80 | const __m128i vyap = _mm_set1_epi32(i: yap); |
81 | |
82 | unsigned int *dptr = dest + (y * dow); |
83 | for (int x = 0; x < dw; x++) { |
84 | const unsigned int *sptr = ypoints[y] + xpoints[x]; |
85 | __m128i vx = qt_qimageScaleAARGBA_helper(pix: sptr, xyap: yap, Cxy: Cy, step: sow, vxyap: vyap, vCxy: vCy); |
86 | |
87 | const int xap = xapoints[x]; |
88 | if (xap > 0) { |
89 | const __m128i vxap = _mm_set1_epi32(i: xap); |
90 | const __m128i vinvxap = _mm_sub_epi32(a: v256, b: vxap); |
91 | __m128i vr = qt_qimageScaleAARGBA_helper(pix: sptr + 1, xyap: yap, Cxy: Cy, step: sow, vxyap: vyap, vCxy: vCy); |
92 | |
93 | vx = _mm_mullo_epi32(V1: vx, V2: vinvxap); |
94 | vr = _mm_mullo_epi32(V1: vr, V2: vxap); |
95 | vx = _mm_add_epi32(a: vx, b: vr); |
96 | vx = _mm_srli_epi32(a: vx, count: 8); |
97 | } |
98 | vx = _mm_srli_epi32(a: vx, count: 14); |
99 | vx = _mm_packus_epi32(V1: vx, V2: vx); |
100 | vx = _mm_packus_epi16(a: vx, b: vx); |
101 | *dptr = _mm_cvtsi128_si32(a: vx); |
102 | if (RGB) |
103 | *dptr |= 0xff000000; |
104 | dptr++; |
105 | } |
106 | } |
107 | }; |
108 | multithread_pixels_function(isi, dh, scaleSection); |
109 | } |
110 | |
111 | template<bool RGB> |
112 | void qt_qimageScaleAARGBA_down_x_up_y_sse4(QImageScaleInfo *isi, unsigned int *dest, |
113 | int dw, int dh, int dow, int sow) |
114 | { |
115 | const unsigned int **ypoints = isi->ypoints; |
116 | int *xpoints = isi->xpoints; |
117 | int *xapoints = isi->xapoints; |
118 | int *yapoints = isi->yapoints; |
119 | |
120 | const __m128i v256 = _mm_set1_epi32(i: 256); |
121 | |
122 | /* go through every scanline in the output buffer */ |
123 | auto scaleSection = [&] (int yStart, int yEnd) { |
124 | for (int y = yStart; y < yEnd; ++y) { |
125 | unsigned int *dptr = dest + (y * dow); |
126 | for (int x = 0; x < dw; x++) { |
127 | int Cx = xapoints[x] >> 16; |
128 | int xap = xapoints[x] & 0xffff; |
129 | const __m128i vCx = _mm_set1_epi32(i: Cx); |
130 | const __m128i vxap = _mm_set1_epi32(i: xap); |
131 | |
132 | const unsigned int *sptr = ypoints[y] + xpoints[x]; |
133 | __m128i vx = qt_qimageScaleAARGBA_helper(pix: sptr, xyap: xap, Cxy: Cx, step: 1, vxyap: vxap, vCxy: vCx); |
134 | |
135 | int yap = yapoints[y]; |
136 | if (yap > 0) { |
137 | const __m128i vyap = _mm_set1_epi32(i: yap); |
138 | const __m128i vinvyap = _mm_sub_epi32(a: v256, b: vyap); |
139 | __m128i vr = qt_qimageScaleAARGBA_helper(pix: sptr + sow, xyap: xap, Cxy: Cx, step: 1, vxyap: vxap, vCxy: vCx); |
140 | |
141 | vx = _mm_mullo_epi32(V1: vx, V2: vinvyap); |
142 | vr = _mm_mullo_epi32(V1: vr, V2: vyap); |
143 | vx = _mm_add_epi32(a: vx, b: vr); |
144 | vx = _mm_srli_epi32(a: vx, count: 8); |
145 | } |
146 | vx = _mm_srli_epi32(a: vx, count: 14); |
147 | vx = _mm_packus_epi32(V1: vx, V2: vx); |
148 | vx = _mm_packus_epi16(a: vx, b: vx); |
149 | *dptr = _mm_cvtsi128_si32(a: vx); |
150 | if (RGB) |
151 | *dptr |= 0xff000000; |
152 | dptr++; |
153 | } |
154 | } |
155 | }; |
156 | multithread_pixels_function(isi, dh, scaleSection); |
157 | } |
158 | |
159 | template<bool RGB> |
160 | void qt_qimageScaleAARGBA_down_xy_sse4(QImageScaleInfo *isi, unsigned int *dest, |
161 | int dw, int dh, int dow, int sow) |
162 | { |
163 | const unsigned int **ypoints = isi->ypoints; |
164 | int *xpoints = isi->xpoints; |
165 | int *xapoints = isi->xapoints; |
166 | int *yapoints = isi->yapoints; |
167 | |
168 | auto scaleSection = [&] (int yStart, int yEnd) { |
169 | for (int y = yStart; y < yEnd; ++y) { |
170 | int Cy = yapoints[y] >> 16; |
171 | int yap = yapoints[y] & 0xffff; |
172 | const __m128i vCy = _mm_set1_epi32(i: Cy); |
173 | const __m128i vyap = _mm_set1_epi32(i: yap); |
174 | |
175 | unsigned int *dptr = dest + (y * dow); |
176 | for (int x = 0; x < dw; x++) { |
177 | const int Cx = xapoints[x] >> 16; |
178 | const int xap = xapoints[x] & 0xffff; |
179 | const __m128i vCx = _mm_set1_epi32(i: Cx); |
180 | const __m128i vxap = _mm_set1_epi32(i: xap); |
181 | |
182 | const unsigned int *sptr = ypoints[y] + xpoints[x]; |
183 | __m128i vx = qt_qimageScaleAARGBA_helper(pix: sptr, xyap: xap, Cxy: Cx, step: 1, vxyap: vxap, vCxy: vCx); |
184 | __m128i vr = _mm_mullo_epi32(V1: _mm_srli_epi32(a: vx, count: 4), V2: vyap); |
185 | |
186 | int j; |
187 | for (j = (1 << 14) - yap; j > Cy; j -= Cy) { |
188 | sptr += sow; |
189 | vx = qt_qimageScaleAARGBA_helper(pix: sptr, xyap: xap, Cxy: Cx, step: 1, vxyap: vxap, vCxy: vCx); |
190 | vr = _mm_add_epi32(a: vr, b: _mm_mullo_epi32(V1: _mm_srli_epi32(a: vx, count: 4), V2: vCy)); |
191 | } |
192 | sptr += sow; |
193 | vx = qt_qimageScaleAARGBA_helper(pix: sptr, xyap: xap, Cxy: Cx, step: 1, vxyap: vxap, vCxy: vCx); |
194 | vr = _mm_add_epi32(a: vr, b: _mm_mullo_epi32(V1: _mm_srli_epi32(a: vx, count: 4), V2: _mm_set1_epi32(i: j))); |
195 | |
196 | vr = _mm_srli_epi32(a: vr, count: 24); |
197 | vr = _mm_packus_epi32(V1: vr, V2: _mm_setzero_si128()); |
198 | vr = _mm_packus_epi16(a: vr, b: _mm_setzero_si128()); |
199 | *dptr = _mm_cvtsi128_si32(a: vr); |
200 | if (RGB) |
201 | *dptr |= 0xff000000; |
202 | dptr++; |
203 | } |
204 | } |
205 | }; |
206 | multithread_pixels_function(isi, dh, scaleSection); |
207 | } |
208 | |
209 | template void qt_qimageScaleAARGBA_up_x_down_y_sse4<false>(QImageScaleInfo *isi, unsigned int *dest, |
210 | int dw, int dh, int dow, int sow); |
211 | |
212 | template void qt_qimageScaleAARGBA_up_x_down_y_sse4<true>(QImageScaleInfo *isi, unsigned int *dest, |
213 | int dw, int dh, int dow, int sow); |
214 | |
215 | template void qt_qimageScaleAARGBA_down_x_up_y_sse4<false>(QImageScaleInfo *isi, unsigned int *dest, |
216 | int dw, int dh, int dow, int sow); |
217 | |
218 | template void qt_qimageScaleAARGBA_down_x_up_y_sse4<true>(QImageScaleInfo *isi, unsigned int *dest, |
219 | int dw, int dh, int dow, int sow); |
220 | |
221 | template void qt_qimageScaleAARGBA_down_xy_sse4<false>(QImageScaleInfo *isi, unsigned int *dest, |
222 | int dw, int dh, int dow, int sow); |
223 | |
224 | template void qt_qimageScaleAARGBA_down_xy_sse4<true>(QImageScaleInfo *isi, unsigned int *dest, |
225 | int dw, int dh, int dow, int sow); |
226 | |
227 | QT_END_NAMESPACE |
228 | |
229 | #endif |
230 | |