1 | // Copyright (C) 2016 The Qt Company Ltd. |
2 | // Copyright (C) 2016 Intel Corporation. |
3 | // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only |
4 | |
5 | #include <private/qdrawhelper_x86_p.h> |
6 | |
7 | #ifdef QT_COMPILER_SUPPORTS_SSE2 |
8 | |
9 | #include <private/qdrawingprimitive_sse2_p.h> |
10 | #include <private/qpaintengine_raster_p.h> |
11 | |
12 | QT_BEGIN_NAMESPACE |
13 | |
14 | #ifndef QDRAWHELPER_AVX |
15 | // in AVX mode, we'll use the SSSE3 code |
16 | void qt_blend_argb32_on_argb32_sse2(uchar *destPixels, int dbpl, |
17 | const uchar *srcPixels, int sbpl, |
18 | int w, int h, |
19 | int const_alpha) |
20 | { |
21 | const quint32 *src = (const quint32 *) srcPixels; |
22 | quint32 *dst = (quint32 *) destPixels; |
23 | if (const_alpha == 256) { |
24 | const __m128i alphaMask = _mm_set1_epi32(i: 0xff000000); |
25 | const __m128i nullVector = _mm_set1_epi32(i: 0); |
26 | const __m128i half = _mm_set1_epi16(w: 0x80); |
27 | const __m128i one = _mm_set1_epi16(w: 0xff); |
28 | const __m128i colorMask = _mm_set1_epi32(i: 0x00ff00ff); |
29 | for (int y = 0; y < h; ++y) { |
30 | BLEND_SOURCE_OVER_ARGB32_SSE2(dst, src, w, nullVector, half, one, colorMask, alphaMask); |
31 | dst = (quint32 *)(((uchar *) dst) + dbpl); |
32 | src = (const quint32 *)(((const uchar *) src) + sbpl); |
33 | } |
34 | } else if (const_alpha != 0) { |
35 | // dest = (s + d * sia) * ca + d * cia |
36 | // = s * ca + d * (sia * ca + cia) |
37 | // = s * ca + d * (1 - sa*ca) |
38 | const_alpha = (const_alpha * 255) >> 8; |
39 | const __m128i nullVector = _mm_set1_epi32(i: 0); |
40 | const __m128i half = _mm_set1_epi16(w: 0x80); |
41 | const __m128i one = _mm_set1_epi16(w: 0xff); |
42 | const __m128i colorMask = _mm_set1_epi32(i: 0x00ff00ff); |
43 | const __m128i constAlphaVector = _mm_set1_epi16(w: const_alpha); |
44 | for (int y = 0; y < h; ++y) { |
45 | BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, w, nullVector, half, one, colorMask, constAlphaVector) |
46 | dst = (quint32 *)(((uchar *) dst) + dbpl); |
47 | src = (const quint32 *)(((const uchar *) src) + sbpl); |
48 | } |
49 | } |
50 | } |
51 | #endif |
52 | |
53 | // qblendfunctions.cpp |
54 | void qt_blend_rgb32_on_rgb32(uchar *destPixels, int dbpl, |
55 | const uchar *srcPixels, int sbpl, |
56 | int w, int h, |
57 | int const_alpha); |
58 | |
59 | void qt_blend_rgb32_on_rgb32_sse2(uchar *destPixels, int dbpl, |
60 | const uchar *srcPixels, int sbpl, |
61 | int w, int h, |
62 | int const_alpha) |
63 | { |
64 | const quint32 *src = (const quint32 *) srcPixels; |
65 | quint32 *dst = (quint32 *) destPixels; |
66 | if (const_alpha != 256) { |
67 | if (const_alpha != 0) { |
68 | const __m128i half = _mm_set1_epi16(w: 0x80); |
69 | const __m128i colorMask = _mm_set1_epi32(i: 0x00ff00ff); |
70 | |
71 | const_alpha = (const_alpha * 255) >> 8; |
72 | int one_minus_const_alpha = 255 - const_alpha; |
73 | const __m128i constAlphaVector = _mm_set1_epi16(w: const_alpha); |
74 | const __m128i oneMinusConstAlpha = _mm_set1_epi16(w: one_minus_const_alpha); |
75 | for (int y = 0; y < h; ++y) { |
76 | int x = 0; |
77 | |
78 | // First, align dest to 16 bytes: |
79 | ALIGNMENT_PROLOGUE_16BYTES(dst, x, w) { |
80 | dst[x] = INTERPOLATE_PIXEL_255(x: src[x], a: const_alpha, y: dst[x], b: one_minus_const_alpha); |
81 | } |
82 | |
83 | for (; x < w-3; x += 4) { |
84 | __m128i srcVector = _mm_loadu_si128(p: (const __m128i *)&src[x]); |
85 | const __m128i dstVector = _mm_load_si128(p: (__m128i *)&dst[x]); |
86 | __m128i result; |
87 | INTERPOLATE_PIXEL_255_SSE2(result, srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half); |
88 | _mm_store_si128(p: (__m128i *)&dst[x], b: result); |
89 | } |
90 | SIMD_EPILOGUE(x, w, 3) |
91 | dst[x] = INTERPOLATE_PIXEL_255(x: src[x], a: const_alpha, y: dst[x], b: one_minus_const_alpha); |
92 | dst = (quint32 *)(((uchar *) dst) + dbpl); |
93 | src = (const quint32 *)(((const uchar *) src) + sbpl); |
94 | } |
95 | } |
96 | } else { |
97 | qt_blend_rgb32_on_rgb32(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha); |
98 | } |
99 | } |
100 | |
101 | void QT_FASTCALL comp_func_SourceOver_sse2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha) |
102 | { |
103 | Q_ASSERT(const_alpha < 256); |
104 | |
105 | const quint32 *src = (const quint32 *) srcPixels; |
106 | quint32 *dst = (quint32 *) destPixels; |
107 | |
108 | const __m128i nullVector = _mm_set1_epi32(i: 0); |
109 | const __m128i half = _mm_set1_epi16(w: 0x80); |
110 | const __m128i one = _mm_set1_epi16(w: 0xff); |
111 | const __m128i colorMask = _mm_set1_epi32(i: 0x00ff00ff); |
112 | if (const_alpha == 255) { |
113 | const __m128i alphaMask = _mm_set1_epi32(i: 0xff000000); |
114 | BLEND_SOURCE_OVER_ARGB32_SSE2(dst, src, length, nullVector, half, one, colorMask, alphaMask); |
115 | } else { |
116 | const __m128i constAlphaVector = _mm_set1_epi16(w: const_alpha); |
117 | BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, length, nullVector, half, one, colorMask, constAlphaVector); |
118 | } |
119 | } |
120 | |
121 | void QT_FASTCALL comp_func_Plus_sse2(uint *dst, const uint *src, int length, uint const_alpha) |
122 | { |
123 | int x = 0; |
124 | |
125 | if (const_alpha == 255) { |
126 | // 1) Prologue: align destination on 16 bytes |
127 | ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) |
128 | dst[x] = comp_func_Plus_one_pixel(d: dst[x], s: src[x]); |
129 | |
130 | // 2) composition with SSE2 |
131 | for (; x < length - 3; x += 4) { |
132 | const __m128i srcVector = _mm_loadu_si128(p: (const __m128i *)&src[x]); |
133 | const __m128i dstVector = _mm_load_si128(p: (__m128i *)&dst[x]); |
134 | |
135 | const __m128i result = _mm_adds_epu8(a: srcVector, b: dstVector); |
136 | _mm_store_si128(p: (__m128i *)&dst[x], b: result); |
137 | } |
138 | |
139 | // 3) Epilogue: |
140 | SIMD_EPILOGUE(x, length, 3) |
141 | dst[x] = comp_func_Plus_one_pixel(d: dst[x], s: src[x]); |
142 | } else { |
143 | const int one_minus_const_alpha = 255 - const_alpha; |
144 | const __m128i constAlphaVector = _mm_set1_epi16(w: const_alpha); |
145 | const __m128i oneMinusConstAlpha = _mm_set1_epi16(w: one_minus_const_alpha); |
146 | |
147 | // 1) Prologue: align destination on 16 bytes |
148 | ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) |
149 | dst[x] = comp_func_Plus_one_pixel_const_alpha(d: dst[x], s: src[x], const_alpha, one_minus_const_alpha); |
150 | |
151 | const __m128i half = _mm_set1_epi16(w: 0x80); |
152 | const __m128i colorMask = _mm_set1_epi32(i: 0x00ff00ff); |
153 | // 2) composition with SSE2 |
154 | for (; x < length - 3; x += 4) { |
155 | const __m128i srcVector = _mm_loadu_si128(p: (const __m128i *)&src[x]); |
156 | const __m128i dstVector = _mm_load_si128(p: (__m128i *)&dst[x]); |
157 | |
158 | __m128i result = _mm_adds_epu8(a: srcVector, b: dstVector); |
159 | INTERPOLATE_PIXEL_255_SSE2(result, result, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half) |
160 | _mm_store_si128(p: (__m128i *)&dst[x], b: result); |
161 | } |
162 | |
163 | // 3) Epilogue: |
164 | SIMD_EPILOGUE(x, length, 3) |
165 | dst[x] = comp_func_Plus_one_pixel_const_alpha(d: dst[x], s: src[x], const_alpha, one_minus_const_alpha); |
166 | } |
167 | } |
168 | |
169 | void QT_FASTCALL comp_func_Source_sse2(uint *dst, const uint *src, int length, uint const_alpha) |
170 | { |
171 | if (const_alpha == 255) { |
172 | ::memcpy(dest: dst, src: src, n: length * sizeof(uint)); |
173 | } else { |
174 | const int ialpha = 255 - const_alpha; |
175 | |
176 | int x = 0; |
177 | |
178 | // 1) prologue, align on 16 bytes |
179 | ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) |
180 | dst[x] = INTERPOLATE_PIXEL_255(x: src[x], a: const_alpha, y: dst[x], b: ialpha); |
181 | |
182 | // 2) interpolate pixels with SSE2 |
183 | const __m128i half = _mm_set1_epi16(w: 0x80); |
184 | const __m128i colorMask = _mm_set1_epi32(i: 0x00ff00ff); |
185 | const __m128i constAlphaVector = _mm_set1_epi16(w: const_alpha); |
186 | const __m128i oneMinusConstAlpha = _mm_set1_epi16(w: ialpha); |
187 | for (; x < length - 3; x += 4) { |
188 | const __m128i srcVector = _mm_loadu_si128(p: (const __m128i *)&src[x]); |
189 | __m128i dstVector = _mm_load_si128(p: (__m128i *)&dst[x]); |
190 | INTERPOLATE_PIXEL_255_SSE2(dstVector, srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half) |
191 | _mm_store_si128(p: (__m128i *)&dst[x], b: dstVector); |
192 | } |
193 | |
194 | // 3) Epilogue |
195 | SIMD_EPILOGUE(x, length, 3) |
196 | dst[x] = INTERPOLATE_PIXEL_255(x: src[x], a: const_alpha, y: dst[x], b: ialpha); |
197 | } |
198 | } |
199 | |
200 | #ifndef __haswell__ |
201 | static Q_NEVER_INLINE |
202 | void Q_DECL_VECTORCALL qt_memfillXX_aligned(void *dest, __m128i value128, quintptr bytecount) |
203 | { |
204 | __m128i *dst128 = reinterpret_cast<__m128i *>(dest); |
205 | __m128i *end128 = reinterpret_cast<__m128i *>(static_cast<uchar *>(dest) + bytecount); |
206 | |
207 | while (dst128 + 4 <= end128) { |
208 | _mm_store_si128(p: dst128 + 0, b: value128); |
209 | _mm_store_si128(p: dst128 + 1, b: value128); |
210 | _mm_store_si128(p: dst128 + 2, b: value128); |
211 | _mm_store_si128(p: dst128 + 3, b: value128); |
212 | dst128 += 4; |
213 | } |
214 | |
215 | bytecount %= 4 * sizeof(__m128i); |
216 | switch (bytecount / sizeof(__m128i)) { |
217 | case 3: _mm_store_si128(p: dst128++, b: value128); Q_FALLTHROUGH(); |
218 | case 2: _mm_store_si128(p: dst128++, b: value128); Q_FALLTHROUGH(); |
219 | case 1: _mm_store_si128(p: dst128++, b: value128); |
220 | } |
221 | } |
222 | |
223 | void qt_memfill64_sse2(quint64 *dest, quint64 value, qsizetype count) |
224 | { |
225 | quintptr misaligned = quintptr(dest) % sizeof(__m128i); |
226 | if (misaligned && count) { |
227 | #if defined(Q_PROCESSOR_X86_32) |
228 | // Before SSE came out, the alignment of the stack used to be only 4 |
229 | // bytes and some OS/ABIs (notably, code generated by MSVC) still only |
230 | // align to that. In any case, we cannot count on the alignment of |
231 | // quint64 to be 8 -- see QtPrivate::AlignOf_WorkaroundForI386Abi in |
232 | // qglobal.h. |
233 | // |
234 | // If the pointer is not aligned to at least 8 bytes, then we'll never |
235 | // in turn hit a multiple of 16 for the qt_memfillXX_aligned call |
236 | // below. |
237 | if (Q_UNLIKELY(misaligned % sizeof(quint64))) |
238 | return qt_memfill_template(dest, value, count); |
239 | #endif |
240 | |
241 | *dest++ = value; |
242 | --count; |
243 | } |
244 | |
245 | if (count % 2) { |
246 | dest[count - 1] = value; |
247 | --count; |
248 | } |
249 | |
250 | qt_memfillXX_aligned(dest, value128: _mm_set1_epi64x(q: value), bytecount: count * sizeof(quint64)); |
251 | } |
252 | |
253 | void qt_memfill32_sse2(quint32 *dest, quint32 value, qsizetype count) |
254 | { |
255 | if (count < 4) { |
256 | // this simplifies the code below: the first switch can fall through |
257 | // without checking the value of count |
258 | switch (count) { |
259 | case 3: *dest++ = value; Q_FALLTHROUGH(); |
260 | case 2: *dest++ = value; Q_FALLTHROUGH(); |
261 | case 1: *dest = value; |
262 | } |
263 | return; |
264 | } |
265 | |
266 | const int align = (quintptr)(dest) & 0xf; |
267 | switch (align) { |
268 | case 4: *dest++ = value; --count; Q_FALLTHROUGH(); |
269 | case 8: *dest++ = value; --count; Q_FALLTHROUGH(); |
270 | case 12: *dest++ = value; --count; |
271 | } |
272 | |
273 | const int rest = count & 0x3; |
274 | if (rest) { |
275 | switch (rest) { |
276 | case 3: dest[count - 3] = value; Q_FALLTHROUGH(); |
277 | case 2: dest[count - 2] = value; Q_FALLTHROUGH(); |
278 | case 1: dest[count - 1] = value; |
279 | } |
280 | } |
281 | |
282 | qt_memfillXX_aligned(dest, value128: _mm_set1_epi32(i: value), bytecount: count * sizeof(quint32)); |
283 | } |
284 | #endif // !__haswell__ |
285 | |
286 | void QT_FASTCALL comp_func_solid_Source_sse2(uint *destPixels, int length, uint color, uint const_alpha) |
287 | { |
288 | if (const_alpha == 255) { |
289 | qt_memfill32(destPixels, color, length); |
290 | } else { |
291 | const quint32 ialpha = 255 - const_alpha; |
292 | color = BYTE_MUL(x: color, a: const_alpha); |
293 | int x = 0; |
294 | |
295 | quint32 *dst = (quint32 *) destPixels; |
296 | const __m128i colorVector = _mm_set1_epi32(i: color); |
297 | const __m128i colorMask = _mm_set1_epi32(i: 0x00ff00ff); |
298 | const __m128i half = _mm_set1_epi16(w: 0x80); |
299 | const __m128i iAlphaVector = _mm_set1_epi16(w: ialpha); |
300 | |
301 | ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) |
302 | destPixels[x] = color + BYTE_MUL(x: destPixels[x], a: ialpha); |
303 | |
304 | for (; x < length-3; x += 4) { |
305 | __m128i dstVector = _mm_load_si128(p: (__m128i *)&dst[x]); |
306 | BYTE_MUL_SSE2(dstVector, dstVector, iAlphaVector, colorMask, half); |
307 | dstVector = _mm_add_epi8(a: colorVector, b: dstVector); |
308 | _mm_store_si128(p: (__m128i *)&dst[x], b: dstVector); |
309 | } |
310 | SIMD_EPILOGUE(x, length, 3) |
311 | destPixels[x] = color + BYTE_MUL(x: destPixels[x], a: ialpha); |
312 | } |
313 | } |
314 | |
315 | void QT_FASTCALL comp_func_solid_SourceOver_sse2(uint *destPixels, int length, uint color, uint const_alpha) |
316 | { |
317 | if ((const_alpha & qAlpha(rgb: color)) == 255) { |
318 | qt_memfill32(destPixels, color, length); |
319 | } else { |
320 | if (const_alpha != 255) |
321 | color = BYTE_MUL(x: color, a: const_alpha); |
322 | |
323 | const quint32 minusAlphaOfColor = qAlpha(rgb: ~color); |
324 | int x = 0; |
325 | |
326 | quint32 *dst = (quint32 *) destPixels; |
327 | const __m128i colorVector = _mm_set1_epi32(i: color); |
328 | const __m128i colorMask = _mm_set1_epi32(i: 0x00ff00ff); |
329 | const __m128i half = _mm_set1_epi16(w: 0x80); |
330 | const __m128i minusAlphaOfColorVector = _mm_set1_epi16(w: minusAlphaOfColor); |
331 | |
332 | ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) |
333 | destPixels[x] = color + BYTE_MUL(x: destPixels[x], a: minusAlphaOfColor); |
334 | |
335 | for (; x < length-3; x += 4) { |
336 | __m128i dstVector = _mm_load_si128(p: (__m128i *)&dst[x]); |
337 | BYTE_MUL_SSE2(dstVector, dstVector, minusAlphaOfColorVector, colorMask, half); |
338 | dstVector = _mm_add_epi8(a: colorVector, b: dstVector); |
339 | _mm_store_si128(p: (__m128i *)&dst[x], b: dstVector); |
340 | } |
341 | SIMD_EPILOGUE(x, length, 3) |
342 | destPixels[x] = color + BYTE_MUL(x: destPixels[x], a: minusAlphaOfColor); |
343 | } |
344 | } |
345 | |
346 | void qt_bitmapblit32_sse2_base(QRasterBuffer *rasterBuffer, int x, int y, |
347 | quint32 color, |
348 | const uchar *src, int width, int height, int stride) |
349 | { |
350 | quint32 *dest = reinterpret_cast<quint32*>(rasterBuffer->scanLine(y)) + x; |
351 | const int destStride = rasterBuffer->stride<quint32>(); |
352 | |
353 | const __m128i c128 = _mm_set1_epi32(i: color); |
354 | const __m128i maskmask1 = _mm_set_epi32(i3: 0x10101010, i2: 0x20202020, |
355 | i1: 0x40404040, i0: 0x80808080); |
356 | const __m128i maskadd1 = _mm_set_epi32(i3: 0x70707070, i2: 0x60606060, |
357 | i1: 0x40404040, i0: 0x00000000); |
358 | |
359 | if (width > 4) { |
360 | const __m128i maskmask2 = _mm_set_epi32(i3: 0x01010101, i2: 0x02020202, |
361 | i1: 0x04040404, i0: 0x08080808); |
362 | const __m128i maskadd2 = _mm_set_epi32(i3: 0x7f7f7f7f, i2: 0x7e7e7e7e, |
363 | i1: 0x7c7c7c7c, i0: 0x78787878); |
364 | while (--height >= 0) { |
365 | for (int x = 0; x < width; x += 8) { |
366 | const quint8 s = src[x >> 3]; |
367 | if (!s) |
368 | continue; |
369 | __m128i mask1 = _mm_set1_epi8(b: s); |
370 | __m128i mask2 = mask1; |
371 | |
372 | mask1 = _mm_and_si128(a: mask1, b: maskmask1); |
373 | mask1 = _mm_add_epi8(a: mask1, b: maskadd1); |
374 | _mm_maskmoveu_si128(d: c128, n: mask1, p: (char*)(dest + x)); |
375 | mask2 = _mm_and_si128(a: mask2, b: maskmask2); |
376 | mask2 = _mm_add_epi8(a: mask2, b: maskadd2); |
377 | _mm_maskmoveu_si128(d: c128, n: mask2, p: (char*)(dest + x + 4)); |
378 | } |
379 | dest += destStride; |
380 | src += stride; |
381 | } |
382 | } else { |
383 | while (--height >= 0) { |
384 | const quint8 s = *src; |
385 | if (s) { |
386 | __m128i mask1 = _mm_set1_epi8(b: s); |
387 | mask1 = _mm_and_si128(a: mask1, b: maskmask1); |
388 | mask1 = _mm_add_epi8(a: mask1, b: maskadd1); |
389 | _mm_maskmoveu_si128(d: c128, n: mask1, p: (char*)(dest)); |
390 | } |
391 | dest += destStride; |
392 | src += stride; |
393 | } |
394 | } |
395 | } |
396 | |
397 | void qt_bitmapblit32_sse2(QRasterBuffer *rasterBuffer, int x, int y, |
398 | const QRgba64 &color, |
399 | const uchar *src, int width, int height, int stride) |
400 | { |
401 | qt_bitmapblit32_sse2_base(rasterBuffer, x, y, color: color.toArgb32(), src, width, height, stride); |
402 | } |
403 | |
404 | void qt_bitmapblit8888_sse2(QRasterBuffer *rasterBuffer, int x, int y, |
405 | const QRgba64 &color, |
406 | const uchar *src, int width, int height, int stride) |
407 | { |
408 | qt_bitmapblit32_sse2_base(rasterBuffer, x, y, color: ARGB2RGBA(x: color.toArgb32()), src, width, height, stride); |
409 | } |
410 | |
411 | void qt_bitmapblit16_sse2(QRasterBuffer *rasterBuffer, int x, int y, |
412 | const QRgba64 &color, |
413 | const uchar *src, int width, int height, int stride) |
414 | { |
415 | const quint16 c = qConvertRgb32To16(c: color.toArgb32()); |
416 | quint16 *dest = reinterpret_cast<quint16*>(rasterBuffer->scanLine(y)) + x; |
417 | const int destStride = rasterBuffer->stride<quint32>(); |
418 | |
419 | const __m128i c128 = _mm_set1_epi16(w: c); |
420 | QT_WARNING_DISABLE_MSVC(4309) // truncation of constant value |
421 | const __m128i maskmask = _mm_set_epi16(w7: 0x0101, w6: 0x0202, w5: 0x0404, w4: 0x0808, |
422 | w3: 0x1010, w2: 0x2020, w1: 0x4040, w0: 0x8080); |
423 | const __m128i maskadd = _mm_set_epi16(w7: 0x7f7f, w6: 0x7e7e, w5: 0x7c7c, w4: 0x7878, |
424 | w3: 0x7070, w2: 0x6060, w1: 0x4040, w0: 0x0000); |
425 | |
426 | while (--height >= 0) { |
427 | for (int x = 0; x < width; x += 8) { |
428 | const quint8 s = src[x >> 3]; |
429 | if (!s) |
430 | continue; |
431 | __m128i mask = _mm_set1_epi8(b: s); |
432 | mask = _mm_and_si128(a: mask, b: maskmask); |
433 | mask = _mm_add_epi8(a: mask, b: maskadd); |
434 | _mm_maskmoveu_si128(d: c128, n: mask, p: (char*)(dest + x)); |
435 | } |
436 | dest += destStride; |
437 | src += stride; |
438 | } |
439 | } |
440 | |
441 | class QSimdSse2 |
442 | { |
443 | public: |
444 | typedef __m128i Int32x4; |
445 | typedef __m128 Float32x4; |
446 | |
447 | union Vect_buffer_i { Int32x4 v; int i[4]; }; |
448 | union Vect_buffer_f { Float32x4 v; float f[4]; }; |
449 | |
450 | static inline Float32x4 Q_DECL_VECTORCALL v_dup(float x) { return _mm_set1_ps(w: x); } |
451 | static inline Float32x4 Q_DECL_VECTORCALL v_dup(double x) { return _mm_set1_ps(w: x); } |
452 | static inline Int32x4 Q_DECL_VECTORCALL v_dup(int x) { return _mm_set1_epi32(i: x); } |
453 | static inline Int32x4 Q_DECL_VECTORCALL v_dup(uint x) { return _mm_set1_epi32(i: x); } |
454 | |
455 | static inline Float32x4 Q_DECL_VECTORCALL v_add(Float32x4 a, Float32x4 b) { return _mm_add_ps(a: a, b: b); } |
456 | static inline Int32x4 Q_DECL_VECTORCALL v_add(Int32x4 a, Int32x4 b) { return _mm_add_epi32(a: a, b: b); } |
457 | |
458 | static inline Float32x4 Q_DECL_VECTORCALL v_max(Float32x4 a, Float32x4 b) { return _mm_max_ps(a: a, b: b); } |
459 | static inline Float32x4 Q_DECL_VECTORCALL v_min(Float32x4 a, Float32x4 b) { return _mm_min_ps(a: a, b: b); } |
460 | static inline Int32x4 Q_DECL_VECTORCALL v_min_16(Int32x4 a, Int32x4 b) { return _mm_min_epi16(a: a, b: b); } |
461 | |
462 | static inline Int32x4 Q_DECL_VECTORCALL v_and(Int32x4 a, Int32x4 b) { return _mm_and_si128(a: a, b: b); } |
463 | |
464 | static inline Float32x4 Q_DECL_VECTORCALL v_sub(Float32x4 a, Float32x4 b) { return _mm_sub_ps(a: a, b: b); } |
465 | static inline Int32x4 Q_DECL_VECTORCALL v_sub(Int32x4 a, Int32x4 b) { return _mm_sub_epi32(a: a, b: b); } |
466 | |
467 | static inline Float32x4 Q_DECL_VECTORCALL v_mul(Float32x4 a, Float32x4 b) { return _mm_mul_ps(a: a, b: b); } |
468 | |
469 | static inline Float32x4 Q_DECL_VECTORCALL v_sqrt(Float32x4 x) { return _mm_sqrt_ps(a: x); } |
470 | |
471 | static inline Int32x4 Q_DECL_VECTORCALL v_toInt(Float32x4 x) { return _mm_cvttps_epi32(a: x); } |
472 | |
473 | static inline Int32x4 Q_DECL_VECTORCALL v_greaterOrEqual(Float32x4 a, Float32x4 b) { return _mm_castps_si128(a: _mm_cmpgt_ps(a: a, b: b)); } |
474 | }; |
475 | |
476 | const uint * QT_FASTCALL qt_fetch_radial_gradient_sse2(uint *buffer, const Operator *op, const QSpanData *data, |
477 | int y, int x, int length) |
478 | { |
479 | return qt_fetch_radial_gradient_template<QRadialFetchSimd<QSimdSse2>,uint>(buffer, op, data, y, x, length); |
480 | } |
481 | |
482 | void qt_scale_image_argb32_on_argb32_sse2(uchar *destPixels, int dbpl, |
483 | const uchar *srcPixels, int sbpl, int srch, |
484 | const QRectF &targetRect, |
485 | const QRectF &sourceRect, |
486 | const QRect &clip, |
487 | int const_alpha) |
488 | { |
489 | if (const_alpha != 256) { |
490 | // from qblendfunctions.cpp |
491 | extern void qt_scale_image_argb32_on_argb32(uchar *destPixels, int dbpl, |
492 | const uchar *srcPixels, int sbpl, int srch, |
493 | const QRectF &targetRect, |
494 | const QRectF &sourceRect, |
495 | const QRect &clip, |
496 | int const_alpha); |
497 | return qt_scale_image_argb32_on_argb32(destPixels, dbpl, srcPixels, sbpl, srch, targetRect, sourceRect, clip, const_alpha); |
498 | } |
499 | |
500 | qreal sx = sourceRect.width() / (qreal)targetRect.width(); |
501 | qreal sy = sourceRect.height() / (qreal)targetRect.height(); |
502 | |
503 | const int ix = 0x00010000 * sx; |
504 | const int iy = 0x00010000 * sy; |
505 | |
506 | QRect tr = targetRect.normalized().toRect(); |
507 | tr = tr.intersected(other: clip); |
508 | if (tr.isEmpty()) |
509 | return; |
510 | const int tx1 = tr.left(); |
511 | const int ty1 = tr.top(); |
512 | int h = tr.height(); |
513 | int w = tr.width(); |
514 | |
515 | quint32 basex; |
516 | quint32 srcy; |
517 | |
518 | if (sx < 0) { |
519 | int dstx = qFloor(v: (tx1 + qreal(0.5) - targetRect.right()) * sx * 65536) + 1; |
520 | basex = quint32(sourceRect.right() * 65536) + dstx; |
521 | } else { |
522 | int dstx = qCeil(v: (tx1 + qreal(0.5) - targetRect.left()) * sx * 65536) - 1; |
523 | basex = quint32(sourceRect.left() * 65536) + dstx; |
524 | } |
525 | if (sy < 0) { |
526 | int dsty = qFloor(v: (ty1 + qreal(0.5) - targetRect.bottom()) * sy * 65536) + 1; |
527 | srcy = quint32(sourceRect.bottom() * 65536) + dsty; |
528 | } else { |
529 | int dsty = qCeil(v: (ty1 + qreal(0.5) - targetRect.top()) * sy * 65536) - 1; |
530 | srcy = quint32(sourceRect.top() * 65536) + dsty; |
531 | } |
532 | |
533 | quint32 *dst = ((quint32 *) (destPixels + ty1 * dbpl)) + tx1; |
534 | |
535 | const __m128i nullVector = _mm_setzero_si128(); |
536 | const __m128i half = _mm_set1_epi16(w: 0x80); |
537 | const __m128i one = _mm_set1_epi16(w: 0xff); |
538 | const __m128i colorMask = _mm_set1_epi32(i: 0x00ff00ff); |
539 | const __m128i alphaMask = _mm_set1_epi32(i: 0xff000000); |
540 | const __m128i ixVector = _mm_set1_epi32(i: 4*ix); |
541 | |
542 | // this bounds check here is required as floating point rounding above might in some cases lead to |
543 | // w/h values that are one pixel too large, falling outside of the valid image area. |
544 | const int ystart = srcy >> 16; |
545 | if (ystart >= srch && iy < 0) { |
546 | srcy += iy; |
547 | --h; |
548 | } |
549 | const int xstart = basex >> 16; |
550 | if (xstart >= (int)(sbpl/sizeof(quint32)) && ix < 0) { |
551 | basex += ix; |
552 | --w; |
553 | } |
554 | int yend = (srcy + iy * (h - 1)) >> 16; |
555 | if (yend < 0 || yend >= srch) |
556 | --h; |
557 | int xend = (basex + ix * (w - 1)) >> 16; |
558 | if (xend < 0 || xend >= (int)(sbpl/sizeof(quint32))) |
559 | --w; |
560 | |
561 | while (--h >= 0) { |
562 | const uint *src = (const quint32 *) (srcPixels + (srcy >> 16) * sbpl); |
563 | int srcx = basex; |
564 | int x = 0; |
565 | |
566 | ALIGNMENT_PROLOGUE_16BYTES(dst, x, w) { |
567 | uint s = src[srcx >> 16]; |
568 | dst[x] = s + BYTE_MUL(x: dst[x], a: qAlpha(rgb: ~s)); |
569 | srcx += ix; |
570 | } |
571 | |
572 | __m128i srcxVector = _mm_set_epi32(i3: srcx, i2: srcx + ix, i1: srcx + ix + ix, i0: srcx + ix + ix + ix); |
573 | |
574 | for (; x < (w - 3); x += 4) { |
575 | const int idx0 = _mm_extract_epi16(srcxVector, 1); |
576 | const int idx1 = _mm_extract_epi16(srcxVector, 3); |
577 | const int idx2 = _mm_extract_epi16(srcxVector, 5); |
578 | const int idx3 = _mm_extract_epi16(srcxVector, 7); |
579 | srcxVector = _mm_add_epi32(a: srcxVector, b: ixVector); |
580 | |
581 | const __m128i srcVector = _mm_set_epi32(i3: src[idx0], i2: src[idx1], i1: src[idx2], i0: src[idx3]); |
582 | BLEND_SOURCE_OVER_ARGB32_SSE2_helper(dst, srcVector, nullVector, half, one, colorMask, alphaMask); |
583 | } |
584 | |
585 | SIMD_EPILOGUE(x, w, 3) { |
586 | uint s = src[(basex + x*ix) >> 16]; |
587 | dst[x] = s + BYTE_MUL(x: dst[x], a: qAlpha(rgb: ~s)); |
588 | } |
589 | dst = (quint32 *)(((uchar *) dst) + dbpl); |
590 | srcy += iy; |
591 | } |
592 | } |
593 | |
594 | |
595 | QT_END_NAMESPACE |
596 | |
597 | #endif // QT_COMPILER_SUPPORTS_SSE2 |
598 | |