1 | // Copyright (C) 2021 The Qt Company Ltd. |
2 | // Copyright (C) 2022 Intel Corporation. |
3 | // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only |
4 | |
5 | #ifndef QSIMD_P_H |
6 | #define QSIMD_P_H |
7 | |
8 | // |
9 | // W A R N I N G |
10 | // ------------- |
11 | // |
12 | // This file is not part of the Qt API. It exists purely as an |
13 | // implementation detail. This header file may change from version to |
14 | // version without notice, or even be removed. |
15 | // |
16 | // We mean it. |
17 | // |
18 | |
19 | #include <QtCore/private/qglobal_p.h> |
20 | #include <QtCore/qsimd.h> |
21 | |
22 | QT_WARNING_PUSH |
23 | QT_WARNING_DISABLE_CLANG("-Wundef" ) |
24 | QT_WARNING_DISABLE_GCC("-Wundef" ) |
25 | QT_WARNING_DISABLE_INTEL(103) |
26 | |
27 | #define ALIGNMENT_PROLOGUE_16BYTES(ptr, i, length) \ |
28 | for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((4 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x3)) & 0x3))); ++i) |
29 | |
30 | #define ALIGNMENT_PROLOGUE_32BYTES(ptr, i, length) \ |
31 | for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((8 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x7)) & 0x7))); ++i) |
32 | |
33 | #define SIMD_EPILOGUE(i, length, max) \ |
34 | for (int _i = 0; _i < max && i < length; ++i, ++_i) |
35 | |
36 | /* |
37 | * Code can use the following constructs to determine compiler support & status: |
38 | * - #ifdef __XXX__ (e.g: #ifdef __AVX__ or #ifdef __ARM_NEON__) |
39 | * If this test passes, then the compiler is already generating code for that |
40 | * given sub-architecture. The intrinsics for that sub-architecture are |
41 | * #included and can be used without restriction or runtime check. |
42 | * |
43 | * - #if QT_COMPILER_SUPPORTS(XXX) |
44 | * If this test passes, then the compiler is able to generate code for that |
45 | * given sub-architecture in another translation unit, given the right set of |
46 | * flags. Use of the intrinsics is not guaranteed. This is useful with |
47 | * runtime detection (see below). |
48 | * |
49 | * - #if QT_COMPILER_SUPPORTS_HERE(XXX) |
50 | * If this test passes, then the compiler is able to generate code for that |
51 | * given sub-architecture in this translation unit, even if it is not doing |
52 | * that now (it might be). Individual functions may be tagged with |
53 | * QT_FUNCTION_TARGET(XXX) to cause the compiler to generate code for that |
54 | * sub-arch. Only inside such functions is the use of the intrisics |
55 | * guaranteed to work. This is useful with runtime detection (see below). |
56 | * |
57 | * The distinction between QT_COMPILER_SUPPORTS and QT_COMPILER_SUPPORTS_HERE is |
58 | * historical: GCC 4.8 needed the distinction. |
59 | * |
60 | * Runtime detection of a CPU sub-architecture can be done with the |
61 | * qCpuHasFeature(XXX) function. There are two strategies for generating |
62 | * optimized code like that: |
63 | * |
64 | * 1) place the optimized code in a different translation unit (C or assembly |
65 | * sources) and pass the correct flags to the compiler to enable support. Those |
66 | * sources must not include qglobal.h, which means they cannot include this |
67 | * file either. The dispatcher function would look like this: |
68 | * |
69 | * void foo() |
70 | * { |
71 | * #if QT_COMPILER_SUPPORTS(XXX) |
72 | * if (qCpuHasFeature(XXX)) { |
73 | * foo_optimized_xxx(); |
74 | * return; |
75 | * } |
76 | * #endif |
77 | * foo_plain(); |
78 | * } |
79 | * |
80 | * 2) place the optimized code in a function tagged with QT_FUNCTION_TARGET and |
81 | * surrounded by #if QT_COMPILER_SUPPORTS_HERE(XXX). That code can freely use |
82 | * other Qt code. The dispatcher function would look like this: |
83 | * |
84 | * void foo() |
85 | * { |
86 | * #if QT_COMPILER_SUPPORTS_HERE(XXX) |
87 | * if (qCpuHasFeature(XXX)) { |
88 | * foo_optimized_xxx(); |
89 | * return; |
90 | * } |
91 | * #endif |
92 | * foo_plain(); |
93 | * } |
94 | */ |
95 | |
96 | #if defined(__MINGW64_VERSION_MAJOR) || defined(Q_CC_MSVC) |
97 | #include <intrin.h> |
98 | #endif |
99 | |
100 | #define QT_COMPILER_SUPPORTS(x) (QT_COMPILER_SUPPORTS_ ## x - 0) |
101 | |
102 | #if defined(Q_PROCESSOR_ARM) |
103 | # define QT_COMPILER_SUPPORTS_HERE(x) ((__ARM_FEATURE_ ## x) || (__ ## x ## __) || QT_COMPILER_SUPPORTS(x)) |
104 | # if defined(Q_CC_GNU) |
105 | /* GCC requires attributes for a function */ |
106 | # define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x))) |
107 | # else |
108 | # define QT_FUNCTION_TARGET(x) |
109 | # endif |
110 | #elif defined(Q_PROCESSOR_MIPS) |
111 | # define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __) |
112 | # define QT_FUNCTION_TARGET(x) |
113 | # if !defined(__MIPS_DSP__) && defined(__mips_dsp) && defined(Q_PROCESSOR_MIPS_32) |
114 | # define __MIPS_DSP__ |
115 | # endif |
116 | # if !defined(__MIPS_DSPR2__) && defined(__mips_dspr2) && defined(Q_PROCESSOR_MIPS_32) |
117 | # define __MIPS_DSPR2__ |
118 | # endif |
119 | #elif defined(Q_PROCESSOR_X86) |
120 | # if defined(Q_CC_CLANG) && defined(Q_CC_MSVC) |
121 | # define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __) |
122 | # else |
123 | # define QT_COMPILER_SUPPORTS_HERE(x) ((__ ## x ## __) || QT_COMPILER_SUPPORTS(x)) |
124 | # endif |
125 | # if defined(Q_CC_GNU) |
126 | /* GCC requires attributes for a function */ |
127 | # define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x))) |
128 | # else |
129 | # define QT_FUNCTION_TARGET(x) |
130 | # endif |
131 | #else |
132 | # define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __) |
133 | # define QT_FUNCTION_TARGET(x) |
134 | #endif |
135 | |
136 | #if defined(__SSE2__) && !defined(QT_COMPILER_SUPPORTS_SSE2) && !defined(QT_BOOTSTRAPPED) |
137 | // Intrinsic support appears to be missing, so pretend these features don't exist |
138 | # undef __SSE__ |
139 | # undef __SSE2__ |
140 | # undef __SSE3__ |
141 | # undef __SSSE3__ |
142 | # undef __SSE4_1__ |
143 | # undef __SSE4_2__ |
144 | # undef __AES__ |
145 | # undef __POPCNT__ |
146 | # undef __AVX__ |
147 | # undef __F16C__ |
148 | # undef __RDRND__ |
149 | # undef __AVX2__ |
150 | # undef __BMI__ |
151 | # undef __BMI2__ |
152 | # undef __FMA__ |
153 | # undef __MOVBE__ |
154 | # undef __RDSEED__ |
155 | # undef __AVX512F__ |
156 | # undef __AVX512ER__ |
157 | # undef __AVX512CD__ |
158 | # undef __AVX512PF__ |
159 | # undef __AVX512DQ__ |
160 | # undef __AVX512BW__ |
161 | # undef __AVX512VL__ |
162 | # undef __AVX512IFMA__ |
163 | # undef __AVX512VBMI__ |
164 | # undef __SHA__ |
165 | # undef __AVX512VBMI2__ |
166 | # undef __AVX512BITALG__ |
167 | # undef __AVX512VNNI__ |
168 | # undef __AVX512VPOPCNTDQ__ |
169 | # undef __GFNI__ |
170 | # undef __VAES__ |
171 | #endif |
172 | |
173 | #ifdef Q_PROCESSOR_X86 |
174 | /* -- x86 intrinsic support -- */ |
175 | |
176 | # if defined(QT_COMPILER_SUPPORTS_RDSEED) && defined(Q_OS_QNX) |
177 | // The compiler for QNX is missing the intrinsic |
178 | # undef QT_COMPILER_SUPPORTS_RDSEED |
179 | # endif |
180 | # if defined(Q_CC_MSVC) && (defined(_M_X64) || _M_IX86_FP >= 2) |
181 | // MSVC doesn't define __SSE2__, so do it ourselves |
182 | # define __SSE__ 1 |
183 | # endif |
184 | |
185 | # if defined(Q_OS_WIN) && defined(Q_CC_GNU) && !defined(Q_CC_CLANG) |
186 | // 64-bit GCC on Windows does not support AVX, so we hack around it by forcing |
187 | // it to emit unaligned loads & stores |
188 | // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=49001 |
189 | asm( |
190 | ".macro vmovapd args:vararg\n" |
191 | " vmovupd \\args\n" |
192 | ".endm\n" |
193 | ".macro vmovaps args:vararg\n" |
194 | " vmovups \\args\n" |
195 | ".endm\n" |
196 | ".macro vmovdqa args:vararg\n" |
197 | " vmovdqu \\args\n" |
198 | ".endm\n" |
199 | ".macro vmovdqa32 args:vararg\n" |
200 | " vmovdqu32 \\args\n" |
201 | ".endm\n" |
202 | ".macro vmovdqa64 args:vararg\n" |
203 | " vmovdqu64 \\args\n" |
204 | ".endm\n" |
205 | ); |
206 | # endif |
207 | |
208 | # if defined(Q_CC_GNU) && !defined(Q_OS_WASM) |
209 | // GCC 4.4 and Clang 2.8 added a few more intrinsics there |
210 | # include <x86intrin.h> |
211 | # endif |
212 | #ifdef Q_OS_WASM |
213 | # include <immintrin.h> |
214 | # endif |
215 | |
216 | # include <QtCore/private/qsimd_x86_p.h> |
217 | |
218 | // x86-64 sub-architecture version 3 |
219 | // |
220 | // The Intel Core 4th generation was codenamed "Haswell" and introduced AVX2, |
221 | // BMI1, BMI2, FMA, LZCNT, MOVBE, which makes it a good divider for a |
222 | // sub-target for us. The first AMD processor with AVX2 support (Zen) has the |
223 | // same features, but had already introduced BMI1 in the previous generation. |
224 | // This feature set was chosen as the version 3 of the x86-64 ISA (x86-64-v3) |
225 | // and is supported by GCC and Clang. |
226 | // |
227 | // macOS's fat binaries support the "x86_64h" sub-architecture and the GNU libc |
228 | // ELF loader also supports a "haswell/" subdir (e.g., /usr/lib/haswell). |
229 | # define ARCH_HASWELL_MACROS (__AVX2__ + __FMA__) |
230 | # if ARCH_HASWELL_MACROS != 0 |
231 | # if ARCH_HASWELL_MACROS != 2 |
232 | # error "Please enable all x86-64-v3 extensions; you probably want to use -march=haswell or -march=x86-64-v3 instead of -mavx2" |
233 | # endif |
234 | static_assert(ARCH_HASWELL_MACROS, "Undeclared identifiers indicate which features are missing." ); |
235 | # define __haswell__ 1 |
236 | # endif |
237 | # undef ARCH_HASWELL_MACROS |
238 | |
239 | // x86-64 sub-architecture version 4 |
240 | // |
241 | // Similar to the above, x86-64-v4 matches the AVX512 variant of the Intel Core |
242 | // 6th generation (codename "Skylake"). AMD Zen4 is the their first processor |
243 | // with AVX512 support and it includes all of these too. |
244 | // |
245 | # define ARCH_SKX_MACROS (__AVX512F__ + __AVX512BW__ + __AVX512CD__ + __AVX512DQ__ + __AVX512VL__) |
246 | # if ARCH_SKX_MACROS != 0 |
247 | # if ARCH_SKX_MACROS != 5 |
248 | # error "Please enable all x86-64-v4 extensions; you probably want to use -march=skylake-avx512 or -march=x86-64-v4 instead of -mavx512f" |
249 | # endif |
250 | static_assert(ARCH_SKX_MACROS, "Undeclared identifiers indicate which features are missing." ); |
251 | # define __skylake_avx512__ 1 |
252 | # endif |
253 | # undef ARCH_SKX_MACROS |
254 | #endif /* Q_PROCESSOR_X86 */ |
255 | |
256 | // NEON intrinsics |
257 | // note: as of GCC 4.9, does not support function targets for ARM |
258 | #if defined(__ARM_NEON) || defined(__ARM_NEON__) |
259 | #if defined(Q_CC_CLANG) |
260 | #define QT_FUNCTION_TARGET_STRING_NEON "neon" |
261 | #else |
262 | #define QT_FUNCTION_TARGET_STRING_NEON "+neon" // unused: gcc doesn't support function targets on non-aarch64, and on Aarch64 NEON is always available. |
263 | #endif |
264 | #ifndef __ARM_NEON__ |
265 | // __ARM_NEON__ is not defined on AArch64, but we need it in our NEON detection. |
266 | #define __ARM_NEON__ |
267 | #endif |
268 | |
269 | #ifndef Q_PROCESSOR_ARM_64 // vaddv is only available on Aarch64 |
270 | inline uint16_t vaddvq_u16(uint16x8_t v8) |
271 | { |
272 | const uint64x2_t v2 = vpaddlq_u32(vpaddlq_u16(v8)); |
273 | const uint64x1_t v1 = vadd_u64(vget_low_u64(v2), vget_high_u64(v2)); |
274 | return vget_lane_u16(vreinterpret_u16_u64(v1), 0); |
275 | } |
276 | |
277 | inline uint8_t vaddv_u8(uint8x8_t v8) |
278 | { |
279 | const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8))); |
280 | return vget_lane_u8(vreinterpret_u8_u64(v1), 0); |
281 | } |
282 | #endif |
283 | |
284 | #endif |
285 | |
286 | #if defined(Q_PROCESSOR_ARM) && defined(__ARM_FEATURE_CRC32) |
287 | # include <arm_acle.h> |
288 | #endif |
289 | |
290 | #if defined(Q_PROCESSOR_ARM_64) |
291 | #if defined(Q_CC_CLANG) |
292 | #define QT_FUNCTION_TARGET_STRING_AES "crypto" |
293 | #define QT_FUNCTION_TARGET_STRING_CRC32 "crc" |
294 | #elif defined(Q_CC_GNU) |
295 | #define QT_FUNCTION_TARGET_STRING_AES "+crypto" |
296 | #define QT_FUNCTION_TARGET_STRING_CRC32 "+crc" |
297 | #endif |
298 | #elif defined(Q_PROCESSOR_ARM_32) |
299 | #if defined(Q_CC_CLANG) |
300 | #define QT_FUNCTION_TARGET_STRING_AES "armv8-a,crypto" |
301 | #define QT_FUNCTION_TARGET_STRING_CRC32 "armv8-a,crc" |
302 | #elif defined(Q_CC_GNU) |
303 | #define QT_FUNCTION_TARGET_STRING_AES "arch=armv8-a+crypto" |
304 | #define QT_FUNCTION_TARGET_STRING_CRC32 "arch=armv8-a+crc" |
305 | #endif |
306 | #endif |
307 | |
308 | #ifndef Q_PROCESSOR_X86 |
309 | enum CPUFeatures { |
310 | #if defined(Q_PROCESSOR_ARM) |
311 | CpuFeatureNEON = 2, |
312 | CpuFeatureARM_NEON = CpuFeatureNEON, |
313 | CpuFeatureCRC32 = 4, |
314 | CpuFeatureAES = 8, |
315 | CpuFeatureARM_CRYPTO = CpuFeatureAES, |
316 | #elif defined(Q_PROCESSOR_MIPS) |
317 | CpuFeatureDSP = 2, |
318 | CpuFeatureDSPR2 = 4, |
319 | #endif |
320 | }; |
321 | |
322 | static const uint64_t qCompilerCpuFeatures = 0 |
323 | #if defined __ARM_NEON__ |
324 | | CpuFeatureNEON |
325 | #endif |
326 | #if defined __ARM_FEATURE_CRC32 |
327 | | CpuFeatureCRC32 |
328 | #endif |
329 | #if defined __ARM_FEATURE_CRYPTO |
330 | | CpuFeatureAES |
331 | #endif |
332 | #if defined __mips_dsp |
333 | | CpuFeatureDSP |
334 | #endif |
335 | #if defined __mips_dspr2 |
336 | | CpuFeatureDSPR2 |
337 | #endif |
338 | ; |
339 | #endif |
340 | |
341 | #ifdef __cplusplus |
342 | # include <atomic> |
343 | # define Q_ATOMIC(T) std::atomic<T> |
344 | QT_BEGIN_NAMESPACE |
345 | using std::atomic_load_explicit; |
346 | static constexpr auto memory_order_relaxed = std::memory_order_relaxed; |
347 | extern "C" { |
348 | #else |
349 | # include <stdatomic.h> |
350 | # define Q_ATOMIC(T) _Atomic(T) |
351 | #endif |
352 | |
353 | #ifdef Q_PROCESSOR_X86 |
354 | typedef uint64_t QCpuFeatureType; |
355 | static const QCpuFeatureType qCompilerCpuFeatures = _compilerCpuFeatures; |
356 | static const QCpuFeatureType CpuFeatureArchHaswell = cpu_haswell; |
357 | static const QCpuFeatureType CpuFeatureArchSkylakeAvx512 = cpu_skylake_avx512; |
358 | #else |
359 | typedef unsigned QCpuFeatureType; |
360 | #endif |
361 | extern Q_CORE_EXPORT Q_ATOMIC(QCpuFeatureType) QT_MANGLE_NAMESPACE(qt_cpu_features)[1]; |
362 | Q_CORE_EXPORT uint64_t QT_MANGLE_NAMESPACE(qDetectCpuFeatures)(); |
363 | |
364 | static inline uint64_t qCpuFeatures() |
365 | { |
366 | #ifdef QT_BOOTSTRAPPED |
367 | return qCompilerCpuFeatures; // no detection |
368 | #else |
369 | quint64 features = atomic_load_explicit(QT_MANGLE_NAMESPACE(qt_cpu_features), m: memory_order_relaxed); |
370 | if (!QT_SUPPORTS_INIT_PRIORITY) { |
371 | if (Q_UNLIKELY(features == 0)) |
372 | features = QT_MANGLE_NAMESPACE(qDetectCpuFeatures)(); |
373 | } |
374 | return features; |
375 | #endif |
376 | } |
377 | |
378 | #define qCpuHasFeature(feature) (((qCompilerCpuFeatures & CpuFeature ## feature) == CpuFeature ## feature) \ |
379 | || ((qCpuFeatures() & CpuFeature ## feature) == CpuFeature ## feature)) |
380 | |
381 | /* |
382 | Small wrapper around x86's PAUSE and ARM's YIELD instructions. |
383 | |
384 | This is completely different from QThread::yieldCurrentThread(), which is |
385 | an OS-level operation that takes the whole thread off the CPU. |
386 | |
387 | This is just preventing one SMT thread from filling a core's pipeline with |
388 | speculated further loop iterations (which need to be expensively flushed on |
389 | final success) when it could just give those pipeline slots to a second SMT |
390 | thread that can do something useful with the core, such as unblocking this |
391 | SMT thread :) |
392 | |
393 | So, instead of |
394 | |
395 | while (!condition) |
396 | ; |
397 | |
398 | it's better to use |
399 | |
400 | while (!condition) |
401 | qYieldCpu(); |
402 | */ |
403 | static inline void qYieldCpu() |
404 | { |
405 | #if defined(Q_PROCESSOR_X86) |
406 | _mm_pause(); |
407 | #elif defined(Q_PROCESSOR_ARM) && Q_PROCESSOR_ARM >= 7 /* yield was added in ARMv7 */ |
408 | # if __has_builtin(__builtin_arm_yield) /* e.g. Clang */ |
409 | __builtin_arm_yield(); |
410 | # elif defined(Q_OS_INTEGRITY) || \ |
411 | (defined(Q_CC_GNU) && !defined(Q_CC_CLANG)) |
412 | /* |
413 | - Integrity is missing the arm_acle.h header |
414 | - GCC doesn't have __yield() in arm_acle.h |
415 | https://stackoverflow.com/a/70076751/134841 |
416 | https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105416 |
417 | */ |
418 | asm volatile("yield" ); /* this works everywhere */ |
419 | # else |
420 | __yield(); /* this is what should work everywhere */ |
421 | # endif |
422 | #endif |
423 | } |
424 | |
425 | #ifdef __cplusplus |
426 | } // extern "C" |
427 | |
428 | # if defined(Q_PROCESSOR_X86) && QT_COMPILER_SUPPORTS_HERE(RDRND) && !defined(QT_BOOTSTRAPPED) |
429 | Q_CORE_EXPORT qsizetype qRandomCpu(void *, qsizetype) noexcept; |
430 | |
431 | static inline bool qHasHwrng() |
432 | { |
433 | return qCpuHasFeature(RDRND); |
434 | } |
435 | # else |
436 | static inline qsizetype qRandomCpu(void *, qsizetype) noexcept |
437 | { |
438 | return 0; |
439 | } |
440 | static inline bool qHasHwrng() |
441 | { |
442 | return false; |
443 | } |
444 | # endif |
445 | |
446 | QT_END_NAMESPACE |
447 | |
448 | #endif // __cplusplus |
449 | |
450 | QT_WARNING_POP |
451 | |
452 | #endif // QSIMD_P_H |
453 | |