1// Copyright (C) 2021 The Qt Company Ltd.
2// Copyright (C) 2022 Intel Corporation.
3// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4
5#ifndef QSIMD_P_H
6#define QSIMD_P_H
7
8//
9// W A R N I N G
10// -------------
11//
12// This file is not part of the Qt API. It exists purely as an
13// implementation detail. This header file may change from version to
14// version without notice, or even be removed.
15//
16// We mean it.
17//
18
19#include <QtCore/private/qglobal_p.h>
20#include <QtCore/qsimd.h>
21
22QT_WARNING_PUSH
23QT_WARNING_DISABLE_CLANG("-Wundef")
24QT_WARNING_DISABLE_GCC("-Wundef")
25QT_WARNING_DISABLE_INTEL(103)
26
27#define ALIGNMENT_PROLOGUE_16BYTES(ptr, i, length) \
28 for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((4 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x3)) & 0x3))); ++i)
29
30#define ALIGNMENT_PROLOGUE_32BYTES(ptr, i, length) \
31 for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((8 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x7)) & 0x7))); ++i)
32
33#define SIMD_EPILOGUE(i, length, max) \
34 for (int _i = 0; _i < max && i < length; ++i, ++_i)
35
36/*
37 * Code can use the following constructs to determine compiler support & status:
38 * - #ifdef __XXX__ (e.g: #ifdef __AVX__ or #ifdef __ARM_NEON__)
39 * If this test passes, then the compiler is already generating code for that
40 * given sub-architecture. The intrinsics for that sub-architecture are
41 * #included and can be used without restriction or runtime check.
42 *
43 * - #if QT_COMPILER_SUPPORTS(XXX)
44 * If this test passes, then the compiler is able to generate code for that
45 * given sub-architecture in another translation unit, given the right set of
46 * flags. Use of the intrinsics is not guaranteed. This is useful with
47 * runtime detection (see below).
48 *
49 * - #if QT_COMPILER_SUPPORTS_HERE(XXX)
50 * If this test passes, then the compiler is able to generate code for that
51 * given sub-architecture in this translation unit, even if it is not doing
52 * that now (it might be). Individual functions may be tagged with
53 * QT_FUNCTION_TARGET(XXX) to cause the compiler to generate code for that
54 * sub-arch. Only inside such functions is the use of the intrisics
55 * guaranteed to work. This is useful with runtime detection (see below).
56 *
57 * The distinction between QT_COMPILER_SUPPORTS and QT_COMPILER_SUPPORTS_HERE is
58 * historical: GCC 4.8 needed the distinction.
59 *
60 * Runtime detection of a CPU sub-architecture can be done with the
61 * qCpuHasFeature(XXX) function. There are two strategies for generating
62 * optimized code like that:
63 *
64 * 1) place the optimized code in a different translation unit (C or assembly
65 * sources) and pass the correct flags to the compiler to enable support. Those
66 * sources must not include qglobal.h, which means they cannot include this
67 * file either. The dispatcher function would look like this:
68 *
69 * void foo()
70 * {
71 * #if QT_COMPILER_SUPPORTS(XXX)
72 * if (qCpuHasFeature(XXX)) {
73 * foo_optimized_xxx();
74 * return;
75 * }
76 * #endif
77 * foo_plain();
78 * }
79 *
80 * 2) place the optimized code in a function tagged with QT_FUNCTION_TARGET and
81 * surrounded by #if QT_COMPILER_SUPPORTS_HERE(XXX). That code can freely use
82 * other Qt code. The dispatcher function would look like this:
83 *
84 * void foo()
85 * {
86 * #if QT_COMPILER_SUPPORTS_HERE(XXX)
87 * if (qCpuHasFeature(XXX)) {
88 * foo_optimized_xxx();
89 * return;
90 * }
91 * #endif
92 * foo_plain();
93 * }
94 */
95
96#if defined(__MINGW64_VERSION_MAJOR) || defined(Q_CC_MSVC)
97#include <intrin.h>
98#endif
99
100#define QT_COMPILER_SUPPORTS(x) (QT_COMPILER_SUPPORTS_ ## x - 0)
101
102#if defined(Q_PROCESSOR_ARM)
103# define QT_COMPILER_SUPPORTS_HERE(x) ((__ARM_FEATURE_ ## x) || (__ ## x ## __) || QT_COMPILER_SUPPORTS(x))
104# if defined(Q_CC_GNU)
105 /* GCC requires attributes for a function */
106# define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x)))
107# else
108# define QT_FUNCTION_TARGET(x)
109# endif
110#elif defined(Q_PROCESSOR_MIPS)
111# define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __)
112# define QT_FUNCTION_TARGET(x)
113# if !defined(__MIPS_DSP__) && defined(__mips_dsp) && defined(Q_PROCESSOR_MIPS_32)
114# define __MIPS_DSP__
115# endif
116# if !defined(__MIPS_DSPR2__) && defined(__mips_dspr2) && defined(Q_PROCESSOR_MIPS_32)
117# define __MIPS_DSPR2__
118# endif
119#elif defined(Q_PROCESSOR_X86)
120# if defined(Q_CC_CLANG) && defined(Q_CC_MSVC)
121# define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __)
122# else
123# define QT_COMPILER_SUPPORTS_HERE(x) ((__ ## x ## __) || QT_COMPILER_SUPPORTS(x))
124# endif
125# if defined(Q_CC_GNU)
126 /* GCC requires attributes for a function */
127# define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x)))
128# else
129# define QT_FUNCTION_TARGET(x)
130# endif
131#else
132# define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __)
133# define QT_FUNCTION_TARGET(x)
134#endif
135
136#if defined(__SSE2__) && !defined(QT_COMPILER_SUPPORTS_SSE2) && !defined(QT_BOOTSTRAPPED)
137// Intrinsic support appears to be missing, so pretend these features don't exist
138# undef __SSE__
139# undef __SSE2__
140# undef __SSE3__
141# undef __SSSE3__
142# undef __SSE4_1__
143# undef __SSE4_2__
144# undef __AES__
145# undef __POPCNT__
146# undef __AVX__
147# undef __F16C__
148# undef __RDRND__
149# undef __AVX2__
150# undef __BMI__
151# undef __BMI2__
152# undef __FMA__
153# undef __MOVBE__
154# undef __RDSEED__
155# undef __AVX512F__
156# undef __AVX512ER__
157# undef __AVX512CD__
158# undef __AVX512PF__
159# undef __AVX512DQ__
160# undef __AVX512BW__
161# undef __AVX512VL__
162# undef __AVX512IFMA__
163# undef __AVX512VBMI__
164# undef __SHA__
165# undef __AVX512VBMI2__
166# undef __AVX512BITALG__
167# undef __AVX512VNNI__
168# undef __AVX512VPOPCNTDQ__
169# undef __GFNI__
170# undef __VAES__
171#endif
172
173#ifdef Q_PROCESSOR_X86
174/* -- x86 intrinsic support -- */
175
176# if defined(QT_COMPILER_SUPPORTS_RDSEED) && defined(Q_OS_QNX)
177// The compiler for QNX is missing the intrinsic
178# undef QT_COMPILER_SUPPORTS_RDSEED
179# endif
180# if defined(Q_CC_MSVC) && (defined(_M_X64) || _M_IX86_FP >= 2)
181// MSVC doesn't define __SSE2__, so do it ourselves
182# define __SSE__ 1
183# endif
184
185# if defined(Q_OS_WIN) && defined(Q_CC_GNU) && !defined(Q_CC_CLANG)
186// 64-bit GCC on Windows does not support AVX, so we hack around it by forcing
187// it to emit unaligned loads & stores
188// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=49001
189asm(
190 ".macro vmovapd args:vararg\n"
191 " vmovupd \\args\n"
192 ".endm\n"
193 ".macro vmovaps args:vararg\n"
194 " vmovups \\args\n"
195 ".endm\n"
196 ".macro vmovdqa args:vararg\n"
197 " vmovdqu \\args\n"
198 ".endm\n"
199 ".macro vmovdqa32 args:vararg\n"
200 " vmovdqu32 \\args\n"
201 ".endm\n"
202 ".macro vmovdqa64 args:vararg\n"
203 " vmovdqu64 \\args\n"
204 ".endm\n"
205);
206# endif
207
208# if defined(Q_CC_GNU) && !defined(Q_OS_WASM)
209// GCC 4.4 and Clang 2.8 added a few more intrinsics there
210# include <x86intrin.h>
211# endif
212#ifdef Q_OS_WASM
213# include <immintrin.h>
214# endif
215
216# include <QtCore/private/qsimd_x86_p.h>
217
218// x86-64 sub-architecture version 3
219//
220// The Intel Core 4th generation was codenamed "Haswell" and introduced AVX2,
221// BMI1, BMI2, FMA, LZCNT, MOVBE, which makes it a good divider for a
222// sub-target for us. The first AMD processor with AVX2 support (Zen) has the
223// same features, but had already introduced BMI1 in the previous generation.
224// This feature set was chosen as the version 3 of the x86-64 ISA (x86-64-v3)
225// and is supported by GCC and Clang.
226//
227// macOS's fat binaries support the "x86_64h" sub-architecture and the GNU libc
228// ELF loader also supports a "haswell/" subdir (e.g., /usr/lib/haswell).
229# define ARCH_HASWELL_MACROS (__AVX2__ + __FMA__)
230# if ARCH_HASWELL_MACROS != 0
231# if ARCH_HASWELL_MACROS != 2
232# error "Please enable all x86-64-v3 extensions; you probably want to use -march=haswell or -march=x86-64-v3 instead of -mavx2"
233# endif
234static_assert(ARCH_HASWELL_MACROS, "Undeclared identifiers indicate which features are missing.");
235# define __haswell__ 1
236# endif
237# undef ARCH_HASWELL_MACROS
238
239// x86-64 sub-architecture version 4
240//
241// Similar to the above, x86-64-v4 matches the AVX512 variant of the Intel Core
242// 6th generation (codename "Skylake"). AMD Zen4 is the their first processor
243// with AVX512 support and it includes all of these too.
244//
245# define ARCH_SKX_MACROS (__AVX512F__ + __AVX512BW__ + __AVX512CD__ + __AVX512DQ__ + __AVX512VL__)
246# if ARCH_SKX_MACROS != 0
247# if ARCH_SKX_MACROS != 5
248# error "Please enable all x86-64-v4 extensions; you probably want to use -march=skylake-avx512 or -march=x86-64-v4 instead of -mavx512f"
249# endif
250static_assert(ARCH_SKX_MACROS, "Undeclared identifiers indicate which features are missing.");
251# define __skylake_avx512__ 1
252# endif
253# undef ARCH_SKX_MACROS
254#endif /* Q_PROCESSOR_X86 */
255
256// NEON intrinsics
257// note: as of GCC 4.9, does not support function targets for ARM
258#if defined(__ARM_NEON) || defined(__ARM_NEON__)
259#if defined(Q_CC_CLANG)
260#define QT_FUNCTION_TARGET_STRING_NEON "neon"
261#else
262#define QT_FUNCTION_TARGET_STRING_NEON "+neon" // unused: gcc doesn't support function targets on non-aarch64, and on Aarch64 NEON is always available.
263#endif
264#ifndef __ARM_NEON__
265// __ARM_NEON__ is not defined on AArch64, but we need it in our NEON detection.
266#define __ARM_NEON__
267#endif
268
269#ifndef Q_PROCESSOR_ARM_64 // vaddv is only available on Aarch64
270inline uint16_t vaddvq_u16(uint16x8_t v8)
271{
272 const uint64x2_t v2 = vpaddlq_u32(vpaddlq_u16(v8));
273 const uint64x1_t v1 = vadd_u64(vget_low_u64(v2), vget_high_u64(v2));
274 return vget_lane_u16(vreinterpret_u16_u64(v1), 0);
275}
276
277inline uint8_t vaddv_u8(uint8x8_t v8)
278{
279 const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8)));
280 return vget_lane_u8(vreinterpret_u8_u64(v1), 0);
281}
282#endif
283
284#endif
285
286#if defined(Q_PROCESSOR_ARM) && defined(__ARM_FEATURE_CRC32)
287# include <arm_acle.h>
288#endif
289
290#if defined(Q_PROCESSOR_ARM_64)
291#if defined(Q_CC_CLANG)
292#define QT_FUNCTION_TARGET_STRING_AES "crypto"
293#define QT_FUNCTION_TARGET_STRING_CRC32 "crc"
294#elif defined(Q_CC_GNU)
295#define QT_FUNCTION_TARGET_STRING_AES "+crypto"
296#define QT_FUNCTION_TARGET_STRING_CRC32 "+crc"
297#endif
298#elif defined(Q_PROCESSOR_ARM_32)
299#if defined(Q_CC_CLANG)
300#define QT_FUNCTION_TARGET_STRING_AES "armv8-a,crypto"
301#define QT_FUNCTION_TARGET_STRING_CRC32 "armv8-a,crc"
302#elif defined(Q_CC_GNU)
303#define QT_FUNCTION_TARGET_STRING_AES "arch=armv8-a+crypto"
304#define QT_FUNCTION_TARGET_STRING_CRC32 "arch=armv8-a+crc"
305#endif
306#endif
307
308#ifndef Q_PROCESSOR_X86
309enum CPUFeatures {
310#if defined(Q_PROCESSOR_ARM)
311 CpuFeatureNEON = 2,
312 CpuFeatureARM_NEON = CpuFeatureNEON,
313 CpuFeatureCRC32 = 4,
314 CpuFeatureAES = 8,
315 CpuFeatureARM_CRYPTO = CpuFeatureAES,
316#elif defined(Q_PROCESSOR_MIPS)
317 CpuFeatureDSP = 2,
318 CpuFeatureDSPR2 = 4,
319#endif
320};
321
322static const uint64_t qCompilerCpuFeatures = 0
323#if defined __ARM_NEON__
324 | CpuFeatureNEON
325#endif
326#if defined __ARM_FEATURE_CRC32
327 | CpuFeatureCRC32
328#endif
329#if defined __ARM_FEATURE_CRYPTO
330 | CpuFeatureAES
331#endif
332#if defined __mips_dsp
333 | CpuFeatureDSP
334#endif
335#if defined __mips_dspr2
336 | CpuFeatureDSPR2
337#endif
338 ;
339#endif
340
341#ifdef __cplusplus
342# include <atomic>
343# define Q_ATOMIC(T) std::atomic<T>
344QT_BEGIN_NAMESPACE
345using std::atomic_load_explicit;
346static constexpr auto memory_order_relaxed = std::memory_order_relaxed;
347extern "C" {
348#else
349# include <stdatomic.h>
350# define Q_ATOMIC(T) _Atomic(T)
351#endif
352
353#ifdef Q_PROCESSOR_X86
354typedef uint64_t QCpuFeatureType;
355static const QCpuFeatureType qCompilerCpuFeatures = _compilerCpuFeatures;
356static const QCpuFeatureType CpuFeatureArchHaswell = cpu_haswell;
357static const QCpuFeatureType CpuFeatureArchSkylakeAvx512 = cpu_skylake_avx512;
358#else
359typedef unsigned QCpuFeatureType;
360#endif
361extern Q_CORE_EXPORT Q_ATOMIC(QCpuFeatureType) QT_MANGLE_NAMESPACE(qt_cpu_features)[1];
362Q_CORE_EXPORT uint64_t QT_MANGLE_NAMESPACE(qDetectCpuFeatures)();
363
364static inline uint64_t qCpuFeatures()
365{
366#ifdef QT_BOOTSTRAPPED
367 return qCompilerCpuFeatures; // no detection
368#else
369 quint64 features = atomic_load_explicit(QT_MANGLE_NAMESPACE(qt_cpu_features), m: memory_order_relaxed);
370 if (!QT_SUPPORTS_INIT_PRIORITY) {
371 if (Q_UNLIKELY(features == 0))
372 features = QT_MANGLE_NAMESPACE(qDetectCpuFeatures)();
373 }
374 return features;
375#endif
376}
377
378#define qCpuHasFeature(feature) (((qCompilerCpuFeatures & CpuFeature ## feature) == CpuFeature ## feature) \
379 || ((qCpuFeatures() & CpuFeature ## feature) == CpuFeature ## feature))
380
381/*
382 Small wrapper around x86's PAUSE and ARM's YIELD instructions.
383
384 This is completely different from QThread::yieldCurrentThread(), which is
385 an OS-level operation that takes the whole thread off the CPU.
386
387 This is just preventing one SMT thread from filling a core's pipeline with
388 speculated further loop iterations (which need to be expensively flushed on
389 final success) when it could just give those pipeline slots to a second SMT
390 thread that can do something useful with the core, such as unblocking this
391 SMT thread :)
392
393 So, instead of
394
395 while (!condition)
396 ;
397
398 it's better to use
399
400 while (!condition)
401 qYieldCpu();
402*/
403static inline void qYieldCpu()
404{
405#if defined(Q_PROCESSOR_X86)
406 _mm_pause();
407#elif defined(Q_PROCESSOR_ARM) && Q_PROCESSOR_ARM >= 7 /* yield was added in ARMv7 */
408# if __has_builtin(__builtin_arm_yield) /* e.g. Clang */
409 __builtin_arm_yield();
410# elif defined(Q_OS_INTEGRITY) || \
411 (defined(Q_CC_GNU) && !defined(Q_CC_CLANG))
412 /*
413 - Integrity is missing the arm_acle.h header
414 - GCC doesn't have __yield() in arm_acle.h
415 https://stackoverflow.com/a/70076751/134841
416 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105416
417 */
418 asm volatile("yield"); /* this works everywhere */
419# else
420 __yield(); /* this is what should work everywhere */
421# endif
422#endif
423}
424
425#ifdef __cplusplus
426} // extern "C"
427
428# if defined(Q_PROCESSOR_X86) && QT_COMPILER_SUPPORTS_HERE(RDRND) && !defined(QT_BOOTSTRAPPED)
429Q_CORE_EXPORT qsizetype qRandomCpu(void *, qsizetype) noexcept;
430
431static inline bool qHasHwrng()
432{
433 return qCpuHasFeature(RDRND);
434}
435# else
436static inline qsizetype qRandomCpu(void *, qsizetype) noexcept
437{
438 return 0;
439}
440static inline bool qHasHwrng()
441{
442 return false;
443}
444# endif
445
446QT_END_NAMESPACE
447
448#endif // __cplusplus
449
450QT_WARNING_POP
451
452#endif // QSIMD_P_H
453

source code of qtbase/src/corelib/global/qsimd_p.h