1// Copyright (C) 2021 The Qt Company Ltd.
2// Copyright (C) 2022 Intel Corporation.
3// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4
5#ifndef QSIMD_P_H
6#define QSIMD_P_H
7
8//
9// W A R N I N G
10// -------------
11//
12// This file is not part of the Qt API. It exists purely as an
13// implementation detail. This header file may change from version to
14// version without notice, or even be removed.
15//
16// We mean it.
17//
18
19#include <QtCore/private/qglobal_p.h>
20#include <QtCore/qsimd.h>
21
22QT_WARNING_PUSH
23QT_WARNING_DISABLE_CLANG("-Wundef")
24QT_WARNING_DISABLE_GCC("-Wundef")
25QT_WARNING_DISABLE_INTEL(103)
26
27#define ALIGNMENT_PROLOGUE_16BYTES(ptr, i, length) \
28 for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((4 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x3)) & 0x3))); ++i)
29
30#define ALIGNMENT_PROLOGUE_32BYTES(ptr, i, length) \
31 for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((8 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x7)) & 0x7))); ++i)
32
33#define SIMD_EPILOGUE(i, length, max) \
34 for (int _i = 0; _i < max && i < length; ++i, ++_i)
35
36/*
37 * Code can use the following constructs to determine compiler support & status:
38 * - #ifdef __XXX__ (e.g: #ifdef __AVX__ or #ifdef __ARM_NEON__)
39 * If this test passes, then the compiler is already generating code for that
40 * given sub-architecture. The intrinsics for that sub-architecture are
41 * #included and can be used without restriction or runtime check.
42 *
43 * - #if QT_COMPILER_SUPPORTS(XXX)
44 * If this test passes, then the compiler is able to generate code for that
45 * given sub-architecture in another translation unit, given the right set of
46 * flags. Use of the intrinsics is not guaranteed. This is useful with
47 * runtime detection (see below).
48 *
49 * - #if QT_COMPILER_SUPPORTS_HERE(XXX)
50 * If this test passes, then the compiler is able to generate code for that
51 * given sub-architecture in this translation unit, even if it is not doing
52 * that now (it might be). Individual functions may be tagged with
53 * QT_FUNCTION_TARGET(XXX) to cause the compiler to generate code for that
54 * sub-arch. Only inside such functions is the use of the intrisics
55 * guaranteed to work. This is useful with runtime detection (see below).
56 *
57 * The distinction between QT_COMPILER_SUPPORTS and QT_COMPILER_SUPPORTS_HERE is
58 * historical: GCC 4.8 needed the distinction.
59 *
60 * Runtime detection of a CPU sub-architecture can be done with the
61 * qCpuHasFeature(XXX) function. There are two strategies for generating
62 * optimized code like that:
63 *
64 * 1) place the optimized code in a different translation unit (C or assembly
65 * sources) and pass the correct flags to the compiler to enable support. Those
66 * sources must not include qglobal.h, which means they cannot include this
67 * file either. The dispatcher function would look like this:
68 *
69 * void foo()
70 * {
71 * #if QT_COMPILER_SUPPORTS(XXX)
72 * if (qCpuHasFeature(XXX)) {
73 * foo_optimized_xxx();
74 * return;
75 * }
76 * #endif
77 * foo_plain();
78 * }
79 *
80 * 2) place the optimized code in a function tagged with QT_FUNCTION_TARGET and
81 * surrounded by #if QT_COMPILER_SUPPORTS_HERE(XXX). That code can freely use
82 * other Qt code. The dispatcher function would look like this:
83 *
84 * void foo()
85 * {
86 * #if QT_COMPILER_SUPPORTS_HERE(XXX)
87 * if (qCpuHasFeature(XXX)) {
88 * foo_optimized_xxx();
89 * return;
90 * }
91 * #endif
92 * foo_plain();
93 * }
94 */
95
96#if defined(__MINGW64_VERSION_MAJOR) || defined(Q_CC_MSVC)
97#include <intrin.h>
98#endif
99
100#define QT_COMPILER_SUPPORTS(x) (QT_COMPILER_SUPPORTS_ ## x - 0)
101
102#if defined(Q_PROCESSOR_ARM)
103# define QT_COMPILER_SUPPORTS_HERE(x) ((__ARM_FEATURE_ ## x) || (__ ## x ## __) || QT_COMPILER_SUPPORTS(x))
104# if defined(Q_CC_GNU)
105 /* GCC requires attributes for a function */
106# define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x)))
107# else
108# define QT_FUNCTION_TARGET(x)
109# endif
110#elif defined(Q_PROCESSOR_MIPS)
111# define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __)
112# define QT_FUNCTION_TARGET(x)
113# if !defined(__MIPS_DSP__) && defined(__mips_dsp) && defined(Q_PROCESSOR_MIPS_32)
114# define __MIPS_DSP__
115# endif
116# if !defined(__MIPS_DSPR2__) && defined(__mips_dspr2) && defined(Q_PROCESSOR_MIPS_32)
117# define __MIPS_DSPR2__
118# endif
119#elif defined(Q_PROCESSOR_X86)
120# if defined(Q_CC_CLANG) && defined(Q_CC_MSVC)
121# define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __)
122# else
123# define QT_COMPILER_SUPPORTS_HERE(x) ((__ ## x ## __) || QT_COMPILER_SUPPORTS(x))
124# endif
125# if defined(Q_CC_GNU)
126 /* GCC requires attributes for a function */
127# define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x)))
128# else
129# define QT_FUNCTION_TARGET(x)
130# endif
131#else
132# define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __)
133# define QT_FUNCTION_TARGET(x)
134#endif
135
136#if defined(__SSE2__) && !defined(QT_COMPILER_SUPPORTS_SSE2) && !defined(QT_BOOTSTRAPPED)
137// Intrinsic support appears to be missing, so pretend these features don't exist
138# undef __SSE__
139# undef __SSE2__
140# undef __SSE3__
141# undef __SSSE3__
142# undef __SSE4_1__
143# undef __SSE4_2__
144# undef __AES__
145# undef __POPCNT__
146# undef __AVX__
147# undef __F16C__
148# undef __RDRND__
149# undef __AVX2__
150# undef __BMI__
151# undef __BMI2__
152# undef __FMA__
153# undef __MOVBE__
154# undef __RDSEED__
155# undef __AVX512F__
156# undef __AVX512ER__
157# undef __AVX512CD__
158# undef __AVX512PF__
159# undef __AVX512DQ__
160# undef __AVX512BW__
161# undef __AVX512VL__
162# undef __AVX512IFMA__
163# undef __AVX512VBMI__
164# undef __SHA__
165# undef __AVX512VBMI2__
166# undef __AVX512BITALG__
167# undef __AVX512VNNI__
168# undef __AVX512VPOPCNTDQ__
169# undef __GFNI__
170# undef __VAES__
171#endif
172
173#ifdef Q_PROCESSOR_X86
174/* -- x86 intrinsic support -- */
175
176# if defined(QT_COMPILER_SUPPORTS_RDSEED) && defined(Q_OS_QNX)
177// The compiler for QNX is missing the intrinsic
178# undef QT_COMPILER_SUPPORTS_RDSEED
179# endif
180# if defined(Q_CC_MSVC) && (defined(_M_X64) || _M_IX86_FP >= 2)
181// MSVC doesn't define __SSE2__, so do it ourselves
182# define __SSE__ 1
183# endif
184
185# if defined(Q_OS_WIN) && defined(Q_CC_GNU) && !defined(Q_CC_CLANG)
186// 64-bit GCC on Windows does not support AVX, so we hack around it by forcing
187// it to emit unaligned loads & stores
188// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=49001
189asm(
190 ".macro vmovapd args:vararg\n"
191 " vmovupd \\args\n"
192 ".endm\n"
193 ".macro vmovaps args:vararg\n"
194 " vmovups \\args\n"
195 ".endm\n"
196 ".macro vmovdqa args:vararg\n"
197 " vmovdqu \\args\n"
198 ".endm\n"
199 ".macro vmovdqa32 args:vararg\n"
200 " vmovdqu32 \\args\n"
201 ".endm\n"
202 ".macro vmovdqa64 args:vararg\n"
203 " vmovdqu64 \\args\n"
204 ".endm\n"
205);
206# endif
207
208# if defined(Q_CC_GNU) && !defined(Q_OS_WASM)
209// GCC 4.4 and Clang 2.8 added a few more intrinsics there
210# include <x86intrin.h>
211# endif
212#ifdef Q_OS_WASM
213# include <immintrin.h>
214# endif
215
216# include <QtCore/private/qsimd_x86_p.h>
217
218// x86-64 sub-architecture version 3
219//
220// The Intel Core 4th generation was codenamed "Haswell" and introduced AVX2,
221// BMI1, BMI2, FMA, LZCNT, MOVBE. This feature set was chosen as the version 3
222// of the x86-64 ISA (x86-64-v3) and is supported by GCC and Clang. On systems
223// with the GNU libc, libraries with this feature can be installed on a
224// "glibc-hwcaps/x86-64-v3" subdir. macOS's fat binaries support the "x86_64h"
225// sub-architecture too.
226
227# if defined(__AVX2__)
228// List of features present with -march=x86-64-v3 and not architecturally
229// implied by __AVX2__
230# define ARCH_HASWELL_MACROS \
231 (__AVX2__ + __BMI__ + __BMI2__ + __F16C__ + __FMA__ + __LZCNT__ + __POPCNT__)
232# if ARCH_HASWELL_MACROS != 7
233# error "Please enable all x86-64-v3 extensions; you probably want to use -march=haswell or -march=x86-64-v3 instead of -mavx2"
234# endif
235static_assert(ARCH_HASWELL_MACROS, "Undeclared identifiers indicate which features are missing.");
236# define __haswell__ 1
237# undef ARCH_HASWELL_MACROS
238# endif
239
240// x86-64 sub-architecture version 4
241//
242// Similar to the above, x86-64-v4 matches the AVX512 variant of the Intel Core
243// 6th generation (codename "Skylake"). AMD Zen4 is the their first processor
244// with AVX512 support and it includes all of these too. The GNU libc subdir for
245// this is "glibc-hwcaps/x86-64-v4".
246//
247# define ARCH_SKX_MACROS (__AVX512F__ + __AVX512BW__ + __AVX512CD__ + __AVX512DQ__ + __AVX512VL__)
248# if ARCH_SKX_MACROS != 0
249# if ARCH_SKX_MACROS != 5
250# error "Please enable all x86-64-v4 extensions; you probably want to use -march=skylake-avx512 or -march=x86-64-v4 instead of -mavx512f"
251# endif
252static_assert(ARCH_SKX_MACROS, "Undeclared identifiers indicate which features are missing.");
253# define __skylake_avx512__ 1
254# endif
255# undef ARCH_SKX_MACROS
256#endif /* Q_PROCESSOR_X86 */
257
258// NEON intrinsics
259// note: as of GCC 4.9, does not support function targets for ARM
260#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(_M_ARM64)
261#if defined(Q_CC_CLANG)
262#define QT_FUNCTION_TARGET_STRING_NEON "neon"
263#else
264#define QT_FUNCTION_TARGET_STRING_NEON "+neon" // unused: gcc doesn't support function targets on non-aarch64, and on Aarch64 NEON is always available.
265#endif
266#ifndef __ARM_NEON__
267// __ARM_NEON__ is not defined on AArch64, but we need it in our NEON detection.
268#define __ARM_NEON__
269#endif
270
271#ifndef Q_PROCESSOR_ARM_64 // vaddv is only available on Aarch64
272inline uint16_t vaddvq_u16(uint16x8_t v8)
273{
274 const uint64x2_t v2 = vpaddlq_u32(vpaddlq_u16(v8));
275 const uint64x1_t v1 = vadd_u64(vget_low_u64(v2), vget_high_u64(v2));
276 return vget_lane_u16(vreinterpret_u16_u64(v1), 0);
277}
278
279inline uint8_t vaddv_u8(uint8x8_t v8)
280{
281 const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8)));
282 return vget_lane_u8(vreinterpret_u8_u64(v1), 0);
283}
284#endif
285
286// Missing NEON intrinsics, needed due different type definitions:
287inline uint16x8_t qvsetq_n_u16(uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4,
288 uint16_t v5, uint16_t v6, uint16_t v7, uint16_t v8) {
289#if defined(Q_CC_MSVC) && !defined(Q_CC_CLANG)
290 using u64 = uint64_t;
291 const uint16x8_t vmask = {
292 v1 | (v2 << 16) | (u64(v3) << 32) | (u64(v4) << 48),
293 v5 | (v6 << 16) | (u64(v7) << 32) | (u64(v8) << 48)
294 };
295#else
296 const uint16x8_t vmask = { v1, v2, v3, v4, v5, v6, v7, v8 };
297#endif
298 return vmask;
299}
300inline uint8x8_t qvset_n_u8(uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
301 uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8) {
302#if defined(Q_CC_MSVC) && !defined(Q_CC_CLANG)
303 using u64 = uint64_t;
304 const uint8x8_t vmask = {
305 v1 | (v2 << 8) | (v3 << 16) | (v4 << 24) |
306 (u64(v5) << 32) | (u64(v6) << 40) | (u64(v7) << 48) | (u64(v8) << 56)
307 };
308#else
309 const uint8x8_t vmask = { v1, v2, v3, v4, v5, v6, v7, v8 };
310#endif
311 return vmask;
312}
313inline uint8x16_t qvsetq_n_u8(uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
314 uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8,
315 uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12,
316 uint8_t v13, uint8_t v14, uint8_t v15, uint8_t v16) {
317#if defined(Q_CC_MSVC) && !defined(Q_CC_CLANG)
318 using u64 = uint64_t;
319 const uint8x16_t vmask = {
320 v1 | (v2 << 8) | (v3 << 16) | (v4 << 24) |
321 (u64(v5) << 32) | (u64(v6) << 40) | (u64(v7) << 48) | (u64(v8) << 56),
322 v9 | (v10 << 8) | (v11 << 16) | (v12 << 24) |
323 (u64(v13) << 32) | (u64(v14) << 40) | (u64(v15) << 48) | (u64(v16) << 56)
324 };
325#else
326 const uint8x16_t vmask = { v1, v2, v3, v4, v5, v6, v7, v8,
327 v9, v10, v11, v12, v13, v14, v15, v16};
328#endif
329 return vmask;
330}
331inline uint32x4_t qvsetq_n_u32(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
332{
333#if defined(Q_CC_MSVC) && !defined(Q_CC_CLANG)
334 return uint32x4_t{ (uint64_t(b) << 32) | a, (uint64_t(d) << 32) | c };
335#else
336 return uint32x4_t{ a, b, c, d };
337#endif
338}
339#endif
340
341#if defined(_M_ARM64) && __ARM_ARCH >= 800
342#define __ARM_FEATURE_CRYPTO 1
343#define __ARM_FEATURE_CRC32 1
344#endif
345
346#if defined(Q_PROCESSOR_ARM_64)
347#if defined(Q_CC_CLANG)
348#define QT_FUNCTION_TARGET_STRING_AES "crypto"
349#define QT_FUNCTION_TARGET_STRING_CRC32 "crc"
350#elif defined(Q_CC_GNU)
351#define QT_FUNCTION_TARGET_STRING_AES "+crypto"
352#define QT_FUNCTION_TARGET_STRING_CRC32 "+crc"
353#elif defined(Q_CC_MSVC)
354#define QT_FUNCTION_TARGET_STRING_AES
355#define QT_FUNCTION_TARGET_STRING_CRC32
356#endif
357#elif defined(Q_PROCESSOR_ARM_32)
358#if defined(Q_CC_CLANG)
359#define QT_FUNCTION_TARGET_STRING_AES "armv8-a,crypto"
360#define QT_FUNCTION_TARGET_STRING_CRC32 "armv8-a,crc"
361#elif defined(Q_CC_GNU)
362#define QT_FUNCTION_TARGET_STRING_AES "arch=armv8-a+crypto"
363#define QT_FUNCTION_TARGET_STRING_CRC32 "arch=armv8-a+crc"
364#endif
365#endif
366
367#ifndef Q_PROCESSOR_X86
368enum CPUFeatures {
369#if defined(Q_PROCESSOR_ARM)
370 CpuFeatureNEON = 2,
371 CpuFeatureARM_NEON = CpuFeatureNEON,
372 CpuFeatureCRC32 = 4,
373 CpuFeatureAES = 8,
374 CpuFeatureARM_CRYPTO = CpuFeatureAES,
375#elif defined(Q_PROCESSOR_MIPS)
376 CpuFeatureDSP = 2,
377 CpuFeatureDSPR2 = 4,
378#endif
379};
380
381static const uint64_t qCompilerCpuFeatures = 0
382#if defined __ARM_NEON__
383 | CpuFeatureNEON
384#endif
385#if !(defined(Q_OS_LINUX) && defined(Q_PROCESSOR_ARM_64))
386 // Yocto Project recipes enable Crypto extension for all ARMv8 configs,
387 // even for targets without the Crypto extension. That's wrong, but as
388 // the compiler never generates the code for them on their own, most
389 // code never notices the problem. But we would. By not setting the
390 // bits here, we force a runtime detection.
391#if defined __ARM_FEATURE_CRC32
392 | CpuFeatureCRC32
393#endif
394#if defined __ARM_FEATURE_CRYPTO
395 | CpuFeatureAES
396#endif
397#endif // Q_OS_LINUX && Q_PROCESSOR_ARM64
398#if defined __mips_dsp
399 | CpuFeatureDSP
400#endif
401#if defined __mips_dspr2
402 | CpuFeatureDSPR2
403#endif
404 ;
405#endif
406
407#ifdef __cplusplus
408# include <atomic>
409# define Q_ATOMIC(T) std::atomic<T>
410QT_BEGIN_NAMESPACE
411using std::atomic_load_explicit;
412static constexpr auto memory_order_relaxed = std::memory_order_relaxed;
413extern "C" {
414#else
415# include <stdatomic.h>
416# define Q_ATOMIC(T) _Atomic(T)
417#endif
418
419#ifdef Q_PROCESSOR_X86
420typedef uint64_t QCpuFeatureType;
421static const QCpuFeatureType qCompilerCpuFeatures = _compilerCpuFeatures;
422static const QCpuFeatureType CpuFeatureArchHaswell = cpu_haswell;
423static const QCpuFeatureType CpuFeatureArchSkylakeAvx512 = cpu_skylake_avx512;
424#else
425typedef unsigned QCpuFeatureType;
426#endif
427extern Q_CORE_EXPORT Q_ATOMIC(QCpuFeatureType) QT_MANGLE_NAMESPACE(qt_cpu_features)[1];
428Q_CORE_EXPORT uint64_t QT_MANGLE_NAMESPACE(qDetectCpuFeatures)();
429
430static inline uint64_t qCpuFeatures()
431{
432#ifdef QT_BOOTSTRAPPED
433 return qCompilerCpuFeatures; // no detection
434#else
435 quint64 features = atomic_load_explicit(QT_MANGLE_NAMESPACE(qt_cpu_features), m: memory_order_relaxed);
436 if (!QT_SUPPORTS_INIT_PRIORITY) {
437 if (Q_UNLIKELY(features == 0))
438 features = QT_MANGLE_NAMESPACE(qDetectCpuFeatures)();
439 }
440 return features;
441#endif
442}
443
444#define qCpuHasFeature(feature) (((qCompilerCpuFeatures & CpuFeature ## feature) == CpuFeature ## feature) \
445 || ((qCpuFeatures() & CpuFeature ## feature) == CpuFeature ## feature))
446
447#ifdef __cplusplus
448} // extern "C"
449
450# if defined(Q_PROCESSOR_X86) && QT_COMPILER_SUPPORTS_HERE(RDRND) && !defined(QT_BOOTSTRAPPED)
451Q_CORE_EXPORT qsizetype qRandomCpu(void *, qsizetype) noexcept;
452
453static inline bool qHasHwrng()
454{
455 return qCpuHasFeature(RDRND);
456}
457# else
458static inline qsizetype qRandomCpu(void *, qsizetype) noexcept
459{
460 return 0;
461}
462static inline bool qHasHwrng()
463{
464 return false;
465}
466# endif
467
468QT_END_NAMESPACE
469
470#endif // __cplusplus
471
472QT_WARNING_POP
473
474#endif // QSIMD_P_H
475

Provided by KDAB

Privacy Policy
Learn to use CMake with our Intro Training
Find out more

source code of qtbase/src/corelib/global/qsimd_p.h