qsimd_p.h source code [qtbase/src/corelib/global/qsimd_p.h]

1	// Copyright (C) 2021 The Qt Company Ltd.
2	// Copyright (C) 2022 Intel Corporation.
3	// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4
5	#ifndef QSIMD_P_H
6	#define QSIMD_P_H
7
8	//
9	// W A R N I N G
10	// -------------
11	//
12	// This file is not part of the Qt API. It exists purely as an
13	// implementation detail. This header file may change from version to
14	// version without notice, or even be removed.
15	//
16	// We mean it.
17	//
18
19	#include <QtCore/private/qglobal_p.h>
20	#include <QtCore/qsimd.h>
21
22	QT_WARNING_PUSH
23	QT_WARNING_DISABLE_CLANG("-Wundef")
24	QT_WARNING_DISABLE_GCC("-Wundef")
25	QT_WARNING_DISABLE_INTEL(`103`)
26
27	#define ALIGNMENT_PROLOGUE_16BYTES(ptr, i, length) \
28	for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((4 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x3)) & 0x3))); ++i)
29
30	#define ALIGNMENT_PROLOGUE_32BYTES(ptr, i, length) \
31	for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((8 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x7)) & 0x7))); ++i)
32
33	#define SIMD_EPILOGUE(i, length, max) \
34	for (int _i = 0; _i < max && i < length; ++i, ++_i)
35
36	/*
37	* Code can use the following constructs to determine compiler support & status:
38	* - #ifdef __XXX__ (e.g: #ifdef __AVX__ or #ifdef __ARM_NEON__)
39	* If this test passes, then the compiler is already generating code for that
40	* given sub-architecture. The intrinsics for that sub-architecture are
41	* #included and can be used without restriction or runtime check.
42	*
43	* - #if QT_COMPILER_SUPPORTS(XXX)
44	* If this test passes, then the compiler is able to generate code for that
45	* given sub-architecture in another translation unit, given the right set of
46	* flags. Use of the intrinsics is not guaranteed. This is useful with
47	* runtime detection (see below).
48	*
49	* - #if QT_COMPILER_SUPPORTS_HERE(XXX)
50	* If this test passes, then the compiler is able to generate code for that
51	* given sub-architecture in this translation unit, even if it is not doing
52	* that now (it might be). Individual functions may be tagged with
53	* QT_FUNCTION_TARGET(XXX) to cause the compiler to generate code for that
54	* sub-arch. Only inside such functions is the use of the intrisics
55	* guaranteed to work. This is useful with runtime detection (see below).
56	*
57	* The distinction between QT_COMPILER_SUPPORTS and QT_COMPILER_SUPPORTS_HERE is
58	* historical: GCC 4.8 needed the distinction.
59	*
60	* Runtime detection of a CPU sub-architecture can be done with the
61	* qCpuHasFeature(XXX) function. There are two strategies for generating
62	* optimized code like that:
63	*
64	* 1) place the optimized code in a different translation unit (C or assembly
65	* sources) and pass the correct flags to the compiler to enable support. Those
66	* sources must not include qglobal.h, which means they cannot include this
67	* file either. The dispatcher function would look like this:
68	*
69	* void foo()
70	* {
71	* #if QT_COMPILER_SUPPORTS(XXX)
72	* if (qCpuHasFeature(XXX)) {
73	* foo_optimized_xxx();
74	* return;
75	* }
76	* #endif
77	* foo_plain();
78	* }
79	*
80	* 2) place the optimized code in a function tagged with QT_FUNCTION_TARGET and
81	* surrounded by #if QT_COMPILER_SUPPORTS_HERE(XXX). That code can freely use
82	* other Qt code. The dispatcher function would look like this:
83	*
84	* void foo()
85	* {
86	* #if QT_COMPILER_SUPPORTS_HERE(XXX)
87	* if (qCpuHasFeature(XXX)) {
88	* foo_optimized_xxx();
89	* return;
90	* }
91	* #endif
92	* foo_plain();
93	* }
94	*/
95
96	#if defined(__MINGW64_VERSION_MAJOR) \|\| defined(Q_CC_MSVC)
97	#include <intrin.h>
98	#endif
99
100	#define QT_COMPILER_SUPPORTS(x) (QT_COMPILER_SUPPORTS_ ## x - 0)
101
102	#if defined(Q_PROCESSOR_ARM)
103	# define QT_COMPILER_SUPPORTS_HERE(x) ((__ARM_FEATURE_ ## x) \|\| (__ ## x ## __) \|\| QT_COMPILER_SUPPORTS(x))
104	# if defined(Q_CC_GNU)
105	/ GCC requires attributes for a function /
106	# define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x)))
107	# else
108	# define QT_FUNCTION_TARGET(x)
109	# endif
110	#elif defined(Q_PROCESSOR_MIPS)
111	# define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __)
112	# define QT_FUNCTION_TARGET(x)
113	# if !defined(__MIPS_DSP__) && defined(__mips_dsp) && defined(Q_PROCESSOR_MIPS_32)
114	# define __MIPS_DSP__
115	# endif
116	# if !defined(__MIPS_DSPR2__) && defined(__mips_dspr2) && defined(Q_PROCESSOR_MIPS_32)
117	# define __MIPS_DSPR2__
118	# endif
119	#elif defined(Q_PROCESSOR_X86)
120	# if defined(Q_CC_CLANG) && defined(Q_CC_MSVC)
121	# define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __)
122	# else
123	# define QT_COMPILER_SUPPORTS_HERE(x) ((__ ## x ## __) \|\| QT_COMPILER_SUPPORTS(x))
124	# endif
125	# if defined(Q_CC_GNU)
126	/ GCC requires attributes for a function /
127	# define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x)))
128	# else
129	# define QT_FUNCTION_TARGET(x)
130	# endif
131	#else
132	# define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __)
133	# define QT_FUNCTION_TARGET(x)
134	#endif
135
136	#if defined(__SSE2__) && !defined(QT_COMPILER_SUPPORTS_SSE2) && !defined(QT_BOOTSTRAPPED)
137	// Intrinsic support appears to be missing, so pretend these features don't exist
138	# undef __SSE__
139	# undef __SSE2__
140	# undef __SSE3__
141	# undef __SSSE3__
142	# undef __SSE4_1__
143	# undef __SSE4_2__
144	# undef __AES__
145	# undef __POPCNT__
146	# undef __AVX__
147	# undef __F16C__
148	# undef __RDRND__
149	# undef __AVX2__
150	# undef __BMI__
151	# undef __BMI2__
152	# undef __FMA__
153	# undef __MOVBE__
154	# undef __RDSEED__
155	# undef __AVX512F__
156	# undef __AVX512ER__
157	# undef __AVX512CD__
158	# undef __AVX512PF__
159	# undef __AVX512DQ__
160	# undef __AVX512BW__
161	# undef __AVX512VL__
162	# undef __AVX512IFMA__
163	# undef __AVX512VBMI__
164	# undef __SHA__
165	# undef __AVX512VBMI2__
166	# undef __AVX512BITALG__
167	# undef __AVX512VNNI__
168	# undef __AVX512VPOPCNTDQ__
169	# undef __GFNI__
170	# undef __VAES__
171	#endif
172
173	#ifdef Q_PROCESSOR_X86
174	/ -- x86 intrinsic support -- /
175
176	# if defined(QT_COMPILER_SUPPORTS_RDSEED) && defined(Q_OS_QNX)
177	// The compiler for QNX is missing the intrinsic
178	# undef QT_COMPILER_SUPPORTS_RDSEED
179	# endif
180	# if defined(Q_CC_MSVC) && (defined(_M_X64) \|\| _M_IX86_FP >= 2)
181	// MSVC doesn't define __SSE2__, so do it ourselves
182	# define __SSE__ 1
183	# endif
184
185	# if defined(Q_OS_WIN) && defined(Q_CC_GNU) && !defined(Q_CC_CLANG)
186	// 64-bit GCC on Windows does not support AVX, so we hack around it by forcing
187	// it to emit unaligned loads & stores
188	// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=49001
189	asm(
190	".macro vmovapd args:vararg\n"
191	" vmovupd \\args\n"
192	".endm\n"
193	".macro vmovaps args:vararg\n"
194	" vmovups \\args\n"
195	".endm\n"
196	".macro vmovdqa args:vararg\n"
197	" vmovdqu \\args\n"
198	".endm\n"
199	".macro vmovdqa32 args:vararg\n"
200	" vmovdqu32 \\args\n"
201	".endm\n"
202	".macro vmovdqa64 args:vararg\n"
203	" vmovdqu64 \\args\n"
204	".endm\n"
205	);
206	# endif
207
208	# if defined(Q_CC_GNU) && !defined(Q_OS_WASM)
209	// GCC 4.4 and Clang 2.8 added a few more intrinsics there
210	# include <x86intrin.h>
211	# endif
212	#ifdef Q_OS_WASM
213	# include <immintrin.h>
214	# endif
215
216	# include <QtCore/private/qsimd_x86_p.h>
217
218	// x86-64 sub-architecture version 3
219	//
220	// The Intel Core 4th generation was codenamed "Haswell" and introduced AVX2,
221	// BMI1, BMI2, FMA, LZCNT, MOVBE. This feature set was chosen as the version 3
222	// of the x86-64 ISA (x86-64-v3) and is supported by GCC and Clang. On systems
223	// with the GNU libc, libraries with this feature can be installed on a
224	// "glibc-hwcaps/x86-64-v3" subdir. macOS's fat binaries support the "x86_64h"
225	// sub-architecture too.
226
227	# if defined(__AVX2__)
228	// List of features present with -march=x86-64-v3 and not architecturally
229	// implied by __AVX2__
230	# define ARCH_HASWELL_MACROS \
231	(__AVX2__ + __BMI__ + __BMI2__ + __F16C__ + __FMA__ + __LZCNT__ + __POPCNT__)
232	# if ARCH_HASWELL_MACROS != 7
233	# error "Please enable all x86-64-v3 extensions; you probably want to use -march=haswell or -march=x86-64-v3 instead of -mavx2"
234	# endif
235	static_assert(ARCH_HASWELL_MACROS, "Undeclared identifiers indicate which features are missing.");
236	# define __haswell__ 1
237	# undef ARCH_HASWELL_MACROS
238	# endif
239
240	// x86-64 sub-architecture version 4
241	//
242	// Similar to the above, x86-64-v4 matches the AVX512 variant of the Intel Core
243	// 6th generation (codename "Skylake"). AMD Zen4 is the their first processor
244	// with AVX512 support and it includes all of these too. The GNU libc subdir for
245	// this is "glibc-hwcaps/x86-64-v4".
246	//
247	# define ARCH_SKX_MACROS (__AVX512F__ + __AVX512BW__ + __AVX512CD__ + __AVX512DQ__ + __AVX512VL__)
248	# if ARCH_SKX_MACROS != 0
249	# if ARCH_SKX_MACROS != 5
250	# error "Please enable all x86-64-v4 extensions; you probably want to use -march=skylake-avx512 or -march=x86-64-v4 instead of -mavx512f"
251	# endif
252	static_assert(ARCH_SKX_MACROS, "Undeclared identifiers indicate which features are missing.");
253	# define __skylake_avx512__ 1
254	# endif
255	# undef ARCH_SKX_MACROS
256	#endif /* Q_PROCESSOR_X86 */
257
258	// NEON intrinsics
259	// note: as of GCC 4.9, does not support function targets for ARM
260	#if defined(__ARM_NEON) \|\| defined(__ARM_NEON__) \|\| defined(_M_ARM64)
261	#if defined(Q_CC_CLANG)
262	#define QT_FUNCTION_TARGET_STRING_NEON "neon"
263	#else
264	#define QT_FUNCTION_TARGET_STRING_NEON "+neon" // unused: gcc doesn't support function targets on non-aarch64, and on Aarch64 NEON is always available.
265	#endif
266	#ifndef __ARM_NEON__
267	// __ARM_NEON__ is not defined on AArch64, but we need it in our NEON detection.
268	#define __ARM_NEON__
269	#endif
270
271	#ifndef Q_PROCESSOR_ARM_64 // vaddv is only available on Aarch64
272	inline uint16_t vaddvq_u16(uint16x8_t v8)
273	{
274	const uint64x2_t v2 = vpaddlq_u32(vpaddlq_u16(v8));
275	const uint64x1_t v1 = vadd_u64(vget_low_u64(v2), vget_high_u64(v2));
276	return vget_lane_u16(vreinterpret_u16_u64(v1), `0`);
277	}
278
279	inline uint8_t vaddv_u8(uint8x8_t v8)
280	{
281	const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8)));
282	return vget_lane_u8(vreinterpret_u8_u64(v1), `0`);
283	}
284	#endif
285
286	// Missing NEON intrinsics, needed due different type definitions:
287	inline uint16x8_t qvsetq_n_u16(uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4,
288	uint16_t v5, uint16_t v6, uint16_t v7, uint16_t v8) {
289	#if defined(Q_CC_MSVC) && !defined(Q_CC_CLANG)
290	using u64 = uint64_t;
291	const uint16x8_t vmask = {
292	v1 \| (v2 << `16`) \| (u64(v3) << `32`) \| (u64(v4) << `48`),
293	v5 \| (v6 << `16`) \| (u64(v7) << `32`) \| (u64(v8) << `48`)
294	};
295	#else
296	const uint16x8_t vmask = { v1, v2, v3, v4, v5, v6, v7, v8 };
297	#endif
298	return vmask;
299	}
300	inline uint8x8_t qvset_n_u8(uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
301	uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8) {
302	#if defined(Q_CC_MSVC) && !defined(Q_CC_CLANG)
303	using u64 = uint64_t;
304	const uint8x8_t vmask = {
305	v1 \| (v2 << `8`) \| (v3 << `16`) \| (v4 << `24`) \|
306	(u64(v5) << `32`) \| (u64(v6) << `40`) \| (u64(v7) << `48`) \| (u64(v8) << `56`)
307	};
308	#else
309	const uint8x8_t vmask = { v1, v2, v3, v4, v5, v6, v7, v8 };
310	#endif
311	return vmask;
312	}
313	inline uint8x16_t qvsetq_n_u8(uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
314	uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8,
315	uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12,
316	uint8_t v13, uint8_t v14, uint8_t v15, uint8_t v16) {
317	#if defined(Q_CC_MSVC) && !defined(Q_CC_CLANG)
318	using u64 = uint64_t;
319	const uint8x16_t vmask = {
320	v1 \| (v2 << `8`) \| (v3 << `16`) \| (v4 << `24`) \|
321	(u64(v5) << `32`) \| (u64(v6) << `40`) \| (u64(v7) << `48`) \| (u64(v8) << `56`),
322	v9 \| (v10 << `8`) \| (v11 << `16`) \| (v12 << `24`) \|
323	(u64(v13) << `32`) \| (u64(v14) << `40`) \| (u64(v15) << `48`) \| (u64(v16) << `56`)
324	};
325	#else
326	const uint8x16_t vmask = { v1, v2, v3, v4, v5, v6, v7, v8,
327	v9, v10, v11, v12, v13, v14, v15, v16};
328	#endif
329	return vmask;
330	}
331	inline uint32x4_t qvsetq_n_u32(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
332	{
333	#if defined(Q_CC_MSVC) && !defined(Q_CC_CLANG)
334	return uint32x4_t{ (uint64_t(b) << `32`) \| a, (uint64_t(d) << `32`) \| c };
335	#else
336	return uint32x4_t{ a, b, c, d };
337	#endif
338	}
339	#endif
340
341	#if defined(_M_ARM64) && __ARM_ARCH >= 800
342	#define __ARM_FEATURE_CRYPTO 1
343	#define __ARM_FEATURE_CRC32 1
344	#endif
345
346	#if defined(Q_PROCESSOR_ARM_64)
347	#if defined(Q_CC_CLANG)
348	#define QT_FUNCTION_TARGET_STRING_AES "crypto"
349	#define QT_FUNCTION_TARGET_STRING_CRC32 "crc"
350	#elif defined(Q_CC_GNU)
351	#define QT_FUNCTION_TARGET_STRING_AES "+crypto"
352	#define QT_FUNCTION_TARGET_STRING_CRC32 "+crc"
353	#elif defined(Q_CC_MSVC)
354	#define QT_FUNCTION_TARGET_STRING_AES
355	#define QT_FUNCTION_TARGET_STRING_CRC32
356	#endif
357	#elif defined(Q_PROCESSOR_ARM_32)
358	#if defined(Q_CC_CLANG)
359	#define QT_FUNCTION_TARGET_STRING_AES "armv8-a,crypto"
360	#define QT_FUNCTION_TARGET_STRING_CRC32 "armv8-a,crc"
361	#elif defined(Q_CC_GNU)
362	#define QT_FUNCTION_TARGET_STRING_AES "arch=armv8-a+crypto"
363	#define QT_FUNCTION_TARGET_STRING_CRC32 "arch=armv8-a+crc"
364	#endif
365	#endif
366
367	#ifndef Q_PROCESSOR_X86
368	enum CPUFeatures {
369	#if defined(Q_PROCESSOR_ARM)
370	CpuFeatureNEON = `2`,
371	CpuFeatureARM_NEON = CpuFeatureNEON,
372	CpuFeatureCRC32 = `4`,
373	CpuFeatureAES = `8`,
374	CpuFeatureARM_CRYPTO = CpuFeatureAES,
375	#elif defined(Q_PROCESSOR_MIPS)
376	CpuFeatureDSP = `2`,
377	CpuFeatureDSPR2 = `4`,
378	#endif
379	};
380
381	static const uint64_t qCompilerCpuFeatures = `0`
382	#if defined __ARM_NEON__
383	\| CpuFeatureNEON
384	#endif
385	#if !(defined(Q_OS_LINUX) && defined(Q_PROCESSOR_ARM_64))
386	// Yocto Project recipes enable Crypto extension for all ARMv8 configs,
387	// even for targets without the Crypto extension. That's wrong, but as
388	// the compiler never generates the code for them on their own, most
389	// code never notices the problem. But we would. By not setting the
390	// bits here, we force a runtime detection.
391	#if defined __ARM_FEATURE_CRC32
392	\| CpuFeatureCRC32
393	#endif
394	#if defined __ARM_FEATURE_CRYPTO
395	\| CpuFeatureAES
396	#endif
397	#endif // Q_OS_LINUX && Q_PROCESSOR_ARM64
398	#if defined __mips_dsp
399	\| CpuFeatureDSP
400	#endif
401	#if defined __mips_dspr2
402	\| CpuFeatureDSPR2
403	#endif
404	;
405	#endif
406
407	#ifdef __cplusplus
408	# include <atomic>
409	# define Q_ATOMIC(T) std::atomic<T>
410	QT_BEGIN_NAMESPACE
411	using std::atomic_load_explicit;
412	static constexpr auto memory_order_relaxed = std::memory_order_relaxed;
413	extern "C" {
414	#else
415	# include <stdatomic.h>
416	# define Q_ATOMIC(T) _Atomic(T)
417	#endif
418
419	#ifdef Q_PROCESSOR_X86
420	typedef uint64_t QCpuFeatureType;
421	static const QCpuFeatureType qCompilerCpuFeatures = _compilerCpuFeatures;
422	static const QCpuFeatureType CpuFeatureArchHaswell = cpu_haswell;
423	static const QCpuFeatureType CpuFeatureArchSkylakeAvx512 = cpu_skylake_avx512;
424	#else
425	typedef unsigned QCpuFeatureType;
426	#endif
427	extern Q_CORE_EXPORT Q_ATOMIC(QCpuFeatureType) QT_MANGLE_NAMESPACE(qt_cpu_features)[`1`];
428	Q_CORE_EXPORT uint64_t QT_MANGLE_NAMESPACE(qDetectCpuFeatures)();
429
430	static inline uint64_t qCpuFeatures()
431	{
432	#ifdef QT_BOOTSTRAPPED
433	return qCompilerCpuFeatures; // no detection
434	#else
435	quint64 features = atomic_load_explicit(QT_MANGLE_NAMESPACE(qt_cpu_features), m: memory_order_relaxed);
436	if (!QT_SUPPORTS_INIT_PRIORITY) {
437	if (Q_UNLIKELY(features == `0`))
438	features = QT_MANGLE_NAMESPACE(qDetectCpuFeatures)();
439	}
440	return features;
441	#endif
442	}
443
444	#define qCpuHasFeature(feature) (((qCompilerCpuFeatures & CpuFeature ## feature) == CpuFeature ## feature) \
445	\|\| ((qCpuFeatures() & CpuFeature ## feature) == CpuFeature ## feature))
446
447	#ifdef __cplusplus
448	} // extern "C"
449
450	# if defined(Q_PROCESSOR_X86) && QT_COMPILER_SUPPORTS_HERE(RDRND) && !defined(QT_BOOTSTRAPPED)
451	Q_CORE_EXPORT qsizetype qRandomCpu(void , qsizetype) noexcept*;
452
453	static inline bool qHasHwrng()
454	{
455	return qCpuHasFeature(RDRND);
456	}
457	# else
458	static inline qsizetype qRandomCpu(void , qsizetype) noexcept*
459	{
460	return `0`;
461	}
462	static inline bool qHasHwrng()
463	{
464	return false;
465	}
466	# endif
467
468	QT_END_NAMESPACE
469
470	#endif // __cplusplus
471
472	QT_WARNING_POP
473
474	#endif // QSIMD_P_H
475

Provided by KDAB

Learn to use CMake with our Intro Training

Find out more

Definitions

source code of qtbase/src/corelib/global/qsimd_p.h