qsimd_p.h source code [qtbase/src/corelib/tools/qsimd_p.h]

1	/****************************************************************************
2	**
3	** Copyright (C) 2016 The Qt Company Ltd.
4	** Copyright (C) 2018 Intel Corporation.
5	** Contact: https://www.qt.io/licensing/
6	**
7	** This file is part of the QtCore module of the Qt Toolkit.
8	**
9	** $QT_BEGIN_LICENSE:LGPL$
10	** Commercial License Usage
11	** Licensees holding valid commercial Qt licenses may use this file in
12	** accordance with the commercial license agreement provided with the
13	** Software or, alternatively, in accordance with the terms contained in
14	** a written agreement between you and The Qt Company. For licensing terms
15	** and conditions see https://www.qt.io/terms-conditions. For further
16	** information use the contact form at https://www.qt.io/contact-us.
17	**
18	** GNU Lesser General Public License Usage
19	** Alternatively, this file may be used under the terms of the GNU Lesser
20	** General Public License version 3 as published by the Free Software
21	** Foundation and appearing in the file LICENSE.LGPL3 included in the
22	** packaging of this file. Please review the following information to
23	** ensure the GNU Lesser General Public License version 3 requirements
24	** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
25	**
26	** GNU General Public License Usage
27	** Alternatively, this file may be used under the terms of the GNU
28	** General Public License version 2.0 or (at your option) the GNU General
29	** Public license version 3 or any later version approved by the KDE Free
30	** Qt Foundation. The licenses are as published by the Free Software
31	** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
32	** included in the packaging of this file. Please review the following
33	** information to ensure the GNU General Public License requirements will
34	** be met: https://www.gnu.org/licenses/gpl-2.0.html and
35	** https://www.gnu.org/licenses/gpl-3.0.html.
36	**
37	** $QT_END_LICENSE$
38	**
39	****************************************************************************/
40
41	#ifndef QSIMD_P_H
42	#define QSIMD_P_H
43
44	//
45	// W A R N I N G
46	// -------------
47	//
48	// This file is not part of the Qt API. It exists purely as an
49	// implementation detail. This header file may change from version to
50	// version without notice, or even be removed.
51	//
52	// We mean it.
53	//
54
55	#include <QtCore/private/qglobal_p.h>
56
57	/*
58	* qt_module_config.prf defines the QT_COMPILER_SUPPORTS_XXX macros.
59	* They mean the compiler supports the necessary flags and the headers
60	* for the x86 and ARM intrinsics:
61	* - GCC: the -mXXX or march=YYY flag is necessary before #include
62	* up to 4.8; GCC >= 4.9 can include unconditionally
63	* - Intel CC: #include can happen unconditionally
64	* - MSVC: #include can happen unconditionally
65	* - RVCT: ???
66	*
67	* We will try to include all headers possible under this configuration.
68	*
69	* MSVC does not define __SSE2__ & family, so we will define them. MSVC 2013 &
70	* up do define __AVX__ if the -arch:AVX option is passed on the command-line.
71	*
72	* Supported XXX are:
73	* Flag \| Arch \| GCC \| Intel CC \| MSVC \|
74	* ARM_NEON \| ARM \| I & C \| None \| ? \|
75	* SSE2 \| x86 \| I & C \| I & C \| I & C \|
76	* SSE3 \| x86 \| I & C \| I & C \| I only \|
77	* SSSE3 \| x86 \| I & C \| I & C \| I only \|
78	* SSE4_1 \| x86 \| I & C \| I & C \| I only \|
79	* SSE4_2 \| x86 \| I & C \| I & C \| I only \|
80	* AVX \| x86 \| I & C \| I & C \| I & C \|
81	* AVX2 \| x86 \| I & C \| I & C \| I only \|
82	* AVX512xx \| x86 \| I & C \| I & C \| I only \|
83	* I = intrinsics; C = code generation
84	*
85	* Code can use the following constructs to determine compiler support & status:
86	* - #ifdef __XXX__ (e.g: #ifdef __AVX__ or #ifdef __ARM_NEON__)
87	* If this test passes, then the compiler is already generating code for that
88	* given sub-architecture. The intrinsics for that sub-architecture are
89	* #included and can be used without restriction or runtime check.
90	*
91	* - #if QT_COMPILER_SUPPORTS(XXX)
92	* If this test passes, then the compiler is able to generate code for that
93	* given sub-architecture in another translation unit, given the right set of
94	* flags. Use of the intrinsics is not guaranteed. This is useful with
95	* runtime detection (see below).
96	*
97	* - #if QT_COMPILER_SUPPORTS_HERE(XXX)
98	* If this test passes, then the compiler is able to generate code for that
99	* given sub-architecture in this translation unit, even if it is not doing
100	* that now (it might be). Individual functions may be tagged with
101	* QT_FUNCTION_TARGET(XXX) to cause the compiler to generate code for that
102	* sub-arch. Only inside such functions is the use of the intrisics
103	* guaranteed to work. This is useful with runtime detection (see below).
104	*
105	* Runtime detection of a CPU sub-architecture can be done with the
106	* qCpuHasFeature(XXX) function. There are two strategies for generating
107	* optimized code like that:
108	*
109	* 1) place the optimized code in a different translation unit (C or assembly
110	* sources) and pass the correct flags to the compiler to enable support. Those
111	* sources must not include qglobal.h, which means they cannot include this
112	* file either. The dispatcher function would look like this:
113	*
114	* void foo()
115	* {
116	* #if QT_COMPILER_SUPPORTS(XXX)
117	* if (qCpuHasFeature(XXX)) {
118	* foo_optimized_xxx();
119	* return;
120	* }
121	* #endif
122	* foo_plain();
123	* }
124	*
125	* 2) place the optimized code in a function tagged with QT_FUNCTION_TARGET and
126	* surrounded by #if QT_COMPILER_SUPPORTS_HERE(XXX). That code can freely use
127	* other Qt code. The dispatcher function would look like this:
128	*
129	* void foo()
130	* {
131	* #if QT_COMPILER_SUPPORTS_HERE(XXX)
132	* if (qCpuHasFeature(XXX)) {
133	* foo_optimized_xxx();
134	* return;
135	* }
136	* #endif
137	* foo_plain();
138	* }
139	*/
140
141	#if defined(__MINGW64_VERSION_MAJOR) \|\| defined(Q_CC_MSVC)
142	#include <intrin.h>
143	#endif
144
145	#define QT_COMPILER_SUPPORTS(x) (QT_COMPILER_SUPPORTS_ ## x - 0)
146
147	#if defined(Q_PROCESSOR_ARM)
148	# define QT_COMPILER_SUPPORTS_HERE(x) (__ARM_FEATURE_ ## x)
149	# if defined(Q_CC_GNU) && !defined(Q_CC_INTEL) && Q_CC_GNU >= 600
150	/ GCC requires attributes for a function /
151	# define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x)))
152	# else
153	# define QT_FUNCTION_TARGET(x)
154	# endif
155	# if !defined(__ARM_FEATURE_NEON) && defined(__ARM_NEON__)
156	# define __ARM_FEATURE_NEON // also support QT_COMPILER_SUPPORTS_HERE(NEON)
157	# endif
158	#elif defined(Q_PROCESSOR_MIPS)
159	# define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __)
160	# define QT_FUNCTION_TARGET(x)
161	# if !defined(__MIPS_DSP__) && defined(__mips_dsp) && defined(Q_PROCESSOR_MIPS_32)
162	# define __MIPS_DSP__
163	# endif
164	# if !defined(__MIPS_DSPR2__) && defined(__mips_dspr2) && defined(Q_PROCESSOR_MIPS_32)
165	# define __MIPS_DSPR2__
166	# endif
167	#elif defined(Q_PROCESSOR_X86) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS)
168	# define QT_COMPILER_SUPPORTS_HERE(x) ((__ ## x ## __) \|\| QT_COMPILER_SUPPORTS(x))
169	# if defined(Q_CC_GNU) && !defined(Q_CC_INTEL)
170	/ GCC requires attributes for a function /
171	# define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x)))
172	# else
173	# define QT_FUNCTION_TARGET(x)
174	# endif
175	#else
176	# define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __)
177	# define QT_FUNCTION_TARGET(x)
178	#endif
179
180	#ifdef Q_PROCESSOR_X86
181	/ -- x86 intrinsic support -- /
182
183	# if defined(Q_CC_MSVC) && (defined(_M_X64) \|\| _M_IX86_FP >= 2)
184	// MSVC doesn't define __SSE2__, so do it ourselves
185	# define __SSE__ 1
186	# define __SSE2__ 1
187	# endif
188
189	# ifdef __SSE2__
190	// #include the intrinsics
191	# include <immintrin.h>
192	# endif
193
194	# if defined(Q_CC_GNU) && !defined(Q_CC_INTEL)
195	// GCC 4.4 and Clang 2.8 added a few more intrinsics there
196	# include <x86intrin.h>
197	# endif
198
199	# if defined(Q_CC_MSVC) && (defined(_M_AVX) \|\| defined(__AVX__))
200	// Visual Studio defines __AVX__ when /arch:AVX is passed, but not the earlier macros
201	// See: https://msdn.microsoft.com/en-us/library/b0084kay.aspx
202	# define __SSE3__ 1
203	# define __SSSE3__ 1
204	// no Intel CPU supports SSE4a, so don't define it
205	# define __SSE4_1__ 1
206	# define __SSE4_2__ 1
207	# ifndef __AVX__
208	# define __AVX__ 1
209	# endif
210	# endif
211
212	# if defined(__SSE4_2__) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS) && (defined(Q_CC_INTEL) \|\| defined(Q_CC_MSVC))
213	// POPCNT instructions:
214	// All processors that support SSE4.2 support POPCNT
215	// (but neither MSVC nor the Intel compiler define this macro)
216	# define __POPCNT__ 1
217	# endif
218
219	// AVX intrinsics
220	# if defined(__AVX__) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS) && (defined(Q_CC_INTEL) \|\| defined(Q_CC_MSVC))
221	// AES, PCLMULQDQ instructions:
222	// All processors that support AVX support PCLMULQDQ
223	// (but neither MSVC nor the Intel compiler define this macro)
224	# define __PCLMUL__ 1
225	# endif
226
227	# if defined(__AVX2__) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS) && (defined(Q_CC_INTEL) \|\| defined(Q_CC_MSVC))
228	// F16C & RDRAND instructions:
229	// All processors that support AVX2 support F16C & RDRAND:
230	// (but neither MSVC nor the Intel compiler define these macros)
231	# define __F16C__ 1
232	# define __RDRND__ 1
233	# endif
234
235	# if defined(__BMI__) && !defined(__BMI2__) && defined(Q_CC_INTEL)
236	// BMI2 instructions:
237	// All processors that support BMI support BMI2 (and AVX2)
238	// (but neither MSVC nor the Intel compiler define this macro)
239	# define __BMI2__ 1
240	# endif
241
242	# include "qsimd_x86_p.h"
243
244	// Haswell sub-architecture
245	//
246	// The Intel Core 4th generation was codenamed "Haswell" and introduced AVX2,
247	// BMI1, BMI2, FMA, LZCNT, MOVBE, which makes it a good divider for a
248	// sub-target for us. The first AMD processor with AVX2 support (Zen) has the
249	// same features.
250	//
251	// macOS's fat binaries support the "x86_64h" sub-architecture and the GNU libc
252	// ELF loader also supports a "haswell/" subdir (e.g., /usr/lib/haswell).
253	# define QT_FUNCTION_TARGET_STRING_ARCH_HASWELL "arch=haswell"
254	# if defined(__AVX2__) && defined(__BMI__) && defined(__BMI2__) && defined(__F16C__) && \
255	defined(__FMA__) && defined(__LZCNT__) && defined(__RDRND__)
256	# define __haswell__ 1
257	# endif
258
259	// This constant does not include all CPU features found in a Haswell, only
260	// those that we'd have optimized code for.
261	// Note: must use Q_CONSTEXPR here, as this file may be compiled in C mode.
262	QT_BEGIN_NAMESPACE
263	static const quint64 CpuFeatureArchHaswell = `0`
264	\| CpuFeatureSSE2
265	\| CpuFeatureSSE3
266	\| CpuFeatureSSSE3
267	\| CpuFeatureSSE4_1
268	\| CpuFeatureSSE4_2
269	\| CpuFeatureFMA
270	\| CpuFeaturePOPCNT
271	\| CpuFeatureAVX
272	\| CpuFeatureF16C
273	\| CpuFeatureAVX2
274	\| CpuFeatureBMI
275	\| CpuFeatureBMI2;
276	QT_END_NAMESPACE
277
278	#endif /* Q_PROCESSOR_X86 */
279
280	// Clang compiler fix, see http://lists.llvm.org/pipermail/cfe-commits/Week-of-Mon-20160222/151168.html
281	// This should be tweaked with an "upper version" of clang once we know which release fixes the
282	// issue. At that point we can rely on __ARM_FEATURE_CRC32 again.
283	#if defined(Q_CC_CLANG) && defined(Q_OS_DARWIN) && defined (__ARM_FEATURE_CRC32)
284	# undef __ARM_FEATURE_CRC32
285	#endif
286
287	// NEON intrinsics
288	// note: as of GCC 4.9, does not support function targets for ARM
289	#if defined(__ARM_NEON) \|\| defined(__ARM_NEON__)
290	#include <arm_neon.h>
291	#define QT_FUNCTION_TARGET_STRING_NEON "+neon" // unused: gcc doesn't support function targets on non-aarch64, and on Aarch64 NEON is always available.
292	#ifndef __ARM_NEON__
293	// __ARM_NEON__ is not defined on AArch64, but we need it in our NEON detection.
294	#define __ARM_NEON__
295	#endif
296	#endif
297	// AArch64/ARM64
298	#if defined(Q_PROCESSOR_ARM_V8) && defined(__ARM_FEATURE_CRC32)
299	#if defined(Q_PROCESSOR_ARM_64)
300	// only available on aarch64
301	#define QT_FUNCTION_TARGET_STRING_CRC32 "+crc"
302	#endif
303	# include <arm_acle.h>
304	#endif
305
306	#ifdef __cplusplus
307	#include <qatomic.h>
308
309	QT_BEGIN_NAMESPACE
310
311	#ifndef Q_PROCESSOR_X86
312	enum CPUFeatures {
313	#if defined(Q_PROCESSOR_ARM)
314	CpuFeatureNEON = `2`,
315	CpuFeatureARM_NEON = CpuFeatureNEON,
316	CpuFeatureCRC32 = `4`,
317	#elif defined(Q_PROCESSOR_MIPS)
318	CpuFeatureDSP = `2`,
319	CpuFeatureDSPR2 = `4`,
320	#endif
321
322	// used only to indicate that the CPU detection was initialised
323	QSimdInitialized = `1`
324	};
325
326	static const quint64 qCompilerCpuFeatures = `0`
327	#if defined __ARM_NEON__
328	\| CpuFeatureNEON
329	#endif
330	#if defined __ARM_FEATURE_CRC32
331	\| CpuFeatureCRC32
332	#endif
333	#if defined __mips_dsp
334	\| CpuFeatureDSP
335	#endif
336	#if defined __mips_dspr2
337	\| CpuFeatureDSPR2
338	#endif
339	;
340	#endif
341
342	#ifdef Q_ATOMIC_INT64_IS_SUPPORTED
343	extern Q_CORE_EXPORT QBasicAtomicInteger<quint64> qt_cpu_features[`1`];
344	#else
345	extern Q_CORE_EXPORT QBasicAtomicInteger<unsigned> qt_cpu_features[`2`];
346	#endif
347	Q_CORE_EXPORT quint64 qDetectCpuFeatures();
348
349	#if defined(Q_PROCESSOR_X86) && QT_COMPILER_SUPPORTS_HERE(RDRND) && !defined(QT_BOOTSTRAPPED)
350	Q_CORE_EXPORT qsizetype qRandomCpu(void , qsizetype) noexcept*;
351	#else
352	static inline qsizetype qRandomCpu(void , qsizetype) noexcept*
353	{
354	return `0`;
355	}
356	#endif
357
358	static inline quint64 qCpuFeatures()
359	{
360	quint64 features = qt_cpu_features[`0`].loadRelaxed();
361	#ifndef Q_ATOMIC_INT64_IS_SUPPORTED
362	features \|= quint64(qt_cpu_features[`1`].loadRelaxed()) << `32`;
363	#endif
364	if (Q_UNLIKELY(features == `0`)) {
365	features = qDetectCpuFeatures();
366	Q_ASSUME(features != `0`);
367	}
368	return features;
369	}
370
371	#define qCpuHasFeature(feature) (((qCompilerCpuFeatures & CpuFeature ## feature) == CpuFeature ## feature) \
372	\|\| ((qCpuFeatures() & CpuFeature ## feature) == CpuFeature ## feature))
373
374	inline bool qHasHwrng()
375	{
376	#if defined(Q_PROCESSOR_X86) && QT_COMPILER_SUPPORTS_HERE(RDRND)
377	return qCpuHasFeature(RDRND);
378	#else
379	return false;
380	#endif
381	}
382
383	#define ALIGNMENT_PROLOGUE_16BYTES(ptr, i, length) \
384	for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((4 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x3)) & 0x3))); ++i)
385
386	#define ALIGNMENT_PROLOGUE_32BYTES(ptr, i, length) \
387	for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((8 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x7)) & 0x7))); ++i)
388
389	QT_END_NAMESPACE
390
391	#endif // __cplusplus
392
393	#define SIMD_EPILOGUE(i, length, max) \
394	for (int _i = 0; _i < max && i < length; ++i, ++_i)
395
396	#endif // QSIMD_P_H
397

source code of qtbase/src/corelib/tools/qsimd_p.h