1/****************************************************************************
2**
3** Copyright (C) 2016 The Qt Company Ltd.
4** Copyright (C) 2018 Intel Corporation.
5** Contact: https://www.qt.io/licensing/
6**
7** This file is part of the QtCore module of the Qt Toolkit.
8**
9** $QT_BEGIN_LICENSE:LGPL$
10** Commercial License Usage
11** Licensees holding valid commercial Qt licenses may use this file in
12** accordance with the commercial license agreement provided with the
13** Software or, alternatively, in accordance with the terms contained in
14** a written agreement between you and The Qt Company. For licensing terms
15** and conditions see https://www.qt.io/terms-conditions. For further
16** information use the contact form at https://www.qt.io/contact-us.
17**
18** GNU Lesser General Public License Usage
19** Alternatively, this file may be used under the terms of the GNU Lesser
20** General Public License version 3 as published by the Free Software
21** Foundation and appearing in the file LICENSE.LGPL3 included in the
22** packaging of this file. Please review the following information to
23** ensure the GNU Lesser General Public License version 3 requirements
24** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
25**
26** GNU General Public License Usage
27** Alternatively, this file may be used under the terms of the GNU
28** General Public License version 2.0 or (at your option) the GNU General
29** Public license version 3 or any later version approved by the KDE Free
30** Qt Foundation. The licenses are as published by the Free Software
31** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
32** included in the packaging of this file. Please review the following
33** information to ensure the GNU General Public License requirements will
34** be met: https://www.gnu.org/licenses/gpl-2.0.html and
35** https://www.gnu.org/licenses/gpl-3.0.html.
36**
37** $QT_END_LICENSE$
38**
39****************************************************************************/
40
41#ifndef QSIMD_P_H
42#define QSIMD_P_H
43
44//
45// W A R N I N G
46// -------------
47//
48// This file is not part of the Qt API. It exists purely as an
49// implementation detail. This header file may change from version to
50// version without notice, or even be removed.
51//
52// We mean it.
53//
54
55#include <QtCore/private/qglobal_p.h>
56#include <QtCore/qsimd.h>
57
58/*
59 * qt_module_config.prf defines the QT_COMPILER_SUPPORTS_XXX macros.
60 * They mean the compiler supports the necessary flags and the headers
61 * for the x86 and ARM intrinsics:
62 * - GCC: the -mXXX or march=YYY flag is necessary before #include
63 * up to 4.8; GCC >= 4.9 can include unconditionally
64 * - Intel CC: #include can happen unconditionally
65 * - MSVC: #include can happen unconditionally
66 * - RVCT: ???
67 *
68 * We will try to include all headers possible under this configuration.
69 *
70 * MSVC does not define __SSE2__ & family, so we will define them. MSVC 2013 &
71 * up do define __AVX__ if the -arch:AVX option is passed on the command-line.
72 *
73 * Supported XXX are:
74 * Flag | Arch | GCC | Intel CC | MSVC |
75 * ARM_NEON | ARM | I & C | None | ? |
76 * SSE2 | x86 | I & C | I & C | I & C |
77 * SSE3 | x86 | I & C | I & C | I only |
78 * SSSE3 | x86 | I & C | I & C | I only |
79 * SSE4_1 | x86 | I & C | I & C | I only |
80 * SSE4_2 | x86 | I & C | I & C | I only |
81 * AVX | x86 | I & C | I & C | I & C |
82 * AVX2 | x86 | I & C | I & C | I only |
83 * AVX512xx | x86 | I & C | I & C | I only |
84 * I = intrinsics; C = code generation
85 *
86 * Code can use the following constructs to determine compiler support & status:
87 * - #ifdef __XXX__ (e.g: #ifdef __AVX__ or #ifdef __ARM_NEON__)
88 * If this test passes, then the compiler is already generating code for that
89 * given sub-architecture. The intrinsics for that sub-architecture are
90 * #included and can be used without restriction or runtime check.
91 *
92 * - #if QT_COMPILER_SUPPORTS(XXX)
93 * If this test passes, then the compiler is able to generate code for that
94 * given sub-architecture in another translation unit, given the right set of
95 * flags. Use of the intrinsics is not guaranteed. This is useful with
96 * runtime detection (see below).
97 *
98 * - #if QT_COMPILER_SUPPORTS_HERE(XXX)
99 * If this test passes, then the compiler is able to generate code for that
100 * given sub-architecture in this translation unit, even if it is not doing
101 * that now (it might be). Individual functions may be tagged with
102 * QT_FUNCTION_TARGET(XXX) to cause the compiler to generate code for that
103 * sub-arch. Only inside such functions is the use of the intrisics
104 * guaranteed to work. This is useful with runtime detection (see below).
105 *
106 * Runtime detection of a CPU sub-architecture can be done with the
107 * qCpuHasFeature(XXX) function. There are two strategies for generating
108 * optimized code like that:
109 *
110 * 1) place the optimized code in a different translation unit (C or assembly
111 * sources) and pass the correct flags to the compiler to enable support. Those
112 * sources must not include qglobal.h, which means they cannot include this
113 * file either. The dispatcher function would look like this:
114 *
115 * void foo()
116 * {
117 * #if QT_COMPILER_SUPPORTS(XXX)
118 * if (qCpuHasFeature(XXX)) {
119 * foo_optimized_xxx();
120 * return;
121 * }
122 * #endif
123 * foo_plain();
124 * }
125 *
126 * 2) place the optimized code in a function tagged with QT_FUNCTION_TARGET and
127 * surrounded by #if QT_COMPILER_SUPPORTS_HERE(XXX). That code can freely use
128 * other Qt code. The dispatcher function would look like this:
129 *
130 * void foo()
131 * {
132 * #if QT_COMPILER_SUPPORTS_HERE(XXX)
133 * if (qCpuHasFeature(XXX)) {
134 * foo_optimized_xxx();
135 * return;
136 * }
137 * #endif
138 * foo_plain();
139 * }
140 */
141
142#if defined(__MINGW64_VERSION_MAJOR) || defined(Q_CC_MSVC)
143#include <intrin.h>
144#endif
145
146#define QT_COMPILER_SUPPORTS(x) (QT_COMPILER_SUPPORTS_ ## x - 0)
147
148#if defined(Q_PROCESSOR_ARM) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS)
149# define QT_COMPILER_SUPPORTS_HERE(x) ((__ARM_FEATURE_ ## x) || (__ ## x ## __) || QT_COMPILER_SUPPORTS(x))
150# if defined(Q_CC_GNU)
151 /* GCC requires attributes for a function */
152# define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x)))
153# else
154# define QT_FUNCTION_TARGET(x)
155# endif
156#elif defined(Q_PROCESSOR_MIPS)
157# define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __)
158# define QT_FUNCTION_TARGET(x)
159# if !defined(__MIPS_DSP__) && defined(__mips_dsp) && defined(Q_PROCESSOR_MIPS_32)
160# define __MIPS_DSP__
161# endif
162# if !defined(__MIPS_DSPR2__) && defined(__mips_dspr2) && defined(Q_PROCESSOR_MIPS_32)
163# define __MIPS_DSPR2__
164# endif
165#elif defined(Q_PROCESSOR_X86) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS)
166# if defined(Q_CC_CLANG) && defined(Q_CC_MSVC)
167# define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __)
168# else
169# define QT_COMPILER_SUPPORTS_HERE(x) ((__ ## x ## __) || QT_COMPILER_SUPPORTS(x))
170# endif
171# if defined(Q_CC_GNU) && !defined(Q_CC_INTEL)
172 /* GCC requires attributes for a function */
173# define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x)))
174# else
175# define QT_FUNCTION_TARGET(x)
176# endif
177#elif defined(Q_PROCESSOR_ARM)
178# define QT_COMPILER_SUPPORTS_HERE(x) ((__ARM_FEATURE_ ## x) || (__ ## x ## __))
179# define QT_FUNCTION_TARGET(x)
180#else
181# define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __)
182# define QT_FUNCTION_TARGET(x)
183#endif
184
185#ifdef Q_PROCESSOR_X86
186/* -- x86 intrinsic support -- */
187
188# if defined(Q_CC_MSVC) && (defined(_M_X64) || _M_IX86_FP >= 2)
189// MSVC doesn't define __SSE2__, so do it ourselves
190# define __SSE__ 1
191# endif
192
193# if defined(Q_CC_GNU) && !defined(Q_CC_INTEL) && !defined(Q_OS_WASM)
194// GCC 4.4 and Clang 2.8 added a few more intrinsics there
195# include <x86intrin.h>
196# endif
197#ifdef Q_OS_WASM
198# include <immintrin.h>
199# endif
200
201# if defined(__SSE4_2__) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS) && (defined(Q_CC_INTEL) || defined(Q_CC_MSVC))
202// POPCNT instructions:
203// All processors that support SSE4.2 support POPCNT
204// (but neither MSVC nor the Intel compiler define this macro)
205# define __POPCNT__ 1
206# endif
207
208// AVX intrinsics
209# if defined(__AVX__) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS) && (defined(Q_CC_INTEL) || defined(Q_CC_MSVC))
210// PCLMULQDQ instructions:
211// All processors that support AVX support PCLMULQDQ
212// (but neither MSVC nor the Intel compiler define this macro)
213# define __PCLMUL__ 1
214# endif
215
216# if defined(__AVX2__) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS) && (defined(Q_CC_INTEL) || defined(Q_CC_MSVC))
217// F16C instructions:
218// All processors that support AVX2 support F16C:
219// (but neither MSVC nor the Intel compiler define this macro)
220# define __F16C__ 1
221# endif
222
223# if defined(__BMI__) && !defined(__BMI2__) && defined(Q_CC_INTEL)
224// BMI2 instructions:
225// All processors that support BMI support BMI2 (and AVX2)
226// (but neither MSVC nor the Intel compiler define this macro)
227# define __BMI2__ 1
228# endif
229
230# include "qsimd_x86_p.h"
231
232// Haswell sub-architecture
233//
234// The Intel Core 4th generation was codenamed "Haswell" and introduced AVX2,
235// BMI1, BMI2, FMA, LZCNT, MOVBE, which makes it a good divider for a
236// sub-target for us. The first AMD processor with AVX2 support (Zen) has the
237// same features.
238//
239// macOS's fat binaries support the "x86_64h" sub-architecture and the GNU libc
240// ELF loader also supports a "haswell/" subdir (e.g., /usr/lib/haswell).
241# define QT_FUNCTION_TARGET_STRING_ARCH_HASWELL "arch=haswell"
242# if defined(__AVX2__) && defined(__BMI__) && defined(__BMI2__) && defined(__F16C__) && \
243 defined(__FMA__) && defined(__LZCNT__) && defined(__RDRND__)
244# define __haswell__ 1
245# endif
246
247// This constant does not include all CPU features found in a Haswell, only
248// those that we'd have optimized code for.
249// Note: must use Q_CONSTEXPR here, as this file may be compiled in C mode.
250QT_BEGIN_NAMESPACE
251static const quint64 CpuFeatureArchHaswell = 0
252 | CpuFeatureSSE2
253 | CpuFeatureSSE3
254 | CpuFeatureSSSE3
255 | CpuFeatureSSE4_1
256 | CpuFeatureSSE4_2
257 | CpuFeatureFMA
258 | CpuFeaturePOPCNT
259 | CpuFeatureAVX
260 | CpuFeatureF16C
261 | CpuFeatureAVX2
262 | CpuFeatureBMI
263 | CpuFeatureBMI2;
264QT_END_NAMESPACE
265
266#endif /* Q_PROCESSOR_X86 */
267
268// NEON intrinsics
269// note: as of GCC 4.9, does not support function targets for ARM
270#if defined(__ARM_NEON) || defined(__ARM_NEON__)
271#if defined(Q_CC_CLANG)
272#define QT_FUNCTION_TARGET_STRING_NEON "neon"
273#else
274#define QT_FUNCTION_TARGET_STRING_NEON "+neon" // unused: gcc doesn't support function targets on non-aarch64, and on Aarch64 NEON is always available.
275#endif
276#ifndef __ARM_NEON__
277// __ARM_NEON__ is not defined on AArch64, but we need it in our NEON detection.
278#define __ARM_NEON__
279#endif
280
281#ifndef Q_PROCESSOR_ARM_64 // vaddv is only available on Aarch64
282inline uint16_t vaddvq_u16(uint16x8_t v8)
283{
284 const uint64x2_t v2 = vpaddlq_u32(vpaddlq_u16(v8));
285 const uint64x1_t v1 = vadd_u64(vget_low_u64(v2), vget_high_u64(v2));
286 return vget_lane_u16(vreinterpret_u16_u64(v1), 0);
287}
288
289inline uint8_t vaddv_u8(uint8x8_t v8)
290{
291 const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8)));
292 return vget_lane_u8(vreinterpret_u8_u64(v1), 0);
293}
294#endif
295
296#endif
297
298#if defined(Q_PROCESSOR_ARM) && defined(__ARM_FEATURE_CRC32)
299# include <arm_acle.h>
300#endif
301
302#if defined(Q_PROCESSOR_ARM_64)
303#if defined(Q_CC_CLANG)
304#define QT_FUNCTION_TARGET_STRING_AES "crypto"
305#define QT_FUNCTION_TARGET_STRING_CRC32 "crc"
306#elif defined(Q_CC_GNU)
307#define QT_FUNCTION_TARGET_STRING_AES "+crypto"
308#define QT_FUNCTION_TARGET_STRING_CRC32 "+crc"
309#endif
310#elif defined(Q_PROCESSOR_ARM_32)
311#if defined(Q_CC_CLANG)
312#define QT_FUNCTION_TARGET_STRING_AES "armv8-a,crypto"
313#define QT_FUNCTION_TARGET_STRING_CRC32 "armv8-a,crc"
314#elif defined(Q_CC_GNU)
315#define QT_FUNCTION_TARGET_STRING_AES "arch=armv8-a+crypto"
316#define QT_FUNCTION_TARGET_STRING_CRC32 "arch=armv8-a+crc"
317#endif
318#endif
319
320
321#ifdef __cplusplus
322#include <qatomic.h>
323
324QT_BEGIN_NAMESPACE
325
326#ifndef Q_PROCESSOR_X86
327enum CPUFeatures {
328#if defined(Q_PROCESSOR_ARM)
329 CpuFeatureNEON = 2,
330 CpuFeatureARM_NEON = CpuFeatureNEON,
331 CpuFeatureCRC32 = 4,
332 CpuFeatureAES = 8,
333 CpuFeatureARM_CRYPTO = CpuFeatureAES,
334#elif defined(Q_PROCESSOR_MIPS)
335 CpuFeatureDSP = 2,
336 CpuFeatureDSPR2 = 4,
337#endif
338
339 // used only to indicate that the CPU detection was initialised
340 QSimdInitialized = 1
341};
342
343static const quint64 qCompilerCpuFeatures = 0
344#if defined __ARM_NEON__
345 | CpuFeatureNEON
346#endif
347#if defined __ARM_FEATURE_CRC32
348 | CpuFeatureCRC32
349#endif
350#if defined __ARM_FEATURE_CRYPTO
351 | CpuFeatureAES
352#endif
353#if defined __mips_dsp
354 | CpuFeatureDSP
355#endif
356#if defined __mips_dspr2
357 | CpuFeatureDSPR2
358#endif
359 ;
360#endif
361
362#ifdef Q_ATOMIC_INT64_IS_SUPPORTED
363extern Q_CORE_EXPORT QBasicAtomicInteger<quint64> qt_cpu_features[1];
364#else
365extern Q_CORE_EXPORT QBasicAtomicInteger<unsigned> qt_cpu_features[2];
366#endif
367Q_CORE_EXPORT quint64 qDetectCpuFeatures();
368
369#if defined(Q_PROCESSOR_X86) && QT_COMPILER_SUPPORTS_HERE(RDRND) && !defined(QT_BOOTSTRAPPED)
370Q_CORE_EXPORT qsizetype qRandomCpu(void *, qsizetype) noexcept;
371#else
372static inline qsizetype qRandomCpu(void *, qsizetype) noexcept
373{
374 return 0;
375}
376#endif
377
378static inline quint64 qCpuFeatures()
379{
380 quint64 features = qt_cpu_features[0].loadRelaxed();
381#ifndef Q_ATOMIC_INT64_IS_SUPPORTED
382 features |= quint64(qt_cpu_features[1].loadRelaxed()) << 32;
383#endif
384 if (Q_UNLIKELY(features == 0)) {
385 features = qDetectCpuFeatures();
386 Q_ASSUME(features != 0);
387 }
388 return features;
389}
390
391#define qCpuHasFeature(feature) (((qCompilerCpuFeatures & CpuFeature ## feature) == CpuFeature ## feature) \
392 || ((qCpuFeatures() & CpuFeature ## feature) == CpuFeature ## feature))
393
394inline bool qHasHwrng()
395{
396#if defined(Q_PROCESSOR_X86) && QT_COMPILER_SUPPORTS_HERE(RDRND)
397 return qCpuHasFeature(RDRND);
398#else
399 return false;
400#endif
401}
402
403#define ALIGNMENT_PROLOGUE_16BYTES(ptr, i, length) \
404 for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((4 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x3)) & 0x3))); ++i)
405
406#define ALIGNMENT_PROLOGUE_32BYTES(ptr, i, length) \
407 for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((8 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x7)) & 0x7))); ++i)
408
409QT_END_NAMESPACE
410
411#endif // __cplusplus
412
413#define SIMD_EPILOGUE(i, length, max) \
414 for (int _i = 0; _i < max && i < length; ++i, ++_i)
415
416#endif // QSIMD_P_H
417

source code of qtbase/src/corelib/global/qsimd_p.h