1 | /**************************************************************************** |
2 | ** |
3 | ** Copyright (C) 2016 The Qt Company Ltd. |
4 | ** Copyright (C) 2018 Intel Corporation. |
5 | ** Contact: https://www.qt.io/licensing/ |
6 | ** |
7 | ** This file is part of the QtCore module of the Qt Toolkit. |
8 | ** |
9 | ** $QT_BEGIN_LICENSE:LGPL$ |
10 | ** Commercial License Usage |
11 | ** Licensees holding valid commercial Qt licenses may use this file in |
12 | ** accordance with the commercial license agreement provided with the |
13 | ** Software or, alternatively, in accordance with the terms contained in |
14 | ** a written agreement between you and The Qt Company. For licensing terms |
15 | ** and conditions see https://www.qt.io/terms-conditions. For further |
16 | ** information use the contact form at https://www.qt.io/contact-us. |
17 | ** |
18 | ** GNU Lesser General Public License Usage |
19 | ** Alternatively, this file may be used under the terms of the GNU Lesser |
20 | ** General Public License version 3 as published by the Free Software |
21 | ** Foundation and appearing in the file LICENSE.LGPL3 included in the |
22 | ** packaging of this file. Please review the following information to |
23 | ** ensure the GNU Lesser General Public License version 3 requirements |
24 | ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. |
25 | ** |
26 | ** GNU General Public License Usage |
27 | ** Alternatively, this file may be used under the terms of the GNU |
28 | ** General Public License version 2.0 or (at your option) the GNU General |
29 | ** Public license version 3 or any later version approved by the KDE Free |
30 | ** Qt Foundation. The licenses are as published by the Free Software |
31 | ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 |
32 | ** included in the packaging of this file. Please review the following |
33 | ** information to ensure the GNU General Public License requirements will |
34 | ** be met: https://www.gnu.org/licenses/gpl-2.0.html and |
35 | ** https://www.gnu.org/licenses/gpl-3.0.html. |
36 | ** |
37 | ** $QT_END_LICENSE$ |
38 | ** |
39 | ****************************************************************************/ |
40 | |
41 | #ifndef QSIMD_P_H |
42 | #define QSIMD_P_H |
43 | |
44 | // |
45 | // W A R N I N G |
46 | // ------------- |
47 | // |
48 | // This file is not part of the Qt API. It exists purely as an |
49 | // implementation detail. This header file may change from version to |
50 | // version without notice, or even be removed. |
51 | // |
52 | // We mean it. |
53 | // |
54 | |
55 | #include <QtCore/private/qglobal_p.h> |
56 | |
57 | /* |
58 | * qt_module_config.prf defines the QT_COMPILER_SUPPORTS_XXX macros. |
59 | * They mean the compiler supports the necessary flags and the headers |
60 | * for the x86 and ARM intrinsics: |
61 | * - GCC: the -mXXX or march=YYY flag is necessary before #include |
62 | * up to 4.8; GCC >= 4.9 can include unconditionally |
63 | * - Intel CC: #include can happen unconditionally |
64 | * - MSVC: #include can happen unconditionally |
65 | * - RVCT: ??? |
66 | * |
67 | * We will try to include all headers possible under this configuration. |
68 | * |
69 | * MSVC does not define __SSE2__ & family, so we will define them. MSVC 2013 & |
70 | * up do define __AVX__ if the -arch:AVX option is passed on the command-line. |
71 | * |
72 | * Supported XXX are: |
73 | * Flag | Arch | GCC | Intel CC | MSVC | |
74 | * ARM_NEON | ARM | I & C | None | ? | |
75 | * SSE2 | x86 | I & C | I & C | I & C | |
76 | * SSE3 | x86 | I & C | I & C | I only | |
77 | * SSSE3 | x86 | I & C | I & C | I only | |
78 | * SSE4_1 | x86 | I & C | I & C | I only | |
79 | * SSE4_2 | x86 | I & C | I & C | I only | |
80 | * AVX | x86 | I & C | I & C | I & C | |
81 | * AVX2 | x86 | I & C | I & C | I only | |
82 | * AVX512xx | x86 | I & C | I & C | I only | |
83 | * I = intrinsics; C = code generation |
84 | * |
85 | * Code can use the following constructs to determine compiler support & status: |
86 | * - #ifdef __XXX__ (e.g: #ifdef __AVX__ or #ifdef __ARM_NEON__) |
87 | * If this test passes, then the compiler is already generating code for that |
88 | * given sub-architecture. The intrinsics for that sub-architecture are |
89 | * #included and can be used without restriction or runtime check. |
90 | * |
91 | * - #if QT_COMPILER_SUPPORTS(XXX) |
92 | * If this test passes, then the compiler is able to generate code for that |
93 | * given sub-architecture in another translation unit, given the right set of |
94 | * flags. Use of the intrinsics is not guaranteed. This is useful with |
95 | * runtime detection (see below). |
96 | * |
97 | * - #if QT_COMPILER_SUPPORTS_HERE(XXX) |
98 | * If this test passes, then the compiler is able to generate code for that |
99 | * given sub-architecture in this translation unit, even if it is not doing |
100 | * that now (it might be). Individual functions may be tagged with |
101 | * QT_FUNCTION_TARGET(XXX) to cause the compiler to generate code for that |
102 | * sub-arch. Only inside such functions is the use of the intrisics |
103 | * guaranteed to work. This is useful with runtime detection (see below). |
104 | * |
105 | * Runtime detection of a CPU sub-architecture can be done with the |
106 | * qCpuHasFeature(XXX) function. There are two strategies for generating |
107 | * optimized code like that: |
108 | * |
109 | * 1) place the optimized code in a different translation unit (C or assembly |
110 | * sources) and pass the correct flags to the compiler to enable support. Those |
111 | * sources must not include qglobal.h, which means they cannot include this |
112 | * file either. The dispatcher function would look like this: |
113 | * |
114 | * void foo() |
115 | * { |
116 | * #if QT_COMPILER_SUPPORTS(XXX) |
117 | * if (qCpuHasFeature(XXX)) { |
118 | * foo_optimized_xxx(); |
119 | * return; |
120 | * } |
121 | * #endif |
122 | * foo_plain(); |
123 | * } |
124 | * |
125 | * 2) place the optimized code in a function tagged with QT_FUNCTION_TARGET and |
126 | * surrounded by #if QT_COMPILER_SUPPORTS_HERE(XXX). That code can freely use |
127 | * other Qt code. The dispatcher function would look like this: |
128 | * |
129 | * void foo() |
130 | * { |
131 | * #if QT_COMPILER_SUPPORTS_HERE(XXX) |
132 | * if (qCpuHasFeature(XXX)) { |
133 | * foo_optimized_xxx(); |
134 | * return; |
135 | * } |
136 | * #endif |
137 | * foo_plain(); |
138 | * } |
139 | */ |
140 | |
141 | #if defined(__MINGW64_VERSION_MAJOR) || defined(Q_CC_MSVC) |
142 | #include <intrin.h> |
143 | #endif |
144 | |
145 | #define QT_COMPILER_SUPPORTS(x) (QT_COMPILER_SUPPORTS_ ## x - 0) |
146 | |
147 | #if defined(Q_PROCESSOR_ARM) |
148 | # define QT_COMPILER_SUPPORTS_HERE(x) (__ARM_FEATURE_ ## x) |
149 | # if defined(Q_CC_GNU) && !defined(Q_CC_INTEL) && Q_CC_GNU >= 600 |
150 | /* GCC requires attributes for a function */ |
151 | # define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x))) |
152 | # else |
153 | # define QT_FUNCTION_TARGET(x) |
154 | # endif |
155 | # if !defined(__ARM_FEATURE_NEON) && defined(__ARM_NEON__) |
156 | # define __ARM_FEATURE_NEON // also support QT_COMPILER_SUPPORTS_HERE(NEON) |
157 | # endif |
158 | #elif defined(Q_PROCESSOR_MIPS) |
159 | # define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __) |
160 | # define QT_FUNCTION_TARGET(x) |
161 | # if !defined(__MIPS_DSP__) && defined(__mips_dsp) && defined(Q_PROCESSOR_MIPS_32) |
162 | # define __MIPS_DSP__ |
163 | # endif |
164 | # if !defined(__MIPS_DSPR2__) && defined(__mips_dspr2) && defined(Q_PROCESSOR_MIPS_32) |
165 | # define __MIPS_DSPR2__ |
166 | # endif |
167 | #elif defined(Q_PROCESSOR_X86) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS) |
168 | # define QT_COMPILER_SUPPORTS_HERE(x) ((__ ## x ## __) || QT_COMPILER_SUPPORTS(x)) |
169 | # if defined(Q_CC_GNU) && !defined(Q_CC_INTEL) |
170 | /* GCC requires attributes for a function */ |
171 | # define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x))) |
172 | # else |
173 | # define QT_FUNCTION_TARGET(x) |
174 | # endif |
175 | #else |
176 | # define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __) |
177 | # define QT_FUNCTION_TARGET(x) |
178 | #endif |
179 | |
180 | #ifdef Q_PROCESSOR_X86 |
181 | /* -- x86 intrinsic support -- */ |
182 | |
183 | # if defined(Q_CC_MSVC) && (defined(_M_X64) || _M_IX86_FP >= 2) |
184 | // MSVC doesn't define __SSE2__, so do it ourselves |
185 | # define __SSE__ 1 |
186 | # define __SSE2__ 1 |
187 | # endif |
188 | |
189 | # ifdef __SSE2__ |
190 | // #include the intrinsics |
191 | # include <immintrin.h> |
192 | # endif |
193 | |
194 | # if defined(Q_CC_GNU) && !defined(Q_CC_INTEL) |
195 | // GCC 4.4 and Clang 2.8 added a few more intrinsics there |
196 | # include <x86intrin.h> |
197 | # endif |
198 | |
199 | # if defined(Q_CC_MSVC) && (defined(_M_AVX) || defined(__AVX__)) |
200 | // Visual Studio defines __AVX__ when /arch:AVX is passed, but not the earlier macros |
201 | // See: https://msdn.microsoft.com/en-us/library/b0084kay.aspx |
202 | # define __SSE3__ 1 |
203 | # define __SSSE3__ 1 |
204 | // no Intel CPU supports SSE4a, so don't define it |
205 | # define __SSE4_1__ 1 |
206 | # define __SSE4_2__ 1 |
207 | # ifndef __AVX__ |
208 | # define __AVX__ 1 |
209 | # endif |
210 | # endif |
211 | |
212 | # if defined(__SSE4_2__) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS) && (defined(Q_CC_INTEL) || defined(Q_CC_MSVC)) |
213 | // POPCNT instructions: |
214 | // All processors that support SSE4.2 support POPCNT |
215 | // (but neither MSVC nor the Intel compiler define this macro) |
216 | # define __POPCNT__ 1 |
217 | # endif |
218 | |
219 | // AVX intrinsics |
220 | # if defined(__AVX__) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS) && (defined(Q_CC_INTEL) || defined(Q_CC_MSVC)) |
221 | // AES, PCLMULQDQ instructions: |
222 | // All processors that support AVX support PCLMULQDQ |
223 | // (but neither MSVC nor the Intel compiler define this macro) |
224 | # define __PCLMUL__ 1 |
225 | # endif |
226 | |
227 | # if defined(__AVX2__) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS) && (defined(Q_CC_INTEL) || defined(Q_CC_MSVC)) |
228 | // F16C & RDRAND instructions: |
229 | // All processors that support AVX2 support F16C & RDRAND: |
230 | // (but neither MSVC nor the Intel compiler define these macros) |
231 | # define __F16C__ 1 |
232 | # define __RDRND__ 1 |
233 | # endif |
234 | |
235 | # if defined(__BMI__) && !defined(__BMI2__) && defined(Q_CC_INTEL) |
236 | // BMI2 instructions: |
237 | // All processors that support BMI support BMI2 (and AVX2) |
238 | // (but neither MSVC nor the Intel compiler define this macro) |
239 | # define __BMI2__ 1 |
240 | # endif |
241 | |
242 | # include "qsimd_x86_p.h" |
243 | |
244 | // Haswell sub-architecture |
245 | // |
246 | // The Intel Core 4th generation was codenamed "Haswell" and introduced AVX2, |
247 | // BMI1, BMI2, FMA, LZCNT, MOVBE, which makes it a good divider for a |
248 | // sub-target for us. The first AMD processor with AVX2 support (Zen) has the |
249 | // same features. |
250 | // |
251 | // macOS's fat binaries support the "x86_64h" sub-architecture and the GNU libc |
252 | // ELF loader also supports a "haswell/" subdir (e.g., /usr/lib/haswell). |
253 | # define QT_FUNCTION_TARGET_STRING_ARCH_HASWELL "arch=haswell" |
254 | # if defined(__AVX2__) && defined(__BMI__) && defined(__BMI2__) && defined(__F16C__) && \ |
255 | defined(__FMA__) && defined(__LZCNT__) && defined(__RDRND__) |
256 | # define __haswell__ 1 |
257 | # endif |
258 | |
259 | // This constant does not include all CPU features found in a Haswell, only |
260 | // those that we'd have optimized code for. |
261 | // Note: must use Q_CONSTEXPR here, as this file may be compiled in C mode. |
262 | QT_BEGIN_NAMESPACE |
263 | static const quint64 CpuFeatureArchHaswell = 0 |
264 | | CpuFeatureSSE2 |
265 | | CpuFeatureSSE3 |
266 | | CpuFeatureSSSE3 |
267 | | CpuFeatureSSE4_1 |
268 | | CpuFeatureSSE4_2 |
269 | | CpuFeatureFMA |
270 | | CpuFeaturePOPCNT |
271 | | CpuFeatureAVX |
272 | | CpuFeatureF16C |
273 | | CpuFeatureAVX2 |
274 | | CpuFeatureBMI |
275 | | CpuFeatureBMI2; |
276 | QT_END_NAMESPACE |
277 | |
278 | #endif /* Q_PROCESSOR_X86 */ |
279 | |
280 | // Clang compiler fix, see http://lists.llvm.org/pipermail/cfe-commits/Week-of-Mon-20160222/151168.html |
281 | // This should be tweaked with an "upper version" of clang once we know which release fixes the |
282 | // issue. At that point we can rely on __ARM_FEATURE_CRC32 again. |
283 | #if defined(Q_CC_CLANG) && defined(Q_OS_DARWIN) && defined (__ARM_FEATURE_CRC32) |
284 | # undef __ARM_FEATURE_CRC32 |
285 | #endif |
286 | |
287 | // NEON intrinsics |
288 | // note: as of GCC 4.9, does not support function targets for ARM |
289 | #if defined(__ARM_NEON) || defined(__ARM_NEON__) |
290 | #include <arm_neon.h> |
291 | #define QT_FUNCTION_TARGET_STRING_NEON "+neon" // unused: gcc doesn't support function targets on non-aarch64, and on Aarch64 NEON is always available. |
292 | #ifndef __ARM_NEON__ |
293 | // __ARM_NEON__ is not defined on AArch64, but we need it in our NEON detection. |
294 | #define __ARM_NEON__ |
295 | #endif |
296 | #endif |
297 | // AArch64/ARM64 |
298 | #if defined(Q_PROCESSOR_ARM_V8) && defined(__ARM_FEATURE_CRC32) |
299 | #if defined(Q_PROCESSOR_ARM_64) |
300 | // only available on aarch64 |
301 | #define QT_FUNCTION_TARGET_STRING_CRC32 "+crc" |
302 | #endif |
303 | # include <arm_acle.h> |
304 | #endif |
305 | |
306 | #ifdef __cplusplus |
307 | #include <qatomic.h> |
308 | |
309 | QT_BEGIN_NAMESPACE |
310 | |
311 | #ifndef Q_PROCESSOR_X86 |
312 | enum CPUFeatures { |
313 | #if defined(Q_PROCESSOR_ARM) |
314 | CpuFeatureNEON = 2, |
315 | CpuFeatureARM_NEON = CpuFeatureNEON, |
316 | CpuFeatureCRC32 = 4, |
317 | #elif defined(Q_PROCESSOR_MIPS) |
318 | CpuFeatureDSP = 2, |
319 | CpuFeatureDSPR2 = 4, |
320 | #endif |
321 | |
322 | // used only to indicate that the CPU detection was initialised |
323 | QSimdInitialized = 1 |
324 | }; |
325 | |
326 | static const quint64 qCompilerCpuFeatures = 0 |
327 | #if defined __ARM_NEON__ |
328 | | CpuFeatureNEON |
329 | #endif |
330 | #if defined __ARM_FEATURE_CRC32 |
331 | | CpuFeatureCRC32 |
332 | #endif |
333 | #if defined __mips_dsp |
334 | | CpuFeatureDSP |
335 | #endif |
336 | #if defined __mips_dspr2 |
337 | | CpuFeatureDSPR2 |
338 | #endif |
339 | ; |
340 | #endif |
341 | |
342 | #ifdef Q_ATOMIC_INT64_IS_SUPPORTED |
343 | extern Q_CORE_EXPORT QBasicAtomicInteger<quint64> qt_cpu_features[1]; |
344 | #else |
345 | extern Q_CORE_EXPORT QBasicAtomicInteger<unsigned> qt_cpu_features[2]; |
346 | #endif |
347 | Q_CORE_EXPORT quint64 qDetectCpuFeatures(); |
348 | |
349 | #if defined(Q_PROCESSOR_X86) && QT_COMPILER_SUPPORTS_HERE(RDRND) && !defined(QT_BOOTSTRAPPED) |
350 | Q_CORE_EXPORT qsizetype qRandomCpu(void *, qsizetype) noexcept; |
351 | #else |
352 | static inline qsizetype qRandomCpu(void *, qsizetype) noexcept |
353 | { |
354 | return 0; |
355 | } |
356 | #endif |
357 | |
358 | static inline quint64 qCpuFeatures() |
359 | { |
360 | quint64 features = qt_cpu_features[0].loadRelaxed(); |
361 | #ifndef Q_ATOMIC_INT64_IS_SUPPORTED |
362 | features |= quint64(qt_cpu_features[1].loadRelaxed()) << 32; |
363 | #endif |
364 | if (Q_UNLIKELY(features == 0)) { |
365 | features = qDetectCpuFeatures(); |
366 | Q_ASSUME(features != 0); |
367 | } |
368 | return features; |
369 | } |
370 | |
371 | #define qCpuHasFeature(feature) (((qCompilerCpuFeatures & CpuFeature ## feature) == CpuFeature ## feature) \ |
372 | || ((qCpuFeatures() & CpuFeature ## feature) == CpuFeature ## feature)) |
373 | |
374 | inline bool qHasHwrng() |
375 | { |
376 | #if defined(Q_PROCESSOR_X86) && QT_COMPILER_SUPPORTS_HERE(RDRND) |
377 | return qCpuHasFeature(RDRND); |
378 | #else |
379 | return false; |
380 | #endif |
381 | } |
382 | |
383 | #define ALIGNMENT_PROLOGUE_16BYTES(ptr, i, length) \ |
384 | for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((4 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x3)) & 0x3))); ++i) |
385 | |
386 | #define ALIGNMENT_PROLOGUE_32BYTES(ptr, i, length) \ |
387 | for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((8 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x7)) & 0x7))); ++i) |
388 | |
389 | QT_END_NAMESPACE |
390 | |
391 | #endif // __cplusplus |
392 | |
393 | #define SIMD_EPILOGUE(i, length, max) \ |
394 | for (int _i = 0; _i < max && i < length; ++i, ++_i) |
395 | |
396 | #endif // QSIMD_P_H |
397 | |