| 1 | // This file is part of Eigen, a lightweight C++ template library |
| 2 | // for linear algebra. |
| 3 | // |
| 4 | // Copyright (C) 2008-2018 Gael Guennebaud <gael.guennebaud@inria.fr> |
| 5 | // Copyright (C) 2020, Arm Limited and Contributors |
| 6 | // |
| 7 | // This Source Code Form is subject to the terms of the Mozilla |
| 8 | // Public License v. 2.0. If a copy of the MPL was not distributed |
| 9 | // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| 10 | |
| 11 | #ifndef EIGEN_CONFIGURE_VECTORIZATION_H |
| 12 | #define EIGEN_CONFIGURE_VECTORIZATION_H |
| 13 | |
| 14 | //------------------------------------------------------------------------------------------ |
| 15 | // Static and dynamic alignment control |
| 16 | // |
| 17 | // The main purpose of this section is to define EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES |
| 18 | // as the maximal boundary in bytes on which dynamically and statically allocated data may be alignment respectively. |
| 19 | // The values of EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES can be specified by the user. If not, |
| 20 | // a default value is automatically computed based on architecture, compiler, and OS. |
| 21 | // |
| 22 | // This section also defines macros EIGEN_ALIGN_TO_BOUNDARY(N) and the shortcuts EIGEN_ALIGN{8,16,32,_MAX} |
| 23 | // to be used to declare statically aligned buffers. |
| 24 | //------------------------------------------------------------------------------------------ |
| 25 | |
| 26 | |
| 27 | /* EIGEN_ALIGN_TO_BOUNDARY(n) forces data to be n-byte aligned. This is used to satisfy SIMD requirements. |
| 28 | * However, we do that EVEN if vectorization (EIGEN_VECTORIZE) is disabled, |
| 29 | * so that vectorization doesn't affect binary compatibility. |
| 30 | * |
| 31 | * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link |
| 32 | * vectorized and non-vectorized code. |
| 33 | * |
| 34 | * FIXME: this code can be cleaned up once we switch to proper C++11 only. |
| 35 | */ |
| 36 | #if (defined EIGEN_CUDACC) |
| 37 | #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n) |
| 38 | #define EIGEN_ALIGNOF(x) __alignof(x) |
| 39 | #elif EIGEN_HAS_ALIGNAS |
| 40 | #define EIGEN_ALIGN_TO_BOUNDARY(n) alignas(n) |
| 41 | #define EIGEN_ALIGNOF(x) alignof(x) |
| 42 | #elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM |
| 43 | #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) |
| 44 | #define EIGEN_ALIGNOF(x) __alignof(x) |
| 45 | #elif EIGEN_COMP_MSVC |
| 46 | #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n)) |
| 47 | #define EIGEN_ALIGNOF(x) __alignof(x) |
| 48 | #elif EIGEN_COMP_SUNCC |
| 49 | // FIXME not sure about this one: |
| 50 | #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) |
| 51 | #define EIGEN_ALIGNOF(x) __alignof(x) |
| 52 | #else |
| 53 | #error Please tell me what is the equivalent of alignas(n) and alignof(x) for your compiler |
| 54 | #endif |
| 55 | |
| 56 | // If the user explicitly disable vectorization, then we also disable alignment |
| 57 | #if defined(EIGEN_DONT_VECTORIZE) |
| 58 | #if defined(EIGEN_GPUCC) |
| 59 | // GPU code is always vectorized and requires memory alignment for |
| 60 | // statically allocated buffers. |
| 61 | #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16 |
| 62 | #else |
| 63 | #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0 |
| 64 | #endif |
| 65 | #elif defined(__AVX512F__) |
| 66 | // 64 bytes static alignment is preferred only if really required |
| 67 | #define EIGEN_IDEAL_MAX_ALIGN_BYTES 64 |
| 68 | #elif defined(__AVX__) |
| 69 | // 32 bytes static alignment is preferred only if really required |
| 70 | #define EIGEN_IDEAL_MAX_ALIGN_BYTES 32 |
| 71 | #else |
| 72 | #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16 |
| 73 | #endif |
| 74 | |
| 75 | |
| 76 | // EIGEN_MIN_ALIGN_BYTES defines the minimal value for which the notion of explicit alignment makes sense |
| 77 | #define EIGEN_MIN_ALIGN_BYTES 16 |
| 78 | |
| 79 | // Defined the boundary (in bytes) on which the data needs to be aligned. Note |
| 80 | // that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be |
| 81 | // aligned at all regardless of the value of this #define. |
| 82 | |
| 83 | #if (defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN)) && defined(EIGEN_MAX_STATIC_ALIGN_BYTES) && EIGEN_MAX_STATIC_ALIGN_BYTES>0 |
| 84 | #error EIGEN_MAX_STATIC_ALIGN_BYTES and EIGEN_DONT_ALIGN[_STATICALLY] are both defined with EIGEN_MAX_STATIC_ALIGN_BYTES!=0. Use EIGEN_MAX_STATIC_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN_STATICALLY. |
| 85 | #endif |
| 86 | |
| 87 | // EIGEN_DONT_ALIGN_STATICALLY and EIGEN_DONT_ALIGN are deprecated |
| 88 | // They imply EIGEN_MAX_STATIC_ALIGN_BYTES=0 |
| 89 | #if defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN) |
| 90 | #ifdef EIGEN_MAX_STATIC_ALIGN_BYTES |
| 91 | #undef EIGEN_MAX_STATIC_ALIGN_BYTES |
| 92 | #endif |
| 93 | #define EIGEN_MAX_STATIC_ALIGN_BYTES 0 |
| 94 | #endif |
| 95 | |
| 96 | #ifndef EIGEN_MAX_STATIC_ALIGN_BYTES |
| 97 | |
| 98 | // Try to automatically guess what is the best default value for EIGEN_MAX_STATIC_ALIGN_BYTES |
| 99 | |
| 100 | // 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable |
| 101 | // 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always |
| 102 | // enable alignment, but it can be a cause of problems on some platforms, so we just disable it in |
| 103 | // certain common platform (compiler+architecture combinations) to avoid these problems. |
| 104 | // Only static alignment is really problematic (relies on nonstandard compiler extensions), |
| 105 | // try to keep heap alignment even when we have to disable static alignment. |
| 106 | #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64 || EIGEN_ARCH_MIPS) |
| 107 | #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 |
| 108 | #elif EIGEN_ARCH_ARM_OR_ARM64 && EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(4, 6) |
| 109 | // Old versions of GCC on ARM, at least 4.4, were once seen to have buggy static alignment support. |
| 110 | // Not sure which version fixed it, hopefully it doesn't affect 4.7, which is still somewhat in use. |
| 111 | // 4.8 and newer seem definitely unaffected. |
| 112 | #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 |
| 113 | #else |
| 114 | #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0 |
| 115 | #endif |
| 116 | |
| 117 | // static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX |
| 118 | #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \ |
| 119 | && !EIGEN_GCC3_OR_OLDER \ |
| 120 | && !EIGEN_COMP_SUNCC \ |
| 121 | && !EIGEN_OS_QNX |
| 122 | #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1 |
| 123 | #else |
| 124 | #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0 |
| 125 | #endif |
| 126 | |
| 127 | #if EIGEN_ARCH_WANTS_STACK_ALIGNMENT |
| 128 | #define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES |
| 129 | #else |
| 130 | #define EIGEN_MAX_STATIC_ALIGN_BYTES 0 |
| 131 | #endif |
| 132 | |
| 133 | #endif |
| 134 | |
| 135 | // If EIGEN_MAX_ALIGN_BYTES is defined, then it is considered as an upper bound for EIGEN_MAX_STATIC_ALIGN_BYTES |
| 136 | #if defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES<EIGEN_MAX_STATIC_ALIGN_BYTES |
| 137 | #undef EIGEN_MAX_STATIC_ALIGN_BYTES |
| 138 | #define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES |
| 139 | #endif |
| 140 | |
| 141 | #if EIGEN_MAX_STATIC_ALIGN_BYTES==0 && !defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT) |
| 142 | #define EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT |
| 143 | #endif |
| 144 | |
| 145 | // At this stage, EIGEN_MAX_STATIC_ALIGN_BYTES>0 is the true test whether we want to align arrays on the stack or not. |
| 146 | // It takes into account both the user choice to explicitly enable/disable alignment (by setting EIGEN_MAX_STATIC_ALIGN_BYTES) |
| 147 | // and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT). |
| 148 | // Henceforth, only EIGEN_MAX_STATIC_ALIGN_BYTES should be used. |
| 149 | |
| 150 | |
| 151 | // Shortcuts to EIGEN_ALIGN_TO_BOUNDARY |
| 152 | #define EIGEN_ALIGN8 EIGEN_ALIGN_TO_BOUNDARY(8) |
| 153 | #define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16) |
| 154 | #define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32) |
| 155 | #define EIGEN_ALIGN64 EIGEN_ALIGN_TO_BOUNDARY(64) |
| 156 | #if EIGEN_MAX_STATIC_ALIGN_BYTES>0 |
| 157 | #define EIGEN_ALIGN_MAX EIGEN_ALIGN_TO_BOUNDARY(EIGEN_MAX_STATIC_ALIGN_BYTES) |
| 158 | #else |
| 159 | #define EIGEN_ALIGN_MAX |
| 160 | #endif |
| 161 | |
| 162 | |
| 163 | // Dynamic alignment control |
| 164 | |
| 165 | #if defined(EIGEN_DONT_ALIGN) && defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES>0 |
| 166 | #error EIGEN_MAX_ALIGN_BYTES and EIGEN_DONT_ALIGN are both defined with EIGEN_MAX_ALIGN_BYTES!=0. Use EIGEN_MAX_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN. |
| 167 | #endif |
| 168 | |
| 169 | #ifdef EIGEN_DONT_ALIGN |
| 170 | #ifdef EIGEN_MAX_ALIGN_BYTES |
| 171 | #undef EIGEN_MAX_ALIGN_BYTES |
| 172 | #endif |
| 173 | #define EIGEN_MAX_ALIGN_BYTES 0 |
| 174 | #elif !defined(EIGEN_MAX_ALIGN_BYTES) |
| 175 | #define EIGEN_MAX_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES |
| 176 | #endif |
| 177 | |
| 178 | #if EIGEN_IDEAL_MAX_ALIGN_BYTES > EIGEN_MAX_ALIGN_BYTES |
| 179 | #define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES |
| 180 | #else |
| 181 | #define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES |
| 182 | #endif |
| 183 | |
| 184 | |
| 185 | #ifndef EIGEN_UNALIGNED_VECTORIZE |
| 186 | #define EIGEN_UNALIGNED_VECTORIZE 1 |
| 187 | #endif |
| 188 | |
| 189 | //---------------------------------------------------------------------- |
| 190 | |
| 191 | // if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into |
| 192 | // account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks |
| 193 | #if EIGEN_MAX_ALIGN_BYTES==0 |
| 194 | #ifndef EIGEN_DONT_VECTORIZE |
| 195 | #define EIGEN_DONT_VECTORIZE |
| 196 | #endif |
| 197 | #endif |
| 198 | |
| 199 | |
| 200 | // The following (except #include <malloc.h> and _M_IX86_FP ??) can likely be |
| 201 | // removed as gcc 4.1 and msvc 2008 are not supported anyways. |
| 202 | #if EIGEN_COMP_MSVC |
| 203 | #include <malloc.h> // for _aligned_malloc -- need it regardless of whether vectorization is enabled |
| 204 | #if (EIGEN_COMP_MSVC >= 1500) // 2008 or later |
| 205 | // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP. |
| 206 | #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64 |
| 207 | #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER |
| 208 | #endif |
| 209 | #endif |
| 210 | #else |
| 211 | #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) ) |
| 212 | #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC |
| 213 | #endif |
| 214 | #endif |
| 215 | |
| 216 | #if !(defined(EIGEN_DONT_VECTORIZE) || defined(EIGEN_GPUCC)) |
| 217 | |
| 218 | #if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER) |
| 219 | |
| 220 | // Defines symbols for compile-time detection of which instructions are |
| 221 | // used. |
| 222 | // EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used |
| 223 | #define EIGEN_VECTORIZE |
| 224 | #define EIGEN_VECTORIZE_SSE |
| 225 | #define EIGEN_VECTORIZE_SSE2 |
| 226 | |
| 227 | // Detect sse3/ssse3/sse4: |
| 228 | // gcc and icc defines __SSE3__, ... |
| 229 | // there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you |
| 230 | // want to force the use of those instructions with msvc. |
| 231 | #ifdef __SSE3__ |
| 232 | #define EIGEN_VECTORIZE_SSE3 |
| 233 | #endif |
| 234 | #ifdef __SSSE3__ |
| 235 | #define EIGEN_VECTORIZE_SSSE3 |
| 236 | #endif |
| 237 | #ifdef __SSE4_1__ |
| 238 | #define EIGEN_VECTORIZE_SSE4_1 |
| 239 | #endif |
| 240 | #ifdef __SSE4_2__ |
| 241 | #define EIGEN_VECTORIZE_SSE4_2 |
| 242 | #endif |
| 243 | #ifdef __AVX__ |
| 244 | #ifndef EIGEN_USE_SYCL |
| 245 | #define EIGEN_VECTORIZE_AVX |
| 246 | #endif |
| 247 | #define EIGEN_VECTORIZE_SSE3 |
| 248 | #define EIGEN_VECTORIZE_SSSE3 |
| 249 | #define EIGEN_VECTORIZE_SSE4_1 |
| 250 | #define EIGEN_VECTORIZE_SSE4_2 |
| 251 | #endif |
| 252 | #ifdef __AVX2__ |
| 253 | #ifndef EIGEN_USE_SYCL |
| 254 | #define EIGEN_VECTORIZE_AVX2 |
| 255 | #define EIGEN_VECTORIZE_AVX |
| 256 | #endif |
| 257 | #define EIGEN_VECTORIZE_SSE3 |
| 258 | #define EIGEN_VECTORIZE_SSSE3 |
| 259 | #define EIGEN_VECTORIZE_SSE4_1 |
| 260 | #define EIGEN_VECTORIZE_SSE4_2 |
| 261 | #endif |
| 262 | #if defined(__FMA__) || (EIGEN_COMP_MSVC && defined(__AVX2__)) |
| 263 | // MSVC does not expose a switch dedicated for FMA |
| 264 | // For MSVC, AVX2 => FMA |
| 265 | #define EIGEN_VECTORIZE_FMA |
| 266 | #endif |
| 267 | #if defined(__AVX512F__) |
| 268 | #ifndef EIGEN_VECTORIZE_FMA |
| 269 | #if EIGEN_COMP_GNUC |
| 270 | #error Please add -mfma to your compiler flags: compiling with -mavx512f alone without SSE/AVX FMA is not supported (bug 1638). |
| 271 | #else |
| 272 | #error Please enable FMA in your compiler flags (e.g. -mfma): compiling with AVX512 alone without SSE/AVX FMA is not supported (bug 1638). |
| 273 | #endif |
| 274 | #endif |
| 275 | #ifndef EIGEN_USE_SYCL |
| 276 | #define EIGEN_VECTORIZE_AVX512 |
| 277 | #define EIGEN_VECTORIZE_AVX2 |
| 278 | #define EIGEN_VECTORIZE_AVX |
| 279 | #endif |
| 280 | #define EIGEN_VECTORIZE_FMA |
| 281 | #define EIGEN_VECTORIZE_SSE3 |
| 282 | #define EIGEN_VECTORIZE_SSSE3 |
| 283 | #define EIGEN_VECTORIZE_SSE4_1 |
| 284 | #define EIGEN_VECTORIZE_SSE4_2 |
| 285 | #ifndef EIGEN_USE_SYCL |
| 286 | #ifdef __AVX512DQ__ |
| 287 | #define EIGEN_VECTORIZE_AVX512DQ |
| 288 | #endif |
| 289 | #ifdef __AVX512ER__ |
| 290 | #define EIGEN_VECTORIZE_AVX512ER |
| 291 | #endif |
| 292 | #ifdef __AVX512BF16__ |
| 293 | #define EIGEN_VECTORIZE_AVX512BF16 |
| 294 | #endif |
| 295 | #endif |
| 296 | #endif |
| 297 | |
| 298 | // Disable AVX support on broken xcode versions |
| 299 | #if defined(__apple_build_version__) && (__apple_build_version__ == 11000033 ) && ( __MAC_OS_X_VERSION_MIN_REQUIRED == 101500 ) |
| 300 | // A nasty bug in the clang compiler shipped with xcode in a common compilation situation |
| 301 | // when XCode 11.0 and Mac deployment target macOS 10.15 is https://trac.macports.org/ticket/58776#no1 |
| 302 | #ifdef EIGEN_VECTORIZE_AVX |
| 303 | #undef EIGEN_VECTORIZE_AVX |
| 304 | #warning "Disabling AVX support: clang compiler shipped with XCode 11.[012] generates broken assembly with -macosx-version-min=10.15 and AVX enabled. " |
| 305 | #ifdef EIGEN_VECTORIZE_AVX2 |
| 306 | #undef EIGEN_VECTORIZE_AVX2 |
| 307 | #endif |
| 308 | #ifdef EIGEN_VECTORIZE_FMA |
| 309 | #undef EIGEN_VECTORIZE_FMA |
| 310 | #endif |
| 311 | #ifdef EIGEN_VECTORIZE_AVX512 |
| 312 | #undef EIGEN_VECTORIZE_AVX512 |
| 313 | #endif |
| 314 | #ifdef EIGEN_VECTORIZE_AVX512DQ |
| 315 | #undef EIGEN_VECTORIZE_AVX512DQ |
| 316 | #endif |
| 317 | #ifdef EIGEN_VECTORIZE_AVX512ER |
| 318 | #undef EIGEN_VECTORIZE_AVX512ER |
| 319 | #endif |
| 320 | #endif |
| 321 | // NOTE: Confirmed test failures in XCode 11.0, and XCode 11.2 with -macosx-version-min=10.15 and AVX |
| 322 | // NOTE using -macosx-version-min=10.15 with Xcode 11.0 results in runtime segmentation faults in many tests, 11.2 produce core dumps in 3 tests |
| 323 | // NOTE using -macosx-version-min=10.14 produces functioning and passing tests in all cases |
| 324 | // NOTE __clang_version__ "11.0.0 (clang-1100.0.33.8)" XCode 11.0 <- Produces many segfault and core dumping tests |
| 325 | // with -macosx-version-min=10.15 and AVX |
| 326 | // NOTE __clang_version__ "11.0.0 (clang-1100.0.33.12)" XCode 11.2 <- Produces 3 core dumping tests with |
| 327 | // -macosx-version-min=10.15 and AVX |
| 328 | #endif |
| 329 | |
| 330 | // include files |
| 331 | |
| 332 | // This extern "C" works around a MINGW-w64 compilation issue |
| 333 | // https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354 |
| 334 | // In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do). |
| 335 | // However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations |
| 336 | // with conflicting linkage. The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know; |
| 337 | // so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too. |
| 338 | // notice that since these are C headers, the extern "C" is theoretically needed anyways. |
| 339 | extern "C" { |
| 340 | // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly. |
| 341 | // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus: |
| 342 | #if EIGEN_COMP_ICC >= 1110 |
| 343 | #include <immintrin.h> |
| 344 | #else |
| 345 | #include <mmintrin.h> |
| 346 | #include <emmintrin.h> |
| 347 | #include <xmmintrin.h> |
| 348 | #ifdef EIGEN_VECTORIZE_SSE3 |
| 349 | #include <pmmintrin.h> |
| 350 | #endif |
| 351 | #ifdef EIGEN_VECTORIZE_SSSE3 |
| 352 | #include <tmmintrin.h> |
| 353 | #endif |
| 354 | #ifdef EIGEN_VECTORIZE_SSE4_1 |
| 355 | #include <smmintrin.h> |
| 356 | #endif |
| 357 | #ifdef EIGEN_VECTORIZE_SSE4_2 |
| 358 | #include <nmmintrin.h> |
| 359 | #endif |
| 360 | #if defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_AVX512) |
| 361 | #include <immintrin.h> |
| 362 | #endif |
| 363 | #endif |
| 364 | } // end extern "C" |
| 365 | |
| 366 | #elif defined __VSX__ |
| 367 | |
| 368 | #define EIGEN_VECTORIZE |
| 369 | #define EIGEN_VECTORIZE_VSX |
| 370 | #include <altivec.h> |
| 371 | // We need to #undef all these ugly tokens defined in <altivec.h> |
| 372 | // => use __vector instead of vector |
| 373 | #undef bool |
| 374 | #undef vector |
| 375 | #undef pixel |
| 376 | |
| 377 | #elif defined __ALTIVEC__ |
| 378 | |
| 379 | #define EIGEN_VECTORIZE |
| 380 | #define EIGEN_VECTORIZE_ALTIVEC |
| 381 | #include <altivec.h> |
| 382 | // We need to #undef all these ugly tokens defined in <altivec.h> |
| 383 | // => use __vector instead of vector |
| 384 | #undef bool |
| 385 | #undef vector |
| 386 | #undef pixel |
| 387 | |
| 388 | #elif ((defined __ARM_NEON) || (defined __ARM_NEON__)) && !(defined EIGEN_ARM64_USE_SVE) |
| 389 | |
| 390 | #define EIGEN_VECTORIZE |
| 391 | #define EIGEN_VECTORIZE_NEON |
| 392 | #include <arm_neon.h> |
| 393 | |
| 394 | // We currently require SVE to be enabled explicitly via EIGEN_ARM64_USE_SVE and |
| 395 | // will not select the backend automatically |
| 396 | #elif (defined __ARM_FEATURE_SVE) && (defined EIGEN_ARM64_USE_SVE) |
| 397 | |
| 398 | #define EIGEN_VECTORIZE |
| 399 | #define EIGEN_VECTORIZE_SVE |
| 400 | #include <arm_sve.h> |
| 401 | |
| 402 | // Since we depend on knowing SVE vector lengths at compile-time, we need |
| 403 | // to ensure a fixed lengths is set |
| 404 | #if defined __ARM_FEATURE_SVE_BITS |
| 405 | #define EIGEN_ARM64_SVE_VL __ARM_FEATURE_SVE_BITS |
| 406 | #else |
| 407 | #error "Eigen requires a fixed SVE lector length but EIGEN_ARM64_SVE_VL is not set." |
| 408 | #endif |
| 409 | |
| 410 | #elif (defined __s390x__ && defined __VEC__) |
| 411 | |
| 412 | #define EIGEN_VECTORIZE |
| 413 | #define EIGEN_VECTORIZE_ZVECTOR |
| 414 | #include <vecintrin.h> |
| 415 | |
| 416 | #elif defined __mips_msa |
| 417 | |
| 418 | // Limit MSA optimizations to little-endian CPUs for now. |
| 419 | // TODO: Perhaps, eventually support MSA optimizations on big-endian CPUs? |
| 420 | #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) |
| 421 | #if defined(__LP64__) |
| 422 | #define EIGEN_MIPS_64 |
| 423 | #else |
| 424 | #define EIGEN_MIPS_32 |
| 425 | #endif |
| 426 | #define EIGEN_VECTORIZE |
| 427 | #define EIGEN_VECTORIZE_MSA |
| 428 | #include <msa.h> |
| 429 | #endif |
| 430 | |
| 431 | #endif |
| 432 | #endif |
| 433 | |
| 434 | // Following the Arm ACLE arm_neon.h should also include arm_fp16.h but not all |
| 435 | // compilers seem to follow this. We therefore include it explicitly. |
| 436 | // See also: https://bugs.llvm.org/show_bug.cgi?id=47955 |
| 437 | #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) |
| 438 | #include <arm_fp16.h> |
| 439 | #endif |
| 440 | |
| 441 | #if defined(__F16C__) && (!defined(EIGEN_GPUCC) && (!defined(EIGEN_COMP_CLANG) || EIGEN_COMP_CLANG>=380)) |
| 442 | // We can use the optimized fp16 to float and float to fp16 conversion routines |
| 443 | #define EIGEN_HAS_FP16_C |
| 444 | |
| 445 | #if defined(EIGEN_COMP_CLANG) |
| 446 | // Workaround for clang: The FP16C intrinsics for clang are included by |
| 447 | // immintrin.h, as opposed to emmintrin.h as suggested by Intel: |
| 448 | // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#othertechs=FP16C&expand=1711 |
| 449 | #include <immintrin.h> |
| 450 | #endif |
| 451 | #endif |
| 452 | |
| 453 | #if defined EIGEN_CUDACC |
| 454 | #define EIGEN_VECTORIZE_GPU |
| 455 | #include <vector_types.h> |
| 456 | #if EIGEN_CUDA_SDK_VER >= 70500 |
| 457 | #define EIGEN_HAS_CUDA_FP16 |
| 458 | #endif |
| 459 | #endif |
| 460 | |
| 461 | #if defined(EIGEN_HAS_CUDA_FP16) |
| 462 | #include <cuda_runtime_api.h> |
| 463 | #include <cuda_fp16.h> |
| 464 | #endif |
| 465 | |
| 466 | #if defined(EIGEN_HIPCC) |
| 467 | #define EIGEN_VECTORIZE_GPU |
| 468 | #include <hip/hip_vector_types.h> |
| 469 | #define EIGEN_HAS_HIP_FP16 |
| 470 | #include <hip/hip_fp16.h> |
| 471 | #endif |
| 472 | |
| 473 | |
| 474 | /** \brief Namespace containing all symbols from the %Eigen library. */ |
| 475 | namespace Eigen { |
| 476 | |
| 477 | inline static const char *SimdInstructionSetsInUse(void) { |
| 478 | #if defined(EIGEN_VECTORIZE_AVX512) |
| 479 | return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2" ; |
| 480 | #elif defined(EIGEN_VECTORIZE_AVX) |
| 481 | return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2" ; |
| 482 | #elif defined(EIGEN_VECTORIZE_SSE4_2) |
| 483 | return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2" ; |
| 484 | #elif defined(EIGEN_VECTORIZE_SSE4_1) |
| 485 | return "SSE, SSE2, SSE3, SSSE3, SSE4.1" ; |
| 486 | #elif defined(EIGEN_VECTORIZE_SSSE3) |
| 487 | return "SSE, SSE2, SSE3, SSSE3" ; |
| 488 | #elif defined(EIGEN_VECTORIZE_SSE3) |
| 489 | return "SSE, SSE2, SSE3" ; |
| 490 | #elif defined(EIGEN_VECTORIZE_SSE2) |
| 491 | return "SSE, SSE2" ; |
| 492 | #elif defined(EIGEN_VECTORIZE_ALTIVEC) |
| 493 | return "AltiVec" ; |
| 494 | #elif defined(EIGEN_VECTORIZE_VSX) |
| 495 | return "VSX" ; |
| 496 | #elif defined(EIGEN_VECTORIZE_NEON) |
| 497 | return "ARM NEON" ; |
| 498 | #elif defined(EIGEN_VECTORIZE_SVE) |
| 499 | return "ARM SVE" ; |
| 500 | #elif defined(EIGEN_VECTORIZE_ZVECTOR) |
| 501 | return "S390X ZVECTOR" ; |
| 502 | #elif defined(EIGEN_VECTORIZE_MSA) |
| 503 | return "MIPS MSA" ; |
| 504 | #else |
| 505 | return "None" ; |
| 506 | #endif |
| 507 | } |
| 508 | |
| 509 | } // end namespace Eigen |
| 510 | |
| 511 | |
| 512 | #endif // EIGEN_CONFIGURE_VECTORIZATION_H |
| 513 | |