1// Copyright 2009-2021 Intel Corporation
2// SPDX-License-Identifier: Apache-2.0
3
4#pragma once
5
6#include "platform.h"
7
8#if defined(__WIN32__)
9#include <intrin.h>
10#endif
11
12#if defined(__ARM_NEON)
13#include "../simd/arm/emulation.h"
14#else
15#include <immintrin.h>
16#if defined(__EMSCRIPTEN__)
17#include "../simd/wasm/emulation.h"
18#endif
19#endif
20
21#if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER)
22 #if !defined(_tzcnt_u32)
23 #define _tzcnt_u32 __tzcnt_u32
24 #endif
25 #if !defined(_tzcnt_u64)
26 #define _tzcnt_u64 __tzcnt_u64
27 #endif
28#endif
29
30#if defined(__LZCNT__)
31 #if !defined(_lzcnt_u32)
32 #define _lzcnt_u32 __lzcnt32
33 #endif
34 #if !defined(_lzcnt_u64)
35 #define _lzcnt_u64 __lzcnt64
36 #endif
37#endif
38
39#if defined(__WIN32__)
40# if !defined(NOMINMAX)
41# define NOMINMAX
42# endif
43# include <windows.h>
44#endif
45
46/* normally defined in pmmintrin.h, but we always need this */
47#if !defined(_MM_SET_DENORMALS_ZERO_MODE)
48#define _MM_DENORMALS_ZERO_ON (0x0040)
49#define _MM_DENORMALS_ZERO_OFF (0x0000)
50#define _MM_DENORMALS_ZERO_MASK (0x0040)
51#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
52#endif
53
54namespace embree
55{
56
57////////////////////////////////////////////////////////////////////////////////
58/// Windows Platform
59////////////////////////////////////////////////////////////////////////////////
60
61#if defined(__WIN32__)
62
63 __forceinline size_t read_tsc()
64 {
65 LARGE_INTEGER li;
66 QueryPerformanceCounter(&li);
67 return (size_t)li.QuadPart;
68 }
69
70 __forceinline int bsf(int v) {
71#if defined(__AVX2__)
72 return _tzcnt_u32(v);
73#else
74 unsigned long r = 0; _BitScanForward(&r,v); return r;
75#endif
76 }
77
78 __forceinline unsigned bsf(unsigned v) {
79#if defined(__AVX2__)
80 return _tzcnt_u32(v);
81#else
82 unsigned long r = 0; _BitScanForward(&r,v); return r;
83#endif
84 }
85
86#if defined(__X86_64__)
87 __forceinline size_t bsf(size_t v) {
88#if defined(__AVX2__)
89 return _tzcnt_u64(v);
90#else
91 unsigned long r = 0; _BitScanForward64(&r,v); return r;
92#endif
93 }
94#endif
95
96 __forceinline int bscf(int& v)
97 {
98 int i = bsf(v);
99 v &= v-1;
100 return i;
101 }
102
103 __forceinline unsigned bscf(unsigned& v)
104 {
105 unsigned i = bsf(v);
106 v &= v-1;
107 return i;
108 }
109
110#if defined(__X86_64__)
111 __forceinline size_t bscf(size_t& v)
112 {
113 size_t i = bsf(v);
114 v &= v-1;
115 return i;
116 }
117#endif
118
119 __forceinline int bsr(int v) {
120#if defined(__AVX2__)
121 return 31 - _lzcnt_u32(v);
122#else
123 unsigned long r = 0; _BitScanReverse(&r,v); return r;
124#endif
125 }
126
127 __forceinline unsigned bsr(unsigned v) {
128#if defined(__AVX2__)
129 return 31 - _lzcnt_u32(v);
130#else
131 unsigned long r = 0; _BitScanReverse(&r,v); return r;
132#endif
133 }
134
135#if defined(__X86_64__)
136 __forceinline size_t bsr(size_t v) {
137#if defined(__AVX2__)
138 return 63 -_lzcnt_u64(v);
139#else
140 unsigned long r = 0; _BitScanReverse64(&r, v); return r;
141#endif
142 }
143#endif
144
145 __forceinline int lzcnt(const int x)
146 {
147#if defined(__AVX2__)
148 return _lzcnt_u32(x);
149#else
150 if (unlikely(x == 0)) return 32;
151 return 31 - bsr(x);
152#endif
153 }
154
155 __forceinline int btc(int v, int i) {
156 long r = v; _bittestandcomplement(&r,i); return r;
157 }
158
159 __forceinline int bts(int v, int i) {
160 long r = v; _bittestandset(&r,i); return r;
161 }
162
163 __forceinline int btr(int v, int i) {
164 long r = v; _bittestandreset(&r,i); return r;
165 }
166
167#if defined(__X86_64__)
168
169 __forceinline size_t btc(size_t v, size_t i) {
170 size_t r = v; _bittestandcomplement64((__int64*)&r,i); return r;
171 }
172
173 __forceinline size_t bts(size_t v, size_t i) {
174 __int64 r = v; _bittestandset64(&r,i); return r;
175 }
176
177 __forceinline size_t btr(size_t v, size_t i) {
178 __int64 r = v; _bittestandreset64(&r,i); return r;
179 }
180
181#endif
182
183 __forceinline int32_t atomic_cmpxchg(volatile int32_t* p, const int32_t c, const int32_t v) {
184 return _InterlockedCompareExchange((volatile long*)p,v,c);
185 }
186
187////////////////////////////////////////////////////////////////////////////////
188/// Unix Platform
189////////////////////////////////////////////////////////////////////////////////
190
191#else
192
193#if defined(__i386__) && defined(__PIC__)
194
195 __forceinline void __cpuid(int out[4], int op)
196 {
197 asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
198 "cpuid\n\t"
199 "xchg{l}\t{%%}ebx, %1\n\t"
200 : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
201 : "0"(op));
202 }
203
204 __forceinline void __cpuid_count(int out[4], int op1, int op2)
205 {
206 asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
207 "cpuid\n\t"
208 "xchg{l}\t{%%}ebx, %1\n\t"
209 : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3])
210 : "0" (op1), "2" (op2));
211 }
212
213#elif defined(__X86_ASM__)
214
215 __forceinline void __cpuid(int out[4], int op) {
216 asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op));
217 }
218
219 __forceinline void __cpuid_count(int out[4], int op1, int op2) {
220 asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2));
221 }
222
223#endif
224
225 __forceinline uint64_t read_tsc() {
226#if defined(__X86_ASM__)
227 uint32_t high,low;
228 asm volatile ("rdtsc" : "=d"(high), "=a"(low));
229 return (((uint64_t)high) << 32) + (uint64_t)low;
230#else
231 /* Not supported yet, meaning measuring traversal cost per pixel does not work. */
232 return 0;
233#endif
234 }
235
236 __forceinline int bsf(int v) {
237#if defined(__AVX2__)
238 return _tzcnt_u32(v);
239#elif defined(__X86_ASM__)
240 int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
241#else
242 return __builtin_ctz(v);
243#endif
244 }
245
246#if defined(__64BIT__)
247 __forceinline unsigned bsf(unsigned v)
248 {
249#if defined(__AVX2__)
250 return _tzcnt_u32(v);
251#elif defined(__X86_ASM__)
252 unsigned r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
253#else
254 return __builtin_ctz(v);
255#endif
256 }
257#endif
258
259 __forceinline size_t bsf(size_t v) {
260#if defined(__AVX2__)
261#if defined(__X86_64__)
262 return _tzcnt_u64(v);
263#else
264 return _tzcnt_u32(v);
265#endif
266#elif defined(__X86_ASM__)
267 size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
268#else
269 return __builtin_ctzl(v);
270#endif
271 }
272
273 __forceinline int bscf(int& v)
274 {
275 int i = bsf(v);
276 v &= v-1;
277 return i;
278 }
279
280#if defined(__64BIT__)
281 __forceinline unsigned int bscf(unsigned int& v)
282 {
283 unsigned int i = bsf(v);
284 v &= v-1;
285 return i;
286 }
287#endif
288
289 __forceinline size_t bscf(size_t& v)
290 {
291 size_t i = bsf(v);
292 v &= v-1;
293 return i;
294 }
295
296 __forceinline int bsr(int v) {
297#if defined(__AVX2__)
298 return 31 - _lzcnt_u32(v);
299#elif defined(__X86_ASM__)
300 int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
301#else
302 return __builtin_clz(v) ^ 31;
303#endif
304 }
305
306#if defined(__64BIT__) || defined(__EMSCRIPTEN__)
307 __forceinline unsigned bsr(unsigned v) {
308#if defined(__AVX2__)
309 return 31 - _lzcnt_u32(v);
310#elif defined(__X86_ASM__)
311 unsigned r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
312#else
313 return __builtin_clz(v) ^ 31;
314#endif
315 }
316#endif
317
318 __forceinline size_t bsr(size_t v) {
319#if defined(__AVX2__)
320#if defined(__X86_64__)
321 return 63 - _lzcnt_u64(v);
322#else
323 return 31 - _lzcnt_u32(v);
324#endif
325#elif defined(__X86_ASM__)
326 size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
327#else
328 return (sizeof(v) * 8 - 1) - __builtin_clzl(v);
329#endif
330 }
331
332 __forceinline int lzcnt(const int x)
333 {
334#if defined(__AVX2__)
335 return _lzcnt_u32(x);
336#else
337 if (unlikely(x == 0)) return 32;
338 return 31 - bsr(v: x);
339#endif
340 }
341
342 __forceinline size_t blsr(size_t v) {
343#if defined(__AVX2__)
344#if defined(__INTEL_COMPILER)
345 return _blsr_u64(v);
346#else
347#if defined(__X86_64__)
348 return __blsr_u64(v);
349#else
350 return __blsr_u32(v);
351#endif
352#endif
353#else
354 return v & (v-1);
355#endif
356 }
357
358 __forceinline int btc(int v, int i) {
359#if defined(__X86_ASM__)
360 int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
361#else
362 return (v ^ (1 << i));
363#endif
364 }
365
366 __forceinline int bts(int v, int i) {
367#if defined(__X86_ASM__)
368 int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
369#else
370 return (v | (1 << i));
371#endif
372 }
373
374 __forceinline int btr(int v, int i) {
375#if defined(__X86_ASM__)
376 int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
377#else
378 return (v & ~(1 << i));
379#endif
380 }
381
382 __forceinline size_t btc(size_t v, size_t i) {
383#if defined(__X86_ASM__)
384 size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
385#else
386 return (v ^ (1 << i));
387#endif
388 }
389
390 __forceinline size_t bts(size_t v, size_t i) {
391#if defined(__X86_ASM__)
392 size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
393#else
394 return (v | (1 << i));
395#endif
396 }
397
398 __forceinline size_t btr(size_t v, size_t i) {
399#if defined(__X86_ASM__)
400 size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
401#else
402 return (v & ~(1 << i));
403#endif
404 }
405
406 __forceinline int32_t atomic_cmpxchg(int32_t volatile* value, int32_t comparand, const int32_t input) {
407 return __sync_val_compare_and_swap(value, comparand, input);
408 }
409
410#endif
411
412////////////////////////////////////////////////////////////////////////////////
413/// All Platforms
414////////////////////////////////////////////////////////////////////////////////
415
416#if defined(__clang__) || defined(__GNUC__)
417#if !defined(_mm_undefined_ps)
418 __forceinline __m128 _mm_undefined_ps() { return _mm_setzero_ps(); }
419#endif
420#if !defined(_mm_undefined_si128)
421 __forceinline __m128i _mm_undefined_si128() { return _mm_setzero_si128(); }
422#endif
423#if !defined(_mm256_undefined_ps) && defined(__AVX__)
424 __forceinline __m256 _mm256_undefined_ps() { return _mm256_setzero_ps(); }
425#endif
426#if !defined(_mm256_undefined_si256) && defined(__AVX__)
427 __forceinline __m256i _mm256_undefined_si256() { return _mm256_setzero_si256(); }
428#endif
429#if !defined(_mm512_undefined_ps) && defined(__AVX512F__)
430 __forceinline __m512 _mm512_undefined_ps() { return _mm512_setzero_ps(); }
431#endif
432#if !defined(_mm512_undefined_epi32) && defined(__AVX512F__)
433 __forceinline __m512i _mm512_undefined_epi32() { return _mm512_setzero_si512(); }
434#endif
435#endif
436
437#if defined(__SSE4_2__)
438
439 __forceinline int popcnt(int in) {
440 return _mm_popcnt_u32(in);
441 }
442
443 __forceinline unsigned popcnt(unsigned in) {
444 return _mm_popcnt_u32(in);
445 }
446
447#if defined(__64BIT__)
448 __forceinline size_t popcnt(size_t in) {
449 return _mm_popcnt_u64(in);
450 }
451#endif
452
453#endif
454
455#if defined(__X86_ASM__)
456 __forceinline uint64_t rdtsc()
457 {
458 int dummy[4];
459 __cpuid(out: dummy,op: 0);
460 uint64_t clock = read_tsc();
461 __cpuid(out: dummy,op: 0);
462 return clock;
463 }
464#endif
465
466 __forceinline void pause_cpu(const size_t N = 8)
467 {
468 for (size_t i=0; i<N; i++)
469 _mm_pause();
470 }
471
472 /* prefetches */
473 __forceinline void prefetchL1 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T0); }
474 __forceinline void prefetchL2 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T1); }
475 __forceinline void prefetchL3 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T2); }
476 __forceinline void prefetchNTA(const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_NTA); }
477 __forceinline void prefetchEX (const void* ptr) {
478#if defined(__INTEL_COMPILER)
479 _mm_prefetch((const char*)ptr,_MM_HINT_ET0);
480#else
481 _mm_prefetch((const char*)ptr,_MM_HINT_T0);
482#endif
483 }
484
485 __forceinline void prefetchL1EX(const void* ptr) {
486 prefetchEX(ptr);
487 }
488
489 __forceinline void prefetchL2EX(const void* ptr) {
490 prefetchEX(ptr);
491 }
492#if defined(__AVX2__)
493 __forceinline unsigned int pext(unsigned int a, unsigned int b) { return _pext_u32(a, b); }
494 __forceinline unsigned int pdep(unsigned int a, unsigned int b) { return _pdep_u32(a, b); }
495#if defined(__X86_64__)
496 __forceinline size_t pext(size_t a, size_t b) { return _pext_u64(a, b); }
497 __forceinline size_t pdep(size_t a, size_t b) { return _pdep_u64(a, b); }
498#endif
499#endif
500
501#if defined(__AVX512F__)
502#if defined(__INTEL_COMPILER)
503 __forceinline float mm512_cvtss_f32(__m512 v) {
504 return _mm512_cvtss_f32(v);
505 }
506 __forceinline int mm512_mask2int(__mmask16 k1) {
507 return _mm512_mask2int(k1);
508 }
509 __forceinline __mmask16 mm512_int2mask(int mask) {
510 return _mm512_int2mask(mask);
511 }
512#else
513 __forceinline float mm512_cvtss_f32(__m512 v) { // FIXME: _mm512_cvtss_f32 neither supported by clang v4.0.0 nor GCC 6.3
514 return _mm_cvtss_f32(_mm512_castps512_ps128(v));
515 }
516 __forceinline int mm512_mask2int(__mmask16 k1) { // FIXME: _mm512_mask2int not yet supported by GCC 6.3
517 return (int)k1;
518 }
519 __forceinline __mmask16 mm512_int2mask(int mask) { // FIXME: _mm512_int2mask not yet supported by GCC 6.3
520 return (__mmask16)mask;
521 }
522#endif
523#endif
524}
525

source code of qtquick3d/src/3rdparty/embree/common/sys/intrinsics.h