1 | // Copyright 2009-2021 Intel Corporation |
2 | // SPDX-License-Identifier: Apache-2.0 |
3 | |
4 | #pragma once |
5 | |
6 | #include "platform.h" |
7 | |
8 | #if defined(__WIN32__) |
9 | #include <intrin.h> |
10 | #endif |
11 | |
12 | #if defined(__ARM_NEON) |
13 | #include "../simd/arm/emulation.h" |
14 | #else |
15 | #include <immintrin.h> |
16 | #if defined(__EMSCRIPTEN__) |
17 | #include "../simd/wasm/emulation.h" |
18 | #endif |
19 | #endif |
20 | |
21 | #if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER) |
22 | #if !defined(_tzcnt_u32) |
23 | #define _tzcnt_u32 __tzcnt_u32 |
24 | #endif |
25 | #if !defined(_tzcnt_u64) |
26 | #define _tzcnt_u64 __tzcnt_u64 |
27 | #endif |
28 | #endif |
29 | |
30 | #if defined(__LZCNT__) |
31 | #if !defined(_lzcnt_u32) |
32 | #define _lzcnt_u32 __lzcnt32 |
33 | #endif |
34 | #if !defined(_lzcnt_u64) |
35 | #define _lzcnt_u64 __lzcnt64 |
36 | #endif |
37 | #endif |
38 | |
39 | #if defined(__WIN32__) |
40 | # if !defined(NOMINMAX) |
41 | # define NOMINMAX |
42 | # endif |
43 | # include <windows.h> |
44 | #endif |
45 | |
46 | /* normally defined in pmmintrin.h, but we always need this */ |
47 | #if !defined(_MM_SET_DENORMALS_ZERO_MODE) |
48 | #define _MM_DENORMALS_ZERO_ON (0x0040) |
49 | #define _MM_DENORMALS_ZERO_OFF (0x0000) |
50 | #define _MM_DENORMALS_ZERO_MASK (0x0040) |
51 | #define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x))) |
52 | #endif |
53 | |
54 | namespace embree |
55 | { |
56 | |
57 | //////////////////////////////////////////////////////////////////////////////// |
58 | /// Windows Platform |
59 | //////////////////////////////////////////////////////////////////////////////// |
60 | |
61 | #if defined(__WIN32__) |
62 | |
63 | __forceinline size_t read_tsc() |
64 | { |
65 | LARGE_INTEGER li; |
66 | QueryPerformanceCounter(&li); |
67 | return (size_t)li.QuadPart; |
68 | } |
69 | |
70 | __forceinline int bsf(int v) { |
71 | #if defined(__AVX2__) |
72 | return _tzcnt_u32(v); |
73 | #else |
74 | unsigned long r = 0; _BitScanForward(&r,v); return r; |
75 | #endif |
76 | } |
77 | |
78 | __forceinline unsigned bsf(unsigned v) { |
79 | #if defined(__AVX2__) |
80 | return _tzcnt_u32(v); |
81 | #else |
82 | unsigned long r = 0; _BitScanForward(&r,v); return r; |
83 | #endif |
84 | } |
85 | |
86 | #if defined(__X86_64__) |
87 | __forceinline size_t bsf(size_t v) { |
88 | #if defined(__AVX2__) |
89 | return _tzcnt_u64(v); |
90 | #else |
91 | unsigned long r = 0; _BitScanForward64(&r,v); return r; |
92 | #endif |
93 | } |
94 | #endif |
95 | |
96 | __forceinline int bscf(int& v) |
97 | { |
98 | int i = bsf(v); |
99 | v &= v-1; |
100 | return i; |
101 | } |
102 | |
103 | __forceinline unsigned bscf(unsigned& v) |
104 | { |
105 | unsigned i = bsf(v); |
106 | v &= v-1; |
107 | return i; |
108 | } |
109 | |
110 | #if defined(__X86_64__) |
111 | __forceinline size_t bscf(size_t& v) |
112 | { |
113 | size_t i = bsf(v); |
114 | v &= v-1; |
115 | return i; |
116 | } |
117 | #endif |
118 | |
119 | __forceinline int bsr(int v) { |
120 | #if defined(__AVX2__) |
121 | return 31 - _lzcnt_u32(v); |
122 | #else |
123 | unsigned long r = 0; _BitScanReverse(&r,v); return r; |
124 | #endif |
125 | } |
126 | |
127 | __forceinline unsigned bsr(unsigned v) { |
128 | #if defined(__AVX2__) |
129 | return 31 - _lzcnt_u32(v); |
130 | #else |
131 | unsigned long r = 0; _BitScanReverse(&r,v); return r; |
132 | #endif |
133 | } |
134 | |
135 | #if defined(__X86_64__) |
136 | __forceinline size_t bsr(size_t v) { |
137 | #if defined(__AVX2__) |
138 | return 63 -_lzcnt_u64(v); |
139 | #else |
140 | unsigned long r = 0; _BitScanReverse64(&r, v); return r; |
141 | #endif |
142 | } |
143 | #endif |
144 | |
145 | __forceinline int lzcnt(const int x) |
146 | { |
147 | #if defined(__AVX2__) |
148 | return _lzcnt_u32(x); |
149 | #else |
150 | if (unlikely(x == 0)) return 32; |
151 | return 31 - bsr(x); |
152 | #endif |
153 | } |
154 | |
155 | __forceinline int btc(int v, int i) { |
156 | long r = v; _bittestandcomplement(&r,i); return r; |
157 | } |
158 | |
159 | __forceinline int bts(int v, int i) { |
160 | long r = v; _bittestandset(&r,i); return r; |
161 | } |
162 | |
163 | __forceinline int btr(int v, int i) { |
164 | long r = v; _bittestandreset(&r,i); return r; |
165 | } |
166 | |
167 | #if defined(__X86_64__) |
168 | |
169 | __forceinline size_t btc(size_t v, size_t i) { |
170 | size_t r = v; _bittestandcomplement64((__int64*)&r,i); return r; |
171 | } |
172 | |
173 | __forceinline size_t bts(size_t v, size_t i) { |
174 | __int64 r = v; _bittestandset64(&r,i); return r; |
175 | } |
176 | |
177 | __forceinline size_t btr(size_t v, size_t i) { |
178 | __int64 r = v; _bittestandreset64(&r,i); return r; |
179 | } |
180 | |
181 | #endif |
182 | |
183 | __forceinline int32_t atomic_cmpxchg(volatile int32_t* p, const int32_t c, const int32_t v) { |
184 | return _InterlockedCompareExchange((volatile long*)p,v,c); |
185 | } |
186 | |
187 | //////////////////////////////////////////////////////////////////////////////// |
188 | /// Unix Platform |
189 | //////////////////////////////////////////////////////////////////////////////// |
190 | |
191 | #else |
192 | |
193 | #if defined(__i386__) && defined(__PIC__) |
194 | |
195 | __forceinline void __cpuid(int out[4], int op) |
196 | { |
197 | asm volatile ("xchg{l}\t{%%}ebx, %1\n\t" |
198 | "cpuid\n\t" |
199 | "xchg{l}\t{%%}ebx, %1\n\t" |
200 | : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3]) |
201 | : "0" (op)); |
202 | } |
203 | |
204 | __forceinline void __cpuid_count(int out[4], int op1, int op2) |
205 | { |
206 | asm volatile ("xchg{l}\t{%%}ebx, %1\n\t" |
207 | "cpuid\n\t" |
208 | "xchg{l}\t{%%}ebx, %1\n\t" |
209 | : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3]) |
210 | : "0" (op1), "2" (op2)); |
211 | } |
212 | |
213 | #elif defined(__X86_ASM__) |
214 | |
215 | __forceinline void __cpuid(int out[4], int op) { |
216 | asm volatile ("cpuid" : "=a" (out[0]), "=b" (out[1]), "=c" (out[2]), "=d" (out[3]) : "a" (op)); |
217 | } |
218 | |
219 | __forceinline void __cpuid_count(int out[4], int op1, int op2) { |
220 | asm volatile ("cpuid" : "=a" (out[0]), "=b" (out[1]), "=c" (out[2]), "=d" (out[3]) : "a" (op1), "c" (op2)); |
221 | } |
222 | |
223 | #endif |
224 | |
225 | __forceinline uint64_t read_tsc() { |
226 | #if defined(__X86_ASM__) |
227 | uint32_t high,low; |
228 | asm volatile ("rdtsc" : "=d" (high), "=a" (low)); |
229 | return (((uint64_t)high) << 32) + (uint64_t)low; |
230 | #else |
231 | /* Not supported yet, meaning measuring traversal cost per pixel does not work. */ |
232 | return 0; |
233 | #endif |
234 | } |
235 | |
236 | __forceinline int bsf(int v) { |
237 | #if defined(__AVX2__) |
238 | return _tzcnt_u32(v); |
239 | #elif defined(__X86_ASM__) |
240 | int r = 0; asm ("bsf %1,%0" : "=r" (r) : "r" (v)); return r; |
241 | #else |
242 | return __builtin_ctz(v); |
243 | #endif |
244 | } |
245 | |
246 | #if defined(__64BIT__) |
247 | __forceinline unsigned bsf(unsigned v) |
248 | { |
249 | #if defined(__AVX2__) |
250 | return _tzcnt_u32(v); |
251 | #elif defined(__X86_ASM__) |
252 | unsigned r = 0; asm ("bsf %1,%0" : "=r" (r) : "r" (v)); return r; |
253 | #else |
254 | return __builtin_ctz(v); |
255 | #endif |
256 | } |
257 | #endif |
258 | |
259 | __forceinline size_t bsf(size_t v) { |
260 | #if defined(__AVX2__) |
261 | #if defined(__X86_64__) |
262 | return _tzcnt_u64(v); |
263 | #else |
264 | return _tzcnt_u32(v); |
265 | #endif |
266 | #elif defined(__X86_ASM__) |
267 | size_t r = 0; asm ("bsf %1,%0" : "=r" (r) : "r" (v)); return r; |
268 | #else |
269 | return __builtin_ctzl(v); |
270 | #endif |
271 | } |
272 | |
273 | __forceinline int bscf(int& v) |
274 | { |
275 | int i = bsf(v); |
276 | v &= v-1; |
277 | return i; |
278 | } |
279 | |
280 | #if defined(__64BIT__) |
281 | __forceinline unsigned int bscf(unsigned int& v) |
282 | { |
283 | unsigned int i = bsf(v); |
284 | v &= v-1; |
285 | return i; |
286 | } |
287 | #endif |
288 | |
289 | __forceinline size_t bscf(size_t& v) |
290 | { |
291 | size_t i = bsf(v); |
292 | v &= v-1; |
293 | return i; |
294 | } |
295 | |
296 | __forceinline int bsr(int v) { |
297 | #if defined(__AVX2__) |
298 | return 31 - _lzcnt_u32(v); |
299 | #elif defined(__X86_ASM__) |
300 | int r = 0; asm ("bsr %1,%0" : "=r" (r) : "r" (v)); return r; |
301 | #else |
302 | return __builtin_clz(v) ^ 31; |
303 | #endif |
304 | } |
305 | |
306 | #if defined(__64BIT__) || defined(__EMSCRIPTEN__) |
307 | __forceinline unsigned bsr(unsigned v) { |
308 | #if defined(__AVX2__) |
309 | return 31 - _lzcnt_u32(v); |
310 | #elif defined(__X86_ASM__) |
311 | unsigned r = 0; asm ("bsr %1,%0" : "=r" (r) : "r" (v)); return r; |
312 | #else |
313 | return __builtin_clz(v) ^ 31; |
314 | #endif |
315 | } |
316 | #endif |
317 | |
318 | __forceinline size_t bsr(size_t v) { |
319 | #if defined(__AVX2__) |
320 | #if defined(__X86_64__) |
321 | return 63 - _lzcnt_u64(v); |
322 | #else |
323 | return 31 - _lzcnt_u32(v); |
324 | #endif |
325 | #elif defined(__X86_ASM__) |
326 | size_t r = 0; asm ("bsr %1,%0" : "=r" (r) : "r" (v)); return r; |
327 | #else |
328 | return (sizeof(v) * 8 - 1) - __builtin_clzl(v); |
329 | #endif |
330 | } |
331 | |
332 | __forceinline int lzcnt(const int x) |
333 | { |
334 | #if defined(__AVX2__) |
335 | return _lzcnt_u32(x); |
336 | #else |
337 | if (unlikely(x == 0)) return 32; |
338 | return 31 - bsr(v: x); |
339 | #endif |
340 | } |
341 | |
342 | __forceinline size_t blsr(size_t v) { |
343 | #if defined(__AVX2__) |
344 | #if defined(__INTEL_COMPILER) |
345 | return _blsr_u64(v); |
346 | #else |
347 | #if defined(__X86_64__) |
348 | return __blsr_u64(v); |
349 | #else |
350 | return __blsr_u32(v); |
351 | #endif |
352 | #endif |
353 | #else |
354 | return v & (v-1); |
355 | #endif |
356 | } |
357 | |
358 | __forceinline int btc(int v, int i) { |
359 | #if defined(__X86_ASM__) |
360 | int r = 0; asm ("btc %1,%0" : "=r" (r) : "r" (i), "0" (v) : "flags" ); return r; |
361 | #else |
362 | return (v ^ (1 << i)); |
363 | #endif |
364 | } |
365 | |
366 | __forceinline int bts(int v, int i) { |
367 | #if defined(__X86_ASM__) |
368 | int r = 0; asm ("bts %1,%0" : "=r" (r) : "r" (i), "0" (v) : "flags" ); return r; |
369 | #else |
370 | return (v | (1 << i)); |
371 | #endif |
372 | } |
373 | |
374 | __forceinline int btr(int v, int i) { |
375 | #if defined(__X86_ASM__) |
376 | int r = 0; asm ("btr %1,%0" : "=r" (r) : "r" (i), "0" (v) : "flags" ); return r; |
377 | #else |
378 | return (v & ~(1 << i)); |
379 | #endif |
380 | } |
381 | |
382 | __forceinline size_t btc(size_t v, size_t i) { |
383 | #if defined(__X86_ASM__) |
384 | size_t r = 0; asm ("btc %1,%0" : "=r" (r) : "r" (i), "0" (v) : "flags" ); return r; |
385 | #else |
386 | return (v ^ (1 << i)); |
387 | #endif |
388 | } |
389 | |
390 | __forceinline size_t bts(size_t v, size_t i) { |
391 | #if defined(__X86_ASM__) |
392 | size_t r = 0; asm ("bts %1,%0" : "=r" (r) : "r" (i), "0" (v) : "flags" ); return r; |
393 | #else |
394 | return (v | (1 << i)); |
395 | #endif |
396 | } |
397 | |
398 | __forceinline size_t btr(size_t v, size_t i) { |
399 | #if defined(__X86_ASM__) |
400 | size_t r = 0; asm ("btr %1,%0" : "=r" (r) : "r" (i), "0" (v) : "flags" ); return r; |
401 | #else |
402 | return (v & ~(1 << i)); |
403 | #endif |
404 | } |
405 | |
406 | __forceinline int32_t atomic_cmpxchg(int32_t volatile* value, int32_t comparand, const int32_t input) { |
407 | return __sync_val_compare_and_swap(value, comparand, input); |
408 | } |
409 | |
410 | #endif |
411 | |
412 | //////////////////////////////////////////////////////////////////////////////// |
413 | /// All Platforms |
414 | //////////////////////////////////////////////////////////////////////////////// |
415 | |
416 | #if defined(__clang__) || defined(__GNUC__) |
417 | #if !defined(_mm_undefined_ps) |
418 | __forceinline __m128 _mm_undefined_ps() { return _mm_setzero_ps(); } |
419 | #endif |
420 | #if !defined(_mm_undefined_si128) |
421 | __forceinline __m128i _mm_undefined_si128() { return _mm_setzero_si128(); } |
422 | #endif |
423 | #if !defined(_mm256_undefined_ps) && defined(__AVX__) |
424 | __forceinline __m256 _mm256_undefined_ps() { return _mm256_setzero_ps(); } |
425 | #endif |
426 | #if !defined(_mm256_undefined_si256) && defined(__AVX__) |
427 | __forceinline __m256i _mm256_undefined_si256() { return _mm256_setzero_si256(); } |
428 | #endif |
429 | #if !defined(_mm512_undefined_ps) && defined(__AVX512F__) |
430 | __forceinline __m512 _mm512_undefined_ps() { return _mm512_setzero_ps(); } |
431 | #endif |
432 | #if !defined(_mm512_undefined_epi32) && defined(__AVX512F__) |
433 | __forceinline __m512i _mm512_undefined_epi32() { return _mm512_setzero_si512(); } |
434 | #endif |
435 | #endif |
436 | |
437 | #if defined(__SSE4_2__) |
438 | |
439 | __forceinline int popcnt(int in) { |
440 | return _mm_popcnt_u32(in); |
441 | } |
442 | |
443 | __forceinline unsigned popcnt(unsigned in) { |
444 | return _mm_popcnt_u32(in); |
445 | } |
446 | |
447 | #if defined(__64BIT__) |
448 | __forceinline size_t popcnt(size_t in) { |
449 | return _mm_popcnt_u64(in); |
450 | } |
451 | #endif |
452 | |
453 | #endif |
454 | |
455 | #if defined(__X86_ASM__) |
456 | __forceinline uint64_t rdtsc() |
457 | { |
458 | int dummy[4]; |
459 | __cpuid(out: dummy,op: 0); |
460 | uint64_t clock = read_tsc(); |
461 | __cpuid(out: dummy,op: 0); |
462 | return clock; |
463 | } |
464 | #endif |
465 | |
466 | __forceinline void pause_cpu(const size_t N = 8) |
467 | { |
468 | for (size_t i=0; i<N; i++) |
469 | _mm_pause(); |
470 | } |
471 | |
472 | /* prefetches */ |
473 | __forceinline void prefetchL1 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T0); } |
474 | __forceinline void prefetchL2 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T1); } |
475 | __forceinline void prefetchL3 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T2); } |
476 | __forceinline void prefetchNTA(const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_NTA); } |
477 | __forceinline void prefetchEX (const void* ptr) { |
478 | #if defined(__INTEL_COMPILER) |
479 | _mm_prefetch((const char*)ptr,_MM_HINT_ET0); |
480 | #else |
481 | _mm_prefetch((const char*)ptr,_MM_HINT_T0); |
482 | #endif |
483 | } |
484 | |
485 | __forceinline void prefetchL1EX(const void* ptr) { |
486 | prefetchEX(ptr); |
487 | } |
488 | |
489 | __forceinline void prefetchL2EX(const void* ptr) { |
490 | prefetchEX(ptr); |
491 | } |
492 | #if defined(__AVX2__) |
493 | __forceinline unsigned int pext(unsigned int a, unsigned int b) { return _pext_u32(a, b); } |
494 | __forceinline unsigned int pdep(unsigned int a, unsigned int b) { return _pdep_u32(a, b); } |
495 | #if defined(__X86_64__) |
496 | __forceinline size_t pext(size_t a, size_t b) { return _pext_u64(a, b); } |
497 | __forceinline size_t pdep(size_t a, size_t b) { return _pdep_u64(a, b); } |
498 | #endif |
499 | #endif |
500 | |
501 | #if defined(__AVX512F__) |
502 | #if defined(__INTEL_COMPILER) |
503 | __forceinline float mm512_cvtss_f32(__m512 v) { |
504 | return _mm512_cvtss_f32(v); |
505 | } |
506 | __forceinline int mm512_mask2int(__mmask16 k1) { |
507 | return _mm512_mask2int(k1); |
508 | } |
509 | __forceinline __mmask16 mm512_int2mask(int mask) { |
510 | return _mm512_int2mask(mask); |
511 | } |
512 | #else |
513 | __forceinline float mm512_cvtss_f32(__m512 v) { // FIXME: _mm512_cvtss_f32 neither supported by clang v4.0.0 nor GCC 6.3 |
514 | return _mm_cvtss_f32(_mm512_castps512_ps128(v)); |
515 | } |
516 | __forceinline int mm512_mask2int(__mmask16 k1) { // FIXME: _mm512_mask2int not yet supported by GCC 6.3 |
517 | return (int)k1; |
518 | } |
519 | __forceinline __mmask16 mm512_int2mask(int mask) { // FIXME: _mm512_int2mask not yet supported by GCC 6.3 |
520 | return (__mmask16)mask; |
521 | } |
522 | #endif |
523 | #endif |
524 | } |
525 | |