1/*
2 Copyright (c) 2005-2021 Intel Corporation
3
4 Licensed under the Apache License, Version 2.0 (the "License");
5 you may not use this file except in compliance with the License.
6 You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 See the License for the specific language governing permissions and
14 limitations under the License.
15*/
16
17#ifndef __TBB_detail__machine_H
18#define __TBB_detail__machine_H
19
20#include "_config.h"
21#include "_assert.h"
22
23#include <atomic>
24#include <climits>
25#include <cstdint>
26#include <cstddef>
27
28#ifdef _WIN32
29#include <intrin.h>
30#ifdef __TBBMALLOC_BUILD
31#define WIN32_LEAN_AND_MEAN
32#define NOMINMAX
33#include <windows.h> // SwitchToThread()
34#endif
35#ifdef _MSC_VER
36#if __TBB_x86_64 || __TBB_x86_32
37#pragma intrinsic(__rdtsc)
38#endif
39#endif
40#endif
41#if __TBB_x86_64 || __TBB_x86_32
42#include <immintrin.h> // _mm_pause
43#endif
44#if (_WIN32)
45#include <float.h> // _control87
46#endif
47
48#if __TBB_GLIBCXX_THIS_THREAD_YIELD_BROKEN
49#include <sched.h> // sched_yield
50#else
51#include <thread> // std::this_thread::yield()
52#endif
53
54namespace tbb {
55namespace detail {
56inline namespace d0 {
57
58//--------------------------------------------------------------------------------------------------
59// Yield implementation
60//--------------------------------------------------------------------------------------------------
61
62#if __TBB_GLIBCXX_THIS_THREAD_YIELD_BROKEN
63static inline void yield() {
64 int err = sched_yield();
65 __TBB_ASSERT_EX(err == 0, "sched_yield has failed");
66}
67#elif __TBBMALLOC_BUILD && _WIN32
68// Use Windows API for yield in tbbmalloc to avoid dependency on C++ runtime with some implementations.
69static inline void yield() {
70 SwitchToThread();
71}
72#else
73using std::this_thread::yield;
74#endif
75
76//--------------------------------------------------------------------------------------------------
77// atomic_fence implementation
78//--------------------------------------------------------------------------------------------------
79
80#if _MSC_VER && (__TBB_x86_64 || __TBB_x86_32)
81#pragma intrinsic(_mm_mfence)
82#endif
83
84static inline void atomic_fence(std::memory_order order) {
85#if _MSC_VER && (__TBB_x86_64 || __TBB_x86_32)
86 if (order == std::memory_order_seq_cst ||
87 order == std::memory_order_acq_rel ||
88 order == std::memory_order_acquire ||
89 order == std::memory_order_release )
90 {
91 _mm_mfence();
92 return;
93 }
94#endif /*_MSC_VER && (__TBB_x86_64 || __TBB_x86_32)*/
95 std::atomic_thread_fence(m: order);
96}
97
98//--------------------------------------------------------------------------------------------------
99// Pause implementation
100//--------------------------------------------------------------------------------------------------
101
102static inline void machine_pause(int32_t delay) {
103#if __TBB_x86_64 || __TBB_x86_32
104 while (delay-- > 0) { _mm_pause(); }
105#elif __ARM_ARCH_7A__ || __aarch64__
106 while (delay-- > 0) { __asm__ __volatile__("yield" ::: "memory"); }
107#else /* Generic */
108 (void)delay; // suppress without including _template_helpers.h
109 yield();
110#endif
111}
112
113////////////////////////////////////////////////////////////////////////////////////////////////////
114// tbb::detail::log2() implementation
115////////////////////////////////////////////////////////////////////////////////////////////////////
116// TODO: Use log2p1() function that will be available in C++20 standard
117
118#if defined(__GNUC__) || defined(__clang__)
119namespace gnu_builtins {
120 inline uintptr_t clz(unsigned int x) { return __builtin_clz(x); }
121 inline uintptr_t clz(unsigned long int x) { return __builtin_clzl(x); }
122 inline uintptr_t clz(unsigned long long int x) { return __builtin_clzll(x); }
123}
124#elif defined(_MSC_VER)
125#pragma intrinsic(__TBB_W(_BitScanReverse))
126namespace msvc_intrinsics {
127 static inline uintptr_t bit_scan_reverse(uintptr_t i) {
128 unsigned long j;
129 __TBB_W(_BitScanReverse)( &j, i );
130 return j;
131 }
132}
133#endif
134
135template <typename T>
136constexpr std::uintptr_t number_of_bits() {
137 return sizeof(T) * CHAR_BIT;
138}
139
140// logarithm is the index of the most significant non-zero bit
141static inline uintptr_t machine_log2(uintptr_t x) {
142#if defined(__GNUC__) || defined(__clang__)
143 // If P is a power of 2 and x<P, then (P-1)-x == (P-1) XOR x
144 return (number_of_bits<decltype(x)>() - 1) ^ gnu_builtins::clz(x);
145#elif defined(_MSC_VER)
146 return msvc_intrinsics::bit_scan_reverse(x);
147#elif __i386__ || __i386 /*for Sun OS*/ || __MINGW32__
148 uintptr_t j, i = x;
149 __asm__("bsr %1,%0" : "=r"(j) : "r"(i));
150 return j;
151#elif __powerpc__ || __POWERPC__
152 #if __TBB_WORDSIZE==8
153 __asm__ __volatile__ ("cntlzd %0,%0" : "+r"(x));
154 return 63 - static_cast<intptr_t>(x);
155 #else
156 __asm__ __volatile__ ("cntlzw %0,%0" : "+r"(x));
157 return 31 - static_cast<intptr_t>(x);
158 #endif /*__TBB_WORDSIZE*/
159#elif __sparc
160 uint64_t count;
161 // one hot encode
162 x |= (x >> 1);
163 x |= (x >> 2);
164 x |= (x >> 4);
165 x |= (x >> 8);
166 x |= (x >> 16);
167 x |= (x >> 32);
168 // count 1's
169 __asm__ ("popc %1, %0" : "=r"(count) : "r"(x) );
170 return count - 1;
171#else
172 intptr_t result = 0;
173
174 if( sizeof(x) > 4 && (uintptr_t tmp = x >> 32) ) { x = tmp; result += 32; }
175 if( uintptr_t tmp = x >> 16 ) { x = tmp; result += 16; }
176 if( uintptr_t tmp = x >> 8 ) { x = tmp; result += 8; }
177 if( uintptr_t tmp = x >> 4 ) { x = tmp; result += 4; }
178 if( uintptr_t tmp = x >> 2 ) { x = tmp; result += 2; }
179
180 return (x & 2) ? result + 1 : result;
181#endif
182}
183
184////////////////////////////////////////////////////////////////////////////////////////////////////
185// tbb::detail::reverse_bits() implementation
186////////////////////////////////////////////////////////////////////////////////////////////////////
187#if TBB_USE_CLANG_BITREVERSE_BUILTINS
188namespace llvm_builtins {
189 inline uint8_t builtin_bitreverse(uint8_t x) { return __builtin_bitreverse8 (x); }
190 inline uint16_t builtin_bitreverse(uint16_t x) { return __builtin_bitreverse16(x); }
191 inline uint32_t builtin_bitreverse(uint32_t x) { return __builtin_bitreverse32(x); }
192 inline uint64_t builtin_bitreverse(uint64_t x) { return __builtin_bitreverse64(x); }
193}
194#else // generic
195template<typename T>
196struct reverse {
197 static const T byte_table[256];
198};
199
200template<typename T>
201const T reverse<T>::byte_table[256] = {
202 0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0,
203 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8,
204 0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4,
205 0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC,
206 0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2,
207 0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA,
208 0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6,
209 0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE,
210 0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1,
211 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9,
212 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5,
213 0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD,
214 0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3,
215 0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,
216 0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7,
217 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF
218};
219
220inline unsigned char reverse_byte(unsigned char src) {
221 return reverse<unsigned char>::byte_table[src];
222}
223#endif // TBB_USE_CLANG_BITREVERSE_BUILTINS
224
225template<typename T>
226T machine_reverse_bits(T src) {
227#if TBB_USE_CLANG_BITREVERSE_BUILTINS
228 return builtin_bitreverse(fixed_width_cast(src));
229#else /* Generic */
230 T dst;
231 unsigned char *original = (unsigned char *) &src;
232 unsigned char *reversed = (unsigned char *) &dst;
233
234 for ( int i = sizeof(T) - 1; i >= 0; i-- ) {
235 reversed[i] = reverse_byte( src: original[sizeof(T) - i - 1] );
236 }
237
238 return dst;
239#endif // TBB_USE_CLANG_BITREVERSE_BUILTINS
240}
241
242} // inline namespace d0
243
244namespace d1 {
245
246#if (_WIN32)
247// API to retrieve/update FPU control setting
248#define __TBB_CPU_CTL_ENV_PRESENT 1
249struct cpu_ctl_env {
250 unsigned int x87cw{};
251#if (__TBB_x86_64)
252 // Changing the infinity mode or the floating-point precision is not supported on x64.
253 // The attempt causes an assertion. See
254 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/control87-controlfp-control87-2
255 static constexpr unsigned int X87CW_CONTROL_MASK = _MCW_DN | _MCW_EM | _MCW_RC;
256#else
257 static constexpr unsigned int X87CW_CONTROL_MASK = ~0U;
258#endif
259#if (__TBB_x86_32 || __TBB_x86_64)
260 unsigned int mxcsr{};
261 static constexpr unsigned int MXCSR_CONTROL_MASK = ~0x3fu; /* all except last six status bits */
262#endif
263
264 bool operator!=( const cpu_ctl_env& ctl ) const {
265 return
266#if (__TBB_x86_32 || __TBB_x86_64)
267 mxcsr != ctl.mxcsr ||
268#endif
269 x87cw != ctl.x87cw;
270 }
271 void get_env() {
272 x87cw = _control87(0, 0);
273#if (__TBB_x86_32 || __TBB_x86_64)
274 mxcsr = _mm_getcsr();
275#endif
276 }
277 void set_env() const {
278 _control87(x87cw, X87CW_CONTROL_MASK);
279#if (__TBB_x86_32 || __TBB_x86_64)
280 _mm_setcsr(mxcsr & MXCSR_CONTROL_MASK);
281#endif
282 }
283};
284#elif (__TBB_x86_32 || __TBB_x86_64)
285// API to retrieve/update FPU control setting
286#define __TBB_CPU_CTL_ENV_PRESENT 1
287struct cpu_ctl_env {
288 int mxcsr{};
289 short x87cw{};
290 static const int MXCSR_CONTROL_MASK = ~0x3f; /* all except last six status bits */
291
292 bool operator!=(const cpu_ctl_env& ctl) const {
293 return mxcsr != ctl.mxcsr || x87cw != ctl.x87cw;
294 }
295 void get_env() {
296 __asm__ __volatile__(
297 "stmxcsr %0\n\t"
298 "fstcw %1"
299 : "=m"(mxcsr), "=m"(x87cw)
300 );
301 mxcsr &= MXCSR_CONTROL_MASK;
302 }
303 void set_env() const {
304 __asm__ __volatile__(
305 "ldmxcsr %0\n\t"
306 "fldcw %1"
307 : : "m"(mxcsr), "m"(x87cw)
308 );
309 }
310};
311#endif
312
313} // namespace d1
314
315} // namespace detail
316} // namespace tbb
317
318#if !__TBB_CPU_CTL_ENV_PRESENT
319#include <fenv.h>
320
321#include <cstring>
322
323namespace tbb {
324namespace detail {
325
326namespace r1 {
327void* __TBB_EXPORTED_FUNC cache_aligned_allocate(std::size_t size);
328void __TBB_EXPORTED_FUNC cache_aligned_deallocate(void* p);
329} // namespace r1
330
331namespace d1 {
332
333class cpu_ctl_env {
334 fenv_t *my_fenv_ptr;
335public:
336 cpu_ctl_env() : my_fenv_ptr(NULL) {}
337 ~cpu_ctl_env() {
338 if ( my_fenv_ptr )
339 r1::cache_aligned_deallocate( (void*)my_fenv_ptr );
340 }
341 // It is possible not to copy memory but just to copy pointers but the following issues should be addressed:
342 // 1. The arena lifetime and the context lifetime are independent;
343 // 2. The user is allowed to recapture different FPU settings to context so 'current FPU settings' inside
344 // dispatch loop may become invalid.
345 // But do we really want to improve the fenv implementation? It seems to be better to replace the fenv implementation
346 // with a platform specific implementation.
347 cpu_ctl_env( const cpu_ctl_env &src ) : my_fenv_ptr(NULL) {
348 *this = src;
349 }
350 cpu_ctl_env& operator=( const cpu_ctl_env &src ) {
351 __TBB_ASSERT( src.my_fenv_ptr, NULL );
352 if ( !my_fenv_ptr )
353 my_fenv_ptr = (fenv_t*)r1::cache_aligned_allocate(sizeof(fenv_t));
354 *my_fenv_ptr = *src.my_fenv_ptr;
355 return *this;
356 }
357 bool operator!=( const cpu_ctl_env &ctl ) const {
358 __TBB_ASSERT( my_fenv_ptr, "cpu_ctl_env is not initialized." );
359 __TBB_ASSERT( ctl.my_fenv_ptr, "cpu_ctl_env is not initialized." );
360 return std::memcmp( (void*)my_fenv_ptr, (void*)ctl.my_fenv_ptr, sizeof(fenv_t) );
361 }
362 void get_env () {
363 if ( !my_fenv_ptr )
364 my_fenv_ptr = (fenv_t*)r1::cache_aligned_allocate(sizeof(fenv_t));
365 fegetenv( my_fenv_ptr );
366 }
367 const cpu_ctl_env& set_env () const {
368 __TBB_ASSERT( my_fenv_ptr, "cpu_ctl_env is not initialized." );
369 fesetenv( my_fenv_ptr );
370 return *this;
371 }
372};
373
374} // namespace d1
375} // namespace detail
376} // namespace tbb
377
378#endif /* !__TBB_CPU_CTL_ENV_PRESENT */
379
380#endif // __TBB_detail__machine_H
381

source code of include/oneapi/tbb/detail/_machine.h