1 | /* |
2 | * kmp_affinity.h -- header for affinity management |
3 | */ |
4 | |
5 | //===----------------------------------------------------------------------===// |
6 | // |
7 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
8 | // See https://llvm.org/LICENSE.txt for license information. |
9 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #ifndef KMP_AFFINITY_H |
14 | #define KMP_AFFINITY_H |
15 | |
16 | #include "kmp.h" |
17 | #include "kmp_os.h" |
18 | #include <limits> |
19 | |
20 | #if KMP_AFFINITY_SUPPORTED |
21 | #if KMP_USE_HWLOC |
22 | class KMPHwlocAffinity : public KMPAffinity { |
23 | public: |
24 | class Mask : public KMPAffinity::Mask { |
25 | hwloc_cpuset_t mask; |
26 | |
27 | public: |
28 | Mask() { |
29 | mask = hwloc_bitmap_alloc(); |
30 | this->zero(); |
31 | } |
32 | ~Mask() { hwloc_bitmap_free(mask); } |
33 | void set(int i) override { hwloc_bitmap_set(mask, i); } |
34 | bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); } |
35 | void clear(int i) override { hwloc_bitmap_clr(mask, i); } |
36 | void zero() override { hwloc_bitmap_zero(mask); } |
37 | bool empty() const override { return hwloc_bitmap_iszero(mask); } |
38 | void copy(const KMPAffinity::Mask *src) override { |
39 | const Mask *convert = static_cast<const Mask *>(src); |
40 | hwloc_bitmap_copy(mask, convert->mask); |
41 | } |
42 | void bitwise_and(const KMPAffinity::Mask *rhs) override { |
43 | const Mask *convert = static_cast<const Mask *>(rhs); |
44 | hwloc_bitmap_and(mask, mask, convert->mask); |
45 | } |
46 | void bitwise_or(const KMPAffinity::Mask *rhs) override { |
47 | const Mask *convert = static_cast<const Mask *>(rhs); |
48 | hwloc_bitmap_or(mask, mask, convert->mask); |
49 | } |
50 | void bitwise_not() override { hwloc_bitmap_not(mask, mask); } |
51 | bool is_equal(const KMPAffinity::Mask *rhs) const override { |
52 | const Mask *convert = static_cast<const Mask *>(rhs); |
53 | return hwloc_bitmap_isequal(mask, convert->mask); |
54 | } |
55 | int begin() const override { return hwloc_bitmap_first(mask); } |
56 | int end() const override { return -1; } |
57 | int next(int previous) const override { |
58 | return hwloc_bitmap_next(mask, previous); |
59 | } |
60 | int get_system_affinity(bool abort_on_error) override { |
61 | KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), |
62 | "Illegal get affinity operation when not capable" ); |
63 | long retval = |
64 | hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD); |
65 | if (retval >= 0) { |
66 | return 0; |
67 | } |
68 | int error = errno; |
69 | if (abort_on_error) { |
70 | __kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()" ), |
71 | KMP_ERR(error), __kmp_msg_null); |
72 | } |
73 | return error; |
74 | } |
75 | int set_system_affinity(bool abort_on_error) const override { |
76 | KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), |
77 | "Illegal set affinity operation when not capable" ); |
78 | long retval = |
79 | hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD); |
80 | if (retval >= 0) { |
81 | return 0; |
82 | } |
83 | int error = errno; |
84 | if (abort_on_error) { |
85 | __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()" ), |
86 | KMP_ERR(error), __kmp_msg_null); |
87 | } |
88 | return error; |
89 | } |
90 | #if KMP_OS_WINDOWS |
91 | int set_process_affinity(bool abort_on_error) const override { |
92 | KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), |
93 | "Illegal set process affinity operation when not capable" ); |
94 | int error = 0; |
95 | const hwloc_topology_support *support = |
96 | hwloc_topology_get_support(__kmp_hwloc_topology); |
97 | if (support->cpubind->set_proc_cpubind) { |
98 | int retval; |
99 | retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask, |
100 | HWLOC_CPUBIND_PROCESS); |
101 | if (retval >= 0) |
102 | return 0; |
103 | error = errno; |
104 | if (abort_on_error) |
105 | __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()" ), |
106 | KMP_ERR(error), __kmp_msg_null); |
107 | } |
108 | return error; |
109 | } |
110 | #endif |
111 | int get_proc_group() const override { |
112 | int group = -1; |
113 | #if KMP_OS_WINDOWS |
114 | if (__kmp_num_proc_groups == 1) { |
115 | return 1; |
116 | } |
117 | for (int i = 0; i < __kmp_num_proc_groups; i++) { |
118 | // On windows, the long type is always 32 bits |
119 | unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2); |
120 | unsigned long second_32_bits = |
121 | hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1); |
122 | if (first_32_bits == 0 && second_32_bits == 0) { |
123 | continue; |
124 | } |
125 | if (group >= 0) { |
126 | return -1; |
127 | } |
128 | group = i; |
129 | } |
130 | #endif /* KMP_OS_WINDOWS */ |
131 | return group; |
132 | } |
133 | }; |
134 | void determine_capable(const char *var) override { |
135 | const hwloc_topology_support *topology_support; |
136 | if (__kmp_hwloc_topology == NULL) { |
137 | if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) { |
138 | __kmp_hwloc_error = TRUE; |
139 | if (__kmp_affinity.flags.verbose) { |
140 | KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()" ); |
141 | } |
142 | } |
143 | if (hwloc_topology_load(__kmp_hwloc_topology) < 0) { |
144 | __kmp_hwloc_error = TRUE; |
145 | if (__kmp_affinity.flags.verbose) { |
146 | KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()" ); |
147 | } |
148 | } |
149 | } |
150 | topology_support = hwloc_topology_get_support(__kmp_hwloc_topology); |
151 | // Is the system capable of setting/getting this thread's affinity? |
152 | // Also, is topology discovery possible? (pu indicates ability to discover |
153 | // processing units). And finally, were there no errors when calling any |
154 | // hwloc_* API functions? |
155 | if (topology_support && topology_support->cpubind->set_thisthread_cpubind && |
156 | topology_support->cpubind->get_thisthread_cpubind && |
157 | topology_support->discovery->pu && !__kmp_hwloc_error) { |
158 | // enables affinity according to KMP_AFFINITY_CAPABLE() macro |
159 | KMP_AFFINITY_ENABLE(TRUE); |
160 | } else { |
161 | // indicate that hwloc didn't work and disable affinity |
162 | __kmp_hwloc_error = TRUE; |
163 | KMP_AFFINITY_DISABLE(); |
164 | } |
165 | } |
166 | void bind_thread(int which) override { |
167 | KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), |
168 | "Illegal set affinity operation when not capable" ); |
169 | KMPAffinity::Mask *mask; |
170 | KMP_CPU_ALLOC_ON_STACK(mask); |
171 | KMP_CPU_ZERO(mask); |
172 | KMP_CPU_SET(which, mask); |
173 | __kmp_set_system_affinity(mask, TRUE); |
174 | KMP_CPU_FREE_FROM_STACK(mask); |
175 | } |
176 | KMPAffinity::Mask *allocate_mask() override { return new Mask(); } |
177 | void deallocate_mask(KMPAffinity::Mask *m) override { delete m; } |
178 | KMPAffinity::Mask *allocate_mask_array(int num) override { |
179 | return new Mask[num]; |
180 | } |
181 | void deallocate_mask_array(KMPAffinity::Mask *array) override { |
182 | Mask *hwloc_array = static_cast<Mask *>(array); |
183 | delete[] hwloc_array; |
184 | } |
185 | KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array, |
186 | int index) override { |
187 | Mask *hwloc_array = static_cast<Mask *>(array); |
188 | return &(hwloc_array[index]); |
189 | } |
190 | api_type get_api_type() const override { return HWLOC; } |
191 | }; |
192 | #endif /* KMP_USE_HWLOC */ |
193 | |
194 | #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY || \ |
195 | KMP_OS_AIX |
196 | #if KMP_OS_LINUX |
197 | /* On some of the older OS's that we build on, these constants aren't present |
198 | in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on |
199 | all systems of the same arch where they are defined, and they cannot change. |
200 | stone forever. */ |
201 | #include <sys/syscall.h> |
202 | #if KMP_ARCH_X86 || KMP_ARCH_ARM |
203 | #ifndef __NR_sched_setaffinity |
204 | #define __NR_sched_setaffinity 241 |
205 | #elif __NR_sched_setaffinity != 241 |
206 | #error Wrong code for setaffinity system call. |
207 | #endif /* __NR_sched_setaffinity */ |
208 | #ifndef __NR_sched_getaffinity |
209 | #define __NR_sched_getaffinity 242 |
210 | #elif __NR_sched_getaffinity != 242 |
211 | #error Wrong code for getaffinity system call. |
212 | #endif /* __NR_sched_getaffinity */ |
213 | #elif KMP_ARCH_AARCH64 |
214 | #ifndef __NR_sched_setaffinity |
215 | #define __NR_sched_setaffinity 122 |
216 | #elif __NR_sched_setaffinity != 122 |
217 | #error Wrong code for setaffinity system call. |
218 | #endif /* __NR_sched_setaffinity */ |
219 | #ifndef __NR_sched_getaffinity |
220 | #define __NR_sched_getaffinity 123 |
221 | #elif __NR_sched_getaffinity != 123 |
222 | #error Wrong code for getaffinity system call. |
223 | #endif /* __NR_sched_getaffinity */ |
224 | #elif KMP_ARCH_X86_64 |
225 | #ifndef __NR_sched_setaffinity |
226 | #define __NR_sched_setaffinity 203 |
227 | #elif __NR_sched_setaffinity != 203 |
228 | #error Wrong code for setaffinity system call. |
229 | #endif /* __NR_sched_setaffinity */ |
230 | #ifndef __NR_sched_getaffinity |
231 | #define __NR_sched_getaffinity 204 |
232 | #elif __NR_sched_getaffinity != 204 |
233 | #error Wrong code for getaffinity system call. |
234 | #endif /* __NR_sched_getaffinity */ |
235 | #elif KMP_ARCH_PPC64 |
236 | #ifndef __NR_sched_setaffinity |
237 | #define __NR_sched_setaffinity 222 |
238 | #elif __NR_sched_setaffinity != 222 |
239 | #error Wrong code for setaffinity system call. |
240 | #endif /* __NR_sched_setaffinity */ |
241 | #ifndef __NR_sched_getaffinity |
242 | #define __NR_sched_getaffinity 223 |
243 | #elif __NR_sched_getaffinity != 223 |
244 | #error Wrong code for getaffinity system call. |
245 | #endif /* __NR_sched_getaffinity */ |
246 | #elif KMP_ARCH_MIPS |
247 | #ifndef __NR_sched_setaffinity |
248 | #define __NR_sched_setaffinity 4239 |
249 | #elif __NR_sched_setaffinity != 4239 |
250 | #error Wrong code for setaffinity system call. |
251 | #endif /* __NR_sched_setaffinity */ |
252 | #ifndef __NR_sched_getaffinity |
253 | #define __NR_sched_getaffinity 4240 |
254 | #elif __NR_sched_getaffinity != 4240 |
255 | #error Wrong code for getaffinity system call. |
256 | #endif /* __NR_sched_getaffinity */ |
257 | #elif KMP_ARCH_MIPS64 |
258 | #ifndef __NR_sched_setaffinity |
259 | #define __NR_sched_setaffinity 5195 |
260 | #elif __NR_sched_setaffinity != 5195 |
261 | #error Wrong code for setaffinity system call. |
262 | #endif /* __NR_sched_setaffinity */ |
263 | #ifndef __NR_sched_getaffinity |
264 | #define __NR_sched_getaffinity 5196 |
265 | #elif __NR_sched_getaffinity != 5196 |
266 | #error Wrong code for getaffinity system call. |
267 | #endif /* __NR_sched_getaffinity */ |
268 | #elif KMP_ARCH_LOONGARCH64 |
269 | #ifndef __NR_sched_setaffinity |
270 | #define __NR_sched_setaffinity 122 |
271 | #elif __NR_sched_setaffinity != 122 |
272 | #error Wrong code for setaffinity system call. |
273 | #endif /* __NR_sched_setaffinity */ |
274 | #ifndef __NR_sched_getaffinity |
275 | #define __NR_sched_getaffinity 123 |
276 | #elif __NR_sched_getaffinity != 123 |
277 | #error Wrong code for getaffinity system call. |
278 | #endif /* __NR_sched_getaffinity */ |
279 | #elif KMP_ARCH_RISCV64 |
280 | #ifndef __NR_sched_setaffinity |
281 | #define __NR_sched_setaffinity 122 |
282 | #elif __NR_sched_setaffinity != 122 |
283 | #error Wrong code for setaffinity system call. |
284 | #endif /* __NR_sched_setaffinity */ |
285 | #ifndef __NR_sched_getaffinity |
286 | #define __NR_sched_getaffinity 123 |
287 | #elif __NR_sched_getaffinity != 123 |
288 | #error Wrong code for getaffinity system call. |
289 | #endif /* __NR_sched_getaffinity */ |
290 | #elif KMP_ARCH_VE |
291 | #ifndef __NR_sched_setaffinity |
292 | #define __NR_sched_setaffinity 203 |
293 | #elif __NR_sched_setaffinity != 203 |
294 | #error Wrong code for setaffinity system call. |
295 | #endif /* __NR_sched_setaffinity */ |
296 | #ifndef __NR_sched_getaffinity |
297 | #define __NR_sched_getaffinity 204 |
298 | #elif __NR_sched_getaffinity != 204 |
299 | #error Wrong code for getaffinity system call. |
300 | #endif /* __NR_sched_getaffinity */ |
301 | #elif KMP_ARCH_S390X |
302 | #ifndef __NR_sched_setaffinity |
303 | #define __NR_sched_setaffinity 239 |
304 | #elif __NR_sched_setaffinity != 239 |
305 | #error Wrong code for setaffinity system call. |
306 | #endif /* __NR_sched_setaffinity */ |
307 | #ifndef __NR_sched_getaffinity |
308 | #define __NR_sched_getaffinity 240 |
309 | #elif __NR_sched_getaffinity != 240 |
310 | #error Wrong code for getaffinity system call. |
311 | #endif /* __NR_sched_getaffinity */ |
312 | #else |
313 | #error Unknown or unsupported architecture |
314 | #endif /* KMP_ARCH_* */ |
315 | #elif KMP_OS_FREEBSD || KMP_OS_DRAGONFLY |
316 | #include <pthread.h> |
317 | #include <pthread_np.h> |
318 | #elif KMP_OS_NETBSD |
319 | #include <pthread.h> |
320 | #include <sched.h> |
321 | #elif KMP_OS_AIX |
322 | #include <sys/dr.h> |
323 | #include <sys/rset.h> |
324 | #define VMI_MAXRADS 64 // Maximum number of RADs allowed by AIX. |
325 | #endif |
326 | class KMPNativeAffinity : public KMPAffinity { |
327 | class Mask : public KMPAffinity::Mask { |
328 | typedef unsigned long mask_t; |
329 | typedef decltype(__kmp_affin_mask_size) mask_size_type; |
330 | static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT; |
331 | static const mask_t ONE = 1; |
332 | mask_size_type get_num_mask_types() const { |
333 | return __kmp_affin_mask_size / sizeof(mask_t); |
334 | } |
335 | |
336 | public: |
337 | mask_t *mask; |
338 | Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); } |
339 | ~Mask() { |
340 | if (mask) |
341 | __kmp_free(mask); |
342 | } |
343 | void set(int i) override { |
344 | mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T)); |
345 | } |
346 | bool is_set(int i) const override { |
347 | return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T))); |
348 | } |
349 | void clear(int i) override { |
350 | mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T)); |
351 | } |
352 | void zero() override { |
353 | mask_size_type e = get_num_mask_types(); |
354 | for (mask_size_type i = 0; i < e; ++i) |
355 | mask[i] = (mask_t)0; |
356 | } |
357 | bool empty() const override { |
358 | mask_size_type e = get_num_mask_types(); |
359 | for (mask_size_type i = 0; i < e; ++i) |
360 | if (mask[i] != (mask_t)0) |
361 | return false; |
362 | return true; |
363 | } |
364 | void copy(const KMPAffinity::Mask *src) override { |
365 | const Mask *convert = static_cast<const Mask *>(src); |
366 | mask_size_type e = get_num_mask_types(); |
367 | for (mask_size_type i = 0; i < e; ++i) |
368 | mask[i] = convert->mask[i]; |
369 | } |
370 | void bitwise_and(const KMPAffinity::Mask *rhs) override { |
371 | const Mask *convert = static_cast<const Mask *>(rhs); |
372 | mask_size_type e = get_num_mask_types(); |
373 | for (mask_size_type i = 0; i < e; ++i) |
374 | mask[i] &= convert->mask[i]; |
375 | } |
376 | void bitwise_or(const KMPAffinity::Mask *rhs) override { |
377 | const Mask *convert = static_cast<const Mask *>(rhs); |
378 | mask_size_type e = get_num_mask_types(); |
379 | for (mask_size_type i = 0; i < e; ++i) |
380 | mask[i] |= convert->mask[i]; |
381 | } |
382 | void bitwise_not() override { |
383 | mask_size_type e = get_num_mask_types(); |
384 | for (mask_size_type i = 0; i < e; ++i) |
385 | mask[i] = ~(mask[i]); |
386 | } |
387 | bool is_equal(const KMPAffinity::Mask *rhs) const override { |
388 | const Mask *convert = static_cast<const Mask *>(rhs); |
389 | mask_size_type e = get_num_mask_types(); |
390 | for (mask_size_type i = 0; i < e; ++i) |
391 | if (mask[i] != convert->mask[i]) |
392 | return false; |
393 | return true; |
394 | } |
395 | int begin() const override { |
396 | int retval = 0; |
397 | while (retval < end() && !is_set(i: retval)) |
398 | ++retval; |
399 | return retval; |
400 | } |
401 | int end() const override { |
402 | int e; |
403 | __kmp_type_convert(src: get_num_mask_types() * BITS_PER_MASK_T, dest: &e); |
404 | return e; |
405 | } |
406 | int next(int previous) const override { |
407 | int retval = previous + 1; |
408 | while (retval < end() && !is_set(i: retval)) |
409 | ++retval; |
410 | return retval; |
411 | } |
412 | #if KMP_OS_AIX |
413 | // On AIX, we don't have a way to get CPU(s) a thread is bound to. |
414 | // This routine is only used to get the full mask. |
415 | int get_system_affinity(bool abort_on_error) override { |
416 | KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), |
417 | "Illegal get affinity operation when not capable" ); |
418 | |
419 | (void)abort_on_error; |
420 | |
421 | // Set the mask with all CPUs that are available. |
422 | for (int i = 0; i < __kmp_xproc; ++i) |
423 | KMP_CPU_SET(i, this); |
424 | return 0; |
425 | } |
426 | int set_system_affinity(bool abort_on_error) const override { |
427 | KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), |
428 | |
429 | "Illegal set affinity operation when not capable" ); |
430 | |
431 | int location; |
432 | int gtid = __kmp_entry_gtid(); |
433 | int tid = thread_self(); |
434 | |
435 | // Unbind the thread if it was bound to any processors before so that |
436 | // we can bind the thread to CPUs specified by the mask not others. |
437 | int retval = bindprocessor(BINDTHREAD, tid, PROCESSOR_CLASS_ANY); |
438 | |
439 | // On AIX, we can only bind to one instead of a set of CPUs with the |
440 | // bindprocessor() system call. |
441 | KMP_CPU_SET_ITERATE(location, this) { |
442 | if (KMP_CPU_ISSET(location, this)) { |
443 | retval = bindprocessor(BINDTHREAD, tid, location); |
444 | if (retval == -1 && errno == 1) { |
445 | rsid_t rsid; |
446 | rsethandle_t rsh; |
447 | // Put something in rsh to prevent compiler warning |
448 | // about uninitalized use |
449 | rsh = rs_alloc(RS_EMPTY); |
450 | rsid.at_pid = getpid(); |
451 | if (RS_DEFAULT_RSET != ra_getrset(R_PROCESS, rsid, 0, rsh)) { |
452 | retval = ra_detachrset(R_PROCESS, rsid, 0); |
453 | retval = bindprocessor(BINDTHREAD, tid, location); |
454 | } |
455 | } |
456 | if (retval == 0) { |
457 | KA_TRACE(10, ("__kmp_set_system_affinity: Done binding " |
458 | "T#%d to cpu=%d.\n" , |
459 | gtid, location)); |
460 | continue; |
461 | } |
462 | int error = errno; |
463 | if (abort_on_error) { |
464 | __kmp_fatal(KMP_MSG(FunctionError, "bindprocessor()" ), |
465 | KMP_ERR(error), __kmp_msg_null); |
466 | KA_TRACE(10, ("__kmp_set_system_affinity: Error binding " |
467 | "T#%d to cpu=%d, errno=%d.\n" , |
468 | gtid, location, error)); |
469 | return error; |
470 | } |
471 | } |
472 | } |
473 | return 0; |
474 | } |
475 | #else // !KMP_OS_AIX |
476 | int get_system_affinity(bool abort_on_error) override { |
477 | KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), |
478 | "Illegal get affinity operation when not capable" ); |
479 | #if KMP_OS_LINUX |
480 | long retval = |
481 | syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask); |
482 | #elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY |
483 | int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size, |
484 | reinterpret_cast<cpuset_t *>(mask)); |
485 | int retval = (r == 0 ? 0 : -1); |
486 | #endif |
487 | if (retval >= 0) { |
488 | return 0; |
489 | } |
490 | int error = errno; |
491 | if (abort_on_error) { |
492 | __kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()" ), |
493 | KMP_ERR(error), __kmp_msg_null); |
494 | } |
495 | return error; |
496 | } |
497 | int set_system_affinity(bool abort_on_error) const override { |
498 | KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), |
499 | "Illegal set affinity operation when not capable" ); |
500 | #if KMP_OS_LINUX |
501 | long retval = |
502 | syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask); |
503 | #elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY |
504 | int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size, |
505 | reinterpret_cast<cpuset_t *>(mask)); |
506 | int retval = (r == 0 ? 0 : -1); |
507 | #endif |
508 | if (retval >= 0) { |
509 | return 0; |
510 | } |
511 | int error = errno; |
512 | if (abort_on_error) { |
513 | __kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()" ), |
514 | KMP_ERR(error), __kmp_msg_null); |
515 | } |
516 | return error; |
517 | } |
518 | #endif // KMP_OS_AIX |
519 | }; |
520 | void determine_capable(const char *env_var) override { |
521 | __kmp_affinity_determine_capable(env_var); |
522 | } |
523 | void bind_thread(int which) override { __kmp_affinity_bind_thread(which); } |
524 | KMPAffinity::Mask *allocate_mask() override { |
525 | KMPNativeAffinity::Mask *retval = new Mask(); |
526 | return retval; |
527 | } |
528 | void deallocate_mask(KMPAffinity::Mask *m) override { |
529 | KMPNativeAffinity::Mask *native_mask = |
530 | static_cast<KMPNativeAffinity::Mask *>(m); |
531 | delete native_mask; |
532 | } |
533 | KMPAffinity::Mask *allocate_mask_array(int num) override { |
534 | return new Mask[num]; |
535 | } |
536 | void deallocate_mask_array(KMPAffinity::Mask *array) override { |
537 | Mask *linux_array = static_cast<Mask *>(array); |
538 | delete[] linux_array; |
539 | } |
540 | KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array, |
541 | int index) override { |
542 | Mask *linux_array = static_cast<Mask *>(array); |
543 | return &(linux_array[index]); |
544 | } |
545 | api_type get_api_type() const override { return NATIVE_OS; } |
546 | }; |
547 | #endif /* KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY \ |
548 | || KMP_OS_AIX */ |
549 | |
550 | #if KMP_OS_WINDOWS |
551 | class KMPNativeAffinity : public KMPAffinity { |
552 | class Mask : public KMPAffinity::Mask { |
553 | typedef ULONG_PTR mask_t; |
554 | static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT; |
555 | mask_t *mask; |
556 | |
557 | public: |
558 | Mask() { |
559 | mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups); |
560 | } |
561 | ~Mask() { |
562 | if (mask) |
563 | __kmp_free(mask); |
564 | } |
565 | void set(int i) override { |
566 | mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T)); |
567 | } |
568 | bool is_set(int i) const override { |
569 | return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T))); |
570 | } |
571 | void clear(int i) override { |
572 | mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T)); |
573 | } |
574 | void zero() override { |
575 | for (int i = 0; i < __kmp_num_proc_groups; ++i) |
576 | mask[i] = 0; |
577 | } |
578 | bool empty() const override { |
579 | for (size_t i = 0; i < __kmp_num_proc_groups; ++i) |
580 | if (mask[i]) |
581 | return false; |
582 | return true; |
583 | } |
584 | void copy(const KMPAffinity::Mask *src) override { |
585 | const Mask *convert = static_cast<const Mask *>(src); |
586 | for (int i = 0; i < __kmp_num_proc_groups; ++i) |
587 | mask[i] = convert->mask[i]; |
588 | } |
589 | void bitwise_and(const KMPAffinity::Mask *rhs) override { |
590 | const Mask *convert = static_cast<const Mask *>(rhs); |
591 | for (int i = 0; i < __kmp_num_proc_groups; ++i) |
592 | mask[i] &= convert->mask[i]; |
593 | } |
594 | void bitwise_or(const KMPAffinity::Mask *rhs) override { |
595 | const Mask *convert = static_cast<const Mask *>(rhs); |
596 | for (int i = 0; i < __kmp_num_proc_groups; ++i) |
597 | mask[i] |= convert->mask[i]; |
598 | } |
599 | void bitwise_not() override { |
600 | for (int i = 0; i < __kmp_num_proc_groups; ++i) |
601 | mask[i] = ~(mask[i]); |
602 | } |
603 | bool is_equal(const KMPAffinity::Mask *rhs) const override { |
604 | const Mask *convert = static_cast<const Mask *>(rhs); |
605 | for (size_t i = 0; i < __kmp_num_proc_groups; ++i) |
606 | if (mask[i] != convert->mask[i]) |
607 | return false; |
608 | return true; |
609 | } |
610 | int begin() const override { |
611 | int retval = 0; |
612 | while (retval < end() && !is_set(retval)) |
613 | ++retval; |
614 | return retval; |
615 | } |
616 | int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; } |
617 | int next(int previous) const override { |
618 | int retval = previous + 1; |
619 | while (retval < end() && !is_set(retval)) |
620 | ++retval; |
621 | return retval; |
622 | } |
623 | int set_process_affinity(bool abort_on_error) const override { |
624 | if (__kmp_num_proc_groups <= 1) { |
625 | if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) { |
626 | DWORD error = GetLastError(); |
627 | if (abort_on_error) { |
628 | __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error), |
629 | __kmp_msg_null); |
630 | } |
631 | return error; |
632 | } |
633 | } |
634 | return 0; |
635 | } |
636 | int set_system_affinity(bool abort_on_error) const override { |
637 | if (__kmp_num_proc_groups > 1) { |
638 | // Check for a valid mask. |
639 | GROUP_AFFINITY ga; |
640 | int group = get_proc_group(); |
641 | if (group < 0) { |
642 | if (abort_on_error) { |
643 | KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity" ); |
644 | } |
645 | return -1; |
646 | } |
647 | // Transform the bit vector into a GROUP_AFFINITY struct |
648 | // and make the system call to set affinity. |
649 | ga.Group = group; |
650 | ga.Mask = mask[group]; |
651 | ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0; |
652 | |
653 | KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL); |
654 | if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) { |
655 | DWORD error = GetLastError(); |
656 | if (abort_on_error) { |
657 | __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error), |
658 | __kmp_msg_null); |
659 | } |
660 | return error; |
661 | } |
662 | } else { |
663 | if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) { |
664 | DWORD error = GetLastError(); |
665 | if (abort_on_error) { |
666 | __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error), |
667 | __kmp_msg_null); |
668 | } |
669 | return error; |
670 | } |
671 | } |
672 | return 0; |
673 | } |
674 | int get_system_affinity(bool abort_on_error) override { |
675 | if (__kmp_num_proc_groups > 1) { |
676 | this->zero(); |
677 | GROUP_AFFINITY ga; |
678 | KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL); |
679 | if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) { |
680 | DWORD error = GetLastError(); |
681 | if (abort_on_error) { |
682 | __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()" ), |
683 | KMP_ERR(error), __kmp_msg_null); |
684 | } |
685 | return error; |
686 | } |
687 | if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) || |
688 | (ga.Mask == 0)) { |
689 | return -1; |
690 | } |
691 | mask[ga.Group] = ga.Mask; |
692 | } else { |
693 | mask_t newMask, sysMask, retval; |
694 | if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) { |
695 | DWORD error = GetLastError(); |
696 | if (abort_on_error) { |
697 | __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()" ), |
698 | KMP_ERR(error), __kmp_msg_null); |
699 | } |
700 | return error; |
701 | } |
702 | retval = SetThreadAffinityMask(GetCurrentThread(), newMask); |
703 | if (!retval) { |
704 | DWORD error = GetLastError(); |
705 | if (abort_on_error) { |
706 | __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()" ), |
707 | KMP_ERR(error), __kmp_msg_null); |
708 | } |
709 | return error; |
710 | } |
711 | newMask = SetThreadAffinityMask(GetCurrentThread(), retval); |
712 | if (!newMask) { |
713 | DWORD error = GetLastError(); |
714 | if (abort_on_error) { |
715 | __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()" ), |
716 | KMP_ERR(error), __kmp_msg_null); |
717 | } |
718 | } |
719 | *mask = retval; |
720 | } |
721 | return 0; |
722 | } |
723 | int get_proc_group() const override { |
724 | int group = -1; |
725 | if (__kmp_num_proc_groups == 1) { |
726 | return 1; |
727 | } |
728 | for (int i = 0; i < __kmp_num_proc_groups; i++) { |
729 | if (mask[i] == 0) |
730 | continue; |
731 | if (group >= 0) |
732 | return -1; |
733 | group = i; |
734 | } |
735 | return group; |
736 | } |
737 | }; |
738 | void determine_capable(const char *env_var) override { |
739 | __kmp_affinity_determine_capable(env_var); |
740 | } |
741 | void bind_thread(int which) override { __kmp_affinity_bind_thread(which); } |
742 | KMPAffinity::Mask *allocate_mask() override { return new Mask(); } |
743 | void deallocate_mask(KMPAffinity::Mask *m) override { delete m; } |
744 | KMPAffinity::Mask *allocate_mask_array(int num) override { |
745 | return new Mask[num]; |
746 | } |
747 | void deallocate_mask_array(KMPAffinity::Mask *array) override { |
748 | Mask *windows_array = static_cast<Mask *>(array); |
749 | delete[] windows_array; |
750 | } |
751 | KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array, |
752 | int index) override { |
753 | Mask *windows_array = static_cast<Mask *>(array); |
754 | return &(windows_array[index]); |
755 | } |
756 | api_type get_api_type() const override { return NATIVE_OS; } |
757 | }; |
758 | #endif /* KMP_OS_WINDOWS */ |
759 | #endif /* KMP_AFFINITY_SUPPORTED */ |
760 | |
761 | // Describe an attribute for a level in the machine topology |
762 | struct kmp_hw_attr_t { |
763 | int core_type : 8; |
764 | int core_eff : 8; |
765 | unsigned valid : 1; |
766 | unsigned reserved : 15; |
767 | |
768 | static const int UNKNOWN_CORE_EFF = -1; |
769 | |
770 | kmp_hw_attr_t() |
771 | : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF), |
772 | valid(0), reserved(0) {} |
773 | void set_core_type(kmp_hw_core_type_t type) { |
774 | valid = 1; |
775 | core_type = type; |
776 | } |
777 | void set_core_eff(int eff) { |
778 | valid = 1; |
779 | core_eff = eff; |
780 | } |
781 | kmp_hw_core_type_t get_core_type() const { |
782 | return (kmp_hw_core_type_t)core_type; |
783 | } |
784 | int get_core_eff() const { return core_eff; } |
785 | bool is_core_type_valid() const { |
786 | return core_type != KMP_HW_CORE_TYPE_UNKNOWN; |
787 | } |
788 | bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; } |
789 | operator bool() const { return valid; } |
790 | void clear() { |
791 | core_type = KMP_HW_CORE_TYPE_UNKNOWN; |
792 | core_eff = UNKNOWN_CORE_EFF; |
793 | valid = 0; |
794 | } |
795 | bool contains(const kmp_hw_attr_t &other) const { |
796 | if (!valid && !other.valid) |
797 | return true; |
798 | if (valid && other.valid) { |
799 | if (other.is_core_type_valid()) { |
800 | if (!is_core_type_valid() || (get_core_type() != other.get_core_type())) |
801 | return false; |
802 | } |
803 | if (other.is_core_eff_valid()) { |
804 | if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff())) |
805 | return false; |
806 | } |
807 | return true; |
808 | } |
809 | return false; |
810 | } |
811 | #if KMP_AFFINITY_SUPPORTED |
812 | bool contains(const kmp_affinity_attrs_t &attr) const { |
813 | if (!valid && !attr.valid) |
814 | return true; |
815 | if (valid && attr.valid) { |
816 | if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN) |
817 | return (is_core_type_valid() && |
818 | (get_core_type() == (kmp_hw_core_type_t)attr.core_type)); |
819 | if (attr.core_eff != UNKNOWN_CORE_EFF) |
820 | return (is_core_eff_valid() && (get_core_eff() == attr.core_eff)); |
821 | return true; |
822 | } |
823 | return false; |
824 | } |
825 | #endif // KMP_AFFINITY_SUPPORTED |
826 | bool operator==(const kmp_hw_attr_t &rhs) const { |
827 | return (rhs.valid == valid && rhs.core_eff == core_eff && |
828 | rhs.core_type == core_type); |
829 | } |
830 | bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); } |
831 | }; |
832 | |
833 | #if KMP_AFFINITY_SUPPORTED |
834 | KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t)); |
835 | #endif |
836 | |
837 | class kmp_hw_thread_t { |
838 | public: |
839 | static const int UNKNOWN_ID = -1; |
840 | static const int MULTIPLE_ID = -2; |
841 | static int compare_ids(const void *a, const void *b); |
842 | static int compare_compact(const void *a, const void *b); |
843 | int ids[KMP_HW_LAST]; |
844 | int sub_ids[KMP_HW_LAST]; |
845 | bool leader; |
846 | int os_id; |
847 | kmp_hw_attr_t attrs; |
848 | |
849 | void print() const; |
850 | void clear() { |
851 | for (int i = 0; i < (int)KMP_HW_LAST; ++i) |
852 | ids[i] = UNKNOWN_ID; |
853 | leader = false; |
854 | attrs.clear(); |
855 | } |
856 | }; |
857 | |
858 | class kmp_topology_t { |
859 | |
860 | struct flags_t { |
861 | int uniform : 1; |
862 | int reserved : 31; |
863 | }; |
864 | |
865 | int depth; |
866 | |
867 | // The following arrays are all 'depth' long and have been |
868 | // allocated to hold up to KMP_HW_LAST number of objects if |
869 | // needed so layers can be added without reallocation of any array |
870 | |
871 | // Orderd array of the types in the topology |
872 | kmp_hw_t *types; |
873 | |
874 | // Keep quick topology ratios, for non-uniform topologies, |
875 | // this ratio holds the max number of itemAs per itemB |
876 | // e.g., [ 4 packages | 6 cores / package | 2 threads / core ] |
877 | int *ratio; |
878 | |
879 | // Storage containing the absolute number of each topology layer |
880 | int *count; |
881 | |
882 | // The number of core efficiencies. This is only useful for hybrid |
883 | // topologies. Core efficiencies will range from 0 to num efficiencies - 1 |
884 | int num_core_efficiencies; |
885 | int num_core_types; |
886 | kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES]; |
887 | |
888 | // The hardware threads array |
889 | // hw_threads is num_hw_threads long |
890 | // Each hw_thread's ids and sub_ids are depth deep |
891 | int num_hw_threads; |
892 | kmp_hw_thread_t *hw_threads; |
893 | |
894 | // Equivalence hash where the key is the hardware topology item |
895 | // and the value is the equivalent hardware topology type in the |
896 | // types[] array, if the value is KMP_HW_UNKNOWN, then there is no |
897 | // known equivalence for the topology type |
898 | kmp_hw_t equivalent[KMP_HW_LAST]; |
899 | |
900 | // Flags describing the topology |
901 | flags_t flags; |
902 | |
903 | // Compact value used during sort_compact() |
904 | int compact; |
905 | |
906 | // Insert a new topology layer after allocation |
907 | void _insert_layer(kmp_hw_t type, const int *ids); |
908 | |
909 | #if KMP_GROUP_AFFINITY |
910 | // Insert topology information about Windows Processor groups |
911 | void _insert_windows_proc_groups(); |
912 | #endif |
913 | |
914 | // Count each item & get the num x's per y |
915 | // e.g., get the number of cores and the number of threads per core |
916 | // for each (x, y) in (KMP_HW_* , KMP_HW_*) |
917 | void _gather_enumeration_information(); |
918 | |
919 | // Remove layers that don't add information to the topology. |
920 | // This is done by having the layer take on the id = UNKNOWN_ID (-1) |
921 | void _remove_radix1_layers(); |
922 | |
923 | // Find out if the topology is uniform |
924 | void _discover_uniformity(); |
925 | |
926 | // Set all the sub_ids for each hardware thread |
927 | void _set_sub_ids(); |
928 | |
929 | // Set global affinity variables describing the number of threads per |
930 | // core, the number of packages, the number of cores per package, and |
931 | // the number of cores. |
932 | void _set_globals(); |
933 | |
934 | // Set the last level cache equivalent type |
935 | void _set_last_level_cache(); |
936 | |
937 | // Return the number of cores with a particular attribute, 'attr'. |
938 | // If 'find_all' is true, then find all cores on the machine, otherwise find |
939 | // all cores per the layer 'above' |
940 | int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above, |
941 | bool find_all = false) const; |
942 | |
943 | public: |
944 | // Force use of allocate()/deallocate() |
945 | kmp_topology_t() = delete; |
946 | kmp_topology_t(const kmp_topology_t &t) = delete; |
947 | kmp_topology_t(kmp_topology_t &&t) = delete; |
948 | kmp_topology_t &operator=(const kmp_topology_t &t) = delete; |
949 | kmp_topology_t &operator=(kmp_topology_t &&t) = delete; |
950 | |
951 | static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types); |
952 | static void deallocate(kmp_topology_t *); |
953 | |
954 | // Functions used in create_map() routines |
955 | kmp_hw_thread_t &at(int index) { |
956 | KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads); |
957 | return hw_threads[index]; |
958 | } |
959 | const kmp_hw_thread_t &at(int index) const { |
960 | KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads); |
961 | return hw_threads[index]; |
962 | } |
963 | int get_num_hw_threads() const { return num_hw_threads; } |
964 | void sort_ids() { |
965 | qsort(base: hw_threads, nmemb: num_hw_threads, size: sizeof(kmp_hw_thread_t), |
966 | compar: kmp_hw_thread_t::compare_ids); |
967 | } |
968 | // Check if the hardware ids are unique, if they are |
969 | // return true, otherwise return false |
970 | bool check_ids() const; |
971 | |
972 | // Function to call after the create_map() routine |
973 | void canonicalize(); |
974 | void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores); |
975 | |
976 | // Functions used after canonicalize() called |
977 | |
978 | #if KMP_AFFINITY_SUPPORTED |
979 | // Set the granularity for affinity settings |
980 | void set_granularity(kmp_affinity_t &stgs) const; |
981 | bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const; |
982 | bool restrict_to_mask(const kmp_affin_mask_t *mask); |
983 | bool filter_hw_subset(); |
984 | #endif |
985 | bool is_uniform() const { return flags.uniform; } |
986 | // Tell whether a type is a valid type in the topology |
987 | // returns KMP_HW_UNKNOWN when there is no equivalent type |
988 | kmp_hw_t get_equivalent_type(kmp_hw_t type) const { |
989 | if (type == KMP_HW_UNKNOWN) |
990 | return KMP_HW_UNKNOWN; |
991 | return equivalent[type]; |
992 | } |
993 | // Set type1 = type2 |
994 | void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) { |
995 | KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1); |
996 | KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2); |
997 | kmp_hw_t real_type2 = equivalent[type2]; |
998 | if (real_type2 == KMP_HW_UNKNOWN) |
999 | real_type2 = type2; |
1000 | equivalent[type1] = real_type2; |
1001 | // This loop is required since any of the types may have been set to |
1002 | // be equivalent to type1. They all must be checked and reset to type2. |
1003 | KMP_FOREACH_HW_TYPE(type) { |
1004 | if (equivalent[type] == type1) { |
1005 | equivalent[type] = real_type2; |
1006 | } |
1007 | } |
1008 | } |
1009 | // Calculate number of types corresponding to level1 |
1010 | // per types corresponding to level2 (e.g., number of threads per core) |
1011 | int calculate_ratio(int level1, int level2) const { |
1012 | KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth); |
1013 | KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth); |
1014 | int r = 1; |
1015 | for (int level = level1; level > level2; --level) |
1016 | r *= ratio[level]; |
1017 | return r; |
1018 | } |
1019 | int get_ratio(int level) const { |
1020 | KMP_DEBUG_ASSERT(level >= 0 && level < depth); |
1021 | return ratio[level]; |
1022 | } |
1023 | int get_depth() const { return depth; }; |
1024 | kmp_hw_t get_type(int level) const { |
1025 | KMP_DEBUG_ASSERT(level >= 0 && level < depth); |
1026 | return types[level]; |
1027 | } |
1028 | int get_level(kmp_hw_t type) const { |
1029 | KMP_DEBUG_ASSERT_VALID_HW_TYPE(type); |
1030 | int eq_type = equivalent[type]; |
1031 | if (eq_type == KMP_HW_UNKNOWN) |
1032 | return -1; |
1033 | for (int i = 0; i < depth; ++i) |
1034 | if (types[i] == eq_type) |
1035 | return i; |
1036 | return -1; |
1037 | } |
1038 | int get_count(int level) const { |
1039 | KMP_DEBUG_ASSERT(level >= 0 && level < depth); |
1040 | return count[level]; |
1041 | } |
1042 | // Return the total number of cores with attribute 'attr' |
1043 | int get_ncores_with_attr(const kmp_hw_attr_t &attr) const { |
1044 | return _get_ncores_with_attr(attr, above: -1, find_all: true); |
1045 | } |
1046 | // Return the number of cores with attribute |
1047 | // 'attr' per topology level 'above' |
1048 | int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const { |
1049 | return _get_ncores_with_attr(attr, above, find_all: false); |
1050 | } |
1051 | |
1052 | #if KMP_AFFINITY_SUPPORTED |
1053 | friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b); |
1054 | void sort_compact(kmp_affinity_t &affinity) { |
1055 | compact = affinity.compact; |
1056 | qsort(base: hw_threads, nmemb: num_hw_threads, size: sizeof(kmp_hw_thread_t), |
1057 | compar: kmp_hw_thread_t::compare_compact); |
1058 | } |
1059 | #endif |
1060 | void print(const char *env_var = "KMP_AFFINITY" ) const; |
1061 | void dump() const; |
1062 | }; |
1063 | extern kmp_topology_t *__kmp_topology; |
1064 | |
1065 | class kmp_hw_subset_t { |
1066 | const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS; |
1067 | |
1068 | public: |
1069 | // Describe a machine topology item in KMP_HW_SUBSET |
1070 | struct item_t { |
1071 | kmp_hw_t type; |
1072 | int num_attrs; |
1073 | int num[MAX_ATTRS]; |
1074 | int offset[MAX_ATTRS]; |
1075 | kmp_hw_attr_t attr[MAX_ATTRS]; |
1076 | }; |
1077 | // Put parenthesis around max to avoid accidental use of Windows max macro. |
1078 | const static int USE_ALL = (std::numeric_limits<int>::max)(); |
1079 | |
1080 | private: |
1081 | int depth; |
1082 | int capacity; |
1083 | item_t *items; |
1084 | kmp_uint64 set; |
1085 | bool absolute; |
1086 | // The set must be able to handle up to KMP_HW_LAST number of layers |
1087 | KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST); |
1088 | // Sorting the KMP_HW_SUBSET items to follow topology order |
1089 | // All unknown topology types will be at the beginning of the subset |
1090 | static int hw_subset_compare(const void *i1, const void *i2) { |
1091 | kmp_hw_t type1 = ((const item_t *)i1)->type; |
1092 | kmp_hw_t type2 = ((const item_t *)i2)->type; |
1093 | int level1 = __kmp_topology->get_level(type: type1); |
1094 | int level2 = __kmp_topology->get_level(type: type2); |
1095 | return level1 - level2; |
1096 | } |
1097 | |
1098 | public: |
1099 | // Force use of allocate()/deallocate() |
1100 | kmp_hw_subset_t() = delete; |
1101 | kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete; |
1102 | kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete; |
1103 | kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete; |
1104 | kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete; |
1105 | |
1106 | static kmp_hw_subset_t *allocate() { |
1107 | int initial_capacity = 5; |
1108 | kmp_hw_subset_t *retval = |
1109 | (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t)); |
1110 | retval->depth = 0; |
1111 | retval->capacity = initial_capacity; |
1112 | retval->set = 0ull; |
1113 | retval->absolute = false; |
1114 | retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity); |
1115 | return retval; |
1116 | } |
1117 | static void deallocate(kmp_hw_subset_t *subset) { |
1118 | __kmp_free(subset->items); |
1119 | __kmp_free(subset); |
1120 | } |
1121 | void set_absolute() { absolute = true; } |
1122 | bool is_absolute() const { return absolute; } |
1123 | void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) { |
1124 | for (int i = 0; i < depth; ++i) { |
1125 | // Found an existing item for this layer type |
1126 | // Add the num, offset, and attr to this item |
1127 | if (items[i].type == type) { |
1128 | int idx = items[i].num_attrs++; |
1129 | if ((size_t)idx >= MAX_ATTRS) |
1130 | return; |
1131 | items[i].num[idx] = num; |
1132 | items[i].offset[idx] = offset; |
1133 | items[i].attr[idx] = attr; |
1134 | return; |
1135 | } |
1136 | } |
1137 | if (depth == capacity - 1) { |
1138 | capacity *= 2; |
1139 | item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity); |
1140 | for (int i = 0; i < depth; ++i) |
1141 | new_items[i] = items[i]; |
1142 | __kmp_free(items); |
1143 | items = new_items; |
1144 | } |
1145 | items[depth].num_attrs = 1; |
1146 | items[depth].type = type; |
1147 | items[depth].num[0] = num; |
1148 | items[depth].offset[0] = offset; |
1149 | items[depth].attr[0] = attr; |
1150 | depth++; |
1151 | set |= (1ull << type); |
1152 | } |
1153 | int get_depth() const { return depth; } |
1154 | const item_t &at(int index) const { |
1155 | KMP_DEBUG_ASSERT(index >= 0 && index < depth); |
1156 | return items[index]; |
1157 | } |
1158 | item_t &at(int index) { |
1159 | KMP_DEBUG_ASSERT(index >= 0 && index < depth); |
1160 | return items[index]; |
1161 | } |
1162 | void remove(int index) { |
1163 | KMP_DEBUG_ASSERT(index >= 0 && index < depth); |
1164 | set &= ~(1ull << items[index].type); |
1165 | for (int j = index + 1; j < depth; ++j) { |
1166 | items[j - 1] = items[j]; |
1167 | } |
1168 | depth--; |
1169 | } |
1170 | void sort() { |
1171 | KMP_DEBUG_ASSERT(__kmp_topology); |
1172 | qsort(base: items, nmemb: depth, size: sizeof(item_t), compar: hw_subset_compare); |
1173 | } |
1174 | bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); } |
1175 | |
1176 | // Canonicalize the KMP_HW_SUBSET value if it is not an absolute subset. |
1177 | // This means putting each of {sockets, cores, threads} in the topology if |
1178 | // they are not specified: |
1179 | // e.g., 1s,2c => 1s,2c,*t | 2c,1t => *s,2c,1t | 1t => *s,*c,1t | etc. |
1180 | // e.g., 3module => *s,3module,*c,*t |
1181 | // By doing this, the runtime assumes users who fiddle with KMP_HW_SUBSET |
1182 | // are expecting the traditional sockets/cores/threads topology. For newer |
1183 | // hardware, there can be intervening layers like dies/tiles/modules |
1184 | // (usually corresponding to a cache level). So when a user asks for |
1185 | // 1s,6c,2t and the topology is really 1s,2modules,4cores,2threads, the user |
1186 | // should get 12 hardware threads across 6 cores and effectively ignore the |
1187 | // module layer. |
1188 | void canonicalize(const kmp_topology_t *top) { |
1189 | // Layers to target for KMP_HW_SUBSET canonicalization |
1190 | kmp_hw_t targeted[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD}; |
1191 | |
1192 | // Do not target-layer-canonicalize absolute KMP_HW_SUBSETS |
1193 | if (is_absolute()) |
1194 | return; |
1195 | |
1196 | // Do not target-layer-canonicalize KMP_HW_SUBSETS when the |
1197 | // topology doesn't have these layers |
1198 | for (kmp_hw_t type : targeted) |
1199 | if (top->get_level(type) == KMP_HW_UNKNOWN) |
1200 | return; |
1201 | |
1202 | // Put targeted layers in topology if they do not exist |
1203 | for (kmp_hw_t type : targeted) { |
1204 | bool found = false; |
1205 | for (int i = 0; i < get_depth(); ++i) { |
1206 | if (top->get_equivalent_type(type: items[i].type) == type) { |
1207 | found = true; |
1208 | break; |
1209 | } |
1210 | } |
1211 | if (!found) { |
1212 | push_back(num: USE_ALL, type, offset: 0, attr: kmp_hw_attr_t{}); |
1213 | } |
1214 | } |
1215 | sort(); |
1216 | // Set as an absolute topology that only targets the targeted layers |
1217 | set_absolute(); |
1218 | } |
1219 | void dump() const { |
1220 | printf(format: "**********************\n" ); |
1221 | printf(format: "*** kmp_hw_subset: ***\n" ); |
1222 | printf(format: "* depth: %d\n" , depth); |
1223 | printf(format: "* items:\n" ); |
1224 | for (int i = 0; i < depth; ++i) { |
1225 | printf(format: " type: %s\n" , __kmp_hw_get_keyword(type: items[i].type)); |
1226 | for (int j = 0; j < items[i].num_attrs; ++j) { |
1227 | printf(format: " num: %d, offset: %d, attr: " , items[i].num[j], |
1228 | items[i].offset[j]); |
1229 | if (!items[i].attr[j]) { |
1230 | printf(format: " (none)\n" ); |
1231 | } else { |
1232 | printf( |
1233 | format: " core_type = %s, core_eff = %d\n" , |
1234 | __kmp_hw_get_core_type_string(type: items[i].attr[j].get_core_type()), |
1235 | items[i].attr[j].get_core_eff()); |
1236 | } |
1237 | } |
1238 | } |
1239 | printf(format: "* set: 0x%llx\n" , set); |
1240 | printf(format: "* absolute: %d\n" , absolute); |
1241 | printf(format: "**********************\n" ); |
1242 | } |
1243 | }; |
1244 | extern kmp_hw_subset_t *__kmp_hw_subset; |
1245 | |
1246 | /* A structure for holding machine-specific hierarchy info to be computed once |
1247 | at init. This structure represents a mapping of threads to the actual machine |
1248 | hierarchy, or to our best guess at what the hierarchy might be, for the |
1249 | purpose of performing an efficient barrier. In the worst case, when there is |
1250 | no machine hierarchy information, it produces a tree suitable for a barrier, |
1251 | similar to the tree used in the hyper barrier. */ |
1252 | class hierarchy_info { |
1253 | public: |
1254 | /* Good default values for number of leaves and branching factor, given no |
1255 | affinity information. Behaves a bit like hyper barrier. */ |
1256 | static const kmp_uint32 maxLeaves = 4; |
1257 | static const kmp_uint32 minBranch = 4; |
1258 | /** Number of levels in the hierarchy. Typical levels are threads/core, |
1259 | cores/package or socket, packages/node, nodes/machine, etc. We don't want |
1260 | to get specific with nomenclature. When the machine is oversubscribed we |
1261 | add levels to duplicate the hierarchy, doubling the thread capacity of the |
1262 | hierarchy each time we add a level. */ |
1263 | kmp_uint32 maxLevels; |
1264 | |
1265 | /** This is specifically the depth of the machine configuration hierarchy, in |
1266 | terms of the number of levels along the longest path from root to any |
1267 | leaf. It corresponds to the number of entries in numPerLevel if we exclude |
1268 | all but one trailing 1. */ |
1269 | kmp_uint32 depth; |
1270 | kmp_uint32 base_num_threads; |
1271 | enum init_status { initialized = 0, not_initialized = 1, initializing = 2 }; |
1272 | volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized, |
1273 | // 2=initialization in progress |
1274 | volatile kmp_int8 resizing; // 0=not resizing, 1=resizing |
1275 | |
1276 | /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children |
1277 | the parent of a node at level i has. For example, if we have a machine |
1278 | with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel = |
1279 | {2, 4, 4, 1, 1}. All empty levels are set to 1. */ |
1280 | kmp_uint32 *numPerLevel; |
1281 | kmp_uint32 *skipPerLevel; |
1282 | |
1283 | void deriveLevels() { |
1284 | int hier_depth = __kmp_topology->get_depth(); |
1285 | for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) { |
1286 | numPerLevel[level] = __kmp_topology->get_ratio(level: i); |
1287 | } |
1288 | } |
1289 | |
1290 | hierarchy_info() |
1291 | : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {} |
1292 | |
1293 | void fini() { |
1294 | if (!uninitialized && numPerLevel) { |
1295 | __kmp_free(numPerLevel); |
1296 | numPerLevel = NULL; |
1297 | uninitialized = not_initialized; |
1298 | } |
1299 | } |
1300 | |
1301 | void init(int num_addrs) { |
1302 | kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8( |
1303 | &uninitialized, not_initialized, initializing); |
1304 | if (bool_result == 0) { // Wait for initialization |
1305 | while (TCR_1(uninitialized) != initialized) |
1306 | KMP_CPU_PAUSE(); |
1307 | return; |
1308 | } |
1309 | KMP_DEBUG_ASSERT(bool_result == 1); |
1310 | |
1311 | /* Added explicit initialization of the data fields here to prevent usage of |
1312 | dirty value observed when static library is re-initialized multiple times |
1313 | (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses |
1314 | OpenMP). */ |
1315 | depth = 1; |
1316 | resizing = 0; |
1317 | maxLevels = 7; |
1318 | numPerLevel = |
1319 | (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32)); |
1320 | skipPerLevel = &(numPerLevel[maxLevels]); |
1321 | for (kmp_uint32 i = 0; i < maxLevels; |
1322 | ++i) { // init numPerLevel[*] to 1 item per level |
1323 | numPerLevel[i] = 1; |
1324 | skipPerLevel[i] = 1; |
1325 | } |
1326 | |
1327 | // Sort table by physical ID |
1328 | if (__kmp_topology && __kmp_topology->get_depth() > 0) { |
1329 | deriveLevels(); |
1330 | } else { |
1331 | numPerLevel[0] = maxLeaves; |
1332 | numPerLevel[1] = num_addrs / maxLeaves; |
1333 | if (num_addrs % maxLeaves) |
1334 | numPerLevel[1]++; |
1335 | } |
1336 | |
1337 | base_num_threads = num_addrs; |
1338 | for (int i = maxLevels - 1; i >= 0; |
1339 | --i) // count non-empty levels to get depth |
1340 | if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1' |
1341 | depth++; |
1342 | |
1343 | kmp_uint32 branch = minBranch; |
1344 | if (numPerLevel[0] == 1) |
1345 | branch = num_addrs / maxLeaves; |
1346 | if (branch < minBranch) |
1347 | branch = minBranch; |
1348 | for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width |
1349 | while (numPerLevel[d] > branch || |
1350 | (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0! |
1351 | if (numPerLevel[d] & 1) |
1352 | numPerLevel[d]++; |
1353 | numPerLevel[d] = numPerLevel[d] >> 1; |
1354 | if (numPerLevel[d + 1] == 1) |
1355 | depth++; |
1356 | numPerLevel[d + 1] = numPerLevel[d + 1] << 1; |
1357 | } |
1358 | if (numPerLevel[0] == 1) { |
1359 | branch = branch >> 1; |
1360 | if (branch < 4) |
1361 | branch = minBranch; |
1362 | } |
1363 | } |
1364 | |
1365 | for (kmp_uint32 i = 1; i < depth; ++i) |
1366 | skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1]; |
1367 | // Fill in hierarchy in the case of oversubscription |
1368 | for (kmp_uint32 i = depth; i < maxLevels; ++i) |
1369 | skipPerLevel[i] = 2 * skipPerLevel[i - 1]; |
1370 | |
1371 | uninitialized = initialized; // One writer |
1372 | } |
1373 | |
1374 | // Resize the hierarchy if nproc changes to something larger than before |
1375 | void resize(kmp_uint32 nproc) { |
1376 | kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1); |
1377 | while (bool_result == 0) { // someone else is trying to resize |
1378 | KMP_CPU_PAUSE(); |
1379 | if (nproc <= base_num_threads) // happy with other thread's resize |
1380 | return; |
1381 | else // try to resize |
1382 | bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1); |
1383 | } |
1384 | KMP_DEBUG_ASSERT(bool_result != 0); |
1385 | if (nproc <= base_num_threads) |
1386 | return; // happy with other thread's resize |
1387 | |
1388 | // Calculate new maxLevels |
1389 | kmp_uint32 old_sz = skipPerLevel[depth - 1]; |
1390 | kmp_uint32 incs = 0, old_maxLevels = maxLevels; |
1391 | // First see if old maxLevels is enough to contain new size |
1392 | for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) { |
1393 | skipPerLevel[i] = 2 * skipPerLevel[i - 1]; |
1394 | numPerLevel[i - 1] *= 2; |
1395 | old_sz *= 2; |
1396 | depth++; |
1397 | } |
1398 | if (nproc > old_sz) { // Not enough space, need to expand hierarchy |
1399 | while (nproc > old_sz) { |
1400 | old_sz *= 2; |
1401 | incs++; |
1402 | depth++; |
1403 | } |
1404 | maxLevels += incs; |
1405 | |
1406 | // Resize arrays |
1407 | kmp_uint32 *old_numPerLevel = numPerLevel; |
1408 | kmp_uint32 *old_skipPerLevel = skipPerLevel; |
1409 | numPerLevel = skipPerLevel = NULL; |
1410 | numPerLevel = |
1411 | (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32)); |
1412 | skipPerLevel = &(numPerLevel[maxLevels]); |
1413 | |
1414 | // Copy old elements from old arrays |
1415 | for (kmp_uint32 i = 0; i < old_maxLevels; ++i) { |
1416 | // init numPerLevel[*] to 1 item per level |
1417 | numPerLevel[i] = old_numPerLevel[i]; |
1418 | skipPerLevel[i] = old_skipPerLevel[i]; |
1419 | } |
1420 | |
1421 | // Init new elements in arrays to 1 |
1422 | for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) { |
1423 | // init numPerLevel[*] to 1 item per level |
1424 | numPerLevel[i] = 1; |
1425 | skipPerLevel[i] = 1; |
1426 | } |
1427 | |
1428 | // Free old arrays |
1429 | __kmp_free(old_numPerLevel); |
1430 | } |
1431 | |
1432 | // Fill in oversubscription levels of hierarchy |
1433 | for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) |
1434 | skipPerLevel[i] = 2 * skipPerLevel[i - 1]; |
1435 | |
1436 | base_num_threads = nproc; |
1437 | resizing = 0; // One writer |
1438 | } |
1439 | }; |
1440 | #endif // KMP_AFFINITY_H |
1441 | |