1/*
2 * kmp_affinity.h -- header for affinity management
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef KMP_AFFINITY_H
14#define KMP_AFFINITY_H
15
16#include "kmp.h"
17#include "kmp_os.h"
18#include <limits>
19
20#if KMP_AFFINITY_SUPPORTED
21#if KMP_USE_HWLOC
22class KMPHwlocAffinity : public KMPAffinity {
23public:
24 class Mask : public KMPAffinity::Mask {
25 hwloc_cpuset_t mask;
26
27 public:
28 Mask() {
29 mask = hwloc_bitmap_alloc();
30 this->zero();
31 }
32 Mask(const Mask &other) = delete;
33 Mask &operator=(const Mask &other) = delete;
34 ~Mask() { hwloc_bitmap_free(mask); }
35 void set(int i) override { hwloc_bitmap_set(mask, i); }
36 bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
37 void clear(int i) override { hwloc_bitmap_clr(mask, i); }
38 void zero() override { hwloc_bitmap_zero(mask); }
39 bool empty() const override { return hwloc_bitmap_iszero(mask); }
40 void copy(const KMPAffinity::Mask *src) override {
41 const Mask *convert = static_cast<const Mask *>(src);
42 hwloc_bitmap_copy(mask, convert->mask);
43 }
44 void bitwise_and(const KMPAffinity::Mask *rhs) override {
45 const Mask *convert = static_cast<const Mask *>(rhs);
46 hwloc_bitmap_and(mask, mask, convert->mask);
47 }
48 void bitwise_or(const KMPAffinity::Mask *rhs) override {
49 const Mask *convert = static_cast<const Mask *>(rhs);
50 hwloc_bitmap_or(mask, mask, convert->mask);
51 }
52 void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
53 bool is_equal(const KMPAffinity::Mask *rhs) const override {
54 const Mask *convert = static_cast<const Mask *>(rhs);
55 return hwloc_bitmap_isequal(mask, convert->mask);
56 }
57 int begin() const override { return hwloc_bitmap_first(mask); }
58 int end() const override { return -1; }
59 int next(int previous) const override {
60 return hwloc_bitmap_next(mask, previous);
61 }
62 int get_system_affinity(bool abort_on_error) override {
63 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
64 "Illegal get affinity operation when not capable");
65 long retval =
66 hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
67 if (retval >= 0) {
68 return 0;
69 }
70 int error = errno;
71 if (abort_on_error) {
72 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"),
73 KMP_ERR(error), __kmp_msg_null);
74 }
75 return error;
76 }
77 int set_system_affinity(bool abort_on_error) const override {
78 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
79 "Illegal set affinity operation when not capable");
80 long retval =
81 hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
82 if (retval >= 0) {
83 return 0;
84 }
85 int error = errno;
86 if (abort_on_error) {
87 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
88 KMP_ERR(error), __kmp_msg_null);
89 }
90 return error;
91 }
92#if KMP_OS_WINDOWS
93 int set_process_affinity(bool abort_on_error) const override {
94 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
95 "Illegal set process affinity operation when not capable");
96 int error = 0;
97 const hwloc_topology_support *support =
98 hwloc_topology_get_support(__kmp_hwloc_topology);
99 if (support->cpubind->set_proc_cpubind) {
100 int retval;
101 retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
102 HWLOC_CPUBIND_PROCESS);
103 if (retval >= 0)
104 return 0;
105 error = errno;
106 if (abort_on_error)
107 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
108 KMP_ERR(error), __kmp_msg_null);
109 }
110 return error;
111 }
112#endif
113 int get_proc_group() const override {
114 int group = -1;
115#if KMP_OS_WINDOWS
116 if (__kmp_num_proc_groups == 1) {
117 return 1;
118 }
119 for (int i = 0; i < __kmp_num_proc_groups; i++) {
120 // On windows, the long type is always 32 bits
121 unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
122 unsigned long second_32_bits =
123 hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
124 if (first_32_bits == 0 && second_32_bits == 0) {
125 continue;
126 }
127 if (group >= 0) {
128 return -1;
129 }
130 group = i;
131 }
132#endif /* KMP_OS_WINDOWS */
133 return group;
134 }
135 };
136 void determine_capable(const char *var) override {
137 const hwloc_topology_support *topology_support;
138 if (__kmp_hwloc_topology == NULL) {
139 if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
140 __kmp_hwloc_error = TRUE;
141 if (__kmp_affinity.flags.verbose) {
142 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
143 }
144 }
145 if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
146 __kmp_hwloc_error = TRUE;
147 if (__kmp_affinity.flags.verbose) {
148 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
149 }
150 }
151 }
152 topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
153 // Is the system capable of setting/getting this thread's affinity?
154 // Also, is topology discovery possible? (pu indicates ability to discover
155 // processing units). And finally, were there no errors when calling any
156 // hwloc_* API functions?
157 if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
158 topology_support->cpubind->get_thisthread_cpubind &&
159 topology_support->discovery->pu && !__kmp_hwloc_error) {
160 // enables affinity according to KMP_AFFINITY_CAPABLE() macro
161 KMP_AFFINITY_ENABLE(TRUE);
162 } else {
163 // indicate that hwloc didn't work and disable affinity
164 __kmp_hwloc_error = TRUE;
165 KMP_AFFINITY_DISABLE();
166 }
167 }
168 void bind_thread(int which) override {
169 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
170 "Illegal set affinity operation when not capable");
171 KMPAffinity::Mask *mask;
172 KMP_CPU_ALLOC_ON_STACK(mask);
173 KMP_CPU_ZERO(mask);
174 KMP_CPU_SET(which, mask);
175 __kmp_set_system_affinity(mask, TRUE);
176 KMP_CPU_FREE_FROM_STACK(mask);
177 }
178 KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
179 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
180 KMPAffinity::Mask *allocate_mask_array(int num) override {
181 return new Mask[num];
182 }
183 void deallocate_mask_array(KMPAffinity::Mask *array) override {
184 Mask *hwloc_array = static_cast<Mask *>(array);
185 delete[] hwloc_array;
186 }
187 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
188 int index) override {
189 Mask *hwloc_array = static_cast<Mask *>(array);
190 return &(hwloc_array[index]);
191 }
192 api_type get_api_type() const override { return HWLOC; }
193};
194#endif /* KMP_USE_HWLOC */
195
196#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY || \
197 KMP_OS_AIX
198#if KMP_OS_LINUX
199/* On some of the older OS's that we build on, these constants aren't present
200 in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
201 all systems of the same arch where they are defined, and they cannot change.
202 stone forever. */
203#include <sys/syscall.h>
204#if KMP_ARCH_X86 || KMP_ARCH_ARM
205#ifndef __NR_sched_setaffinity
206#define __NR_sched_setaffinity 241
207#elif __NR_sched_setaffinity != 241
208#error Wrong code for setaffinity system call.
209#endif /* __NR_sched_setaffinity */
210#ifndef __NR_sched_getaffinity
211#define __NR_sched_getaffinity 242
212#elif __NR_sched_getaffinity != 242
213#error Wrong code for getaffinity system call.
214#endif /* __NR_sched_getaffinity */
215#elif KMP_ARCH_AARCH64
216#ifndef __NR_sched_setaffinity
217#define __NR_sched_setaffinity 122
218#elif __NR_sched_setaffinity != 122
219#error Wrong code for setaffinity system call.
220#endif /* __NR_sched_setaffinity */
221#ifndef __NR_sched_getaffinity
222#define __NR_sched_getaffinity 123
223#elif __NR_sched_getaffinity != 123
224#error Wrong code for getaffinity system call.
225#endif /* __NR_sched_getaffinity */
226#elif KMP_ARCH_X86_64
227#ifndef __NR_sched_setaffinity
228#define __NR_sched_setaffinity 203
229#elif __NR_sched_setaffinity != 203
230#error Wrong code for setaffinity system call.
231#endif /* __NR_sched_setaffinity */
232#ifndef __NR_sched_getaffinity
233#define __NR_sched_getaffinity 204
234#elif __NR_sched_getaffinity != 204
235#error Wrong code for getaffinity system call.
236#endif /* __NR_sched_getaffinity */
237#elif KMP_ARCH_PPC64
238#ifndef __NR_sched_setaffinity
239#define __NR_sched_setaffinity 222
240#elif __NR_sched_setaffinity != 222
241#error Wrong code for setaffinity system call.
242#endif /* __NR_sched_setaffinity */
243#ifndef __NR_sched_getaffinity
244#define __NR_sched_getaffinity 223
245#elif __NR_sched_getaffinity != 223
246#error Wrong code for getaffinity system call.
247#endif /* __NR_sched_getaffinity */
248#elif KMP_ARCH_MIPS
249#ifndef __NR_sched_setaffinity
250#define __NR_sched_setaffinity 4239
251#elif __NR_sched_setaffinity != 4239
252#error Wrong code for setaffinity system call.
253#endif /* __NR_sched_setaffinity */
254#ifndef __NR_sched_getaffinity
255#define __NR_sched_getaffinity 4240
256#elif __NR_sched_getaffinity != 4240
257#error Wrong code for getaffinity system call.
258#endif /* __NR_sched_getaffinity */
259#elif KMP_ARCH_MIPS64
260#ifndef __NR_sched_setaffinity
261#define __NR_sched_setaffinity 5195
262#elif __NR_sched_setaffinity != 5195
263#error Wrong code for setaffinity system call.
264#endif /* __NR_sched_setaffinity */
265#ifndef __NR_sched_getaffinity
266#define __NR_sched_getaffinity 5196
267#elif __NR_sched_getaffinity != 5196
268#error Wrong code for getaffinity system call.
269#endif /* __NR_sched_getaffinity */
270#elif KMP_ARCH_LOONGARCH64
271#ifndef __NR_sched_setaffinity
272#define __NR_sched_setaffinity 122
273#elif __NR_sched_setaffinity != 122
274#error Wrong code for setaffinity system call.
275#endif /* __NR_sched_setaffinity */
276#ifndef __NR_sched_getaffinity
277#define __NR_sched_getaffinity 123
278#elif __NR_sched_getaffinity != 123
279#error Wrong code for getaffinity system call.
280#endif /* __NR_sched_getaffinity */
281#elif KMP_ARCH_RISCV64
282#ifndef __NR_sched_setaffinity
283#define __NR_sched_setaffinity 122
284#elif __NR_sched_setaffinity != 122
285#error Wrong code for setaffinity system call.
286#endif /* __NR_sched_setaffinity */
287#ifndef __NR_sched_getaffinity
288#define __NR_sched_getaffinity 123
289#elif __NR_sched_getaffinity != 123
290#error Wrong code for getaffinity system call.
291#endif /* __NR_sched_getaffinity */
292#elif KMP_ARCH_VE
293#ifndef __NR_sched_setaffinity
294#define __NR_sched_setaffinity 203
295#elif __NR_sched_setaffinity != 203
296#error Wrong code for setaffinity system call.
297#endif /* __NR_sched_setaffinity */
298#ifndef __NR_sched_getaffinity
299#define __NR_sched_getaffinity 204
300#elif __NR_sched_getaffinity != 204
301#error Wrong code for getaffinity system call.
302#endif /* __NR_sched_getaffinity */
303#elif KMP_ARCH_S390X
304#ifndef __NR_sched_setaffinity
305#define __NR_sched_setaffinity 239
306#elif __NR_sched_setaffinity != 239
307#error Wrong code for setaffinity system call.
308#endif /* __NR_sched_setaffinity */
309#ifndef __NR_sched_getaffinity
310#define __NR_sched_getaffinity 240
311#elif __NR_sched_getaffinity != 240
312#error Wrong code for getaffinity system call.
313#endif /* __NR_sched_getaffinity */
314#elif KMP_ARCH_SPARC
315#ifndef __NR_sched_setaffinity
316#define __NR_sched_setaffinity 261
317#elif __NR_sched_setaffinity != 261
318#error Wrong code for setaffinity system call.
319#endif /* __NR_sched_setaffinity */
320#ifndef __NR_sched_getaffinity
321#define __NR_sched_getaffinity 260
322#elif __NR_sched_getaffinity != 260
323#error Wrong code for getaffinity system call.
324#endif /* __NR_sched_getaffinity */
325#else
326#error Unknown or unsupported architecture
327#endif /* KMP_ARCH_* */
328#elif KMP_OS_FREEBSD || KMP_OS_DRAGONFLY
329#include <pthread.h>
330#include <pthread_np.h>
331#elif KMP_OS_NETBSD
332#include <pthread.h>
333#include <sched.h>
334#elif KMP_OS_AIX
335#include <sys/dr.h>
336#include <sys/rset.h>
337#define VMI_MAXRADS 64 // Maximum number of RADs allowed by AIX.
338#define GET_NUMBER_SMT_SETS 0x0004
339extern "C" int syssmt(int flags, int, int, int *);
340#endif
341class KMPNativeAffinity : public KMPAffinity {
342 class Mask : public KMPAffinity::Mask {
343 typedef unsigned long mask_t;
344 typedef decltype(__kmp_affin_mask_size) mask_size_type;
345 static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
346 static const mask_t ONE = 1;
347 mask_size_type get_num_mask_types() const {
348 return __kmp_affin_mask_size / sizeof(mask_t);
349 }
350
351 public:
352 mask_t *mask;
353 Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
354 ~Mask() {
355 if (mask)
356 __kmp_free(mask);
357 }
358 void set(int i) override {
359 mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
360 }
361 bool is_set(int i) const override {
362 return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
363 }
364 void clear(int i) override {
365 mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
366 }
367 void zero() override {
368 mask_size_type e = get_num_mask_types();
369 for (mask_size_type i = 0; i < e; ++i)
370 mask[i] = (mask_t)0;
371 }
372 bool empty() const override {
373 mask_size_type e = get_num_mask_types();
374 for (mask_size_type i = 0; i < e; ++i)
375 if (mask[i] != (mask_t)0)
376 return false;
377 return true;
378 }
379 void copy(const KMPAffinity::Mask *src) override {
380 const Mask *convert = static_cast<const Mask *>(src);
381 mask_size_type e = get_num_mask_types();
382 for (mask_size_type i = 0; i < e; ++i)
383 mask[i] = convert->mask[i];
384 }
385 void bitwise_and(const KMPAffinity::Mask *rhs) override {
386 const Mask *convert = static_cast<const Mask *>(rhs);
387 mask_size_type e = get_num_mask_types();
388 for (mask_size_type i = 0; i < e; ++i)
389 mask[i] &= convert->mask[i];
390 }
391 void bitwise_or(const KMPAffinity::Mask *rhs) override {
392 const Mask *convert = static_cast<const Mask *>(rhs);
393 mask_size_type e = get_num_mask_types();
394 for (mask_size_type i = 0; i < e; ++i)
395 mask[i] |= convert->mask[i];
396 }
397 void bitwise_not() override {
398 mask_size_type e = get_num_mask_types();
399 for (mask_size_type i = 0; i < e; ++i)
400 mask[i] = ~(mask[i]);
401 }
402 bool is_equal(const KMPAffinity::Mask *rhs) const override {
403 const Mask *convert = static_cast<const Mask *>(rhs);
404 mask_size_type e = get_num_mask_types();
405 for (mask_size_type i = 0; i < e; ++i)
406 if (mask[i] != convert->mask[i])
407 return false;
408 return true;
409 }
410 int begin() const override {
411 int retval = 0;
412 while (retval < end() && !is_set(i: retval))
413 ++retval;
414 return retval;
415 }
416 int end() const override {
417 int e;
418 __kmp_type_convert(src: get_num_mask_types() * BITS_PER_MASK_T, dest: &e);
419 return e;
420 }
421 int next(int previous) const override {
422 int retval = previous + 1;
423 while (retval < end() && !is_set(i: retval))
424 ++retval;
425 return retval;
426 }
427#if KMP_OS_AIX
428 // On AIX, we don't have a way to get CPU(s) a thread is bound to.
429 // This routine is only used to get the full mask.
430 int get_system_affinity(bool abort_on_error) override {
431 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
432 "Illegal get affinity operation when not capable");
433
434 (void)abort_on_error;
435
436 // Set the mask with all CPUs that are available.
437 for (int i = 0; i < __kmp_xproc; ++i)
438 KMP_CPU_SET(i, this);
439 return 0;
440 }
441 int set_system_affinity(bool abort_on_error) const override {
442 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
443
444 "Illegal set affinity operation when not capable");
445
446 int location;
447 int gtid = __kmp_entry_gtid();
448 int tid = thread_self();
449
450 // Unbind the thread if it was bound to any processors before so that
451 // we can bind the thread to CPUs specified by the mask not others.
452 int retval = bindprocessor(BINDTHREAD, tid, PROCESSOR_CLASS_ANY);
453
454 // On AIX, we can only bind to one instead of a set of CPUs with the
455 // bindprocessor() system call.
456 KMP_CPU_SET_ITERATE(location, this) {
457 if (KMP_CPU_ISSET(location, this)) {
458 retval = bindprocessor(BINDTHREAD, tid, location);
459 if (retval == -1 && errno == 1) {
460 rsid_t rsid;
461 rsethandle_t rsh;
462 // Put something in rsh to prevent compiler warning
463 // about uninitalized use
464 rsh = rs_alloc(RS_EMPTY);
465 rsid.at_pid = getpid();
466 if (RS_DEFAULT_RSET != ra_getrset(R_PROCESS, rsid, 0, rsh)) {
467 retval = ra_detachrset(R_PROCESS, rsid, 0);
468 retval = bindprocessor(BINDTHREAD, tid, location);
469 }
470 }
471 if (retval == 0) {
472 KA_TRACE(10, ("__kmp_set_system_affinity: Done binding "
473 "T#%d to cpu=%d.\n",
474 gtid, location));
475 continue;
476 }
477 int error = errno;
478 if (abort_on_error) {
479 __kmp_fatal(KMP_MSG(FunctionError, "bindprocessor()"),
480 KMP_ERR(error), __kmp_msg_null);
481 KA_TRACE(10, ("__kmp_set_system_affinity: Error binding "
482 "T#%d to cpu=%d, errno=%d.\n",
483 gtid, location, error));
484 return error;
485 }
486 }
487 }
488 return 0;
489 }
490#else // !KMP_OS_AIX
491 int get_system_affinity(bool abort_on_error) override {
492 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
493 "Illegal get affinity operation when not capable");
494#if KMP_OS_LINUX
495 long retval =
496 syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
497#elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
498 int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
499 reinterpret_cast<cpuset_t *>(mask));
500 int retval = (r == 0 ? 0 : -1);
501#endif
502 if (retval >= 0) {
503 return 0;
504 }
505 int error = errno;
506 if (abort_on_error) {
507 __kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"),
508 KMP_ERR(error), __kmp_msg_null);
509 }
510 return error;
511 }
512 int set_system_affinity(bool abort_on_error) const override {
513 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
514 "Illegal set affinity operation when not capable");
515#if KMP_OS_LINUX
516 long retval =
517 syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
518#elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
519 int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
520 reinterpret_cast<cpuset_t *>(mask));
521 int retval = (r == 0 ? 0 : -1);
522#endif
523 if (retval >= 0) {
524 return 0;
525 }
526 int error = errno;
527 if (abort_on_error) {
528 __kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"),
529 KMP_ERR(error), __kmp_msg_null);
530 }
531 return error;
532 }
533#endif // KMP_OS_AIX
534 };
535 void determine_capable(const char *env_var) override {
536 __kmp_affinity_determine_capable(env_var);
537 }
538 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
539 KMPAffinity::Mask *allocate_mask() override {
540 KMPNativeAffinity::Mask *retval = new Mask();
541 return retval;
542 }
543 void deallocate_mask(KMPAffinity::Mask *m) override {
544 KMPNativeAffinity::Mask *native_mask =
545 static_cast<KMPNativeAffinity::Mask *>(m);
546 delete native_mask;
547 }
548 KMPAffinity::Mask *allocate_mask_array(int num) override {
549 return new Mask[num];
550 }
551 void deallocate_mask_array(KMPAffinity::Mask *array) override {
552 Mask *linux_array = static_cast<Mask *>(array);
553 delete[] linux_array;
554 }
555 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
556 int index) override {
557 Mask *linux_array = static_cast<Mask *>(array);
558 return &(linux_array[index]);
559 }
560 api_type get_api_type() const override { return NATIVE_OS; }
561};
562#endif /* KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY \
563 || KMP_OS_AIX */
564
565#if KMP_OS_WINDOWS
566class KMPNativeAffinity : public KMPAffinity {
567 class Mask : public KMPAffinity::Mask {
568 typedef ULONG_PTR mask_t;
569 static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
570 mask_t *mask;
571
572 public:
573 Mask() {
574 mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
575 }
576 ~Mask() {
577 if (mask)
578 __kmp_free(mask);
579 }
580 void set(int i) override {
581 mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
582 }
583 bool is_set(int i) const override {
584 return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
585 }
586 void clear(int i) override {
587 mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
588 }
589 void zero() override {
590 for (int i = 0; i < __kmp_num_proc_groups; ++i)
591 mask[i] = 0;
592 }
593 bool empty() const override {
594 for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
595 if (mask[i])
596 return false;
597 return true;
598 }
599 void copy(const KMPAffinity::Mask *src) override {
600 const Mask *convert = static_cast<const Mask *>(src);
601 for (int i = 0; i < __kmp_num_proc_groups; ++i)
602 mask[i] = convert->mask[i];
603 }
604 void bitwise_and(const KMPAffinity::Mask *rhs) override {
605 const Mask *convert = static_cast<const Mask *>(rhs);
606 for (int i = 0; i < __kmp_num_proc_groups; ++i)
607 mask[i] &= convert->mask[i];
608 }
609 void bitwise_or(const KMPAffinity::Mask *rhs) override {
610 const Mask *convert = static_cast<const Mask *>(rhs);
611 for (int i = 0; i < __kmp_num_proc_groups; ++i)
612 mask[i] |= convert->mask[i];
613 }
614 void bitwise_not() override {
615 for (int i = 0; i < __kmp_num_proc_groups; ++i)
616 mask[i] = ~(mask[i]);
617 }
618 bool is_equal(const KMPAffinity::Mask *rhs) const override {
619 const Mask *convert = static_cast<const Mask *>(rhs);
620 for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
621 if (mask[i] != convert->mask[i])
622 return false;
623 return true;
624 }
625 int begin() const override {
626 int retval = 0;
627 while (retval < end() && !is_set(retval))
628 ++retval;
629 return retval;
630 }
631 int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
632 int next(int previous) const override {
633 int retval = previous + 1;
634 while (retval < end() && !is_set(retval))
635 ++retval;
636 return retval;
637 }
638 int set_process_affinity(bool abort_on_error) const override {
639 if (__kmp_num_proc_groups <= 1) {
640 if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
641 DWORD error = GetLastError();
642 if (abort_on_error) {
643 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
644 __kmp_msg_null);
645 }
646 return error;
647 }
648 }
649 return 0;
650 }
651 int set_system_affinity(bool abort_on_error) const override {
652 if (__kmp_num_proc_groups > 1) {
653 // Check for a valid mask.
654 GROUP_AFFINITY ga;
655 int group = get_proc_group();
656 if (group < 0) {
657 if (abort_on_error) {
658 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
659 }
660 return -1;
661 }
662 // Transform the bit vector into a GROUP_AFFINITY struct
663 // and make the system call to set affinity.
664 ga.Group = group;
665 ga.Mask = mask[group];
666 ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
667
668 KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
669 if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
670 DWORD error = GetLastError();
671 if (abort_on_error) {
672 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
673 __kmp_msg_null);
674 }
675 return error;
676 }
677 } else {
678 if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
679 DWORD error = GetLastError();
680 if (abort_on_error) {
681 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
682 __kmp_msg_null);
683 }
684 return error;
685 }
686 }
687 return 0;
688 }
689 int get_system_affinity(bool abort_on_error) override {
690 if (__kmp_num_proc_groups > 1) {
691 this->zero();
692 GROUP_AFFINITY ga;
693 KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
694 if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
695 DWORD error = GetLastError();
696 if (abort_on_error) {
697 __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
698 KMP_ERR(error), __kmp_msg_null);
699 }
700 return error;
701 }
702 if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
703 (ga.Mask == 0)) {
704 return -1;
705 }
706 mask[ga.Group] = ga.Mask;
707 } else {
708 mask_t newMask, sysMask, retval;
709 if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
710 DWORD error = GetLastError();
711 if (abort_on_error) {
712 __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
713 KMP_ERR(error), __kmp_msg_null);
714 }
715 return error;
716 }
717 retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
718 if (!retval) {
719 DWORD error = GetLastError();
720 if (abort_on_error) {
721 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
722 KMP_ERR(error), __kmp_msg_null);
723 }
724 return error;
725 }
726 newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
727 if (!newMask) {
728 DWORD error = GetLastError();
729 if (abort_on_error) {
730 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
731 KMP_ERR(error), __kmp_msg_null);
732 }
733 }
734 *mask = retval;
735 }
736 return 0;
737 }
738 int get_proc_group() const override {
739 int group = -1;
740 if (__kmp_num_proc_groups == 1) {
741 return 1;
742 }
743 for (int i = 0; i < __kmp_num_proc_groups; i++) {
744 if (mask[i] == 0)
745 continue;
746 if (group >= 0)
747 return -1;
748 group = i;
749 }
750 return group;
751 }
752 };
753 void determine_capable(const char *env_var) override {
754 __kmp_affinity_determine_capable(env_var);
755 }
756 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
757 KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
758 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
759 KMPAffinity::Mask *allocate_mask_array(int num) override {
760 return new Mask[num];
761 }
762 void deallocate_mask_array(KMPAffinity::Mask *array) override {
763 Mask *windows_array = static_cast<Mask *>(array);
764 delete[] windows_array;
765 }
766 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
767 int index) override {
768 Mask *windows_array = static_cast<Mask *>(array);
769 return &(windows_array[index]);
770 }
771 api_type get_api_type() const override { return NATIVE_OS; }
772};
773#endif /* KMP_OS_WINDOWS */
774#endif /* KMP_AFFINITY_SUPPORTED */
775
776// Describe an attribute for a level in the machine topology
777struct kmp_hw_attr_t {
778 int core_type : 8;
779 int core_eff : 8;
780 unsigned valid : 1;
781 unsigned reserved : 15;
782
783 static const int UNKNOWN_CORE_EFF = -1;
784
785 kmp_hw_attr_t()
786 : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF),
787 valid(0), reserved(0) {}
788 void set_core_type(kmp_hw_core_type_t type) {
789 valid = 1;
790 core_type = type;
791 }
792 void set_core_eff(int eff) {
793 valid = 1;
794 core_eff = eff;
795 }
796 kmp_hw_core_type_t get_core_type() const {
797 return (kmp_hw_core_type_t)core_type;
798 }
799 int get_core_eff() const { return core_eff; }
800 bool is_core_type_valid() const {
801 return core_type != KMP_HW_CORE_TYPE_UNKNOWN;
802 }
803 bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; }
804 operator bool() const { return valid; }
805 void clear() {
806 core_type = KMP_HW_CORE_TYPE_UNKNOWN;
807 core_eff = UNKNOWN_CORE_EFF;
808 valid = 0;
809 }
810 bool contains(const kmp_hw_attr_t &other) const {
811 if (!valid && !other.valid)
812 return true;
813 if (valid && other.valid) {
814 if (other.is_core_type_valid()) {
815 if (!is_core_type_valid() || (get_core_type() != other.get_core_type()))
816 return false;
817 }
818 if (other.is_core_eff_valid()) {
819 if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff()))
820 return false;
821 }
822 return true;
823 }
824 return false;
825 }
826#if KMP_AFFINITY_SUPPORTED
827 bool contains(const kmp_affinity_attrs_t &attr) const {
828 if (!valid && !attr.valid)
829 return true;
830 if (valid && attr.valid) {
831 if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN)
832 return (is_core_type_valid() &&
833 (get_core_type() == (kmp_hw_core_type_t)attr.core_type));
834 if (attr.core_eff != UNKNOWN_CORE_EFF)
835 return (is_core_eff_valid() && (get_core_eff() == attr.core_eff));
836 return true;
837 }
838 return false;
839 }
840#endif // KMP_AFFINITY_SUPPORTED
841 bool operator==(const kmp_hw_attr_t &rhs) const {
842 return (rhs.valid == valid && rhs.core_eff == core_eff &&
843 rhs.core_type == core_type);
844 }
845 bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); }
846};
847
848#if KMP_AFFINITY_SUPPORTED
849KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t));
850#endif
851
852class kmp_hw_thread_t {
853public:
854 static const int UNKNOWN_ID = -1;
855 static const int MULTIPLE_ID = -2;
856 static int compare_ids(const void *a, const void *b);
857 static int compare_compact(const void *a, const void *b);
858 int ids[KMP_HW_LAST];
859 int sub_ids[KMP_HW_LAST];
860 bool leader;
861 int os_id;
862 int original_idx;
863 kmp_hw_attr_t attrs;
864
865 void print() const;
866 void clear() {
867 for (int i = 0; i < (int)KMP_HW_LAST; ++i)
868 ids[i] = UNKNOWN_ID;
869 leader = false;
870 attrs.clear();
871 }
872};
873
874class kmp_topology_t {
875
876 struct flags_t {
877 int uniform : 1;
878 int reserved : 31;
879 };
880
881 int depth;
882
883 // The following arrays are all 'depth' long and have been
884 // allocated to hold up to KMP_HW_LAST number of objects if
885 // needed so layers can be added without reallocation of any array
886
887 // Orderd array of the types in the topology
888 kmp_hw_t *types;
889
890 // Keep quick topology ratios, for non-uniform topologies,
891 // this ratio holds the max number of itemAs per itemB
892 // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
893 int *ratio;
894
895 // Storage containing the absolute number of each topology layer
896 int *count;
897
898 // The number of core efficiencies. This is only useful for hybrid
899 // topologies. Core efficiencies will range from 0 to num efficiencies - 1
900 int num_core_efficiencies;
901 int num_core_types;
902 kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES];
903
904 // The hardware threads array
905 // hw_threads is num_hw_threads long
906 // Each hw_thread's ids and sub_ids are depth deep
907 int num_hw_threads;
908 kmp_hw_thread_t *hw_threads;
909
910 // Equivalence hash where the key is the hardware topology item
911 // and the value is the equivalent hardware topology type in the
912 // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
913 // known equivalence for the topology type
914 kmp_hw_t equivalent[KMP_HW_LAST];
915
916 // Flags describing the topology
917 flags_t flags;
918
919 // Compact value used during sort_compact()
920 int compact;
921
922#if KMP_GROUP_AFFINITY
923 // Insert topology information about Windows Processor groups
924 void _insert_windows_proc_groups();
925#endif
926
927 // Count each item & get the num x's per y
928 // e.g., get the number of cores and the number of threads per core
929 // for each (x, y) in (KMP_HW_* , KMP_HW_*)
930 void _gather_enumeration_information();
931
932 // Remove layers that don't add information to the topology.
933 // This is done by having the layer take on the id = UNKNOWN_ID (-1)
934 void _remove_radix1_layers();
935
936 // Find out if the topology is uniform
937 void _discover_uniformity();
938
939 // Set all the sub_ids for each hardware thread
940 void _set_sub_ids();
941
942 // Set global affinity variables describing the number of threads per
943 // core, the number of packages, the number of cores per package, and
944 // the number of cores.
945 void _set_globals();
946
947 // Set the last level cache equivalent type
948 void _set_last_level_cache();
949
950 // Return the number of cores with a particular attribute, 'attr'.
951 // If 'find_all' is true, then find all cores on the machine, otherwise find
952 // all cores per the layer 'above'
953 int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above,
954 bool find_all = false) const;
955
956public:
957 // Force use of allocate()/deallocate()
958 kmp_topology_t() = delete;
959 kmp_topology_t(const kmp_topology_t &t) = delete;
960 kmp_topology_t(kmp_topology_t &&t) = delete;
961 kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
962 kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
963
964 static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
965 static void deallocate(kmp_topology_t *);
966
967 // Functions used in create_map() routines
968 kmp_hw_thread_t &at(int index) {
969 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
970 return hw_threads[index];
971 }
972 const kmp_hw_thread_t &at(int index) const {
973 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
974 return hw_threads[index];
975 }
976 int get_num_hw_threads() const { return num_hw_threads; }
977 void sort_ids() {
978 qsort(base: hw_threads, nmemb: num_hw_threads, size: sizeof(kmp_hw_thread_t),
979 compar: kmp_hw_thread_t::compare_ids);
980 }
981
982 // Insert a new topology layer after allocation
983 void insert_layer(kmp_hw_t type, const int *ids);
984
985 // Check if the hardware ids are unique, if they are
986 // return true, otherwise return false
987 bool check_ids() const;
988
989 // Function to call after the create_map() routine
990 void canonicalize();
991 void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
992
993// Functions used after canonicalize() called
994
995#if KMP_AFFINITY_SUPPORTED
996 // Set the granularity for affinity settings
997 void set_granularity(kmp_affinity_t &stgs) const;
998 bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const;
999 bool restrict_to_mask(const kmp_affin_mask_t *mask);
1000 bool filter_hw_subset();
1001#endif
1002 bool is_uniform() const { return flags.uniform; }
1003 // Tell whether a type is a valid type in the topology
1004 // returns KMP_HW_UNKNOWN when there is no equivalent type
1005 kmp_hw_t get_equivalent_type(kmp_hw_t type) const {
1006 if (type == KMP_HW_UNKNOWN)
1007 return KMP_HW_UNKNOWN;
1008 return equivalent[type];
1009 }
1010 // Set type1 = type2
1011 void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
1012 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
1013 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
1014 kmp_hw_t real_type2 = equivalent[type2];
1015 if (real_type2 == KMP_HW_UNKNOWN)
1016 real_type2 = type2;
1017 equivalent[type1] = real_type2;
1018 // This loop is required since any of the types may have been set to
1019 // be equivalent to type1. They all must be checked and reset to type2.
1020 KMP_FOREACH_HW_TYPE(type) {
1021 if (equivalent[type] == type1) {
1022 equivalent[type] = real_type2;
1023 }
1024 }
1025 }
1026 // Calculate number of types corresponding to level1
1027 // per types corresponding to level2 (e.g., number of threads per core)
1028 int calculate_ratio(int level1, int level2) const {
1029 KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
1030 KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
1031 int r = 1;
1032 for (int level = level1; level > level2; --level)
1033 r *= ratio[level];
1034 return r;
1035 }
1036 int get_ratio(int level) const {
1037 KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1038 return ratio[level];
1039 }
1040 int get_depth() const { return depth; };
1041 kmp_hw_t get_type(int level) const {
1042 KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1043 return types[level];
1044 }
1045 int get_level(kmp_hw_t type) const {
1046 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
1047 int eq_type = equivalent[type];
1048 if (eq_type == KMP_HW_UNKNOWN)
1049 return -1;
1050 for (int i = 0; i < depth; ++i)
1051 if (types[i] == eq_type)
1052 return i;
1053 return -1;
1054 }
1055 int get_count(int level) const {
1056 KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1057 return count[level];
1058 }
1059 // Return the total number of cores with attribute 'attr'
1060 int get_ncores_with_attr(const kmp_hw_attr_t &attr) const {
1061 return _get_ncores_with_attr(attr, above: -1, find_all: true);
1062 }
1063 // Return the number of cores with attribute
1064 // 'attr' per topology level 'above'
1065 int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const {
1066 return _get_ncores_with_attr(attr, above, find_all: false);
1067 }
1068
1069#if KMP_AFFINITY_SUPPORTED
1070 friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b);
1071 void sort_compact(kmp_affinity_t &affinity) {
1072 compact = affinity.compact;
1073 qsort(base: hw_threads, nmemb: num_hw_threads, size: sizeof(kmp_hw_thread_t),
1074 compar: kmp_hw_thread_t::compare_compact);
1075 }
1076#endif
1077 void print(const char *env_var = "KMP_AFFINITY") const;
1078 void dump() const;
1079};
1080extern kmp_topology_t *__kmp_topology;
1081
1082class kmp_hw_subset_t {
1083 const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS;
1084
1085public:
1086 // Describe a machine topology item in KMP_HW_SUBSET
1087 struct item_t {
1088 kmp_hw_t type;
1089 int num_attrs;
1090 int num[MAX_ATTRS];
1091 int offset[MAX_ATTRS];
1092 kmp_hw_attr_t attr[MAX_ATTRS];
1093 };
1094 // Put parenthesis around max to avoid accidental use of Windows max macro.
1095 const static int USE_ALL = (std::numeric_limits<int>::max)();
1096
1097private:
1098 int depth;
1099 int capacity;
1100 item_t *items;
1101 kmp_uint64 set;
1102 bool absolute;
1103 // The set must be able to handle up to KMP_HW_LAST number of layers
1104 KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
1105 // Sorting the KMP_HW_SUBSET items to follow topology order
1106 // All unknown topology types will be at the beginning of the subset
1107 static int hw_subset_compare(const void *i1, const void *i2) {
1108 kmp_hw_t type1 = ((const item_t *)i1)->type;
1109 kmp_hw_t type2 = ((const item_t *)i2)->type;
1110 int level1 = __kmp_topology->get_level(type: type1);
1111 int level2 = __kmp_topology->get_level(type: type2);
1112 return level1 - level2;
1113 }
1114
1115public:
1116 // Force use of allocate()/deallocate()
1117 kmp_hw_subset_t() = delete;
1118 kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
1119 kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
1120 kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
1121 kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
1122
1123 static kmp_hw_subset_t *allocate() {
1124 int initial_capacity = 5;
1125 kmp_hw_subset_t *retval =
1126 (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
1127 retval->depth = 0;
1128 retval->capacity = initial_capacity;
1129 retval->set = 0ull;
1130 retval->absolute = false;
1131 retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
1132 return retval;
1133 }
1134 static void deallocate(kmp_hw_subset_t *subset) {
1135 __kmp_free(subset->items);
1136 __kmp_free(subset);
1137 }
1138 void set_absolute() { absolute = true; }
1139 bool is_absolute() const { return absolute; }
1140 void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) {
1141 for (int i = 0; i < depth; ++i) {
1142 // Found an existing item for this layer type
1143 // Add the num, offset, and attr to this item
1144 if (items[i].type == type) {
1145 int idx = items[i].num_attrs++;
1146 if ((size_t)idx >= MAX_ATTRS)
1147 return;
1148 items[i].num[idx] = num;
1149 items[i].offset[idx] = offset;
1150 items[i].attr[idx] = attr;
1151 return;
1152 }
1153 }
1154 if (depth == capacity - 1) {
1155 capacity *= 2;
1156 item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
1157 for (int i = 0; i < depth; ++i)
1158 new_items[i] = items[i];
1159 __kmp_free(items);
1160 items = new_items;
1161 }
1162 items[depth].num_attrs = 1;
1163 items[depth].type = type;
1164 items[depth].num[0] = num;
1165 items[depth].offset[0] = offset;
1166 items[depth].attr[0] = attr;
1167 depth++;
1168 set |= (1ull << type);
1169 }
1170 int get_depth() const { return depth; }
1171 const item_t &at(int index) const {
1172 KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1173 return items[index];
1174 }
1175 item_t &at(int index) {
1176 KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1177 return items[index];
1178 }
1179 void remove(int index) {
1180 KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1181 set &= ~(1ull << items[index].type);
1182 for (int j = index + 1; j < depth; ++j) {
1183 items[j - 1] = items[j];
1184 }
1185 depth--;
1186 }
1187 void sort() {
1188 KMP_DEBUG_ASSERT(__kmp_topology);
1189 qsort(base: items, nmemb: depth, size: sizeof(item_t), compar: hw_subset_compare);
1190 }
1191 bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
1192
1193 // Canonicalize the KMP_HW_SUBSET value if it is not an absolute subset.
1194 // This means putting each of {sockets, cores, threads} in the topology if
1195 // they are not specified:
1196 // e.g., 1s,2c => 1s,2c,*t | 2c,1t => *s,2c,1t | 1t => *s,*c,1t | etc.
1197 // e.g., 3module => *s,3module,*c,*t
1198 // By doing this, the runtime assumes users who fiddle with KMP_HW_SUBSET
1199 // are expecting the traditional sockets/cores/threads topology. For newer
1200 // hardware, there can be intervening layers like dies/tiles/modules
1201 // (usually corresponding to a cache level). So when a user asks for
1202 // 1s,6c,2t and the topology is really 1s,2modules,4cores,2threads, the user
1203 // should get 12 hardware threads across 6 cores and effectively ignore the
1204 // module layer.
1205 void canonicalize(const kmp_topology_t *top) {
1206 // Layers to target for KMP_HW_SUBSET canonicalization
1207 kmp_hw_t targeted[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD};
1208
1209 // Do not target-layer-canonicalize absolute KMP_HW_SUBSETS
1210 if (is_absolute())
1211 return;
1212
1213 // Do not target-layer-canonicalize KMP_HW_SUBSETS when the
1214 // topology doesn't have these layers
1215 for (kmp_hw_t type : targeted)
1216 if (top->get_level(type) == KMP_HW_UNKNOWN)
1217 return;
1218
1219 // Put targeted layers in topology if they do not exist
1220 for (kmp_hw_t type : targeted) {
1221 bool found = false;
1222 for (int i = 0; i < get_depth(); ++i) {
1223 if (top->get_equivalent_type(type: items[i].type) == type) {
1224 found = true;
1225 break;
1226 }
1227 }
1228 if (!found) {
1229 push_back(num: USE_ALL, type, offset: 0, attr: kmp_hw_attr_t{});
1230 }
1231 }
1232 sort();
1233 // Set as an absolute topology that only targets the targeted layers
1234 set_absolute();
1235 }
1236 void dump() const {
1237 printf(format: "**********************\n");
1238 printf(format: "*** kmp_hw_subset: ***\n");
1239 printf(format: "* depth: %d\n", depth);
1240 printf(format: "* items:\n");
1241 for (int i = 0; i < depth; ++i) {
1242 printf(format: " type: %s\n", __kmp_hw_get_keyword(type: items[i].type));
1243 for (int j = 0; j < items[i].num_attrs; ++j) {
1244 printf(format: " num: %d, offset: %d, attr: ", items[i].num[j],
1245 items[i].offset[j]);
1246 if (!items[i].attr[j]) {
1247 printf(format: " (none)\n");
1248 } else {
1249 printf(
1250 format: " core_type = %s, core_eff = %d\n",
1251 __kmp_hw_get_core_type_string(type: items[i].attr[j].get_core_type()),
1252 items[i].attr[j].get_core_eff());
1253 }
1254 }
1255 }
1256 printf(format: "* set: 0x%llx\n", set);
1257 printf(format: "* absolute: %d\n", absolute);
1258 printf(format: "**********************\n");
1259 }
1260};
1261extern kmp_hw_subset_t *__kmp_hw_subset;
1262
1263/* A structure for holding machine-specific hierarchy info to be computed once
1264 at init. This structure represents a mapping of threads to the actual machine
1265 hierarchy, or to our best guess at what the hierarchy might be, for the
1266 purpose of performing an efficient barrier. In the worst case, when there is
1267 no machine hierarchy information, it produces a tree suitable for a barrier,
1268 similar to the tree used in the hyper barrier. */
1269class hierarchy_info {
1270public:
1271 /* Good default values for number of leaves and branching factor, given no
1272 affinity information. Behaves a bit like hyper barrier. */
1273 static const kmp_uint32 maxLeaves = 4;
1274 static const kmp_uint32 minBranch = 4;
1275 /** Number of levels in the hierarchy. Typical levels are threads/core,
1276 cores/package or socket, packages/node, nodes/machine, etc. We don't want
1277 to get specific with nomenclature. When the machine is oversubscribed we
1278 add levels to duplicate the hierarchy, doubling the thread capacity of the
1279 hierarchy each time we add a level. */
1280 kmp_uint32 maxLevels;
1281
1282 /** This is specifically the depth of the machine configuration hierarchy, in
1283 terms of the number of levels along the longest path from root to any
1284 leaf. It corresponds to the number of entries in numPerLevel if we exclude
1285 all but one trailing 1. */
1286 kmp_uint32 depth;
1287 kmp_uint32 base_num_threads = 0;
1288 enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
1289 volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
1290 // 2=initialization in progress
1291 volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
1292
1293 /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children
1294 the parent of a node at level i has. For example, if we have a machine
1295 with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel =
1296 {2, 4, 4, 1, 1}. All empty levels are set to 1. */
1297 kmp_uint32 *numPerLevel = nullptr;
1298 kmp_uint32 *skipPerLevel = nullptr;
1299
1300 void deriveLevels() {
1301 int hier_depth = __kmp_topology->get_depth();
1302 for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
1303 numPerLevel[level] = __kmp_topology->get_ratio(level: i);
1304 }
1305 }
1306
1307 hierarchy_info()
1308 : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
1309
1310 void fini() {
1311 if (!uninitialized && numPerLevel) {
1312 __kmp_free(numPerLevel);
1313 numPerLevel = NULL;
1314 uninitialized = not_initialized;
1315 }
1316 }
1317
1318 void init(int num_addrs) {
1319 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
1320 &uninitialized, not_initialized, initializing);
1321 if (bool_result == 0) { // Wait for initialization
1322 while (TCR_1(uninitialized) != initialized)
1323 KMP_CPU_PAUSE();
1324 return;
1325 }
1326 KMP_DEBUG_ASSERT(bool_result == 1);
1327
1328 /* Added explicit initialization of the data fields here to prevent usage of
1329 dirty value observed when static library is re-initialized multiple times
1330 (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
1331 OpenMP). */
1332 depth = 1;
1333 resizing = 0;
1334 maxLevels = 7;
1335 numPerLevel =
1336 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1337 skipPerLevel = &(numPerLevel[maxLevels]);
1338 for (kmp_uint32 i = 0; i < maxLevels;
1339 ++i) { // init numPerLevel[*] to 1 item per level
1340 numPerLevel[i] = 1;
1341 skipPerLevel[i] = 1;
1342 }
1343
1344 // Sort table by physical ID
1345 if (__kmp_topology && __kmp_topology->get_depth() > 0) {
1346 deriveLevels();
1347 } else {
1348 numPerLevel[0] = maxLeaves;
1349 numPerLevel[1] = num_addrs / maxLeaves;
1350 if (num_addrs % maxLeaves)
1351 numPerLevel[1]++;
1352 }
1353
1354 base_num_threads = num_addrs;
1355 for (int i = maxLevels - 1; i >= 0;
1356 --i) // count non-empty levels to get depth
1357 if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
1358 depth++;
1359
1360 kmp_uint32 branch = minBranch;
1361 if (numPerLevel[0] == 1)
1362 branch = num_addrs / maxLeaves;
1363 if (branch < minBranch)
1364 branch = minBranch;
1365 for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
1366 while (numPerLevel[d] > branch ||
1367 (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
1368 if (numPerLevel[d] & 1)
1369 numPerLevel[d]++;
1370 numPerLevel[d] = numPerLevel[d] >> 1;
1371 if (numPerLevel[d + 1] == 1)
1372 depth++;
1373 numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
1374 }
1375 if (numPerLevel[0] == 1) {
1376 branch = branch >> 1;
1377 if (branch < 4)
1378 branch = minBranch;
1379 }
1380 }
1381
1382 for (kmp_uint32 i = 1; i < depth; ++i)
1383 skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
1384 // Fill in hierarchy in the case of oversubscription
1385 for (kmp_uint32 i = depth; i < maxLevels; ++i)
1386 skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1387
1388 uninitialized = initialized; // One writer
1389 }
1390
1391 // Resize the hierarchy if nproc changes to something larger than before
1392 void resize(kmp_uint32 nproc) {
1393 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1394 while (bool_result == 0) { // someone else is trying to resize
1395 KMP_CPU_PAUSE();
1396 if (nproc <= base_num_threads) // happy with other thread's resize
1397 return;
1398 else // try to resize
1399 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1400 }
1401 KMP_DEBUG_ASSERT(bool_result != 0);
1402 if (nproc <= base_num_threads)
1403 return; // happy with other thread's resize
1404
1405 // Calculate new maxLevels
1406 kmp_uint32 old_sz = skipPerLevel[depth - 1];
1407 kmp_uint32 incs = 0, old_maxLevels = maxLevels;
1408 // First see if old maxLevels is enough to contain new size
1409 for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
1410 skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1411 numPerLevel[i - 1] *= 2;
1412 old_sz *= 2;
1413 depth++;
1414 }
1415 if (nproc > old_sz) { // Not enough space, need to expand hierarchy
1416 while (nproc > old_sz) {
1417 old_sz *= 2;
1418 incs++;
1419 depth++;
1420 }
1421 maxLevels += incs;
1422
1423 // Resize arrays
1424 kmp_uint32 *old_numPerLevel = numPerLevel;
1425 kmp_uint32 *old_skipPerLevel = skipPerLevel;
1426 numPerLevel = skipPerLevel = NULL;
1427 numPerLevel =
1428 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1429 skipPerLevel = &(numPerLevel[maxLevels]);
1430
1431 // Copy old elements from old arrays
1432 for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
1433 // init numPerLevel[*] to 1 item per level
1434 numPerLevel[i] = old_numPerLevel[i];
1435 skipPerLevel[i] = old_skipPerLevel[i];
1436 }
1437
1438 // Init new elements in arrays to 1
1439 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
1440 // init numPerLevel[*] to 1 item per level
1441 numPerLevel[i] = 1;
1442 skipPerLevel[i] = 1;
1443 }
1444
1445 // Free old arrays
1446 __kmp_free(old_numPerLevel);
1447 }
1448
1449 // Fill in oversubscription levels of hierarchy
1450 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
1451 skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1452
1453 base_num_threads = nproc;
1454 resizing = 0; // One writer
1455 }
1456};
1457#endif // KMP_AFFINITY_H
1458

source code of openmp/runtime/src/kmp_affinity.h