1 | /* |
2 | * kmp_affinity.h -- header for affinity management |
3 | */ |
4 | |
5 | //===----------------------------------------------------------------------===// |
6 | // |
7 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
8 | // See https://llvm.org/LICENSE.txt for license information. |
9 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #ifndef KMP_AFFINITY_H |
14 | #define KMP_AFFINITY_H |
15 | |
16 | #include "kmp.h" |
17 | #include "kmp_os.h" |
18 | #include <limits> |
19 | |
20 | #if KMP_AFFINITY_SUPPORTED |
21 | #if KMP_USE_HWLOC |
22 | class KMPHwlocAffinity : public KMPAffinity { |
23 | public: |
24 | class Mask : public KMPAffinity::Mask { |
25 | hwloc_cpuset_t mask; |
26 | |
27 | public: |
28 | Mask() { |
29 | mask = hwloc_bitmap_alloc(); |
30 | this->zero(); |
31 | } |
32 | Mask(const Mask &other) = delete; |
33 | Mask &operator=(const Mask &other) = delete; |
34 | ~Mask() { hwloc_bitmap_free(mask); } |
35 | void set(int i) override { hwloc_bitmap_set(mask, i); } |
36 | bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); } |
37 | void clear(int i) override { hwloc_bitmap_clr(mask, i); } |
38 | void zero() override { hwloc_bitmap_zero(mask); } |
39 | bool empty() const override { return hwloc_bitmap_iszero(mask); } |
40 | void copy(const KMPAffinity::Mask *src) override { |
41 | const Mask *convert = static_cast<const Mask *>(src); |
42 | hwloc_bitmap_copy(mask, convert->mask); |
43 | } |
44 | void bitwise_and(const KMPAffinity::Mask *rhs) override { |
45 | const Mask *convert = static_cast<const Mask *>(rhs); |
46 | hwloc_bitmap_and(mask, mask, convert->mask); |
47 | } |
48 | void bitwise_or(const KMPAffinity::Mask *rhs) override { |
49 | const Mask *convert = static_cast<const Mask *>(rhs); |
50 | hwloc_bitmap_or(mask, mask, convert->mask); |
51 | } |
52 | void bitwise_not() override { hwloc_bitmap_not(mask, mask); } |
53 | bool is_equal(const KMPAffinity::Mask *rhs) const override { |
54 | const Mask *convert = static_cast<const Mask *>(rhs); |
55 | return hwloc_bitmap_isequal(mask, convert->mask); |
56 | } |
57 | int begin() const override { return hwloc_bitmap_first(mask); } |
58 | int end() const override { return -1; } |
59 | int next(int previous) const override { |
60 | return hwloc_bitmap_next(mask, previous); |
61 | } |
62 | int get_system_affinity(bool abort_on_error) override { |
63 | KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), |
64 | "Illegal get affinity operation when not capable" ); |
65 | long retval = |
66 | hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD); |
67 | if (retval >= 0) { |
68 | return 0; |
69 | } |
70 | int error = errno; |
71 | if (abort_on_error) { |
72 | __kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()" ), |
73 | KMP_ERR(error), __kmp_msg_null); |
74 | } |
75 | return error; |
76 | } |
77 | int set_system_affinity(bool abort_on_error) const override { |
78 | KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), |
79 | "Illegal set affinity operation when not capable" ); |
80 | long retval = |
81 | hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD); |
82 | if (retval >= 0) { |
83 | return 0; |
84 | } |
85 | int error = errno; |
86 | if (abort_on_error) { |
87 | __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()" ), |
88 | KMP_ERR(error), __kmp_msg_null); |
89 | } |
90 | return error; |
91 | } |
92 | #if KMP_OS_WINDOWS |
93 | int set_process_affinity(bool abort_on_error) const override { |
94 | KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), |
95 | "Illegal set process affinity operation when not capable" ); |
96 | int error = 0; |
97 | const hwloc_topology_support *support = |
98 | hwloc_topology_get_support(__kmp_hwloc_topology); |
99 | if (support->cpubind->set_proc_cpubind) { |
100 | int retval; |
101 | retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask, |
102 | HWLOC_CPUBIND_PROCESS); |
103 | if (retval >= 0) |
104 | return 0; |
105 | error = errno; |
106 | if (abort_on_error) |
107 | __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()" ), |
108 | KMP_ERR(error), __kmp_msg_null); |
109 | } |
110 | return error; |
111 | } |
112 | #endif |
113 | int get_proc_group() const override { |
114 | int group = -1; |
115 | #if KMP_OS_WINDOWS |
116 | if (__kmp_num_proc_groups == 1) { |
117 | return 1; |
118 | } |
119 | for (int i = 0; i < __kmp_num_proc_groups; i++) { |
120 | // On windows, the long type is always 32 bits |
121 | unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2); |
122 | unsigned long second_32_bits = |
123 | hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1); |
124 | if (first_32_bits == 0 && second_32_bits == 0) { |
125 | continue; |
126 | } |
127 | if (group >= 0) { |
128 | return -1; |
129 | } |
130 | group = i; |
131 | } |
132 | #endif /* KMP_OS_WINDOWS */ |
133 | return group; |
134 | } |
135 | }; |
136 | void determine_capable(const char *var) override { |
137 | const hwloc_topology_support *topology_support; |
138 | if (__kmp_hwloc_topology == NULL) { |
139 | if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) { |
140 | __kmp_hwloc_error = TRUE; |
141 | if (__kmp_affinity.flags.verbose) { |
142 | KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()" ); |
143 | } |
144 | } |
145 | if (hwloc_topology_load(__kmp_hwloc_topology) < 0) { |
146 | __kmp_hwloc_error = TRUE; |
147 | if (__kmp_affinity.flags.verbose) { |
148 | KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()" ); |
149 | } |
150 | } |
151 | } |
152 | topology_support = hwloc_topology_get_support(__kmp_hwloc_topology); |
153 | // Is the system capable of setting/getting this thread's affinity? |
154 | // Also, is topology discovery possible? (pu indicates ability to discover |
155 | // processing units). And finally, were there no errors when calling any |
156 | // hwloc_* API functions? |
157 | if (topology_support && topology_support->cpubind->set_thisthread_cpubind && |
158 | topology_support->cpubind->get_thisthread_cpubind && |
159 | topology_support->discovery->pu && !__kmp_hwloc_error) { |
160 | // enables affinity according to KMP_AFFINITY_CAPABLE() macro |
161 | KMP_AFFINITY_ENABLE(TRUE); |
162 | } else { |
163 | // indicate that hwloc didn't work and disable affinity |
164 | __kmp_hwloc_error = TRUE; |
165 | KMP_AFFINITY_DISABLE(); |
166 | } |
167 | } |
168 | void bind_thread(int which) override { |
169 | KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), |
170 | "Illegal set affinity operation when not capable" ); |
171 | KMPAffinity::Mask *mask; |
172 | KMP_CPU_ALLOC_ON_STACK(mask); |
173 | KMP_CPU_ZERO(mask); |
174 | KMP_CPU_SET(which, mask); |
175 | __kmp_set_system_affinity(mask, TRUE); |
176 | KMP_CPU_FREE_FROM_STACK(mask); |
177 | } |
178 | KMPAffinity::Mask *allocate_mask() override { return new Mask(); } |
179 | void deallocate_mask(KMPAffinity::Mask *m) override { delete m; } |
180 | KMPAffinity::Mask *allocate_mask_array(int num) override { |
181 | return new Mask[num]; |
182 | } |
183 | void deallocate_mask_array(KMPAffinity::Mask *array) override { |
184 | Mask *hwloc_array = static_cast<Mask *>(array); |
185 | delete[] hwloc_array; |
186 | } |
187 | KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array, |
188 | int index) override { |
189 | Mask *hwloc_array = static_cast<Mask *>(array); |
190 | return &(hwloc_array[index]); |
191 | } |
192 | api_type get_api_type() const override { return HWLOC; } |
193 | }; |
194 | #endif /* KMP_USE_HWLOC */ |
195 | |
196 | #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY || \ |
197 | KMP_OS_AIX |
198 | #if KMP_OS_LINUX |
199 | /* On some of the older OS's that we build on, these constants aren't present |
200 | in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on |
201 | all systems of the same arch where they are defined, and they cannot change. |
202 | stone forever. */ |
203 | #include <sys/syscall.h> |
204 | #if KMP_ARCH_X86 || KMP_ARCH_ARM |
205 | #ifndef __NR_sched_setaffinity |
206 | #define __NR_sched_setaffinity 241 |
207 | #elif __NR_sched_setaffinity != 241 |
208 | #error Wrong code for setaffinity system call. |
209 | #endif /* __NR_sched_setaffinity */ |
210 | #ifndef __NR_sched_getaffinity |
211 | #define __NR_sched_getaffinity 242 |
212 | #elif __NR_sched_getaffinity != 242 |
213 | #error Wrong code for getaffinity system call. |
214 | #endif /* __NR_sched_getaffinity */ |
215 | #elif KMP_ARCH_AARCH64 |
216 | #ifndef __NR_sched_setaffinity |
217 | #define __NR_sched_setaffinity 122 |
218 | #elif __NR_sched_setaffinity != 122 |
219 | #error Wrong code for setaffinity system call. |
220 | #endif /* __NR_sched_setaffinity */ |
221 | #ifndef __NR_sched_getaffinity |
222 | #define __NR_sched_getaffinity 123 |
223 | #elif __NR_sched_getaffinity != 123 |
224 | #error Wrong code for getaffinity system call. |
225 | #endif /* __NR_sched_getaffinity */ |
226 | #elif KMP_ARCH_X86_64 |
227 | #ifndef __NR_sched_setaffinity |
228 | #define __NR_sched_setaffinity 203 |
229 | #elif __NR_sched_setaffinity != 203 |
230 | #error Wrong code for setaffinity system call. |
231 | #endif /* __NR_sched_setaffinity */ |
232 | #ifndef __NR_sched_getaffinity |
233 | #define __NR_sched_getaffinity 204 |
234 | #elif __NR_sched_getaffinity != 204 |
235 | #error Wrong code for getaffinity system call. |
236 | #endif /* __NR_sched_getaffinity */ |
237 | #elif KMP_ARCH_PPC64 |
238 | #ifndef __NR_sched_setaffinity |
239 | #define __NR_sched_setaffinity 222 |
240 | #elif __NR_sched_setaffinity != 222 |
241 | #error Wrong code for setaffinity system call. |
242 | #endif /* __NR_sched_setaffinity */ |
243 | #ifndef __NR_sched_getaffinity |
244 | #define __NR_sched_getaffinity 223 |
245 | #elif __NR_sched_getaffinity != 223 |
246 | #error Wrong code for getaffinity system call. |
247 | #endif /* __NR_sched_getaffinity */ |
248 | #elif KMP_ARCH_MIPS |
249 | #ifndef __NR_sched_setaffinity |
250 | #define __NR_sched_setaffinity 4239 |
251 | #elif __NR_sched_setaffinity != 4239 |
252 | #error Wrong code for setaffinity system call. |
253 | #endif /* __NR_sched_setaffinity */ |
254 | #ifndef __NR_sched_getaffinity |
255 | #define __NR_sched_getaffinity 4240 |
256 | #elif __NR_sched_getaffinity != 4240 |
257 | #error Wrong code for getaffinity system call. |
258 | #endif /* __NR_sched_getaffinity */ |
259 | #elif KMP_ARCH_MIPS64 |
260 | #ifndef __NR_sched_setaffinity |
261 | #define __NR_sched_setaffinity 5195 |
262 | #elif __NR_sched_setaffinity != 5195 |
263 | #error Wrong code for setaffinity system call. |
264 | #endif /* __NR_sched_setaffinity */ |
265 | #ifndef __NR_sched_getaffinity |
266 | #define __NR_sched_getaffinity 5196 |
267 | #elif __NR_sched_getaffinity != 5196 |
268 | #error Wrong code for getaffinity system call. |
269 | #endif /* __NR_sched_getaffinity */ |
270 | #elif KMP_ARCH_LOONGARCH64 |
271 | #ifndef __NR_sched_setaffinity |
272 | #define __NR_sched_setaffinity 122 |
273 | #elif __NR_sched_setaffinity != 122 |
274 | #error Wrong code for setaffinity system call. |
275 | #endif /* __NR_sched_setaffinity */ |
276 | #ifndef __NR_sched_getaffinity |
277 | #define __NR_sched_getaffinity 123 |
278 | #elif __NR_sched_getaffinity != 123 |
279 | #error Wrong code for getaffinity system call. |
280 | #endif /* __NR_sched_getaffinity */ |
281 | #elif KMP_ARCH_RISCV64 |
282 | #ifndef __NR_sched_setaffinity |
283 | #define __NR_sched_setaffinity 122 |
284 | #elif __NR_sched_setaffinity != 122 |
285 | #error Wrong code for setaffinity system call. |
286 | #endif /* __NR_sched_setaffinity */ |
287 | #ifndef __NR_sched_getaffinity |
288 | #define __NR_sched_getaffinity 123 |
289 | #elif __NR_sched_getaffinity != 123 |
290 | #error Wrong code for getaffinity system call. |
291 | #endif /* __NR_sched_getaffinity */ |
292 | #elif KMP_ARCH_VE |
293 | #ifndef __NR_sched_setaffinity |
294 | #define __NR_sched_setaffinity 203 |
295 | #elif __NR_sched_setaffinity != 203 |
296 | #error Wrong code for setaffinity system call. |
297 | #endif /* __NR_sched_setaffinity */ |
298 | #ifndef __NR_sched_getaffinity |
299 | #define __NR_sched_getaffinity 204 |
300 | #elif __NR_sched_getaffinity != 204 |
301 | #error Wrong code for getaffinity system call. |
302 | #endif /* __NR_sched_getaffinity */ |
303 | #elif KMP_ARCH_S390X |
304 | #ifndef __NR_sched_setaffinity |
305 | #define __NR_sched_setaffinity 239 |
306 | #elif __NR_sched_setaffinity != 239 |
307 | #error Wrong code for setaffinity system call. |
308 | #endif /* __NR_sched_setaffinity */ |
309 | #ifndef __NR_sched_getaffinity |
310 | #define __NR_sched_getaffinity 240 |
311 | #elif __NR_sched_getaffinity != 240 |
312 | #error Wrong code for getaffinity system call. |
313 | #endif /* __NR_sched_getaffinity */ |
314 | #elif KMP_ARCH_SPARC |
315 | #ifndef __NR_sched_setaffinity |
316 | #define __NR_sched_setaffinity 261 |
317 | #elif __NR_sched_setaffinity != 261 |
318 | #error Wrong code for setaffinity system call. |
319 | #endif /* __NR_sched_setaffinity */ |
320 | #ifndef __NR_sched_getaffinity |
321 | #define __NR_sched_getaffinity 260 |
322 | #elif __NR_sched_getaffinity != 260 |
323 | #error Wrong code for getaffinity system call. |
324 | #endif /* __NR_sched_getaffinity */ |
325 | #else |
326 | #error Unknown or unsupported architecture |
327 | #endif /* KMP_ARCH_* */ |
328 | #elif KMP_OS_FREEBSD || KMP_OS_DRAGONFLY |
329 | #include <pthread.h> |
330 | #include <pthread_np.h> |
331 | #elif KMP_OS_NETBSD |
332 | #include <pthread.h> |
333 | #include <sched.h> |
334 | #elif KMP_OS_AIX |
335 | #include <sys/dr.h> |
336 | #include <sys/rset.h> |
337 | #define VMI_MAXRADS 64 // Maximum number of RADs allowed by AIX. |
338 | #define GET_NUMBER_SMT_SETS 0x0004 |
339 | extern "C" int syssmt(int flags, int, int, int *); |
340 | #endif |
341 | class KMPNativeAffinity : public KMPAffinity { |
342 | class Mask : public KMPAffinity::Mask { |
343 | typedef unsigned long mask_t; |
344 | typedef decltype(__kmp_affin_mask_size) mask_size_type; |
345 | static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT; |
346 | static const mask_t ONE = 1; |
347 | mask_size_type get_num_mask_types() const { |
348 | return __kmp_affin_mask_size / sizeof(mask_t); |
349 | } |
350 | |
351 | public: |
352 | mask_t *mask; |
353 | Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); } |
354 | ~Mask() { |
355 | if (mask) |
356 | __kmp_free(mask); |
357 | } |
358 | void set(int i) override { |
359 | mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T)); |
360 | } |
361 | bool is_set(int i) const override { |
362 | return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T))); |
363 | } |
364 | void clear(int i) override { |
365 | mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T)); |
366 | } |
367 | void zero() override { |
368 | mask_size_type e = get_num_mask_types(); |
369 | for (mask_size_type i = 0; i < e; ++i) |
370 | mask[i] = (mask_t)0; |
371 | } |
372 | bool empty() const override { |
373 | mask_size_type e = get_num_mask_types(); |
374 | for (mask_size_type i = 0; i < e; ++i) |
375 | if (mask[i] != (mask_t)0) |
376 | return false; |
377 | return true; |
378 | } |
379 | void copy(const KMPAffinity::Mask *src) override { |
380 | const Mask *convert = static_cast<const Mask *>(src); |
381 | mask_size_type e = get_num_mask_types(); |
382 | for (mask_size_type i = 0; i < e; ++i) |
383 | mask[i] = convert->mask[i]; |
384 | } |
385 | void bitwise_and(const KMPAffinity::Mask *rhs) override { |
386 | const Mask *convert = static_cast<const Mask *>(rhs); |
387 | mask_size_type e = get_num_mask_types(); |
388 | for (mask_size_type i = 0; i < e; ++i) |
389 | mask[i] &= convert->mask[i]; |
390 | } |
391 | void bitwise_or(const KMPAffinity::Mask *rhs) override { |
392 | const Mask *convert = static_cast<const Mask *>(rhs); |
393 | mask_size_type e = get_num_mask_types(); |
394 | for (mask_size_type i = 0; i < e; ++i) |
395 | mask[i] |= convert->mask[i]; |
396 | } |
397 | void bitwise_not() override { |
398 | mask_size_type e = get_num_mask_types(); |
399 | for (mask_size_type i = 0; i < e; ++i) |
400 | mask[i] = ~(mask[i]); |
401 | } |
402 | bool is_equal(const KMPAffinity::Mask *rhs) const override { |
403 | const Mask *convert = static_cast<const Mask *>(rhs); |
404 | mask_size_type e = get_num_mask_types(); |
405 | for (mask_size_type i = 0; i < e; ++i) |
406 | if (mask[i] != convert->mask[i]) |
407 | return false; |
408 | return true; |
409 | } |
410 | int begin() const override { |
411 | int retval = 0; |
412 | while (retval < end() && !is_set(i: retval)) |
413 | ++retval; |
414 | return retval; |
415 | } |
416 | int end() const override { |
417 | int e; |
418 | __kmp_type_convert(src: get_num_mask_types() * BITS_PER_MASK_T, dest: &e); |
419 | return e; |
420 | } |
421 | int next(int previous) const override { |
422 | int retval = previous + 1; |
423 | while (retval < end() && !is_set(i: retval)) |
424 | ++retval; |
425 | return retval; |
426 | } |
427 | #if KMP_OS_AIX |
428 | // On AIX, we don't have a way to get CPU(s) a thread is bound to. |
429 | // This routine is only used to get the full mask. |
430 | int get_system_affinity(bool abort_on_error) override { |
431 | KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), |
432 | "Illegal get affinity operation when not capable" ); |
433 | |
434 | (void)abort_on_error; |
435 | |
436 | // Set the mask with all CPUs that are available. |
437 | for (int i = 0; i < __kmp_xproc; ++i) |
438 | KMP_CPU_SET(i, this); |
439 | return 0; |
440 | } |
441 | int set_system_affinity(bool abort_on_error) const override { |
442 | KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), |
443 | |
444 | "Illegal set affinity operation when not capable" ); |
445 | |
446 | int location; |
447 | int gtid = __kmp_entry_gtid(); |
448 | int tid = thread_self(); |
449 | |
450 | // Unbind the thread if it was bound to any processors before so that |
451 | // we can bind the thread to CPUs specified by the mask not others. |
452 | int retval = bindprocessor(BINDTHREAD, tid, PROCESSOR_CLASS_ANY); |
453 | |
454 | // On AIX, we can only bind to one instead of a set of CPUs with the |
455 | // bindprocessor() system call. |
456 | KMP_CPU_SET_ITERATE(location, this) { |
457 | if (KMP_CPU_ISSET(location, this)) { |
458 | retval = bindprocessor(BINDTHREAD, tid, location); |
459 | if (retval == -1 && errno == 1) { |
460 | rsid_t rsid; |
461 | rsethandle_t rsh; |
462 | // Put something in rsh to prevent compiler warning |
463 | // about uninitalized use |
464 | rsh = rs_alloc(RS_EMPTY); |
465 | rsid.at_pid = getpid(); |
466 | if (RS_DEFAULT_RSET != ra_getrset(R_PROCESS, rsid, 0, rsh)) { |
467 | retval = ra_detachrset(R_PROCESS, rsid, 0); |
468 | retval = bindprocessor(BINDTHREAD, tid, location); |
469 | } |
470 | } |
471 | if (retval == 0) { |
472 | KA_TRACE(10, ("__kmp_set_system_affinity: Done binding " |
473 | "T#%d to cpu=%d.\n" , |
474 | gtid, location)); |
475 | continue; |
476 | } |
477 | int error = errno; |
478 | if (abort_on_error) { |
479 | __kmp_fatal(KMP_MSG(FunctionError, "bindprocessor()" ), |
480 | KMP_ERR(error), __kmp_msg_null); |
481 | KA_TRACE(10, ("__kmp_set_system_affinity: Error binding " |
482 | "T#%d to cpu=%d, errno=%d.\n" , |
483 | gtid, location, error)); |
484 | return error; |
485 | } |
486 | } |
487 | } |
488 | return 0; |
489 | } |
490 | #else // !KMP_OS_AIX |
491 | int get_system_affinity(bool abort_on_error) override { |
492 | KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), |
493 | "Illegal get affinity operation when not capable" ); |
494 | #if KMP_OS_LINUX |
495 | long retval = |
496 | syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask); |
497 | #elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY |
498 | int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size, |
499 | reinterpret_cast<cpuset_t *>(mask)); |
500 | int retval = (r == 0 ? 0 : -1); |
501 | #endif |
502 | if (retval >= 0) { |
503 | return 0; |
504 | } |
505 | int error = errno; |
506 | if (abort_on_error) { |
507 | __kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()" ), |
508 | KMP_ERR(error), __kmp_msg_null); |
509 | } |
510 | return error; |
511 | } |
512 | int set_system_affinity(bool abort_on_error) const override { |
513 | KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), |
514 | "Illegal set affinity operation when not capable" ); |
515 | #if KMP_OS_LINUX |
516 | long retval = |
517 | syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask); |
518 | #elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY |
519 | int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size, |
520 | reinterpret_cast<cpuset_t *>(mask)); |
521 | int retval = (r == 0 ? 0 : -1); |
522 | #endif |
523 | if (retval >= 0) { |
524 | return 0; |
525 | } |
526 | int error = errno; |
527 | if (abort_on_error) { |
528 | __kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()" ), |
529 | KMP_ERR(error), __kmp_msg_null); |
530 | } |
531 | return error; |
532 | } |
533 | #endif // KMP_OS_AIX |
534 | }; |
535 | void determine_capable(const char *env_var) override { |
536 | __kmp_affinity_determine_capable(env_var); |
537 | } |
538 | void bind_thread(int which) override { __kmp_affinity_bind_thread(which); } |
539 | KMPAffinity::Mask *allocate_mask() override { |
540 | KMPNativeAffinity::Mask *retval = new Mask(); |
541 | return retval; |
542 | } |
543 | void deallocate_mask(KMPAffinity::Mask *m) override { |
544 | KMPNativeAffinity::Mask *native_mask = |
545 | static_cast<KMPNativeAffinity::Mask *>(m); |
546 | delete native_mask; |
547 | } |
548 | KMPAffinity::Mask *allocate_mask_array(int num) override { |
549 | return new Mask[num]; |
550 | } |
551 | void deallocate_mask_array(KMPAffinity::Mask *array) override { |
552 | Mask *linux_array = static_cast<Mask *>(array); |
553 | delete[] linux_array; |
554 | } |
555 | KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array, |
556 | int index) override { |
557 | Mask *linux_array = static_cast<Mask *>(array); |
558 | return &(linux_array[index]); |
559 | } |
560 | api_type get_api_type() const override { return NATIVE_OS; } |
561 | }; |
562 | #endif /* KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY \ |
563 | || KMP_OS_AIX */ |
564 | |
565 | #if KMP_OS_WINDOWS |
566 | class KMPNativeAffinity : public KMPAffinity { |
567 | class Mask : public KMPAffinity::Mask { |
568 | typedef ULONG_PTR mask_t; |
569 | static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT; |
570 | mask_t *mask; |
571 | |
572 | public: |
573 | Mask() { |
574 | mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups); |
575 | } |
576 | ~Mask() { |
577 | if (mask) |
578 | __kmp_free(mask); |
579 | } |
580 | void set(int i) override { |
581 | mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T)); |
582 | } |
583 | bool is_set(int i) const override { |
584 | return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T))); |
585 | } |
586 | void clear(int i) override { |
587 | mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T)); |
588 | } |
589 | void zero() override { |
590 | for (int i = 0; i < __kmp_num_proc_groups; ++i) |
591 | mask[i] = 0; |
592 | } |
593 | bool empty() const override { |
594 | for (size_t i = 0; i < __kmp_num_proc_groups; ++i) |
595 | if (mask[i]) |
596 | return false; |
597 | return true; |
598 | } |
599 | void copy(const KMPAffinity::Mask *src) override { |
600 | const Mask *convert = static_cast<const Mask *>(src); |
601 | for (int i = 0; i < __kmp_num_proc_groups; ++i) |
602 | mask[i] = convert->mask[i]; |
603 | } |
604 | void bitwise_and(const KMPAffinity::Mask *rhs) override { |
605 | const Mask *convert = static_cast<const Mask *>(rhs); |
606 | for (int i = 0; i < __kmp_num_proc_groups; ++i) |
607 | mask[i] &= convert->mask[i]; |
608 | } |
609 | void bitwise_or(const KMPAffinity::Mask *rhs) override { |
610 | const Mask *convert = static_cast<const Mask *>(rhs); |
611 | for (int i = 0; i < __kmp_num_proc_groups; ++i) |
612 | mask[i] |= convert->mask[i]; |
613 | } |
614 | void bitwise_not() override { |
615 | for (int i = 0; i < __kmp_num_proc_groups; ++i) |
616 | mask[i] = ~(mask[i]); |
617 | } |
618 | bool is_equal(const KMPAffinity::Mask *rhs) const override { |
619 | const Mask *convert = static_cast<const Mask *>(rhs); |
620 | for (size_t i = 0; i < __kmp_num_proc_groups; ++i) |
621 | if (mask[i] != convert->mask[i]) |
622 | return false; |
623 | return true; |
624 | } |
625 | int begin() const override { |
626 | int retval = 0; |
627 | while (retval < end() && !is_set(retval)) |
628 | ++retval; |
629 | return retval; |
630 | } |
631 | int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; } |
632 | int next(int previous) const override { |
633 | int retval = previous + 1; |
634 | while (retval < end() && !is_set(retval)) |
635 | ++retval; |
636 | return retval; |
637 | } |
638 | int set_process_affinity(bool abort_on_error) const override { |
639 | if (__kmp_num_proc_groups <= 1) { |
640 | if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) { |
641 | DWORD error = GetLastError(); |
642 | if (abort_on_error) { |
643 | __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error), |
644 | __kmp_msg_null); |
645 | } |
646 | return error; |
647 | } |
648 | } |
649 | return 0; |
650 | } |
651 | int set_system_affinity(bool abort_on_error) const override { |
652 | if (__kmp_num_proc_groups > 1) { |
653 | // Check for a valid mask. |
654 | GROUP_AFFINITY ga; |
655 | int group = get_proc_group(); |
656 | if (group < 0) { |
657 | if (abort_on_error) { |
658 | KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity" ); |
659 | } |
660 | return -1; |
661 | } |
662 | // Transform the bit vector into a GROUP_AFFINITY struct |
663 | // and make the system call to set affinity. |
664 | ga.Group = group; |
665 | ga.Mask = mask[group]; |
666 | ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0; |
667 | |
668 | KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL); |
669 | if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) { |
670 | DWORD error = GetLastError(); |
671 | if (abort_on_error) { |
672 | __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error), |
673 | __kmp_msg_null); |
674 | } |
675 | return error; |
676 | } |
677 | } else { |
678 | if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) { |
679 | DWORD error = GetLastError(); |
680 | if (abort_on_error) { |
681 | __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error), |
682 | __kmp_msg_null); |
683 | } |
684 | return error; |
685 | } |
686 | } |
687 | return 0; |
688 | } |
689 | int get_system_affinity(bool abort_on_error) override { |
690 | if (__kmp_num_proc_groups > 1) { |
691 | this->zero(); |
692 | GROUP_AFFINITY ga; |
693 | KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL); |
694 | if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) { |
695 | DWORD error = GetLastError(); |
696 | if (abort_on_error) { |
697 | __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()" ), |
698 | KMP_ERR(error), __kmp_msg_null); |
699 | } |
700 | return error; |
701 | } |
702 | if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) || |
703 | (ga.Mask == 0)) { |
704 | return -1; |
705 | } |
706 | mask[ga.Group] = ga.Mask; |
707 | } else { |
708 | mask_t newMask, sysMask, retval; |
709 | if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) { |
710 | DWORD error = GetLastError(); |
711 | if (abort_on_error) { |
712 | __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()" ), |
713 | KMP_ERR(error), __kmp_msg_null); |
714 | } |
715 | return error; |
716 | } |
717 | retval = SetThreadAffinityMask(GetCurrentThread(), newMask); |
718 | if (!retval) { |
719 | DWORD error = GetLastError(); |
720 | if (abort_on_error) { |
721 | __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()" ), |
722 | KMP_ERR(error), __kmp_msg_null); |
723 | } |
724 | return error; |
725 | } |
726 | newMask = SetThreadAffinityMask(GetCurrentThread(), retval); |
727 | if (!newMask) { |
728 | DWORD error = GetLastError(); |
729 | if (abort_on_error) { |
730 | __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()" ), |
731 | KMP_ERR(error), __kmp_msg_null); |
732 | } |
733 | } |
734 | *mask = retval; |
735 | } |
736 | return 0; |
737 | } |
738 | int get_proc_group() const override { |
739 | int group = -1; |
740 | if (__kmp_num_proc_groups == 1) { |
741 | return 1; |
742 | } |
743 | for (int i = 0; i < __kmp_num_proc_groups; i++) { |
744 | if (mask[i] == 0) |
745 | continue; |
746 | if (group >= 0) |
747 | return -1; |
748 | group = i; |
749 | } |
750 | return group; |
751 | } |
752 | }; |
753 | void determine_capable(const char *env_var) override { |
754 | __kmp_affinity_determine_capable(env_var); |
755 | } |
756 | void bind_thread(int which) override { __kmp_affinity_bind_thread(which); } |
757 | KMPAffinity::Mask *allocate_mask() override { return new Mask(); } |
758 | void deallocate_mask(KMPAffinity::Mask *m) override { delete m; } |
759 | KMPAffinity::Mask *allocate_mask_array(int num) override { |
760 | return new Mask[num]; |
761 | } |
762 | void deallocate_mask_array(KMPAffinity::Mask *array) override { |
763 | Mask *windows_array = static_cast<Mask *>(array); |
764 | delete[] windows_array; |
765 | } |
766 | KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array, |
767 | int index) override { |
768 | Mask *windows_array = static_cast<Mask *>(array); |
769 | return &(windows_array[index]); |
770 | } |
771 | api_type get_api_type() const override { return NATIVE_OS; } |
772 | }; |
773 | #endif /* KMP_OS_WINDOWS */ |
774 | #endif /* KMP_AFFINITY_SUPPORTED */ |
775 | |
776 | // Describe an attribute for a level in the machine topology |
777 | struct kmp_hw_attr_t { |
778 | int core_type : 8; |
779 | int core_eff : 8; |
780 | unsigned valid : 1; |
781 | unsigned reserved : 15; |
782 | |
783 | static const int UNKNOWN_CORE_EFF = -1; |
784 | |
785 | kmp_hw_attr_t() |
786 | : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF), |
787 | valid(0), reserved(0) {} |
788 | void set_core_type(kmp_hw_core_type_t type) { |
789 | valid = 1; |
790 | core_type = type; |
791 | } |
792 | void set_core_eff(int eff) { |
793 | valid = 1; |
794 | core_eff = eff; |
795 | } |
796 | kmp_hw_core_type_t get_core_type() const { |
797 | return (kmp_hw_core_type_t)core_type; |
798 | } |
799 | int get_core_eff() const { return core_eff; } |
800 | bool is_core_type_valid() const { |
801 | return core_type != KMP_HW_CORE_TYPE_UNKNOWN; |
802 | } |
803 | bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; } |
804 | operator bool() const { return valid; } |
805 | void clear() { |
806 | core_type = KMP_HW_CORE_TYPE_UNKNOWN; |
807 | core_eff = UNKNOWN_CORE_EFF; |
808 | valid = 0; |
809 | } |
810 | bool contains(const kmp_hw_attr_t &other) const { |
811 | if (!valid && !other.valid) |
812 | return true; |
813 | if (valid && other.valid) { |
814 | if (other.is_core_type_valid()) { |
815 | if (!is_core_type_valid() || (get_core_type() != other.get_core_type())) |
816 | return false; |
817 | } |
818 | if (other.is_core_eff_valid()) { |
819 | if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff())) |
820 | return false; |
821 | } |
822 | return true; |
823 | } |
824 | return false; |
825 | } |
826 | #if KMP_AFFINITY_SUPPORTED |
827 | bool contains(const kmp_affinity_attrs_t &attr) const { |
828 | if (!valid && !attr.valid) |
829 | return true; |
830 | if (valid && attr.valid) { |
831 | if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN) |
832 | return (is_core_type_valid() && |
833 | (get_core_type() == (kmp_hw_core_type_t)attr.core_type)); |
834 | if (attr.core_eff != UNKNOWN_CORE_EFF) |
835 | return (is_core_eff_valid() && (get_core_eff() == attr.core_eff)); |
836 | return true; |
837 | } |
838 | return false; |
839 | } |
840 | #endif // KMP_AFFINITY_SUPPORTED |
841 | bool operator==(const kmp_hw_attr_t &rhs) const { |
842 | return (rhs.valid == valid && rhs.core_eff == core_eff && |
843 | rhs.core_type == core_type); |
844 | } |
845 | bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); } |
846 | }; |
847 | |
848 | #if KMP_AFFINITY_SUPPORTED |
849 | KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t)); |
850 | #endif |
851 | |
852 | class kmp_hw_thread_t { |
853 | public: |
854 | static const int UNKNOWN_ID = -1; |
855 | static const int MULTIPLE_ID = -2; |
856 | static int compare_ids(const void *a, const void *b); |
857 | static int compare_compact(const void *a, const void *b); |
858 | int ids[KMP_HW_LAST]; |
859 | int sub_ids[KMP_HW_LAST]; |
860 | bool leader; |
861 | int os_id; |
862 | int original_idx; |
863 | kmp_hw_attr_t attrs; |
864 | |
865 | void print() const; |
866 | void clear() { |
867 | for (int i = 0; i < (int)KMP_HW_LAST; ++i) |
868 | ids[i] = UNKNOWN_ID; |
869 | leader = false; |
870 | attrs.clear(); |
871 | } |
872 | }; |
873 | |
874 | class kmp_topology_t { |
875 | |
876 | struct flags_t { |
877 | int uniform : 1; |
878 | int reserved : 31; |
879 | }; |
880 | |
881 | int depth; |
882 | |
883 | // The following arrays are all 'depth' long and have been |
884 | // allocated to hold up to KMP_HW_LAST number of objects if |
885 | // needed so layers can be added without reallocation of any array |
886 | |
887 | // Orderd array of the types in the topology |
888 | kmp_hw_t *types; |
889 | |
890 | // Keep quick topology ratios, for non-uniform topologies, |
891 | // this ratio holds the max number of itemAs per itemB |
892 | // e.g., [ 4 packages | 6 cores / package | 2 threads / core ] |
893 | int *ratio; |
894 | |
895 | // Storage containing the absolute number of each topology layer |
896 | int *count; |
897 | |
898 | // The number of core efficiencies. This is only useful for hybrid |
899 | // topologies. Core efficiencies will range from 0 to num efficiencies - 1 |
900 | int num_core_efficiencies; |
901 | int num_core_types; |
902 | kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES]; |
903 | |
904 | // The hardware threads array |
905 | // hw_threads is num_hw_threads long |
906 | // Each hw_thread's ids and sub_ids are depth deep |
907 | int num_hw_threads; |
908 | kmp_hw_thread_t *hw_threads; |
909 | |
910 | // Equivalence hash where the key is the hardware topology item |
911 | // and the value is the equivalent hardware topology type in the |
912 | // types[] array, if the value is KMP_HW_UNKNOWN, then there is no |
913 | // known equivalence for the topology type |
914 | kmp_hw_t equivalent[KMP_HW_LAST]; |
915 | |
916 | // Flags describing the topology |
917 | flags_t flags; |
918 | |
919 | // Compact value used during sort_compact() |
920 | int compact; |
921 | |
922 | #if KMP_GROUP_AFFINITY |
923 | // Insert topology information about Windows Processor groups |
924 | void _insert_windows_proc_groups(); |
925 | #endif |
926 | |
927 | // Count each item & get the num x's per y |
928 | // e.g., get the number of cores and the number of threads per core |
929 | // for each (x, y) in (KMP_HW_* , KMP_HW_*) |
930 | void _gather_enumeration_information(); |
931 | |
932 | // Remove layers that don't add information to the topology. |
933 | // This is done by having the layer take on the id = UNKNOWN_ID (-1) |
934 | void _remove_radix1_layers(); |
935 | |
936 | // Find out if the topology is uniform |
937 | void _discover_uniformity(); |
938 | |
939 | // Set all the sub_ids for each hardware thread |
940 | void _set_sub_ids(); |
941 | |
942 | // Set global affinity variables describing the number of threads per |
943 | // core, the number of packages, the number of cores per package, and |
944 | // the number of cores. |
945 | void _set_globals(); |
946 | |
947 | // Set the last level cache equivalent type |
948 | void _set_last_level_cache(); |
949 | |
950 | // Return the number of cores with a particular attribute, 'attr'. |
951 | // If 'find_all' is true, then find all cores on the machine, otherwise find |
952 | // all cores per the layer 'above' |
953 | int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above, |
954 | bool find_all = false) const; |
955 | |
956 | public: |
957 | // Force use of allocate()/deallocate() |
958 | kmp_topology_t() = delete; |
959 | kmp_topology_t(const kmp_topology_t &t) = delete; |
960 | kmp_topology_t(kmp_topology_t &&t) = delete; |
961 | kmp_topology_t &operator=(const kmp_topology_t &t) = delete; |
962 | kmp_topology_t &operator=(kmp_topology_t &&t) = delete; |
963 | |
964 | static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types); |
965 | static void deallocate(kmp_topology_t *); |
966 | |
967 | // Functions used in create_map() routines |
968 | kmp_hw_thread_t &at(int index) { |
969 | KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads); |
970 | return hw_threads[index]; |
971 | } |
972 | const kmp_hw_thread_t &at(int index) const { |
973 | KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads); |
974 | return hw_threads[index]; |
975 | } |
976 | int get_num_hw_threads() const { return num_hw_threads; } |
977 | void sort_ids() { |
978 | qsort(base: hw_threads, nmemb: num_hw_threads, size: sizeof(kmp_hw_thread_t), |
979 | compar: kmp_hw_thread_t::compare_ids); |
980 | } |
981 | |
982 | // Insert a new topology layer after allocation |
983 | void insert_layer(kmp_hw_t type, const int *ids); |
984 | |
985 | // Check if the hardware ids are unique, if they are |
986 | // return true, otherwise return false |
987 | bool check_ids() const; |
988 | |
989 | // Function to call after the create_map() routine |
990 | void canonicalize(); |
991 | void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores); |
992 | |
993 | // Functions used after canonicalize() called |
994 | |
995 | #if KMP_AFFINITY_SUPPORTED |
996 | // Set the granularity for affinity settings |
997 | void set_granularity(kmp_affinity_t &stgs) const; |
998 | bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const; |
999 | bool restrict_to_mask(const kmp_affin_mask_t *mask); |
1000 | bool filter_hw_subset(); |
1001 | #endif |
1002 | bool is_uniform() const { return flags.uniform; } |
1003 | // Tell whether a type is a valid type in the topology |
1004 | // returns KMP_HW_UNKNOWN when there is no equivalent type |
1005 | kmp_hw_t get_equivalent_type(kmp_hw_t type) const { |
1006 | if (type == KMP_HW_UNKNOWN) |
1007 | return KMP_HW_UNKNOWN; |
1008 | return equivalent[type]; |
1009 | } |
1010 | // Set type1 = type2 |
1011 | void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) { |
1012 | KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1); |
1013 | KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2); |
1014 | kmp_hw_t real_type2 = equivalent[type2]; |
1015 | if (real_type2 == KMP_HW_UNKNOWN) |
1016 | real_type2 = type2; |
1017 | equivalent[type1] = real_type2; |
1018 | // This loop is required since any of the types may have been set to |
1019 | // be equivalent to type1. They all must be checked and reset to type2. |
1020 | KMP_FOREACH_HW_TYPE(type) { |
1021 | if (equivalent[type] == type1) { |
1022 | equivalent[type] = real_type2; |
1023 | } |
1024 | } |
1025 | } |
1026 | // Calculate number of types corresponding to level1 |
1027 | // per types corresponding to level2 (e.g., number of threads per core) |
1028 | int calculate_ratio(int level1, int level2) const { |
1029 | KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth); |
1030 | KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth); |
1031 | int r = 1; |
1032 | for (int level = level1; level > level2; --level) |
1033 | r *= ratio[level]; |
1034 | return r; |
1035 | } |
1036 | int get_ratio(int level) const { |
1037 | KMP_DEBUG_ASSERT(level >= 0 && level < depth); |
1038 | return ratio[level]; |
1039 | } |
1040 | int get_depth() const { return depth; }; |
1041 | kmp_hw_t get_type(int level) const { |
1042 | KMP_DEBUG_ASSERT(level >= 0 && level < depth); |
1043 | return types[level]; |
1044 | } |
1045 | int get_level(kmp_hw_t type) const { |
1046 | KMP_DEBUG_ASSERT_VALID_HW_TYPE(type); |
1047 | int eq_type = equivalent[type]; |
1048 | if (eq_type == KMP_HW_UNKNOWN) |
1049 | return -1; |
1050 | for (int i = 0; i < depth; ++i) |
1051 | if (types[i] == eq_type) |
1052 | return i; |
1053 | return -1; |
1054 | } |
1055 | int get_count(int level) const { |
1056 | KMP_DEBUG_ASSERT(level >= 0 && level < depth); |
1057 | return count[level]; |
1058 | } |
1059 | // Return the total number of cores with attribute 'attr' |
1060 | int get_ncores_with_attr(const kmp_hw_attr_t &attr) const { |
1061 | return _get_ncores_with_attr(attr, above: -1, find_all: true); |
1062 | } |
1063 | // Return the number of cores with attribute |
1064 | // 'attr' per topology level 'above' |
1065 | int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const { |
1066 | return _get_ncores_with_attr(attr, above, find_all: false); |
1067 | } |
1068 | |
1069 | #if KMP_AFFINITY_SUPPORTED |
1070 | friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b); |
1071 | void sort_compact(kmp_affinity_t &affinity) { |
1072 | compact = affinity.compact; |
1073 | qsort(base: hw_threads, nmemb: num_hw_threads, size: sizeof(kmp_hw_thread_t), |
1074 | compar: kmp_hw_thread_t::compare_compact); |
1075 | } |
1076 | #endif |
1077 | void print(const char *env_var = "KMP_AFFINITY" ) const; |
1078 | void dump() const; |
1079 | }; |
1080 | extern kmp_topology_t *__kmp_topology; |
1081 | |
1082 | class kmp_hw_subset_t { |
1083 | const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS; |
1084 | |
1085 | public: |
1086 | // Describe a machine topology item in KMP_HW_SUBSET |
1087 | struct item_t { |
1088 | kmp_hw_t type; |
1089 | int num_attrs; |
1090 | int num[MAX_ATTRS]; |
1091 | int offset[MAX_ATTRS]; |
1092 | kmp_hw_attr_t attr[MAX_ATTRS]; |
1093 | }; |
1094 | // Put parenthesis around max to avoid accidental use of Windows max macro. |
1095 | const static int USE_ALL = (std::numeric_limits<int>::max)(); |
1096 | |
1097 | private: |
1098 | int depth; |
1099 | int capacity; |
1100 | item_t *items; |
1101 | kmp_uint64 set; |
1102 | bool absolute; |
1103 | // The set must be able to handle up to KMP_HW_LAST number of layers |
1104 | KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST); |
1105 | // Sorting the KMP_HW_SUBSET items to follow topology order |
1106 | // All unknown topology types will be at the beginning of the subset |
1107 | static int hw_subset_compare(const void *i1, const void *i2) { |
1108 | kmp_hw_t type1 = ((const item_t *)i1)->type; |
1109 | kmp_hw_t type2 = ((const item_t *)i2)->type; |
1110 | int level1 = __kmp_topology->get_level(type: type1); |
1111 | int level2 = __kmp_topology->get_level(type: type2); |
1112 | return level1 - level2; |
1113 | } |
1114 | |
1115 | public: |
1116 | // Force use of allocate()/deallocate() |
1117 | kmp_hw_subset_t() = delete; |
1118 | kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete; |
1119 | kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete; |
1120 | kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete; |
1121 | kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete; |
1122 | |
1123 | static kmp_hw_subset_t *allocate() { |
1124 | int initial_capacity = 5; |
1125 | kmp_hw_subset_t *retval = |
1126 | (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t)); |
1127 | retval->depth = 0; |
1128 | retval->capacity = initial_capacity; |
1129 | retval->set = 0ull; |
1130 | retval->absolute = false; |
1131 | retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity); |
1132 | return retval; |
1133 | } |
1134 | static void deallocate(kmp_hw_subset_t *subset) { |
1135 | __kmp_free(subset->items); |
1136 | __kmp_free(subset); |
1137 | } |
1138 | void set_absolute() { absolute = true; } |
1139 | bool is_absolute() const { return absolute; } |
1140 | void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) { |
1141 | for (int i = 0; i < depth; ++i) { |
1142 | // Found an existing item for this layer type |
1143 | // Add the num, offset, and attr to this item |
1144 | if (items[i].type == type) { |
1145 | int idx = items[i].num_attrs++; |
1146 | if ((size_t)idx >= MAX_ATTRS) |
1147 | return; |
1148 | items[i].num[idx] = num; |
1149 | items[i].offset[idx] = offset; |
1150 | items[i].attr[idx] = attr; |
1151 | return; |
1152 | } |
1153 | } |
1154 | if (depth == capacity - 1) { |
1155 | capacity *= 2; |
1156 | item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity); |
1157 | for (int i = 0; i < depth; ++i) |
1158 | new_items[i] = items[i]; |
1159 | __kmp_free(items); |
1160 | items = new_items; |
1161 | } |
1162 | items[depth].num_attrs = 1; |
1163 | items[depth].type = type; |
1164 | items[depth].num[0] = num; |
1165 | items[depth].offset[0] = offset; |
1166 | items[depth].attr[0] = attr; |
1167 | depth++; |
1168 | set |= (1ull << type); |
1169 | } |
1170 | int get_depth() const { return depth; } |
1171 | const item_t &at(int index) const { |
1172 | KMP_DEBUG_ASSERT(index >= 0 && index < depth); |
1173 | return items[index]; |
1174 | } |
1175 | item_t &at(int index) { |
1176 | KMP_DEBUG_ASSERT(index >= 0 && index < depth); |
1177 | return items[index]; |
1178 | } |
1179 | void remove(int index) { |
1180 | KMP_DEBUG_ASSERT(index >= 0 && index < depth); |
1181 | set &= ~(1ull << items[index].type); |
1182 | for (int j = index + 1; j < depth; ++j) { |
1183 | items[j - 1] = items[j]; |
1184 | } |
1185 | depth--; |
1186 | } |
1187 | void sort() { |
1188 | KMP_DEBUG_ASSERT(__kmp_topology); |
1189 | qsort(base: items, nmemb: depth, size: sizeof(item_t), compar: hw_subset_compare); |
1190 | } |
1191 | bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); } |
1192 | |
1193 | // Canonicalize the KMP_HW_SUBSET value if it is not an absolute subset. |
1194 | // This means putting each of {sockets, cores, threads} in the topology if |
1195 | // they are not specified: |
1196 | // e.g., 1s,2c => 1s,2c,*t | 2c,1t => *s,2c,1t | 1t => *s,*c,1t | etc. |
1197 | // e.g., 3module => *s,3module,*c,*t |
1198 | // By doing this, the runtime assumes users who fiddle with KMP_HW_SUBSET |
1199 | // are expecting the traditional sockets/cores/threads topology. For newer |
1200 | // hardware, there can be intervening layers like dies/tiles/modules |
1201 | // (usually corresponding to a cache level). So when a user asks for |
1202 | // 1s,6c,2t and the topology is really 1s,2modules,4cores,2threads, the user |
1203 | // should get 12 hardware threads across 6 cores and effectively ignore the |
1204 | // module layer. |
1205 | void canonicalize(const kmp_topology_t *top) { |
1206 | // Layers to target for KMP_HW_SUBSET canonicalization |
1207 | kmp_hw_t targeted[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD}; |
1208 | |
1209 | // Do not target-layer-canonicalize absolute KMP_HW_SUBSETS |
1210 | if (is_absolute()) |
1211 | return; |
1212 | |
1213 | // Do not target-layer-canonicalize KMP_HW_SUBSETS when the |
1214 | // topology doesn't have these layers |
1215 | for (kmp_hw_t type : targeted) |
1216 | if (top->get_level(type) == KMP_HW_UNKNOWN) |
1217 | return; |
1218 | |
1219 | // Put targeted layers in topology if they do not exist |
1220 | for (kmp_hw_t type : targeted) { |
1221 | bool found = false; |
1222 | for (int i = 0; i < get_depth(); ++i) { |
1223 | if (top->get_equivalent_type(type: items[i].type) == type) { |
1224 | found = true; |
1225 | break; |
1226 | } |
1227 | } |
1228 | if (!found) { |
1229 | push_back(num: USE_ALL, type, offset: 0, attr: kmp_hw_attr_t{}); |
1230 | } |
1231 | } |
1232 | sort(); |
1233 | // Set as an absolute topology that only targets the targeted layers |
1234 | set_absolute(); |
1235 | } |
1236 | void dump() const { |
1237 | printf(format: "**********************\n" ); |
1238 | printf(format: "*** kmp_hw_subset: ***\n" ); |
1239 | printf(format: "* depth: %d\n" , depth); |
1240 | printf(format: "* items:\n" ); |
1241 | for (int i = 0; i < depth; ++i) { |
1242 | printf(format: " type: %s\n" , __kmp_hw_get_keyword(type: items[i].type)); |
1243 | for (int j = 0; j < items[i].num_attrs; ++j) { |
1244 | printf(format: " num: %d, offset: %d, attr: " , items[i].num[j], |
1245 | items[i].offset[j]); |
1246 | if (!items[i].attr[j]) { |
1247 | printf(format: " (none)\n" ); |
1248 | } else { |
1249 | printf( |
1250 | format: " core_type = %s, core_eff = %d\n" , |
1251 | __kmp_hw_get_core_type_string(type: items[i].attr[j].get_core_type()), |
1252 | items[i].attr[j].get_core_eff()); |
1253 | } |
1254 | } |
1255 | } |
1256 | printf(format: "* set: 0x%llx\n" , set); |
1257 | printf(format: "* absolute: %d\n" , absolute); |
1258 | printf(format: "**********************\n" ); |
1259 | } |
1260 | }; |
1261 | extern kmp_hw_subset_t *__kmp_hw_subset; |
1262 | |
1263 | /* A structure for holding machine-specific hierarchy info to be computed once |
1264 | at init. This structure represents a mapping of threads to the actual machine |
1265 | hierarchy, or to our best guess at what the hierarchy might be, for the |
1266 | purpose of performing an efficient barrier. In the worst case, when there is |
1267 | no machine hierarchy information, it produces a tree suitable for a barrier, |
1268 | similar to the tree used in the hyper barrier. */ |
1269 | class hierarchy_info { |
1270 | public: |
1271 | /* Good default values for number of leaves and branching factor, given no |
1272 | affinity information. Behaves a bit like hyper barrier. */ |
1273 | static const kmp_uint32 maxLeaves = 4; |
1274 | static const kmp_uint32 minBranch = 4; |
1275 | /** Number of levels in the hierarchy. Typical levels are threads/core, |
1276 | cores/package or socket, packages/node, nodes/machine, etc. We don't want |
1277 | to get specific with nomenclature. When the machine is oversubscribed we |
1278 | add levels to duplicate the hierarchy, doubling the thread capacity of the |
1279 | hierarchy each time we add a level. */ |
1280 | kmp_uint32 maxLevels; |
1281 | |
1282 | /** This is specifically the depth of the machine configuration hierarchy, in |
1283 | terms of the number of levels along the longest path from root to any |
1284 | leaf. It corresponds to the number of entries in numPerLevel if we exclude |
1285 | all but one trailing 1. */ |
1286 | kmp_uint32 depth; |
1287 | kmp_uint32 base_num_threads = 0; |
1288 | enum init_status { initialized = 0, not_initialized = 1, initializing = 2 }; |
1289 | volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized, |
1290 | // 2=initialization in progress |
1291 | volatile kmp_int8 resizing; // 0=not resizing, 1=resizing |
1292 | |
1293 | /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children |
1294 | the parent of a node at level i has. For example, if we have a machine |
1295 | with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel = |
1296 | {2, 4, 4, 1, 1}. All empty levels are set to 1. */ |
1297 | kmp_uint32 *numPerLevel = nullptr; |
1298 | kmp_uint32 *skipPerLevel = nullptr; |
1299 | |
1300 | void deriveLevels() { |
1301 | int hier_depth = __kmp_topology->get_depth(); |
1302 | for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) { |
1303 | numPerLevel[level] = __kmp_topology->get_ratio(level: i); |
1304 | } |
1305 | } |
1306 | |
1307 | hierarchy_info() |
1308 | : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {} |
1309 | |
1310 | void fini() { |
1311 | if (!uninitialized && numPerLevel) { |
1312 | __kmp_free(numPerLevel); |
1313 | numPerLevel = NULL; |
1314 | uninitialized = not_initialized; |
1315 | } |
1316 | } |
1317 | |
1318 | void init(int num_addrs) { |
1319 | kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8( |
1320 | &uninitialized, not_initialized, initializing); |
1321 | if (bool_result == 0) { // Wait for initialization |
1322 | while (TCR_1(uninitialized) != initialized) |
1323 | KMP_CPU_PAUSE(); |
1324 | return; |
1325 | } |
1326 | KMP_DEBUG_ASSERT(bool_result == 1); |
1327 | |
1328 | /* Added explicit initialization of the data fields here to prevent usage of |
1329 | dirty value observed when static library is re-initialized multiple times |
1330 | (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses |
1331 | OpenMP). */ |
1332 | depth = 1; |
1333 | resizing = 0; |
1334 | maxLevels = 7; |
1335 | numPerLevel = |
1336 | (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32)); |
1337 | skipPerLevel = &(numPerLevel[maxLevels]); |
1338 | for (kmp_uint32 i = 0; i < maxLevels; |
1339 | ++i) { // init numPerLevel[*] to 1 item per level |
1340 | numPerLevel[i] = 1; |
1341 | skipPerLevel[i] = 1; |
1342 | } |
1343 | |
1344 | // Sort table by physical ID |
1345 | if (__kmp_topology && __kmp_topology->get_depth() > 0) { |
1346 | deriveLevels(); |
1347 | } else { |
1348 | numPerLevel[0] = maxLeaves; |
1349 | numPerLevel[1] = num_addrs / maxLeaves; |
1350 | if (num_addrs % maxLeaves) |
1351 | numPerLevel[1]++; |
1352 | } |
1353 | |
1354 | base_num_threads = num_addrs; |
1355 | for (int i = maxLevels - 1; i >= 0; |
1356 | --i) // count non-empty levels to get depth |
1357 | if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1' |
1358 | depth++; |
1359 | |
1360 | kmp_uint32 branch = minBranch; |
1361 | if (numPerLevel[0] == 1) |
1362 | branch = num_addrs / maxLeaves; |
1363 | if (branch < minBranch) |
1364 | branch = minBranch; |
1365 | for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width |
1366 | while (numPerLevel[d] > branch || |
1367 | (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0! |
1368 | if (numPerLevel[d] & 1) |
1369 | numPerLevel[d]++; |
1370 | numPerLevel[d] = numPerLevel[d] >> 1; |
1371 | if (numPerLevel[d + 1] == 1) |
1372 | depth++; |
1373 | numPerLevel[d + 1] = numPerLevel[d + 1] << 1; |
1374 | } |
1375 | if (numPerLevel[0] == 1) { |
1376 | branch = branch >> 1; |
1377 | if (branch < 4) |
1378 | branch = minBranch; |
1379 | } |
1380 | } |
1381 | |
1382 | for (kmp_uint32 i = 1; i < depth; ++i) |
1383 | skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1]; |
1384 | // Fill in hierarchy in the case of oversubscription |
1385 | for (kmp_uint32 i = depth; i < maxLevels; ++i) |
1386 | skipPerLevel[i] = 2 * skipPerLevel[i - 1]; |
1387 | |
1388 | uninitialized = initialized; // One writer |
1389 | } |
1390 | |
1391 | // Resize the hierarchy if nproc changes to something larger than before |
1392 | void resize(kmp_uint32 nproc) { |
1393 | kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1); |
1394 | while (bool_result == 0) { // someone else is trying to resize |
1395 | KMP_CPU_PAUSE(); |
1396 | if (nproc <= base_num_threads) // happy with other thread's resize |
1397 | return; |
1398 | else // try to resize |
1399 | bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1); |
1400 | } |
1401 | KMP_DEBUG_ASSERT(bool_result != 0); |
1402 | if (nproc <= base_num_threads) |
1403 | return; // happy with other thread's resize |
1404 | |
1405 | // Calculate new maxLevels |
1406 | kmp_uint32 old_sz = skipPerLevel[depth - 1]; |
1407 | kmp_uint32 incs = 0, old_maxLevels = maxLevels; |
1408 | // First see if old maxLevels is enough to contain new size |
1409 | for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) { |
1410 | skipPerLevel[i] = 2 * skipPerLevel[i - 1]; |
1411 | numPerLevel[i - 1] *= 2; |
1412 | old_sz *= 2; |
1413 | depth++; |
1414 | } |
1415 | if (nproc > old_sz) { // Not enough space, need to expand hierarchy |
1416 | while (nproc > old_sz) { |
1417 | old_sz *= 2; |
1418 | incs++; |
1419 | depth++; |
1420 | } |
1421 | maxLevels += incs; |
1422 | |
1423 | // Resize arrays |
1424 | kmp_uint32 *old_numPerLevel = numPerLevel; |
1425 | kmp_uint32 *old_skipPerLevel = skipPerLevel; |
1426 | numPerLevel = skipPerLevel = NULL; |
1427 | numPerLevel = |
1428 | (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32)); |
1429 | skipPerLevel = &(numPerLevel[maxLevels]); |
1430 | |
1431 | // Copy old elements from old arrays |
1432 | for (kmp_uint32 i = 0; i < old_maxLevels; ++i) { |
1433 | // init numPerLevel[*] to 1 item per level |
1434 | numPerLevel[i] = old_numPerLevel[i]; |
1435 | skipPerLevel[i] = old_skipPerLevel[i]; |
1436 | } |
1437 | |
1438 | // Init new elements in arrays to 1 |
1439 | for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) { |
1440 | // init numPerLevel[*] to 1 item per level |
1441 | numPerLevel[i] = 1; |
1442 | skipPerLevel[i] = 1; |
1443 | } |
1444 | |
1445 | // Free old arrays |
1446 | __kmp_free(old_numPerLevel); |
1447 | } |
1448 | |
1449 | // Fill in oversubscription levels of hierarchy |
1450 | for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) |
1451 | skipPerLevel[i] = 2 * skipPerLevel[i - 1]; |
1452 | |
1453 | base_num_threads = nproc; |
1454 | resizing = 0; // One writer |
1455 | } |
1456 | }; |
1457 | #endif // KMP_AFFINITY_H |
1458 | |