kmp_affinity.h source code [openmp/runtime/src/kmp_affinity.h]

1	/*
2	* kmp_affinity.h -- header for affinity management
3	*/
4
5	//===----------------------------------------------------------------------===//
6	//
7	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8	// See https://llvm.org/LICENSE.txt for license information.
9	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10	//
11	//===----------------------------------------------------------------------===//
12
13	#ifndef KMP_AFFINITY_H
14	#define KMP_AFFINITY_H
15
16	#include "kmp.h"
17	#include "kmp_os.h"
18	#include <limits>
19
20	#if KMP_AFFINITY_SUPPORTED
21	#if KMP_USE_HWLOC
22	class KMPHwlocAffinity : public KMPAffinity {
23	public:
24	class Mask : public KMPAffinity::Mask {
25	hwloc_cpuset_t mask;
26
27	public:
28	Mask() {
29	mask = hwloc_bitmap_alloc();
30	this->zero();
31	}
32	Mask(const Mask &other) = delete;
33	Mask &operator=(const Mask &other) = delete;
34	~Mask() { hwloc_bitmap_free(mask); }
35	void set(int i) override { hwloc_bitmap_set(mask, i); }
36	bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
37	void clear(int i) override { hwloc_bitmap_clr(mask, i); }
38	void zero() override { hwloc_bitmap_zero(mask); }
39	bool empty() const override { return hwloc_bitmap_iszero(mask); }
40	void copy(const KMPAffinity::Mask *src) override {
41	const Mask convert = static_cast<const* Mask *>(src);
42	hwloc_bitmap_copy(mask, convert->mask);
43	}
44	void bitwise_and(const KMPAffinity::Mask *rhs) override {
45	const Mask convert = static_cast<const* Mask *>(rhs);
46	hwloc_bitmap_and(mask, mask, convert->mask);
47	}
48	void bitwise_or(const KMPAffinity::Mask *rhs) override {
49	const Mask convert = static_cast<const* Mask *>(rhs);
50	hwloc_bitmap_or(mask, mask, convert->mask);
51	}
52	void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
53	bool is_equal(const KMPAffinity::Mask rhs) const* override {
54	const Mask convert = static_cast<const* Mask *>(rhs);
55	return hwloc_bitmap_isequal(mask, convert->mask);
56	}
57	int begin() const override { return hwloc_bitmap_first(mask); }
58	int end() const override { return -`1`; }
59	int next(int previous) const override {
60	return hwloc_bitmap_next(mask, previous);
61	}
62	int get_system_affinity(bool abort_on_error) override {
63	KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
64	"Illegal get affinity operation when not capable");
65	long retval =
66	hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
67	if (retval >= `0`) {
68	return `0`;
69	}
70	int error = errno;
71	if (abort_on_error) {
72	__kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"),
73	KMP_ERR(error), __kmp_msg_null);
74	}
75	return error;
76	}
77	int set_system_affinity(bool abort_on_error) const override {
78	KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
79	"Illegal set affinity operation when not capable");
80	long retval =
81	hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
82	if (retval >= `0`) {
83	return `0`;
84	}
85	int error = errno;
86	if (abort_on_error) {
87	__kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
88	KMP_ERR(error), __kmp_msg_null);
89	}
90	return error;
91	}
92	#if KMP_OS_WINDOWS
93	int set_process_affinity(bool abort_on_error) const override {
94	KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
95	"Illegal set process affinity operation when not capable");
96	int error = `0`;
97	const hwloc_topology_support *support =
98	hwloc_topology_get_support(__kmp_hwloc_topology);
99	if (support->cpubind->set_proc_cpubind) {
100	int retval;
101	retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
102	HWLOC_CPUBIND_PROCESS);
103	if (retval >= `0`)
104	return `0`;
105	error = errno;
106	if (abort_on_error)
107	__kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
108	KMP_ERR(error), __kmp_msg_null);
109	}
110	return error;
111	}
112	#endif
113	int get_proc_group() const override {
114	int group = -`1`;
115	#if KMP_OS_WINDOWS
116	if (__kmp_num_proc_groups == `1`) {
117	return `1`;
118	}
119	for (int i = `0`; i < __kmp_num_proc_groups; i++) {
120	// On windows, the long type is always 32 bits
121	unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * `2`);
122	unsigned long second_32_bits =
123	hwloc_bitmap_to_ith_ulong(mask, i * `2` + `1`);
124	if (first_32_bits == `0` && second_32_bits == `0`) {
125	continue;
126	}
127	if (group >= `0`) {
128	return -`1`;
129	}
130	group = i;
131	}
132	#endif /* KMP_OS_WINDOWS */
133	return group;
134	}
135	};
136	void determine_capable(const char *var) override {
137	const hwloc_topology_support *topology_support;
138	if (__kmp_hwloc_topology == NULL) {
139	if (hwloc_topology_init(&__kmp_hwloc_topology) < `0`) {
140	__kmp_hwloc_error = TRUE;
141	if (__kmp_affinity.flags.verbose) {
142	KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
143	}
144	}
145	if (hwloc_topology_load(__kmp_hwloc_topology) < `0`) {
146	__kmp_hwloc_error = TRUE;
147	if (__kmp_affinity.flags.verbose) {
148	KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
149	}
150	}
151	}
152	topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
153	// Is the system capable of setting/getting this thread's affinity?
154	// Also, is topology discovery possible? (pu indicates ability to discover
155	// processing units). And finally, were there no errors when calling any
156	// hwloc_ API functions?*
157	if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
158	topology_support->cpubind->get_thisthread_cpubind &&
159	topology_support->discovery->pu && !__kmp_hwloc_error) {
160	// enables affinity according to KMP_AFFINITY_CAPABLE() macro
161	KMP_AFFINITY_ENABLE(TRUE);
162	} else {
163	// indicate that hwloc didn't work and disable affinity
164	__kmp_hwloc_error = TRUE;
165	KMP_AFFINITY_DISABLE();
166	}
167	}
168	void bind_thread(int which) override {
169	KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
170	"Illegal set affinity operation when not capable");
171	KMPAffinity::Mask *mask;
172	KMP_CPU_ALLOC_ON_STACK(mask);
173	KMP_CPU_ZERO(mask);
174	KMP_CPU_SET(which, mask);
175	__kmp_set_system_affinity(mask, TRUE);
176	KMP_CPU_FREE_FROM_STACK(mask);
177	}
178	KMPAffinity::Mask allocate_mask() override { return* new Mask(); }
179	void deallocate_mask(KMPAffinity::Mask m) override { delete* m; }
180	KMPAffinity::Mask allocate_mask_array(int* num) override {
181	return new Mask[num];
182	}
183	void deallocate_mask_array(KMPAffinity::Mask *array) override {
184	Mask hwloc_array = static_cast<Mask >(array);
185	delete[] hwloc_array;
186	}
187	KMPAffinity::Mask index_mask_array(KMPAffinity::Mask array,
188	int index) override {
189	Mask hwloc_array = static_cast<Mask >(array);
190	return &(hwloc_array[index]);
191	}
192	api_type get_api_type() const override { return HWLOC; }
193	};
194	#endif /* KMP_USE_HWLOC */
195
196	#if KMP_OS_LINUX \|\| KMP_OS_FREEBSD \|\| KMP_OS_NETBSD \|\| KMP_OS_DRAGONFLY \|\| \
197	KMP_OS_AIX
198	#if KMP_OS_LINUX
199	/ On some of the older OS's that we build on, these constants aren't present*
200	in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
201	all systems of the same arch where they are defined, and they cannot change.
202	stone forever. /*
203	#include <sys/syscall.h>
204	#if KMP_ARCH_X86 \|\| KMP_ARCH_ARM
205	#ifndef __NR_sched_setaffinity
206	#define __NR_sched_setaffinity 241
207	#elif __NR_sched_setaffinity != 241
208	#error Wrong code for setaffinity system call.
209	#endif /* __NR_sched_setaffinity */
210	#ifndef __NR_sched_getaffinity
211	#define __NR_sched_getaffinity 242
212	#elif __NR_sched_getaffinity != 242
213	#error Wrong code for getaffinity system call.
214	#endif /* __NR_sched_getaffinity */
215	#elif KMP_ARCH_AARCH64
216	#ifndef __NR_sched_setaffinity
217	#define __NR_sched_setaffinity 122
218	#elif __NR_sched_setaffinity != 122
219	#error Wrong code for setaffinity system call.
220	#endif /* __NR_sched_setaffinity */
221	#ifndef __NR_sched_getaffinity
222	#define __NR_sched_getaffinity 123
223	#elif __NR_sched_getaffinity != 123
224	#error Wrong code for getaffinity system call.
225	#endif /* __NR_sched_getaffinity */
226	#elif KMP_ARCH_X86_64
227	#ifndef __NR_sched_setaffinity
228	#define __NR_sched_setaffinity 203
229	#elif __NR_sched_setaffinity != 203
230	#error Wrong code for setaffinity system call.
231	#endif /* __NR_sched_setaffinity */
232	#ifndef __NR_sched_getaffinity
233	#define __NR_sched_getaffinity 204
234	#elif __NR_sched_getaffinity != 204
235	#error Wrong code for getaffinity system call.
236	#endif /* __NR_sched_getaffinity */
237	#elif KMP_ARCH_PPC64
238	#ifndef __NR_sched_setaffinity
239	#define __NR_sched_setaffinity 222
240	#elif __NR_sched_setaffinity != 222
241	#error Wrong code for setaffinity system call.
242	#endif /* __NR_sched_setaffinity */
243	#ifndef __NR_sched_getaffinity
244	#define __NR_sched_getaffinity 223
245	#elif __NR_sched_getaffinity != 223
246	#error Wrong code for getaffinity system call.
247	#endif /* __NR_sched_getaffinity */
248	#elif KMP_ARCH_MIPS
249	#ifndef __NR_sched_setaffinity
250	#define __NR_sched_setaffinity 4239
251	#elif __NR_sched_setaffinity != 4239
252	#error Wrong code for setaffinity system call.
253	#endif /* __NR_sched_setaffinity */
254	#ifndef __NR_sched_getaffinity
255	#define __NR_sched_getaffinity 4240
256	#elif __NR_sched_getaffinity != 4240
257	#error Wrong code for getaffinity system call.
258	#endif /* __NR_sched_getaffinity */
259	#elif KMP_ARCH_MIPS64
260	#ifndef __NR_sched_setaffinity
261	#define __NR_sched_setaffinity 5195
262	#elif __NR_sched_setaffinity != 5195
263	#error Wrong code for setaffinity system call.
264	#endif /* __NR_sched_setaffinity */
265	#ifndef __NR_sched_getaffinity
266	#define __NR_sched_getaffinity 5196
267	#elif __NR_sched_getaffinity != 5196
268	#error Wrong code for getaffinity system call.
269	#endif /* __NR_sched_getaffinity */
270	#elif KMP_ARCH_LOONGARCH64
271	#ifndef __NR_sched_setaffinity
272	#define __NR_sched_setaffinity 122
273	#elif __NR_sched_setaffinity != 122
274	#error Wrong code for setaffinity system call.
275	#endif /* __NR_sched_setaffinity */
276	#ifndef __NR_sched_getaffinity
277	#define __NR_sched_getaffinity 123
278	#elif __NR_sched_getaffinity != 123
279	#error Wrong code for getaffinity system call.
280	#endif /* __NR_sched_getaffinity */
281	#elif KMP_ARCH_RISCV64
282	#ifndef __NR_sched_setaffinity
283	#define __NR_sched_setaffinity 122
284	#elif __NR_sched_setaffinity != 122
285	#error Wrong code for setaffinity system call.
286	#endif /* __NR_sched_setaffinity */
287	#ifndef __NR_sched_getaffinity
288	#define __NR_sched_getaffinity 123
289	#elif __NR_sched_getaffinity != 123
290	#error Wrong code for getaffinity system call.
291	#endif /* __NR_sched_getaffinity */
292	#elif KMP_ARCH_VE
293	#ifndef __NR_sched_setaffinity
294	#define __NR_sched_setaffinity 203
295	#elif __NR_sched_setaffinity != 203
296	#error Wrong code for setaffinity system call.
297	#endif /* __NR_sched_setaffinity */
298	#ifndef __NR_sched_getaffinity
299	#define __NR_sched_getaffinity 204
300	#elif __NR_sched_getaffinity != 204
301	#error Wrong code for getaffinity system call.
302	#endif /* __NR_sched_getaffinity */
303	#elif KMP_ARCH_S390X
304	#ifndef __NR_sched_setaffinity
305	#define __NR_sched_setaffinity 239
306	#elif __NR_sched_setaffinity != 239
307	#error Wrong code for setaffinity system call.
308	#endif /* __NR_sched_setaffinity */
309	#ifndef __NR_sched_getaffinity
310	#define __NR_sched_getaffinity 240
311	#elif __NR_sched_getaffinity != 240
312	#error Wrong code for getaffinity system call.
313	#endif /* __NR_sched_getaffinity */
314	#elif KMP_ARCH_SPARC
315	#ifndef __NR_sched_setaffinity
316	#define __NR_sched_setaffinity 261
317	#elif __NR_sched_setaffinity != 261
318	#error Wrong code for setaffinity system call.
319	#endif /* __NR_sched_setaffinity */
320	#ifndef __NR_sched_getaffinity
321	#define __NR_sched_getaffinity 260
322	#elif __NR_sched_getaffinity != 260
323	#error Wrong code for getaffinity system call.
324	#endif /* __NR_sched_getaffinity */
325	#else
326	#error Unknown or unsupported architecture
327	#endif /* KMP_ARCH_* */
328	#elif KMP_OS_FREEBSD \|\| KMP_OS_DRAGONFLY
329	#include <pthread.h>
330	#include <pthread_np.h>
331	#elif KMP_OS_NETBSD
332	#include <pthread.h>
333	#include <sched.h>
334	#elif KMP_OS_AIX
335	#include <sys/dr.h>
336	#include <sys/rset.h>
337	#define VMI_MAXRADS 64 // Maximum number of RADs allowed by AIX.
338	#define GET_NUMBER_SMT_SETS 0x0004
339	extern "C" int syssmt(int flags, int, int, int *);
340	#endif
341	class KMPNativeAffinity : public KMPAffinity {
342	class Mask : public KMPAffinity::Mask {
343	typedef unsigned long mask_t;
344	typedef decltype(__kmp_affin_mask_size) mask_size_type;
345	static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
346	static const mask_t ONE = `1`;
347	mask_size_type get_num_mask_types() const {
348	return __kmp_affin_mask_size / sizeof(mask_t);
349	}
350
351	public:
352	mask_t *mask;
353	Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
354	~Mask() {
355	if (mask)
356	__kmp_free(mask);
357	}
358	void set(int i) override {
359	mask[i / BITS_PER_MASK_T] \|= (ONE << (i % BITS_PER_MASK_T));
360	}
361	bool is_set(int i) const override {
362	return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
363	}
364	void clear(int i) override {
365	mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
366	}
367	void zero() override {
368	mask_size_type e = get_num_mask_types();
369	for (mask_size_type i = `0`; i < e; ++i)
370	mask[i] = (mask_t)`0`;
371	}
372	bool empty() const override {
373	mask_size_type e = get_num_mask_types();
374	for (mask_size_type i = `0`; i < e; ++i)
375	if (mask[i] != (mask_t)`0`)
376	return false;
377	return true;
378	}
379	void copy(const KMPAffinity::Mask *src) override {
380	const Mask convert = static_cast<const* Mask *>(src);
381	mask_size_type e = get_num_mask_types();
382	for (mask_size_type i = `0`; i < e; ++i)
383	mask[i] = convert->mask[i];
384	}
385	void bitwise_and(const KMPAffinity::Mask *rhs) override {
386	const Mask convert = static_cast<const* Mask *>(rhs);
387	mask_size_type e = get_num_mask_types();
388	for (mask_size_type i = `0`; i < e; ++i)
389	mask[i] &= convert->mask[i];
390	}
391	void bitwise_or(const KMPAffinity::Mask *rhs) override {
392	const Mask convert = static_cast<const* Mask *>(rhs);
393	mask_size_type e = get_num_mask_types();
394	for (mask_size_type i = `0`; i < e; ++i)
395	mask[i] \|= convert->mask[i];
396	}
397	void bitwise_not() override {
398	mask_size_type e = get_num_mask_types();
399	for (mask_size_type i = `0`; i < e; ++i)
400	mask[i] = ~(mask[i]);
401	}
402	bool is_equal(const KMPAffinity::Mask rhs) const* override {
403	const Mask convert = static_cast<const* Mask *>(rhs);
404	mask_size_type e = get_num_mask_types();
405	for (mask_size_type i = `0`; i < e; ++i)
406	if (mask[i] != convert->mask[i])
407	return false;
408	return true;
409	}
410	int begin() const override {
411	int retval = `0`;
412	while (retval < end() && !is_set(i: retval))
413	++retval;
414	return retval;
415	}
416	int end() const override {
417	int e;
418	__kmp_type_convert(src: get_num_mask_types() * BITS_PER_MASK_T, dest: &e);
419	return e;
420	}
421	int next(int previous) const override {
422	int retval = previous + `1`;
423	while (retval < end() && !is_set(i: retval))
424	++retval;
425	return retval;
426	}
427	#if KMP_OS_AIX
428	// On AIX, we don't have a way to get CPU(s) a thread is bound to.
429	// This routine is only used to get the full mask.
430	int get_system_affinity(bool abort_on_error) override {
431	KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
432	"Illegal get affinity operation when not capable");
433
434	(void)abort_on_error;
435
436	// Set the mask with all CPUs that are available.
437	for (int i = `0`; i < __kmp_xproc; ++i)
438	KMP_CPU_SET(i, this);
439	return `0`;
440	}
441	int set_system_affinity(bool abort_on_error) const override {
442	KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
443
444	"Illegal set affinity operation when not capable");
445
446	int location;
447	int gtid = __kmp_entry_gtid();
448	int tid = thread_self();
449
450	// Unbind the thread if it was bound to any processors before so that
451	// we can bind the thread to CPUs specified by the mask not others.
452	int retval = bindprocessor(BINDTHREAD, tid, PROCESSOR_CLASS_ANY);
453
454	// On AIX, we can only bind to one instead of a set of CPUs with the
455	// bindprocessor() system call.
456	KMP_CPU_SET_ITERATE(location, this) {
457	if (KMP_CPU_ISSET(location, this)) {
458	retval = bindprocessor(BINDTHREAD, tid, location);
459	if (retval == -`1` && errno == `1`) {
460	rsid_t rsid;
461	rsethandle_t rsh;
462	// Put something in rsh to prevent compiler warning
463	// about uninitalized use
464	rsh = rs_alloc(RS_EMPTY);
465	rsid.at_pid = getpid();
466	if (RS_DEFAULT_RSET != ra_getrset(R_PROCESS, rsid, `0`, rsh)) {
467	retval = ra_detachrset(R_PROCESS, rsid, `0`);
468	retval = bindprocessor(BINDTHREAD, tid, location);
469	}
470	}
471	if (retval == `0`) {
472	KA_TRACE(`10`, ("__kmp_set_system_affinity: Done binding "
473	"T#%d to cpu=%d.\n",
474	gtid, location));
475	continue;
476	}
477	int error = errno;
478	if (abort_on_error) {
479	__kmp_fatal(KMP_MSG(FunctionError, "bindprocessor()"),
480	KMP_ERR(error), __kmp_msg_null);
481	KA_TRACE(`10`, ("__kmp_set_system_affinity: Error binding "
482	"T#%d to cpu=%d, errno=%d.\n",
483	gtid, location, error));
484	return error;
485	}
486	}
487	}
488	return `0`;
489	}
490	#else // !KMP_OS_AIX
491	int get_system_affinity(bool abort_on_error) override {
492	KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
493	"Illegal get affinity operation when not capable");
494	#if KMP_OS_LINUX
495	long retval =
496	syscall(__NR_sched_getaffinity, `0`, __kmp_affin_mask_size, mask);
497	#elif KMP_OS_FREEBSD \|\| KMP_OS_NETBSD \|\| KMP_OS_DRAGONFLY
498	int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
499	reinterpret_cast<cpuset_t *>(mask));
500	int retval = (r == `0` ? `0` : -`1`);
501	#endif
502	if (retval >= `0`) {
503	return `0`;
504	}
505	int error = errno;
506	if (abort_on_error) {
507	__kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"),
508	KMP_ERR(error), __kmp_msg_null);
509	}
510	return error;
511	}
512	int set_system_affinity(bool abort_on_error) const override {
513	KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
514	"Illegal set affinity operation when not capable");
515	#if KMP_OS_LINUX
516	long retval =
517	syscall(__NR_sched_setaffinity, `0`, __kmp_affin_mask_size, mask);
518	#elif KMP_OS_FREEBSD \|\| KMP_OS_NETBSD \|\| KMP_OS_DRAGONFLY
519	int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
520	reinterpret_cast<cpuset_t *>(mask));
521	int retval = (r == `0` ? `0` : -`1`);
522	#endif
523	if (retval >= `0`) {
524	return `0`;
525	}
526	int error = errno;
527	if (abort_on_error) {
528	__kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"),
529	KMP_ERR(error), __kmp_msg_null);
530	}
531	return error;
532	}
533	#endif // KMP_OS_AIX
534	};
535	void determine_capable(const char *env_var) override {
536	__kmp_affinity_determine_capable(env_var);
537	}
538	void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
539	KMPAffinity::Mask *allocate_mask() override {
540	KMPNativeAffinity::Mask retval = new* Mask ();
541	return retval;
542	}
543	void deallocate_mask(KMPAffinity::Mask *m) override {
544	KMPNativeAffinity::Mask *native_mask =
545	static_cast<KMPNativeAffinity::Mask *>(m);
546	delete native_mask;
547	}
548	KMPAffinity::Mask allocate_mask_array(int* num) override {
549	return new Mask[num];
550	}
551	void deallocate_mask_array(KMPAffinity::Mask *array) override {
552	Mask linux_array = static_cast<Mask >(array);
553	delete[] linux_array;
554	}
555	KMPAffinity::Mask index_mask_array(KMPAffinity::Mask array,
556	int index) override {
557	Mask linux_array = static_cast<Mask >(array);
558	return &(linux_array[index]);
559	}
560	api_type get_api_type() const override { return NATIVE_OS; }
561	};
562	#endif /* KMP_OS_LINUX \|\| KMP_OS_FREEBSD \|\| KMP_OS_NETBSD \|\| KMP_OS_DRAGONFLY \
563	\|\| KMP_OS_AIX */
564
565	#if KMP_OS_WINDOWS
566	class KMPNativeAffinity : public KMPAffinity {
567	class Mask : public KMPAffinity::Mask {
568	typedef ULONG_PTR mask_t;
569	static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
570	mask_t *mask;
571
572	public:
573	Mask() {
574	mask = (mask_t )__kmp_allocate(sizeof(mask_t) __kmp_num_proc_groups);
575	}
576	~Mask() {
577	if (mask)
578	__kmp_free(mask);
579	}
580	void set(int i) override {
581	mask[i / BITS_PER_MASK_T] \|= ((mask_t)`1` << (i % BITS_PER_MASK_T));
582	}
583	bool is_set(int i) const override {
584	return (mask[i / BITS_PER_MASK_T] & ((mask_t)`1` << (i % BITS_PER_MASK_T)));
585	}
586	void clear(int i) override {
587	mask[i / BITS_PER_MASK_T] &= ~((mask_t)`1` << (i % BITS_PER_MASK_T));
588	}
589	void zero() override {
590	for (int i = `0`; i < __kmp_num_proc_groups; ++i)
591	mask[i] = `0`;
592	}
593	bool empty() const override {
594	for (size_t i = `0`; i < __kmp_num_proc_groups; ++i)
595	if (mask[i])
596	return false;
597	return true;
598	}
599	void copy(const KMPAffinity::Mask *src) override {
600	const Mask convert = static_cast<const* Mask *>(src);
601	for (int i = `0`; i < __kmp_num_proc_groups; ++i)
602	mask[i] = convert->mask[i];
603	}
604	void bitwise_and(const KMPAffinity::Mask *rhs) override {
605	const Mask convert = static_cast<const* Mask *>(rhs);
606	for (int i = `0`; i < __kmp_num_proc_groups; ++i)
607	mask[i] &= convert->mask[i];
608	}
609	void bitwise_or(const KMPAffinity::Mask *rhs) override {
610	const Mask convert = static_cast<const* Mask *>(rhs);
611	for (int i = `0`; i < __kmp_num_proc_groups; ++i)
612	mask[i] \|= convert->mask[i];
613	}
614	void bitwise_not() override {
615	for (int i = `0`; i < __kmp_num_proc_groups; ++i)
616	mask[i] = ~(mask[i]);
617	}
618	bool is_equal(const KMPAffinity::Mask rhs) const* override {
619	const Mask convert = static_cast<const* Mask *>(rhs);
620	for (size_t i = `0`; i < __kmp_num_proc_groups; ++i)
621	if (mask[i] != convert->mask[i])
622	return false;
623	return true;
624	}
625	int begin() const override {
626	int retval = `0`;
627	while (retval < end() && !is_set(retval))
628	++retval;
629	return retval;
630	}
631	int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
632	int next(int previous) const override {
633	int retval = previous + `1`;
634	while (retval < end() && !is_set(retval))
635	++retval;
636	return retval;
637	}
638	int set_process_affinity(bool abort_on_error) const override {
639	if (__kmp_num_proc_groups <= `1`) {
640	if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
641	DWORD error = GetLastError();
642	if (abort_on_error) {
643	__kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
644	__kmp_msg_null);
645	}
646	return error;
647	}
648	}
649	return `0`;
650	}
651	int set_system_affinity(bool abort_on_error) const override {
652	if (__kmp_num_proc_groups > `1`) {
653	// Check for a valid mask.
654	GROUP_AFFINITY ga;
655	int group = get_proc_group();
656	if (group < `0`) {
657	if (abort_on_error) {
658	KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
659	}
660	return -`1`;
661	}
662	// Transform the bit vector into a GROUP_AFFINITY struct
663	// and make the system call to set affinity.
664	ga.Group = group;
665	ga.Mask = mask[group];
666	ga.Reserved[`0`] = ga.Reserved[`1`] = ga.Reserved[`2`] = `0`;
667
668	KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
669	if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == `0`) {
670	DWORD error = GetLastError();
671	if (abort_on_error) {
672	__kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
673	__kmp_msg_null);
674	}
675	return error;
676	}
677	} else {
678	if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
679	DWORD error = GetLastError();
680	if (abort_on_error) {
681	__kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
682	__kmp_msg_null);
683	}
684	return error;
685	}
686	}
687	return `0`;
688	}
689	int get_system_affinity(bool abort_on_error) override {
690	if (__kmp_num_proc_groups > `1`) {
691	this->zero();
692	GROUP_AFFINITY ga;
693	KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
694	if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == `0`) {
695	DWORD error = GetLastError();
696	if (abort_on_error) {
697	__kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
698	KMP_ERR(error), __kmp_msg_null);
699	}
700	return error;
701	}
702	if ((ga.Group < `0`) \|\| (ga.Group > __kmp_num_proc_groups) \|\|
703	(ga.Mask == `0`)) {
704	return -`1`;
705	}
706	mask[ga.Group] = ga.Mask;
707	} else {
708	mask_t newMask, sysMask, retval;
709	if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
710	DWORD error = GetLastError();
711	if (abort_on_error) {
712	__kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
713	KMP_ERR(error), __kmp_msg_null);
714	}
715	return error;
716	}
717	retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
718	if (!retval) {
719	DWORD error = GetLastError();
720	if (abort_on_error) {
721	__kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
722	KMP_ERR(error), __kmp_msg_null);
723	}
724	return error;
725	}
726	newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
727	if (!newMask) {
728	DWORD error = GetLastError();
729	if (abort_on_error) {
730	__kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
731	KMP_ERR(error), __kmp_msg_null);
732	}
733	}
734	*mask = retval;
735	}
736	return `0`;
737	}
738	int get_proc_group() const override {
739	int group = -`1`;
740	if (__kmp_num_proc_groups == `1`) {
741	return `1`;
742	}
743	for (int i = `0`; i < __kmp_num_proc_groups; i++) {
744	if (mask[i] == `0`)
745	continue;
746	if (group >= `0`)
747	return -`1`;
748	group = i;
749	}
750	return group;
751	}
752	};
753	void determine_capable(const char *env_var) override {
754	__kmp_affinity_determine_capable(env_var);
755	}
756	void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
757	KMPAffinity::Mask allocate_mask() override { return* new Mask(); }
758	void deallocate_mask(KMPAffinity::Mask m) override { delete* m; }
759	KMPAffinity::Mask allocate_mask_array(int* num) override {
760	return new Mask[num];
761	}
762	void deallocate_mask_array(KMPAffinity::Mask *array) override {
763	Mask windows_array = static_cast<Mask >(array);
764	delete[] windows_array;
765	}
766	KMPAffinity::Mask index_mask_array(KMPAffinity::Mask array,
767	int index) override {
768	Mask windows_array = static_cast<Mask >(array);
769	return &(windows_array[index]);
770	}
771	api_type get_api_type() const override { return NATIVE_OS; }
772	};
773	#endif /* KMP_OS_WINDOWS */
774	#endif /* KMP_AFFINITY_SUPPORTED */
775
776	// Describe an attribute for a level in the machine topology
777	struct kmp_hw_attr_t {
778	int core_type : `8`;
779	int core_eff : `8`;
780	unsigned valid : `1`;
781	unsigned reserved : `15`;
782
783	static const int UNKNOWN_CORE_EFF = -`1`;
784
785	kmp_hw_attr_t()
786	: core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF),
787	valid(`0`), reserved(`0`) {}
788	void set_core_type(kmp_hw_core_type_t type) {
789	valid = `1`;
790	core_type = type;
791	}
792	void set_core_eff(int eff) {
793	valid = `1`;
794	core_eff = eff;
795	}
796	kmp_hw_core_type_t get_core_type() const {
797	return (kmp_hw_core_type_t)core_type;
798	}
799	int get_core_eff() const { return core_eff; }
800	bool is_core_type_valid() const {
801	return core_type != KMP_HW_CORE_TYPE_UNKNOWN;
802	}
803	bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; }
804	operator bool() const { return valid; }
805	void clear() {
806	core_type = KMP_HW_CORE_TYPE_UNKNOWN;
807	core_eff = UNKNOWN_CORE_EFF;
808	valid = `0`;
809	}
810	bool contains(const kmp_hw_attr_t &other) const {
811	if (!valid && !other.valid)
812	return true;
813	if (valid && other.valid) {
814	if (other.is_core_type_valid()) {
815	if (!is_core_type_valid() \|\| (get_core_type() != other.get_core_type()))
816	return false;
817	}
818	if (other.is_core_eff_valid()) {
819	if (!is_core_eff_valid() \|\| (get_core_eff() != other.get_core_eff()))
820	return false;
821	}
822	return true;
823	}
824	return false;
825	}
826	#if KMP_AFFINITY_SUPPORTED
827	bool contains(const kmp_affinity_attrs_t &attr) const {
828	if (!valid && !attr.valid)
829	return true;
830	if (valid && attr.valid) {
831	if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN)
832	return (is_core_type_valid() &&
833	(get_core_type() == (kmp_hw_core_type_t)attr.core_type));
834	if (attr.core_eff != UNKNOWN_CORE_EFF)
835	return (is_core_eff_valid() && (get_core_eff() == attr.core_eff));
836	return true;
837	}
838	return false;
839	}
840	#endif // KMP_AFFINITY_SUPPORTED
841	bool operator==(const kmp_hw_attr_t &rhs) const {
842	return (rhs.valid == valid && rhs.core_eff == core_eff &&
843	rhs.core_type == core_type);
844	}
845	bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); }
846	};
847
848	#if KMP_AFFINITY_SUPPORTED
849	KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t));
850	#endif
851
852	class kmp_hw_thread_t {
853	public:
854	static const int UNKNOWN_ID = -`1`;
855	static const int MULTIPLE_ID = -`2`;
856	static int compare_ids(const void a, const* void *b);
857	static int compare_compact(const void a, const* void *b);
858	int ids[KMP_HW_LAST];
859	int sub_ids[KMP_HW_LAST];
860	bool leader;
861	int os_id;
862	int original_idx;
863	kmp_hw_attr_t attrs;
864
865	void print() const;
866	void clear() {
867	for (int i = `0`; i < (int)KMP_HW_LAST; ++i)
868	ids[i] = UNKNOWN_ID;
869	leader = false;
870	attrs.clear();
871	}
872	};
873
874	class kmp_topology_t {
875
876	struct flags_t {
877	int uniform : `1`;
878	int reserved : `31`;
879	};
880
881	int depth;
882
883	// The following arrays are all 'depth' long and have been
884	// allocated to hold up to KMP_HW_LAST number of objects if
885	// needed so layers can be added without reallocation of any array
886
887	// Orderd array of the types in the topology
888	kmp_hw_t *types;
889
890	// Keep quick topology ratios, for non-uniform topologies,
891	// this ratio holds the max number of itemAs per itemB
892	// e.g., [ 4 packages \| 6 cores / package \| 2 threads / core ]
893	int *ratio;
894
895	// Storage containing the absolute number of each topology layer
896	int *count;
897
898	// The number of core efficiencies. This is only useful for hybrid
899	// topologies. Core efficiencies will range from 0 to num efficiencies - 1
900	int num_core_efficiencies;
901	int num_core_types;
902	kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES];
903
904	// The hardware threads array
905	// hw_threads is num_hw_threads long
906	// Each hw_thread's ids and sub_ids are depth deep
907	int num_hw_threads;
908	kmp_hw_thread_t *hw_threads;
909
910	// Equivalence hash where the key is the hardware topology item
911	// and the value is the equivalent hardware topology type in the
912	// types[] array, if the value is KMP_HW_UNKNOWN, then there is no
913	// known equivalence for the topology type
914	kmp_hw_t equivalent[KMP_HW_LAST];
915
916	// Flags describing the topology
917	flags_t flags;
918
919	// Compact value used during sort_compact()
920	int compact;
921
922	#if KMP_GROUP_AFFINITY
923	// Insert topology information about Windows Processor groups
924	void _insert_windows_proc_groups();
925	#endif
926
927	// Count each item & get the num x's per y
928	// e.g., get the number of cores and the number of threads per core
929	// for each (x, y) in (KMP_HW_ , KMP_HW_)
930	void _gather_enumeration_information();
931
932	// Remove layers that don't add information to the topology.
933	// This is done by having the layer take on the id = UNKNOWN_ID (-1)
934	void _remove_radix1_layers();
935
936	// Find out if the topology is uniform
937	void _discover_uniformity();
938
939	// Set all the sub_ids for each hardware thread
940	void _set_sub_ids();
941
942	// Set global affinity variables describing the number of threads per
943	// core, the number of packages, the number of cores per package, and
944	// the number of cores.
945	void _set_globals();
946
947	// Set the last level cache equivalent type
948	void _set_last_level_cache();
949
950	// Return the number of cores with a particular attribute, 'attr'.
951	// If 'find_all' is true, then find all cores on the machine, otherwise find
952	// all cores per the layer 'above'
953	int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above,
954	bool find_all = false) const;
955
956	public:
957	// Force use of allocate()/deallocate()
958	kmp_topology_t() = delete;
959	kmp_topology_t(const kmp_topology_t &t) = delete;
960	kmp_topology_t(kmp_topology_t &&t) = delete;
961	kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
962	kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
963
964	static kmp_topology_t allocate(int* nproc, int ndepth, const kmp_hw_t *types);
965	static void deallocate(kmp_topology_t *);
966
967	// Functions used in create_map() routines
968	kmp_hw_thread_t &at(int index) {
969	KMP_DEBUG_ASSERT(index >= `0` && index < num_hw_threads);
970	return hw_threads[index];
971	}
972	const kmp_hw_thread_t &at(int index) const {
973	KMP_DEBUG_ASSERT(index >= `0` && index < num_hw_threads);
974	return hw_threads[index];
975	}
976	int get_num_hw_threads() const { return num_hw_threads; }
977	void sort_ids() {
978	qsort(base: hw_threads, nmemb: num_hw_threads, size: sizeof(kmp_hw_thread_t),
979	compar: kmp_hw_thread_t::compare_ids);
980	}
981
982	// Insert a new topology layer after allocation
983	void insert_layer(kmp_hw_t type, const int *ids);
984
985	// Check if the hardware ids are unique, if they are
986	// return true, otherwise return false
987	bool check_ids() const;
988
989	// Function to call after the create_map() routine
990	void canonicalize();
991	void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
992
993	// Functions used after canonicalize() called
994
995	#if KMP_AFFINITY_SUPPORTED
996	// Set the granularity for affinity settings
997	void set_granularity(kmp_affinity_t &stgs) const;
998	bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const;
999	bool restrict_to_mask(const kmp_affin_mask_t *mask);
1000	bool filter_hw_subset();
1001	#endif
1002	bool is_uniform() const { return flags.uniform; }
1003	// Tell whether a type is a valid type in the topology
1004	// returns KMP_HW_UNKNOWN when there is no equivalent type
1005	kmp_hw_t get_equivalent_type(kmp_hw_t type) const {
1006	if (type == KMP_HW_UNKNOWN)
1007	return KMP_HW_UNKNOWN;
1008	return equivalent[type];
1009	}
1010	// Set type1 = type2
1011	void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
1012	KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
1013	KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
1014	kmp_hw_t real_type2 = equivalent[type2];
1015	if (real_type2 == KMP_HW_UNKNOWN)
1016	real_type2 = type2;
1017	equivalent[type1] = real_type2;
1018	// This loop is required since any of the types may have been set to
1019	// be equivalent to type1. They all must be checked and reset to type2.
1020	KMP_FOREACH_HW_TYPE(type) {
1021	if (equivalent[type] == type1) {
1022	equivalent[type] = real_type2;
1023	}
1024	}
1025	}
1026	// Calculate number of types corresponding to level1
1027	// per types corresponding to level2 (e.g., number of threads per core)
1028	int calculate_ratio(int level1, int level2) const {
1029	KMP_DEBUG_ASSERT(level1 >= `0` && level1 < depth);
1030	KMP_DEBUG_ASSERT(level2 >= `0` && level2 < depth);
1031	int r = `1`;
1032	for (int level = level1; level > level2; --level)
1033	r *= ratio[level];
1034	return r;
1035	}
1036	int get_ratio(int level) const {
1037	KMP_DEBUG_ASSERT(level >= `0` && level < depth);
1038	return ratio[level];
1039	}
1040	int get_depth() const { return depth; };
1041	kmp_hw_t get_type(int level) const {
1042	KMP_DEBUG_ASSERT(level >= `0` && level < depth);
1043	return types[level];
1044	}
1045	int get_level(kmp_hw_t type) const {
1046	KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
1047	int eq_type = equivalent[type];
1048	if (eq_type == KMP_HW_UNKNOWN)
1049	return -`1`;
1050	for (int i = `0`; i < depth; ++i)
1051	if (types[i] == eq_type)
1052	return i;
1053	return -`1`;
1054	}
1055	int get_count(int level) const {
1056	KMP_DEBUG_ASSERT(level >= `0` && level < depth);
1057	return count[level];
1058	}
1059	// Return the total number of cores with attribute 'attr'
1060	int get_ncores_with_attr(const kmp_hw_attr_t &attr) const {
1061	return _get_ncores_with_attr(attr, above: -`1`, find_all: true);
1062	}
1063	// Return the number of cores with attribute
1064	// 'attr' per topology level 'above'
1065	int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const {
1066	return _get_ncores_with_attr(attr, above, find_all: false);
1067	}
1068
1069	#if KMP_AFFINITY_SUPPORTED
1070	friend int kmp_hw_thread_t::compare_compact(const void a, const* void *b);
1071	void sort_compact(kmp_affinity_t &affinity) {
1072	compact = affinity.compact;
1073	qsort(base: hw_threads, nmemb: num_hw_threads, size: sizeof(kmp_hw_thread_t),
1074	compar: kmp_hw_thread_t::compare_compact);
1075	}
1076	#endif
1077	void print(const char env_var = "KMP_AFFINITY") const*;
1078	void dump() const;
1079	};
1080	extern kmp_topology_t *__kmp_topology;
1081
1082	class kmp_hw_subset_t {
1083	const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS;
1084
1085	public:
1086	// Describe a machine topology item in KMP_HW_SUBSET
1087	struct item_t {
1088	kmp_hw_t type;
1089	int num_attrs;
1090	int num[MAX_ATTRS];
1091	int offset[MAX_ATTRS];
1092	kmp_hw_attr_t attr[MAX_ATTRS];
1093	};
1094	// Put parenthesis around max to avoid accidental use of Windows max macro.
1095	const static int USE_ALL = (std::numeric_limits<int>::max)();
1096
1097	private:
1098	int depth;
1099	int capacity;
1100	item_t *items;
1101	kmp_uint64 set;
1102	bool absolute;
1103	// The set must be able to handle up to KMP_HW_LAST number of layers
1104	KMP_BUILD_ASSERT(sizeof(set) * `8` >= KMP_HW_LAST);
1105	// Sorting the KMP_HW_SUBSET items to follow topology order
1106	// All unknown topology types will be at the beginning of the subset
1107	static int hw_subset_compare(const void i1, const* void *i2) {
1108	kmp_hw_t type1 = ((const item_t *)i1)->type;
1109	kmp_hw_t type2 = ((const item_t *)i2)->type;
1110	int level1 = __kmp_topology->get_level(type: type1);
1111	int level2 = __kmp_topology->get_level(type: type2);
1112	return level1 - level2;
1113	}
1114
1115	public:
1116	// Force use of allocate()/deallocate()
1117	kmp_hw_subset_t() = delete;
1118	kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
1119	kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
1120	kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
1121	kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
1122
1123	static kmp_hw_subset_t *allocate() {
1124	int initial_capacity = `5`;
1125	kmp_hw_subset_t *retval =
1126	(kmp_hw_subset_t )__kmp_allocate(sizeof*(kmp_hw_subset_t));
1127	retval->depth = `0`;
1128	retval->capacity = initial_capacity;
1129	retval->set = `0ull`;
1130	retval->absolute = false;
1131	retval->items = (item_t )__kmp_allocate(sizeof(item_t) initial_capacity);
1132	return retval;
1133	}
1134	static void deallocate(kmp_hw_subset_t *subset) {
1135	__kmp_free(subset->items);
1136	__kmp_free(subset);
1137	}
1138	void set_absolute() { absolute = true; }
1139	bool is_absolute() const { return absolute; }
1140	void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) {
1141	for (int i = `0`; i < depth; ++i) {
1142	// Found an existing item for this layer type
1143	// Add the num, offset, and attr to this item
1144	if (items[i].type == type) {
1145	int idx = items[i].num_attrs++;
1146	if ((size_t)idx >= MAX_ATTRS)
1147	return;
1148	items[i].num[idx] = num;
1149	items[i].offset[idx] = offset;
1150	items[i].attr[idx] = attr;
1151	return;
1152	}
1153	}
1154	if (depth == capacity - `1`) {
1155	capacity *= `2`;
1156	item_t new_items = (item_t )__kmp_allocate(sizeof(item_t) * capacity);
1157	for (int i = `0`; i < depth; ++i)
1158	new_items[i] = items[i];
1159	__kmp_free(items);
1160	items = new_items;
1161	}
1162	items[depth].num_attrs = `1`;
1163	items[depth].type = type;
1164	items[depth].num[`0`] = num;
1165	items[depth].offset[`0`] = offset;
1166	items[depth].attr[`0`] = attr;
1167	depth++;
1168	set \|= (`1ull` << type);
1169	}
1170	int get_depth() const { return depth; }
1171	const item_t &at(int index) const {
1172	KMP_DEBUG_ASSERT(index >= `0` && index < depth);
1173	return items[index];
1174	}
1175	item_t &at(int index) {
1176	KMP_DEBUG_ASSERT(index >= `0` && index < depth);
1177	return items[index];
1178	}
1179	void remove(int index) {
1180	KMP_DEBUG_ASSERT(index >= `0` && index < depth);
1181	set &= ~(`1ull` << items[index].type);
1182	for (int j = index + `1`; j < depth; ++j) {
1183	items[j - `1`] = items[j];
1184	}
1185	depth--;
1186	}
1187	void sort() {
1188	KMP_DEBUG_ASSERT(__kmp_topology);
1189	qsort(base: items, nmemb: depth, size: sizeof(item_t), compar: hw_subset_compare);
1190	}
1191	bool specified(kmp_hw_t type) const { return ((set & (`1ull` << type)) > `0`); }
1192
1193	// Canonicalize the KMP_HW_SUBSET value if it is not an absolute subset.
1194	// This means putting each of {sockets, cores, threads} in the topology if
1195	// they are not specified:
1196	// e.g., 1s,2c => 1s,2c,t \| 2c,1t => s,2c,1t \| 1t => s,c,1t \| etc.
1197	// e.g., 3module => s,3module,c,t*
1198	// By doing this, the runtime assumes users who fiddle with KMP_HW_SUBSET
1199	// are expecting the traditional sockets/cores/threads topology. For newer
1200	// hardware, there can be intervening layers like dies/tiles/modules
1201	// (usually corresponding to a cache level). So when a user asks for
1202	// 1s,6c,2t and the topology is really 1s,2modules,4cores,2threads, the user
1203	// should get 12 hardware threads across 6 cores and effectively ignore the
1204	// module layer.
1205	void canonicalize(const kmp_topology_t *top) {
1206	// Layers to target for KMP_HW_SUBSET canonicalization
1207	kmp_hw_t targeted[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD};
1208
1209	// Do not target-layer-canonicalize absolute KMP_HW_SUBSETS
1210	if (is_absolute())
1211	return;
1212
1213	// Do not target-layer-canonicalize KMP_HW_SUBSETS when the
1214	// topology doesn't have these layers
1215	for (kmp_hw_t type : targeted)
1216	if (top->get_level(type) == KMP_HW_UNKNOWN)
1217	return;
1218
1219	// Put targeted layers in topology if they do not exist
1220	for (kmp_hw_t type : targeted) {
1221	bool found = false;
1222	for (int i = `0`; i < get_depth(); ++i) {
1223	if (top->get_equivalent_type(type: items[i].type) == type) {
1224	found = true;
1225	break;
1226	}
1227	}
1228	if (!found) {
1229	push_back(num: USE_ALL, type, offset: `0`, attr: kmp_hw_attr_t {});
1230	}
1231	}
1232	sort();
1233	// Set as an absolute topology that only targets the targeted layers
1234	set_absolute();
1235	}
1236	void dump() const {
1237	printf(format: "**********************\n");
1238	printf(format: "* kmp_hw_subset: *\n");
1239	printf(format: "* depth: %d\n", depth);
1240	printf(format: "* items:\n");
1241	for (int i = `0`; i < depth; ++i) {
1242	printf(format: " type: %s\n", __kmp_hw_get_keyword(type: items[i].type));
1243	for (int j = `0`; j < items[i].num_attrs; ++j) {
1244	printf(format: " num: %d, offset: %d, attr: ", items[i].num[j],
1245	items[i].offset[j]);
1246	if (!items[i].attr[j]) {
1247	printf(format: " (none)\n");
1248	} else {
1249	printf(
1250	format: " core_type = %s, core_eff = %d\n",
1251	__kmp_hw_get_core_type_string(type: items[i].attr[j].get_core_type()),
1252	items[i].attr[j].get_core_eff());
1253	}
1254	}
1255	}
1256	printf(format: "* set: 0x%llx\n", set);
1257	printf(format: "* absolute: %d\n", absolute);
1258	printf(format: "**********************\n");
1259	}
1260	};
1261	extern kmp_hw_subset_t *__kmp_hw_subset;
1262
1263	/ A structure for holding machine-specific hierarchy info to be computed once*
1264	at init. This structure represents a mapping of threads to the actual machine
1265	hierarchy, or to our best guess at what the hierarchy might be, for the
1266	purpose of performing an efficient barrier. In the worst case, when there is
1267	no machine hierarchy information, it produces a tree suitable for a barrier,
1268	similar to the tree used in the hyper barrier. /*
1269	class hierarchy_info {
1270	public:
1271	/ Good default values for number of leaves and branching factor, given no*
1272	affinity information. Behaves a bit like hyper barrier. /*
1273	static const kmp_uint32 maxLeaves = `4`;
1274	static const kmp_uint32 minBranch = `4`;
1275	/* Number of levels in the hierarchy. Typical levels are threads/core,*
1276	cores/package or socket, packages/node, nodes/machine, etc. We don't want
1277	to get specific with nomenclature. When the machine is oversubscribed we
1278	add levels to duplicate the hierarchy, doubling the thread capacity of the
1279	hierarchy each time we add a level. /*
1280	kmp_uint32 maxLevels;
1281
1282	/* This is specifically the depth of the machine configuration hierarchy, in*
1283	terms of the number of levels along the longest path from root to any
1284	leaf. It corresponds to the number of entries in numPerLevel if we exclude
1285	all but one trailing 1. /*
1286	kmp_uint32 depth;
1287	kmp_uint32 base_num_threads = `0`;
1288	enum init_status { initialized = `0`, not_initialized = `1`, initializing = `2` };
1289	volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
1290	// 2=initialization in progress
1291	volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
1292
1293	/* Level 0 corresponds to leaves. numPerLevel[i] is the number of children*
1294	the parent of a node at level i has. For example, if we have a machine
1295	with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel =
1296	{2, 4, 4, 1, 1}. All empty levels are set to 1. /*
1297	kmp_uint32 numPerLevel = nullptr*;
1298	kmp_uint32 skipPerLevel = nullptr*;
1299
1300	void deriveLevels() {
1301	int hier_depth = __kmp_topology->get_depth();
1302	for (int i = hier_depth - `1`, level = `0`; i >= `0`; --i, ++level) {
1303	numPerLevel[level] = __kmp_topology->get_ratio(level: i);
1304	}
1305	}
1306
1307	hierarchy_info()
1308	: maxLevels(`7`), depth(`1`), uninitialized(not_initialized), resizing(`0`) {}
1309
1310	void fini() {
1311	if (!uninitialized && numPerLevel) {
1312	__kmp_free(numPerLevel);
1313	numPerLevel = NULL;
1314	uninitialized = not_initialized;
1315	}
1316	}
1317
1318	void init(int num_addrs) {
1319	kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
1320	&uninitialized, not_initialized, initializing);
1321	if (bool_result == `0`) { // Wait for initialization
1322	while (TCR_1(uninitialized) != initialized)
1323	KMP_CPU_PAUSE();
1324	return;
1325	}
1326	KMP_DEBUG_ASSERT(bool_result == `1`);
1327
1328	/ Added explicit initialization of the data fields here to prevent usage of*
1329	dirty value observed when static library is re-initialized multiple times
1330	(e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
1331	OpenMP). /*
1332	depth = `1`;
1333	resizing = `0`;
1334	maxLevels = `7`;
1335	numPerLevel =
1336	(kmp_uint32 )__kmp_allocate(maxLevels `2` * sizeof(kmp_uint32));
1337	skipPerLevel = &(numPerLevel[maxLevels]);
1338	for (kmp_uint32 i = `0`; i < maxLevels;
1339	++i) { // init numPerLevel[] to 1 item per level*
1340	numPerLevel[i] = `1`;
1341	skipPerLevel[i] = `1`;
1342	}
1343
1344	// Sort table by physical ID
1345	if (__kmp_topology && __kmp_topology->get_depth() > `0`) {
1346	deriveLevels();
1347	} else {
1348	numPerLevel[`0`] = maxLeaves;
1349	numPerLevel[`1`] = num_addrs / maxLeaves;
1350	if (num_addrs % maxLeaves)
1351	numPerLevel[`1`]++;
1352	}
1353
1354	base_num_threads = num_addrs;
1355	for (int i = maxLevels - `1`; i >= `0`;
1356	--i) // count non-empty levels to get depth
1357	if (numPerLevel[i] != `1` \|\| depth > `1`) // only count one top-level '1'
1358	depth++;
1359
1360	kmp_uint32 branch = minBranch;
1361	if (numPerLevel[`0`] == `1`)
1362	branch = num_addrs / maxLeaves;
1363	if (branch < minBranch)
1364	branch = minBranch;
1365	for (kmp_uint32 d = `0`; d < depth - `1`; ++d) { // optimize hierarchy width
1366	while (numPerLevel[d] > branch \|\|
1367	(d == `0` && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
1368	if (numPerLevel[d] & `1`)
1369	numPerLevel[d]++;
1370	numPerLevel[d] = numPerLevel[d] >> `1`;
1371	if (numPerLevel[d + `1`] == `1`)
1372	depth++;
1373	numPerLevel[d + `1`] = numPerLevel[d + `1`] << `1`;
1374	}
1375	if (numPerLevel[`0`] == `1`) {
1376	branch = branch >> `1`;
1377	if (branch < `4`)
1378	branch = minBranch;
1379	}
1380	}
1381
1382	for (kmp_uint32 i = `1`; i < depth; ++i)
1383	skipPerLevel[i] = numPerLevel[i - `1`] * skipPerLevel[i - `1`];
1384	// Fill in hierarchy in the case of oversubscription
1385	for (kmp_uint32 i = depth; i < maxLevels; ++i)
1386	skipPerLevel[i] = `2` * skipPerLevel[i - `1`];
1387
1388	uninitialized = initialized; // One writer
1389	}
1390
1391	// Resize the hierarchy if nproc changes to something larger than before
1392	void resize(kmp_uint32 nproc) {
1393	kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, `0`, `1`);
1394	while (bool_result == `0`) { // someone else is trying to resize
1395	KMP_CPU_PAUSE();
1396	if (nproc <= base_num_threads) // happy with other thread's resize
1397	return;
1398	else // try to resize
1399	bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, `0`, `1`);
1400	}
1401	KMP_DEBUG_ASSERT(bool_result != `0`);
1402	if (nproc <= base_num_threads)
1403	return; // happy with other thread's resize
1404
1405	// Calculate new maxLevels
1406	kmp_uint32 old_sz = skipPerLevel[depth - `1`];
1407	kmp_uint32 incs = `0`, old_maxLevels = maxLevels;
1408	// First see if old maxLevels is enough to contain new size
1409	for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
1410	skipPerLevel[i] = `2` * skipPerLevel[i - `1`];
1411	numPerLevel[i - `1`] *= `2`;
1412	old_sz *= `2`;
1413	depth++;
1414	}
1415	if (nproc > old_sz) { // Not enough space, need to expand hierarchy
1416	while (nproc > old_sz) {
1417	old_sz *= `2`;
1418	incs++;
1419	depth++;
1420	}
1421	maxLevels += incs;
1422
1423	// Resize arrays
1424	kmp_uint32 *old_numPerLevel = numPerLevel;
1425	kmp_uint32 *old_skipPerLevel = skipPerLevel;
1426	numPerLevel = skipPerLevel = NULL;
1427	numPerLevel =
1428	(kmp_uint32 )__kmp_allocate(maxLevels `2` * sizeof(kmp_uint32));
1429	skipPerLevel = &(numPerLevel[maxLevels]);
1430
1431	// Copy old elements from old arrays
1432	for (kmp_uint32 i = `0`; i < old_maxLevels; ++i) {
1433	// init numPerLevel[] to 1 item per level*
1434	numPerLevel[i] = old_numPerLevel[i];
1435	skipPerLevel[i] = old_skipPerLevel[i];
1436	}
1437
1438	// Init new elements in arrays to 1
1439	for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
1440	// init numPerLevel[] to 1 item per level*
1441	numPerLevel[i] = `1`;
1442	skipPerLevel[i] = `1`;
1443	}
1444
1445	// Free old arrays
1446	__kmp_free(old_numPerLevel);
1447	}
1448
1449	// Fill in oversubscription levels of hierarchy
1450	for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
1451	skipPerLevel[i] = `2` * skipPerLevel[i - `1`];
1452
1453	base_num_threads = nproc;
1454	resizing = `0`; // One writer
1455	}
1456	};
1457	#endif // KMP_AFFINITY_H
1458

source code of openmp/runtime/src/kmp_affinity.h