1/*
2 * kmp_runtime.cpp -- KPTS runtime support library
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#include "kmp.h"
14#include "kmp_affinity.h"
15#include "kmp_atomic.h"
16#include "kmp_environment.h"
17#include "kmp_error.h"
18#include "kmp_i18n.h"
19#include "kmp_io.h"
20#include "kmp_itt.h"
21#include "kmp_settings.h"
22#include "kmp_stats.h"
23#include "kmp_str.h"
24#include "kmp_wait_release.h"
25#include "kmp_wrapper_getpid.h"
26#include "kmp_dispatch.h"
27#include "kmp_utils.h"
28#if KMP_USE_HIER_SCHED
29#include "kmp_dispatch_hier.h"
30#endif
31
32#if OMPT_SUPPORT
33#include "ompt-specific.h"
34#endif
35#if OMPD_SUPPORT
36#include "ompd-specific.h"
37#endif
38
39#if OMP_PROFILING_SUPPORT
40#include "llvm/Support/TimeProfiler.h"
41static char *ProfileTraceFile = nullptr;
42#endif
43
44/* these are temporary issues to be dealt with */
45#define KMP_USE_PRCTL 0
46
47#if KMP_OS_WINDOWS
48#include <process.h>
49#endif
50
51#ifndef KMP_USE_SHM
52// Windows and WASI do not need these include files as they don't use shared
53// memory.
54#else
55#include <sys/mman.h>
56#include <sys/stat.h>
57#include <fcntl.h>
58#define SHM_SIZE 1024
59#endif
60
61#if defined(KMP_GOMP_COMPAT)
62char const __kmp_version_alt_comp[] =
63 KMP_VERSION_PREFIX "alternative compiler support: yes";
64#endif /* defined(KMP_GOMP_COMPAT) */
65
66char const __kmp_version_omp_api[] =
67 KMP_VERSION_PREFIX "API version: 5.0 (201611)";
68
69#ifdef KMP_DEBUG
70char const __kmp_version_lock[] =
71 KMP_VERSION_PREFIX "lock type: run time selectable";
72#endif /* KMP_DEBUG */
73
74#define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
75
76/* ------------------------------------------------------------------------ */
77
78#if KMP_USE_MONITOR
79kmp_info_t __kmp_monitor;
80#endif
81
82/* Forward declarations */
83
84void __kmp_cleanup(void);
85
86static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
87 int gtid);
88static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
89 kmp_internal_control_t *new_icvs,
90 ident_t *loc);
91#if KMP_AFFINITY_SUPPORTED
92static void __kmp_partition_places(kmp_team_t *team,
93 int update_master_only = 0);
94#endif
95static void __kmp_do_serial_initialize(void);
96void __kmp_fork_barrier(int gtid, int tid);
97void __kmp_join_barrier(int gtid);
98void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
99 kmp_internal_control_t *new_icvs, ident_t *loc);
100
101#ifdef USE_LOAD_BALANCE
102static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
103#endif
104
105static int __kmp_expand_threads(int nNeed);
106#if KMP_OS_WINDOWS
107static int __kmp_unregister_root_other_thread(int gtid);
108#endif
109static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
110kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
111
112void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
113 int new_nthreads);
114void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
115
116static kmp_nested_nthreads_t *__kmp_override_nested_nth(kmp_info_t *thr,
117 int level) {
118 kmp_nested_nthreads_t *new_nested_nth =
119 (kmp_nested_nthreads_t *)KMP_INTERNAL_MALLOC(
120 sizeof(kmp_nested_nthreads_t));
121 int new_size = level + thr->th.th_set_nested_nth_sz;
122 new_nested_nth->nth = (int *)KMP_INTERNAL_MALLOC(new_size * sizeof(int));
123 for (int i = 0; i < level + 1; ++i)
124 new_nested_nth->nth[i] = 0;
125 for (int i = level + 1, j = 1; i < new_size; ++i, ++j)
126 new_nested_nth->nth[i] = thr->th.th_set_nested_nth[j];
127 new_nested_nth->size = new_nested_nth->used = new_size;
128 return new_nested_nth;
129}
130
131/* Calculate the identifier of the current thread */
132/* fast (and somewhat portable) way to get unique identifier of executing
133 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
134int __kmp_get_global_thread_id() {
135 int i;
136 kmp_info_t **other_threads;
137 size_t stack_data;
138 char *stack_addr;
139 size_t stack_size;
140 char *stack_base;
141
142 KA_TRACE(
143 1000,
144 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
145 __kmp_nth, __kmp_all_nth));
146
147 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
148 a parallel region, made it return KMP_GTID_DNE to force serial_initialize
149 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
150 __kmp_init_gtid for this to work. */
151
152 if (!TCR_4(__kmp_init_gtid))
153 return KMP_GTID_DNE;
154
155#ifdef KMP_TDATA_GTID
156 if (TCR_4(__kmp_gtid_mode) >= 3) {
157 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
158 return __kmp_gtid;
159 }
160#endif
161 if (TCR_4(__kmp_gtid_mode) >= 2) {
162 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
163 return __kmp_gtid_get_specific();
164 }
165 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
166
167 stack_addr = (char *)&stack_data;
168 other_threads = __kmp_threads;
169
170 /* ATT: The code below is a source of potential bugs due to unsynchronized
171 access to __kmp_threads array. For example:
172 1. Current thread loads other_threads[i] to thr and checks it, it is
173 non-NULL.
174 2. Current thread is suspended by OS.
175 3. Another thread unregisters and finishes (debug versions of free()
176 may fill memory with something like 0xEF).
177 4. Current thread is resumed.
178 5. Current thread reads junk from *thr.
179 TODO: Fix it. --ln */
180
181 for (i = 0; i < __kmp_threads_capacity; i++) {
182
183 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
184 if (!thr)
185 continue;
186
187 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
188 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
189
190 /* stack grows down -- search through all of the active threads */
191
192 if (stack_addr <= stack_base) {
193 size_t stack_diff = stack_base - stack_addr;
194
195 if (stack_diff <= stack_size) {
196 /* The only way we can be closer than the allocated */
197 /* stack size is if we are running on this thread. */
198 // __kmp_gtid_get_specific can return negative value because this
199 // function can be called by thread destructor. However, before the
200 // thread destructor is called, the value of the corresponding
201 // thread-specific data will be reset to NULL.
202 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() < 0 ||
203 __kmp_gtid_get_specific() == i);
204 return i;
205 }
206 }
207 }
208
209 /* get specific to try and determine our gtid */
210 KA_TRACE(1000,
211 ("*** __kmp_get_global_thread_id: internal alg. failed to find "
212 "thread, using TLS\n"));
213 i = __kmp_gtid_get_specific();
214
215 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
216
217 /* if we havn't been assigned a gtid, then return code */
218 if (i < 0)
219 return i;
220
221 // other_threads[i] can be nullptr at this point because the corresponding
222 // thread could have already been destructed. It can happen when this function
223 // is called in end library routine.
224 if (!TCR_SYNC_PTR(other_threads[i]))
225 return i;
226
227 /* dynamically updated stack window for uber threads to avoid get_specific
228 call */
229 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
230 KMP_FATAL(StackOverflow, i);
231 }
232
233 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
234 if (stack_addr > stack_base) {
235 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
236 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
237 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
238 stack_base);
239 } else {
240 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
241 stack_base - stack_addr);
242 }
243
244 /* Reprint stack bounds for ubermaster since they have been refined */
245 if (__kmp_storage_map) {
246 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
247 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
248 __kmp_print_storage_map_gtid(gtid: i, p1: stack_beg, p2: stack_end,
249 size: other_threads[i]->th.th_info.ds.ds_stacksize,
250 format: "th_%d stack (refinement)", i);
251 }
252 return i;
253}
254
255int __kmp_get_global_thread_id_reg() {
256 int gtid;
257
258 if (!__kmp_init_serial) {
259 gtid = KMP_GTID_DNE;
260 } else
261#ifdef KMP_TDATA_GTID
262 if (TCR_4(__kmp_gtid_mode) >= 3) {
263 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
264 gtid = __kmp_gtid;
265 } else
266#endif
267 if (TCR_4(__kmp_gtid_mode) >= 2) {
268 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
269 gtid = __kmp_gtid_get_specific();
270 } else {
271 KA_TRACE(1000,
272 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
273 gtid = __kmp_get_global_thread_id();
274 }
275
276 /* we must be a new uber master sibling thread */
277 if (gtid == KMP_GTID_DNE) {
278 KA_TRACE(10,
279 ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
280 "Registering a new gtid.\n"));
281 __kmp_acquire_bootstrap_lock(lck: &__kmp_initz_lock);
282 if (!__kmp_init_serial) {
283 __kmp_do_serial_initialize();
284 gtid = __kmp_gtid_get_specific();
285 } else {
286 gtid = __kmp_register_root(FALSE);
287 }
288 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
289 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
290 }
291
292 KMP_DEBUG_ASSERT(gtid >= 0);
293
294 return gtid;
295}
296
297/* caller must hold forkjoin_lock */
298void __kmp_check_stack_overlap(kmp_info_t *th) {
299 int f;
300 char *stack_beg = NULL;
301 char *stack_end = NULL;
302 int gtid;
303
304 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
305 if (__kmp_storage_map) {
306 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
307 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
308
309 gtid = __kmp_gtid_from_thread(thr: th);
310
311 if (gtid == KMP_GTID_MONITOR) {
312 __kmp_print_storage_map_gtid(
313 gtid, p1: stack_beg, p2: stack_end, size: th->th.th_info.ds.ds_stacksize,
314 format: "th_%s stack (%s)", "mon",
315 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
316 } else {
317 __kmp_print_storage_map_gtid(
318 gtid, p1: stack_beg, p2: stack_end, size: th->th.th_info.ds.ds_stacksize,
319 format: "th_%d stack (%s)", gtid,
320 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
321 }
322 }
323
324 /* No point in checking ubermaster threads since they use refinement and
325 * cannot overlap */
326 gtid = __kmp_gtid_from_thread(thr: th);
327 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
328 KA_TRACE(10,
329 ("__kmp_check_stack_overlap: performing extensive checking\n"));
330 if (stack_beg == NULL) {
331 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
332 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
333 }
334
335 for (f = 0; f < __kmp_threads_capacity; f++) {
336 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
337
338 if (f_th && f_th != th) {
339 char *other_stack_end =
340 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
341 char *other_stack_beg =
342 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
343 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
344 (stack_end > other_stack_beg && stack_end < other_stack_end)) {
345
346 /* Print the other stack values before the abort */
347 if (__kmp_storage_map)
348 __kmp_print_storage_map_gtid(
349 gtid: -1, p1: other_stack_beg, p2: other_stack_end,
350 size: (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
351 format: "th_%d stack (overlapped)", __kmp_gtid_from_thread(thr: f_th));
352
353 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
354 __kmp_msg_null);
355 }
356 }
357 }
358 }
359 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
360}
361
362/* ------------------------------------------------------------------------ */
363
364void __kmp_infinite_loop(void) {
365 static int done = FALSE;
366
367 while (!done) {
368 KMP_YIELD(TRUE);
369 }
370}
371
372#define MAX_MESSAGE 512
373
374void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
375 char const *format, ...) {
376 char buffer[MAX_MESSAGE];
377 va_list ap;
378
379 va_start(ap, format);
380 KMP_SNPRINTF(s: buffer, maxlen: sizeof(buffer), format: "OMP storage map: %p %p%8lu %s\n", p1,
381 p2, (unsigned long)size, format);
382 __kmp_acquire_bootstrap_lock(lck: &__kmp_stdio_lock);
383 __kmp_vprintf(stream: kmp_err, format: buffer, ap);
384#if KMP_PRINT_DATA_PLACEMENT
385 int node;
386 if (gtid >= 0) {
387 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
388 if (__kmp_storage_map_verbose) {
389 node = __kmp_get_host_node(p1);
390 if (node < 0) /* doesn't work, so don't try this next time */
391 __kmp_storage_map_verbose = FALSE;
392 else {
393 char *last;
394 int lastNode;
395 int localProc = __kmp_get_cpu_from_gtid(gtid);
396
397 const int page_size = KMP_GET_PAGE_SIZE();
398
399 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
400 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
401 if (localProc >= 0)
402 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
403 localProc >> 1);
404 else
405 __kmp_printf_no_lock(" GTID %d\n", gtid);
406#if KMP_USE_PRCTL
407 /* The more elaborate format is disabled for now because of the prctl
408 * hanging bug. */
409 do {
410 last = p1;
411 lastNode = node;
412 /* This loop collates adjacent pages with the same host node. */
413 do {
414 (char *)p1 += page_size;
415 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
416 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
417 lastNode);
418 } while (p1 <= p2);
419#else
420 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
421 (char *)p1 + (page_size - 1),
422 __kmp_get_host_node(p1));
423 if (p1 < p2) {
424 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
425 (char *)p2 + (page_size - 1),
426 __kmp_get_host_node(p2));
427 }
428#endif
429 }
430 }
431 } else
432 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
433 }
434#endif /* KMP_PRINT_DATA_PLACEMENT */
435 __kmp_release_bootstrap_lock(lck: &__kmp_stdio_lock);
436
437 va_end(ap);
438}
439
440void __kmp_warn(char const *format, ...) {
441 char buffer[MAX_MESSAGE];
442 va_list ap;
443
444 if (__kmp_generate_warnings == kmp_warnings_off) {
445 return;
446 }
447
448 va_start(ap, format);
449
450 KMP_SNPRINTF(s: buffer, maxlen: sizeof(buffer), format: "OMP warning: %s\n", format);
451 __kmp_acquire_bootstrap_lock(lck: &__kmp_stdio_lock);
452 __kmp_vprintf(stream: kmp_err, format: buffer, ap);
453 __kmp_release_bootstrap_lock(lck: &__kmp_stdio_lock);
454
455 va_end(ap);
456}
457
458void __kmp_abort_process() {
459 // Later threads may stall here, but that's ok because abort() will kill them.
460 __kmp_acquire_bootstrap_lock(lck: &__kmp_exit_lock);
461
462 if (__kmp_debug_buf) {
463 __kmp_dump_debug_buffer();
464 }
465
466#if KMP_OS_WINDOWS
467 // Let other threads know of abnormal termination and prevent deadlock
468 // if abort happened during library initialization or shutdown
469 __kmp_global.g.g_abort = SIGABRT;
470
471 /* On Windows* OS by default abort() causes pop-up error box, which stalls
472 nightly testing. Unfortunately, we cannot reliably suppress pop-up error
473 boxes. _set_abort_behavior() works well, but this function is not
474 available in VS7 (this is not problem for DLL, but it is a problem for
475 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
476 help, at least in some versions of MS C RTL.
477
478 It seems following sequence is the only way to simulate abort() and
479 avoid pop-up error box. */
480 raise(SIGABRT);
481 _exit(3); // Just in case, if signal ignored, exit anyway.
482#else
483 __kmp_unregister_library();
484 abort();
485#endif
486
487 __kmp_infinite_loop();
488 __kmp_release_bootstrap_lock(lck: &__kmp_exit_lock);
489
490} // __kmp_abort_process
491
492void __kmp_abort_thread(void) {
493 // TODO: Eliminate g_abort global variable and this function.
494 // In case of abort just call abort(), it will kill all the threads.
495 __kmp_infinite_loop();
496} // __kmp_abort_thread
497
498/* Print out the storage map for the major kmp_info_t thread data structures
499 that are allocated together. */
500
501static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
502 __kmp_print_storage_map_gtid(gtid, p1: thr, p2: thr + 1, size: sizeof(kmp_info_t), format: "th_%d",
503 gtid);
504
505 __kmp_print_storage_map_gtid(gtid, p1: &thr->th.th_info, p2: &thr->th.th_team,
506 size: sizeof(kmp_desc_t), format: "th_%d.th_info", gtid);
507
508 __kmp_print_storage_map_gtid(gtid, p1: &thr->th.th_local, p2: &thr->th.th_pri_head,
509 size: sizeof(kmp_local_t), format: "th_%d.th_local", gtid);
510
511 __kmp_print_storage_map_gtid(
512 gtid, p1: &thr->th.th_bar[0], p2: &thr->th.th_bar[bs_last_barrier],
513 size: sizeof(kmp_balign_t) * bs_last_barrier, format: "th_%d.th_bar", gtid);
514
515 __kmp_print_storage_map_gtid(gtid, p1: &thr->th.th_bar[bs_plain_barrier],
516 p2: &thr->th.th_bar[bs_plain_barrier + 1],
517 size: sizeof(kmp_balign_t), format: "th_%d.th_bar[plain]",
518 gtid);
519
520 __kmp_print_storage_map_gtid(gtid, p1: &thr->th.th_bar[bs_forkjoin_barrier],
521 p2: &thr->th.th_bar[bs_forkjoin_barrier + 1],
522 size: sizeof(kmp_balign_t), format: "th_%d.th_bar[forkjoin]",
523 gtid);
524
525#if KMP_FAST_REDUCTION_BARRIER
526 __kmp_print_storage_map_gtid(gtid, p1: &thr->th.th_bar[bs_reduction_barrier],
527 p2: &thr->th.th_bar[bs_reduction_barrier + 1],
528 size: sizeof(kmp_balign_t), format: "th_%d.th_bar[reduction]",
529 gtid);
530#endif // KMP_FAST_REDUCTION_BARRIER
531}
532
533/* Print out the storage map for the major kmp_team_t team data structures
534 that are allocated together. */
535
536static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
537 int team_id, int num_thr) {
538 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
539 __kmp_print_storage_map_gtid(gtid: -1, p1: team, p2: team + 1, size: sizeof(kmp_team_t), format: "%s_%d",
540 header, team_id);
541
542 __kmp_print_storage_map_gtid(gtid: -1, p1: &team->t.t_bar[0],
543 p2: &team->t.t_bar[bs_last_barrier],
544 size: sizeof(kmp_balign_team_t) * bs_last_barrier,
545 format: "%s_%d.t_bar", header, team_id);
546
547 __kmp_print_storage_map_gtid(gtid: -1, p1: &team->t.t_bar[bs_plain_barrier],
548 p2: &team->t.t_bar[bs_plain_barrier + 1],
549 size: sizeof(kmp_balign_team_t), format: "%s_%d.t_bar[plain]",
550 header, team_id);
551
552 __kmp_print_storage_map_gtid(gtid: -1, p1: &team->t.t_bar[bs_forkjoin_barrier],
553 p2: &team->t.t_bar[bs_forkjoin_barrier + 1],
554 size: sizeof(kmp_balign_team_t),
555 format: "%s_%d.t_bar[forkjoin]", header, team_id);
556
557#if KMP_FAST_REDUCTION_BARRIER
558 __kmp_print_storage_map_gtid(gtid: -1, p1: &team->t.t_bar[bs_reduction_barrier],
559 p2: &team->t.t_bar[bs_reduction_barrier + 1],
560 size: sizeof(kmp_balign_team_t),
561 format: "%s_%d.t_bar[reduction]", header, team_id);
562#endif // KMP_FAST_REDUCTION_BARRIER
563
564 __kmp_print_storage_map_gtid(
565 gtid: -1, p1: &team->t.t_dispatch[0], p2: &team->t.t_dispatch[num_thr],
566 size: sizeof(kmp_disp_t) * num_thr, format: "%s_%d.t_dispatch", header, team_id);
567
568 __kmp_print_storage_map_gtid(
569 gtid: -1, p1: &team->t.t_threads[0], p2: &team->t.t_threads[num_thr],
570 size: sizeof(kmp_info_t *) * num_thr, format: "%s_%d.t_threads", header, team_id);
571
572 __kmp_print_storage_map_gtid(gtid: -1, p1: &team->t.t_disp_buffer[0],
573 p2: &team->t.t_disp_buffer[num_disp_buff],
574 size: sizeof(dispatch_shared_info_t) * num_disp_buff,
575 format: "%s_%d.t_disp_buffer", header, team_id);
576}
577
578static void __kmp_init_allocator() {
579 __kmp_init_memkind();
580 __kmp_init_target_mem();
581}
582static void __kmp_fini_allocator() {
583 __kmp_fini_target_mem();
584 __kmp_fini_memkind();
585}
586
587/* ------------------------------------------------------------------------ */
588
589#if ENABLE_LIBOMPTARGET
590static void __kmp_init_omptarget() {
591 __kmp_init_target_task();
592}
593#endif
594
595/* ------------------------------------------------------------------------ */
596
597#if KMP_DYNAMIC_LIB
598#if KMP_OS_WINDOWS
599
600BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
601 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
602
603 switch (fdwReason) {
604
605 case DLL_PROCESS_ATTACH:
606 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
607
608 return TRUE;
609
610 case DLL_PROCESS_DETACH:
611 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
612
613 // According to Windows* documentation for DllMain entry point:
614 // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
615 // lpReserved == NULL when FreeLibrary() is called,
616 // lpReserved != NULL when the process is terminated.
617 // When FreeLibrary() is called, worker threads remain alive. So the
618 // runtime's state is consistent and executing proper shutdown is OK.
619 // When the process is terminated, worker threads have exited or been
620 // forcefully terminated by the OS and only the shutdown thread remains.
621 // This can leave the runtime in an inconsistent state.
622 // Hence, only attempt proper cleanup when FreeLibrary() is called.
623 // Otherwise, rely on OS to reclaim resources.
624 if (lpReserved == NULL)
625 __kmp_internal_end_library(__kmp_gtid_get_specific());
626
627 return TRUE;
628
629 case DLL_THREAD_ATTACH:
630 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
631
632 /* if we want to register new siblings all the time here call
633 * __kmp_get_gtid(); */
634 return TRUE;
635
636 case DLL_THREAD_DETACH:
637 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
638
639 __kmp_internal_end_thread(__kmp_gtid_get_specific());
640 return TRUE;
641 }
642
643 return TRUE;
644}
645
646#endif /* KMP_OS_WINDOWS */
647#endif /* KMP_DYNAMIC_LIB */
648
649/* __kmp_parallel_deo -- Wait until it's our turn. */
650void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
651 int gtid = *gtid_ref;
652#ifdef BUILD_PARALLEL_ORDERED
653 kmp_team_t *team = __kmp_team_from_gtid(gtid);
654#endif /* BUILD_PARALLEL_ORDERED */
655
656 if (__kmp_env_consistency_check) {
657 if (__kmp_threads[gtid]->th.th_root->r.r_active)
658#if KMP_USE_DYNAMIC_LOCK
659 __kmp_push_sync(gtid, ct: ct_ordered_in_parallel, ident: loc_ref, NULL, 0);
660#else
661 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
662#endif
663 }
664#ifdef BUILD_PARALLEL_ORDERED
665 if (!team->t.t_serialized) {
666 KMP_MB();
667 KMP_WAIT(spinner: &team->t.t_ordered.dt.t_value, checker: __kmp_tid_from_gtid(gtid), KMP_EQ,
668 NULL);
669 KMP_MB();
670 }
671#endif /* BUILD_PARALLEL_ORDERED */
672}
673
674/* __kmp_parallel_dxo -- Signal the next task. */
675void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
676 int gtid = *gtid_ref;
677#ifdef BUILD_PARALLEL_ORDERED
678 int tid = __kmp_tid_from_gtid(gtid);
679 kmp_team_t *team = __kmp_team_from_gtid(gtid);
680#endif /* BUILD_PARALLEL_ORDERED */
681
682 if (__kmp_env_consistency_check) {
683 if (__kmp_threads[gtid]->th.th_root->r.r_active)
684 __kmp_pop_sync(gtid, ct: ct_ordered_in_parallel, ident: loc_ref);
685 }
686#ifdef BUILD_PARALLEL_ORDERED
687 if (!team->t.t_serialized) {
688 KMP_MB(); /* Flush all pending memory write invalidates. */
689
690 /* use the tid of the next thread in this team */
691 /* TODO replace with general release procedure */
692 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
693
694 KMP_MB(); /* Flush all pending memory write invalidates. */
695 }
696#endif /* BUILD_PARALLEL_ORDERED */
697}
698
699/* ------------------------------------------------------------------------ */
700/* The BARRIER for a SINGLE process section is always explicit */
701
702int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
703 int status;
704 kmp_info_t *th;
705 kmp_team_t *team;
706
707 if (!TCR_4(__kmp_init_parallel))
708 __kmp_parallel_initialize();
709 __kmp_resume_if_soft_paused();
710
711 th = __kmp_threads[gtid];
712 team = th->th.th_team;
713 status = 0;
714
715 th->th.th_ident = id_ref;
716
717 if (team->t.t_serialized) {
718 status = 1;
719 } else {
720 kmp_int32 old_this = th->th.th_local.this_construct;
721
722 ++th->th.th_local.this_construct;
723 /* try to set team count to thread count--success means thread got the
724 single block */
725 /* TODO: Should this be acquire or release? */
726 if (team->t.t_construct == old_this) {
727 status = __kmp_atomic_compare_store_acq(p: &team->t.t_construct, expected: old_this,
728 desired: th->th.th_local.this_construct);
729 }
730#if USE_ITT_BUILD
731 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
732 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
733 team->t.t_active_level == 1) {
734 // Only report metadata by primary thread of active team at level 1
735 __kmp_itt_metadata_single(loc: id_ref);
736 }
737#endif /* USE_ITT_BUILD */
738 }
739
740 if (__kmp_env_consistency_check) {
741 if (status && push_ws) {
742 __kmp_push_workshare(gtid, ct: ct_psingle, ident: id_ref);
743 } else {
744 __kmp_check_workshare(gtid, ct: ct_psingle, ident: id_ref);
745 }
746 }
747#if USE_ITT_BUILD
748 if (status) {
749 __kmp_itt_single_start(gtid);
750 }
751#endif /* USE_ITT_BUILD */
752 return status;
753}
754
755void __kmp_exit_single(int gtid) {
756#if USE_ITT_BUILD
757 __kmp_itt_single_end(gtid);
758#endif /* USE_ITT_BUILD */
759 if (__kmp_env_consistency_check)
760 __kmp_pop_workshare(gtid, ct: ct_psingle, NULL);
761}
762
763/* determine if we can go parallel or must use a serialized parallel region and
764 * how many threads we can use
765 * set_nproc is the number of threads requested for the team
766 * returns 0 if we should serialize or only use one thread,
767 * otherwise the number of threads to use
768 * The forkjoin lock is held by the caller. */
769static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
770 int master_tid, int set_nthreads,
771 int enter_teams) {
772 int capacity;
773 int new_nthreads;
774 KMP_DEBUG_ASSERT(__kmp_init_serial);
775 KMP_DEBUG_ASSERT(root && parent_team);
776 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
777
778 // If dyn-var is set, dynamically adjust the number of desired threads,
779 // according to the method specified by dynamic_mode.
780 new_nthreads = set_nthreads;
781 if (!get__dynamic_2(parent_team, master_tid)) {
782 ;
783 }
784#ifdef USE_LOAD_BALANCE
785 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
786 new_nthreads = __kmp_load_balance_nproc(root, set_nproc: set_nthreads);
787 if (new_nthreads == 1) {
788 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
789 "reservation to 1 thread\n",
790 master_tid));
791 return 1;
792 }
793 if (new_nthreads < set_nthreads) {
794 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
795 "reservation to %d threads\n",
796 master_tid, new_nthreads));
797 }
798 }
799#endif /* USE_LOAD_BALANCE */
800 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
801 new_nthreads = __kmp_avail_proc - __kmp_nth +
802 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
803 if (new_nthreads <= 1) {
804 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
805 "reservation to 1 thread\n",
806 master_tid));
807 return 1;
808 }
809 if (new_nthreads < set_nthreads) {
810 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
811 "reservation to %d threads\n",
812 master_tid, new_nthreads));
813 } else {
814 new_nthreads = set_nthreads;
815 }
816 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
817 if (set_nthreads > 2) {
818 new_nthreads = __kmp_get_random(thread: parent_team->t.t_threads[master_tid]);
819 new_nthreads = (new_nthreads % set_nthreads) + 1;
820 if (new_nthreads == 1) {
821 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
822 "reservation to 1 thread\n",
823 master_tid));
824 return 1;
825 }
826 if (new_nthreads < set_nthreads) {
827 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
828 "reservation to %d threads\n",
829 master_tid, new_nthreads));
830 }
831 }
832 } else {
833 KMP_ASSERT(0);
834 }
835
836 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
837 if (__kmp_nth + new_nthreads -
838 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
839 __kmp_max_nth) {
840 int tl_nthreads = __kmp_max_nth - __kmp_nth +
841 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
842 if (tl_nthreads <= 0) {
843 tl_nthreads = 1;
844 }
845
846 // If dyn-var is false, emit a 1-time warning.
847 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
848 __kmp_reserve_warn = 1;
849 __kmp_msg(kmp_ms_warning,
850 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
851 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
852 }
853 if (tl_nthreads == 1) {
854 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
855 "reduced reservation to 1 thread\n",
856 master_tid));
857 return 1;
858 }
859 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
860 "reservation to %d threads\n",
861 master_tid, tl_nthreads));
862 new_nthreads = tl_nthreads;
863 }
864
865 // Respect OMP_THREAD_LIMIT
866 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
867 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
868 if (cg_nthreads + new_nthreads -
869 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
870 max_cg_threads) {
871 int tl_nthreads = max_cg_threads - cg_nthreads +
872 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
873 if (tl_nthreads <= 0) {
874 tl_nthreads = 1;
875 }
876
877 // If dyn-var is false, emit a 1-time warning.
878 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
879 __kmp_reserve_warn = 1;
880 __kmp_msg(kmp_ms_warning,
881 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
882 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
883 }
884 if (tl_nthreads == 1) {
885 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
886 "reduced reservation to 1 thread\n",
887 master_tid));
888 return 1;
889 }
890 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
891 "reservation to %d threads\n",
892 master_tid, tl_nthreads));
893 new_nthreads = tl_nthreads;
894 }
895
896 // Check if the threads array is large enough, or needs expanding.
897 // See comment in __kmp_register_root() about the adjustment if
898 // __kmp_threads[0] == NULL.
899 capacity = __kmp_threads_capacity;
900 if (TCR_PTR(__kmp_threads[0]) == NULL) {
901 --capacity;
902 }
903 // If it is not for initializing the hidden helper team, we need to take
904 // __kmp_hidden_helper_threads_num out of the capacity because it is included
905 // in __kmp_threads_capacity.
906 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
907 capacity -= __kmp_hidden_helper_threads_num;
908 }
909 if (__kmp_nth + new_nthreads -
910 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
911 capacity) {
912 // Expand the threads array.
913 int slotsRequired = __kmp_nth + new_nthreads -
914 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
915 capacity;
916 int slotsAdded = __kmp_expand_threads(nNeed: slotsRequired);
917 if (slotsAdded < slotsRequired) {
918 // The threads array was not expanded enough.
919 new_nthreads -= (slotsRequired - slotsAdded);
920 KMP_ASSERT(new_nthreads >= 1);
921
922 // If dyn-var is false, emit a 1-time warning.
923 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
924 __kmp_reserve_warn = 1;
925 if (__kmp_tp_cached) {
926 __kmp_msg(kmp_ms_warning,
927 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
928 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
929 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
930 } else {
931 __kmp_msg(kmp_ms_warning,
932 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
933 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
934 }
935 }
936 }
937 }
938
939#ifdef KMP_DEBUG
940 if (new_nthreads == 1) {
941 KC_TRACE(10,
942 ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
943 "dead roots and rechecking; requested %d threads\n",
944 __kmp_get_gtid(), set_nthreads));
945 } else {
946 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
947 " %d threads\n",
948 __kmp_get_gtid(), new_nthreads, set_nthreads));
949 }
950#endif // KMP_DEBUG
951
952 if (this_thr->th.th_nt_strict && new_nthreads < set_nthreads) {
953 __kmpc_error(loc: this_thr->th.th_nt_loc, severity: this_thr->th.th_nt_sev,
954 message: this_thr->th.th_nt_msg);
955 }
956 return new_nthreads;
957}
958
959/* Allocate threads from the thread pool and assign them to the new team. We are
960 assured that there are enough threads available, because we checked on that
961 earlier within critical section forkjoin */
962static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
963 kmp_info_t *master_th, int master_gtid,
964 int fork_teams_workers) {
965 int i;
966 int use_hot_team;
967
968 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
969 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
970 KMP_MB();
971
972 /* first, let's setup the primary thread */
973 master_th->th.th_info.ds.ds_tid = 0;
974 master_th->th.th_team = team;
975 master_th->th.th_team_nproc = team->t.t_nproc;
976 master_th->th.th_team_master = master_th;
977 master_th->th.th_team_serialized = FALSE;
978 master_th->th.th_dispatch = &team->t.t_dispatch[0];
979
980/* make sure we are not the optimized hot team */
981#if KMP_NESTED_HOT_TEAMS
982 use_hot_team = 0;
983 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
984 if (hot_teams) { // hot teams array is not allocated if
985 // KMP_HOT_TEAMS_MAX_LEVEL=0
986 int level = team->t.t_active_level - 1; // index in array of hot teams
987 if (master_th->th.th_teams_microtask) { // are we inside the teams?
988 if (master_th->th.th_teams_size.nteams > 1) {
989 ++level; // level was not increased in teams construct for
990 // team_of_masters
991 }
992 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
993 master_th->th.th_teams_level == team->t.t_level) {
994 ++level; // level was not increased in teams construct for
995 // team_of_workers before the parallel
996 } // team->t.t_level will be increased inside parallel
997 }
998 if (level < __kmp_hot_teams_max_level) {
999 if (hot_teams[level].hot_team) {
1000 // hot team has already been allocated for given level
1001 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1002 use_hot_team = 1; // the team is ready to use
1003 } else {
1004 use_hot_team = 0; // AC: threads are not allocated yet
1005 hot_teams[level].hot_team = team; // remember new hot team
1006 hot_teams[level].hot_team_nth = team->t.t_nproc;
1007 }
1008 } else {
1009 use_hot_team = 0;
1010 }
1011 }
1012#else
1013 use_hot_team = team == root->r.r_hot_team;
1014#endif
1015 if (!use_hot_team) {
1016
1017 /* install the primary thread */
1018 team->t.t_threads[0] = master_th;
1019 __kmp_initialize_info(master_th, team, tid: 0, gtid: master_gtid);
1020
1021 /* now, install the worker threads */
1022 for (i = 1; i < team->t.t_nproc; i++) {
1023
1024 /* fork or reallocate a new thread and install it in team */
1025 kmp_info_t *thr = __kmp_allocate_thread(root, team, tid: i);
1026 team->t.t_threads[i] = thr;
1027 KMP_DEBUG_ASSERT(thr);
1028 KMP_DEBUG_ASSERT(thr->th.th_team == team);
1029 /* align team and thread arrived states */
1030 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1031 "T#%d(%d:%d) join =%llu, plain=%llu\n",
1032 __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1033 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1034 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1035 team->t.t_bar[bs_plain_barrier].b_arrived));
1036 thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1037 thr->th.th_teams_level = master_th->th.th_teams_level;
1038 thr->th.th_teams_size = master_th->th.th_teams_size;
1039 { // Initialize threads' barrier data.
1040 int b;
1041 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1042 for (b = 0; b < bs_last_barrier; ++b) {
1043 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1044 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1045#if USE_DEBUGGER
1046 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1047#endif
1048 }
1049 }
1050 }
1051
1052#if KMP_AFFINITY_SUPPORTED
1053 // Do not partition the places list for teams construct workers who
1054 // haven't actually been forked to do real work yet. This partitioning
1055 // will take place in the parallel region nested within the teams construct.
1056 if (!fork_teams_workers) {
1057 __kmp_partition_places(team);
1058 }
1059#endif
1060
1061 if (team->t.t_nproc > 1 &&
1062 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1063 team->t.b->update_num_threads(nthr: team->t.t_nproc);
1064 __kmp_add_threads_to_team(team, new_nthreads: team->t.t_nproc);
1065 }
1066 }
1067
1068 // Take care of primary thread's task state
1069 if (__kmp_tasking_mode != tskm_immediate_exec) {
1070 if (use_hot_team) {
1071 KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team->t.t_parent, master_th);
1072 KA_TRACE(
1073 20,
1074 ("__kmp_fork_team_threads: Primary T#%d pushing task_team %p / team "
1075 "%p, new task_team %p / team %p\n",
1076 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
1077 team->t.t_parent, team->t.t_task_team[master_th->th.th_task_state],
1078 team));
1079
1080 // Store primary thread's current task state on new team
1081 KMP_CHECK_UPDATE(team->t.t_primary_task_state,
1082 master_th->th.th_task_state);
1083
1084 // Restore primary thread's task state to hot team's state
1085 // by using thread 1's task state
1086 if (team->t.t_nproc > 1) {
1087 KMP_DEBUG_ASSERT(team->t.t_threads[1]->th.th_task_state == 0 ||
1088 team->t.t_threads[1]->th.th_task_state == 1);
1089 KMP_CHECK_UPDATE(master_th->th.th_task_state,
1090 team->t.t_threads[1]->th.th_task_state);
1091 } else {
1092 master_th->th.th_task_state = 0;
1093 }
1094 } else {
1095 // Store primary thread's current task_state on new team
1096 KMP_CHECK_UPDATE(team->t.t_primary_task_state,
1097 master_th->th.th_task_state);
1098 // Are not using hot team, so set task state to 0.
1099 master_th->th.th_task_state = 0;
1100 }
1101 }
1102
1103 if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1104 for (i = 0; i < team->t.t_nproc; i++) {
1105 kmp_info_t *thr = team->t.t_threads[i];
1106 if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1107 thr->th.th_prev_level != team->t.t_level) {
1108 team->t.t_display_affinity = 1;
1109 break;
1110 }
1111 }
1112 }
1113
1114 KMP_MB();
1115}
1116
1117#if KMP_ARCH_X86 || KMP_ARCH_X86_64
1118// Propagate any changes to the floating point control registers out to the team
1119// We try to avoid unnecessary writes to the relevant cache line in the team
1120// structure, so we don't make changes unless they are needed.
1121inline static void propagateFPControl(kmp_team_t *team) {
1122 if (__kmp_inherit_fp_control) {
1123 kmp_int16 x87_fpu_control_word;
1124 kmp_uint32 mxcsr;
1125
1126 // Get primary thread's values of FPU control flags (both X87 and vector)
1127 __kmp_store_x87_fpu_control_word(p: &x87_fpu_control_word);
1128 __kmp_store_mxcsr(p: &mxcsr);
1129 mxcsr &= KMP_X86_MXCSR_MASK;
1130
1131 // There is no point looking at t_fp_control_saved here.
1132 // If it is TRUE, we still have to update the values if they are different
1133 // from those we now have. If it is FALSE we didn't save anything yet, but
1134 // our objective is the same. We have to ensure that the values in the team
1135 // are the same as those we have.
1136 // So, this code achieves what we need whether or not t_fp_control_saved is
1137 // true. By checking whether the value needs updating we avoid unnecessary
1138 // writes that would put the cache-line into a written state, causing all
1139 // threads in the team to have to read it again.
1140 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1141 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1142 // Although we don't use this value, other code in the runtime wants to know
1143 // whether it should restore them. So we must ensure it is correct.
1144 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1145 } else {
1146 // Similarly here. Don't write to this cache-line in the team structure
1147 // unless we have to.
1148 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1149 }
1150}
1151
1152// Do the opposite, setting the hardware registers to the updated values from
1153// the team.
1154inline static void updateHWFPControl(kmp_team_t *team) {
1155 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1156 // Only reset the fp control regs if they have been changed in the team.
1157 // the parallel region that we are exiting.
1158 kmp_int16 x87_fpu_control_word;
1159 kmp_uint32 mxcsr;
1160 __kmp_store_x87_fpu_control_word(p: &x87_fpu_control_word);
1161 __kmp_store_mxcsr(p: &mxcsr);
1162 mxcsr &= KMP_X86_MXCSR_MASK;
1163
1164 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1165 __kmp_clear_x87_fpu_status_word();
1166 __kmp_load_x87_fpu_control_word(p: &team->t.t_x87_fpu_control_word);
1167 }
1168
1169 if (team->t.t_mxcsr != mxcsr) {
1170 __kmp_load_mxcsr(p: &team->t.t_mxcsr);
1171 }
1172 }
1173}
1174#else
1175#define propagateFPControl(x) ((void)0)
1176#define updateHWFPControl(x) ((void)0)
1177#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1178
1179static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1180 int realloc); // forward declaration
1181
1182/* Run a parallel region that has been serialized, so runs only in a team of the
1183 single primary thread. */
1184void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1185 kmp_info_t *this_thr;
1186 kmp_team_t *serial_team;
1187
1188 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1189
1190 /* Skip all this code for autopar serialized loops since it results in
1191 unacceptable overhead */
1192 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1193 return;
1194
1195 if (!TCR_4(__kmp_init_parallel))
1196 __kmp_parallel_initialize();
1197 __kmp_resume_if_soft_paused();
1198
1199 this_thr = __kmp_threads[global_tid];
1200 serial_team = this_thr->th.th_serial_team;
1201
1202 /* utilize the serialized team held by this thread */
1203 KMP_DEBUG_ASSERT(serial_team);
1204 KMP_MB();
1205
1206 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1207 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1208 proc_bind = proc_bind_false;
1209 } else if (proc_bind == proc_bind_default) {
1210 // No proc_bind clause was specified, so use the current value
1211 // of proc-bind-var for this parallel region.
1212 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1213 }
1214 // Reset for next parallel region
1215 this_thr->th.th_set_proc_bind = proc_bind_default;
1216
1217 // Reset num_threads for next parallel region
1218 this_thr->th.th_set_nproc = 0;
1219
1220#if OMPT_SUPPORT
1221 ompt_data_t ompt_parallel_data = ompt_data_none;
1222 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1223 if (ompt_enabled.enabled &&
1224 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1225
1226 ompt_task_info_t *parent_task_info;
1227 parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1228
1229 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1230 if (ompt_enabled.ompt_callback_parallel_begin) {
1231 int team_size = 1;
1232
1233 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1234 &(parent_task_info->task_data), &(parent_task_info->frame),
1235 &ompt_parallel_data, team_size,
1236 ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1237 }
1238 }
1239#endif // OMPT_SUPPORT
1240
1241 if (this_thr->th.th_team != serial_team) {
1242 // Nested level will be an index in the nested nthreads array
1243 int level = this_thr->th.th_team->t.t_level;
1244
1245 if (serial_team->t.t_serialized) {
1246 /* this serial team was already used
1247 TODO increase performance by making this locks more specific */
1248 kmp_team_t *new_team;
1249
1250 __kmp_acquire_bootstrap_lock(lck: &__kmp_forkjoin_lock);
1251
1252 new_team =
1253 __kmp_allocate_team(root: this_thr->th.th_root, new_nproc: 1, max_nproc: 1,
1254#if OMPT_SUPPORT
1255 ompt_parallel_data,
1256#endif
1257 proc_bind, new_icvs: &this_thr->th.th_current_task->td_icvs,
1258 argc: 0 USE_NESTED_HOT_ARG(NULL));
1259 __kmp_release_bootstrap_lock(lck: &__kmp_forkjoin_lock);
1260 KMP_ASSERT(new_team);
1261
1262 /* setup new serialized team and install it */
1263 new_team->t.t_threads[0] = this_thr;
1264 new_team->t.t_parent = this_thr->th.th_team;
1265 serial_team = new_team;
1266 this_thr->th.th_serial_team = serial_team;
1267
1268 KF_TRACE(
1269 10,
1270 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1271 global_tid, serial_team));
1272
1273 /* TODO the above breaks the requirement that if we run out of resources,
1274 then we can still guarantee that serialized teams are ok, since we may
1275 need to allocate a new one */
1276 } else {
1277 KF_TRACE(
1278 10,
1279 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1280 global_tid, serial_team));
1281 }
1282
1283 /* we have to initialize this serial team */
1284 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1285 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1286 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1287 serial_team->t.t_ident = loc;
1288 serial_team->t.t_serialized = 1;
1289 serial_team->t.t_nproc = 1;
1290 serial_team->t.t_parent = this_thr->th.th_team;
1291 if (this_thr->th.th_team->t.t_nested_nth)
1292 serial_team->t.t_nested_nth = this_thr->th.th_team->t.t_nested_nth;
1293 else
1294 serial_team->t.t_nested_nth = &__kmp_nested_nth;
1295 // Save previous team's task state on serial team structure
1296 serial_team->t.t_primary_task_state = this_thr->th.th_task_state;
1297 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1298 this_thr->th.th_team = serial_team;
1299 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1300
1301 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1302 this_thr->th.th_current_task));
1303 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1304 this_thr->th.th_current_task->td_flags.executing = 0;
1305
1306 __kmp_push_current_task_to_thread(this_thr, team: serial_team, tid: 0);
1307
1308 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1309 implicit task for each serialized task represented by
1310 team->t.t_serialized? */
1311 copy_icvs(dst: &this_thr->th.th_current_task->td_icvs,
1312 src: &this_thr->th.th_current_task->td_parent->td_icvs);
1313
1314 // Thread value exists in the nested nthreads array for the next nested
1315 // level
1316 kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
1317 if (this_thr->th.th_team->t.t_nested_nth)
1318 nested_nth = this_thr->th.th_team->t.t_nested_nth;
1319 if (nested_nth->used && (level + 1 < nested_nth->used)) {
1320 this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
1321 }
1322
1323 if (__kmp_nested_proc_bind.used &&
1324 (level + 1 < __kmp_nested_proc_bind.used)) {
1325 this_thr->th.th_current_task->td_icvs.proc_bind =
1326 __kmp_nested_proc_bind.bind_types[level + 1];
1327 }
1328
1329#if USE_DEBUGGER
1330 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1331#endif
1332 this_thr->th.th_info.ds.ds_tid = 0;
1333
1334 /* set thread cache values */
1335 this_thr->th.th_team_nproc = 1;
1336 this_thr->th.th_team_master = this_thr;
1337 this_thr->th.th_team_serialized = 1;
1338 this_thr->th.th_task_team = NULL;
1339 this_thr->th.th_task_state = 0;
1340
1341 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1342 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1343 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1344
1345 propagateFPControl(team: serial_team);
1346
1347 /* check if we need to allocate dispatch buffers stack */
1348 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1349 if (!serial_team->t.t_dispatch->th_disp_buffer) {
1350 serial_team->t.t_dispatch->th_disp_buffer =
1351 (dispatch_private_info_t *)__kmp_allocate(
1352 sizeof(dispatch_private_info_t));
1353 }
1354 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1355
1356 KMP_MB();
1357
1358 } else {
1359 /* this serialized team is already being used,
1360 * that's fine, just add another nested level */
1361 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1362 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1363 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1364 ++serial_team->t.t_serialized;
1365 this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1366
1367 // Nested level will be an index in the nested nthreads array
1368 int level = this_thr->th.th_team->t.t_level;
1369 // Thread value exists in the nested nthreads array for the next nested
1370 // level
1371
1372 kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
1373 if (serial_team->t.t_nested_nth)
1374 nested_nth = serial_team->t.t_nested_nth;
1375 if (nested_nth->used && (level + 1 < nested_nth->used)) {
1376 this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
1377 }
1378
1379 serial_team->t.t_level++;
1380 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1381 "of serial team %p to %d\n",
1382 global_tid, serial_team, serial_team->t.t_level));
1383
1384 /* allocate/push dispatch buffers stack */
1385 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1386 {
1387 dispatch_private_info_t *disp_buffer =
1388 (dispatch_private_info_t *)__kmp_allocate(
1389 sizeof(dispatch_private_info_t));
1390 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1391 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1392 }
1393 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1394
1395 /* allocate/push task team stack */
1396 __kmp_push_task_team_node(thread: this_thr, team: serial_team);
1397
1398 KMP_MB();
1399 }
1400 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1401
1402 // Perform the display affinity functionality for
1403 // serialized parallel regions
1404 if (__kmp_display_affinity) {
1405 if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1406 this_thr->th.th_prev_num_threads != 1) {
1407 // NULL means use the affinity-format-var ICV
1408 __kmp_aux_display_affinity(gtid: global_tid, NULL);
1409 this_thr->th.th_prev_level = serial_team->t.t_level;
1410 this_thr->th.th_prev_num_threads = 1;
1411 }
1412 }
1413
1414 if (__kmp_env_consistency_check)
1415 __kmp_push_parallel(gtid: global_tid, NULL);
1416#if OMPT_SUPPORT
1417 serial_team->t.ompt_team_info.master_return_address = codeptr;
1418 if (ompt_enabled.enabled &&
1419 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1420 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1421 OMPT_GET_FRAME_ADDRESS(0);
1422
1423 ompt_lw_taskteam_t lw_taskteam;
1424 __ompt_lw_taskteam_init(lwt: &lw_taskteam, thr: this_thr, gtid: global_tid,
1425 ompt_pid: &ompt_parallel_data, codeptr);
1426
1427 __ompt_lw_taskteam_link(lwt: &lw_taskteam, thr: this_thr, on_heap: 1);
1428 // don't use lw_taskteam after linking. content was swaped
1429
1430 /* OMPT implicit task begin */
1431 if (ompt_enabled.ompt_callback_implicit_task) {
1432 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1433 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1434 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(gtid: global_tid),
1435 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1436 OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1437 __kmp_tid_from_gtid(gtid: global_tid);
1438 }
1439
1440 /* OMPT state */
1441 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1442 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1443 OMPT_GET_FRAME_ADDRESS(0);
1444 }
1445#endif
1446}
1447
1448// Test if this fork is for a team closely nested in a teams construct
1449static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
1450 microtask_t microtask, int level,
1451 int teams_level, kmp_va_list ap) {
1452 return (master_th->th.th_teams_microtask && ap &&
1453 microtask != (microtask_t)__kmp_teams_master && level == teams_level);
1454}
1455
1456// Test if this fork is for the teams construct, i.e. to form the outer league
1457// of teams
1458static inline bool __kmp_is_entering_teams(int active_level, int level,
1459 int teams_level, kmp_va_list ap) {
1460 return ((ap == NULL && active_level == 0) ||
1461 (ap && teams_level > 0 && teams_level == level));
1462}
1463
1464// AC: This is start of parallel that is nested inside teams construct.
1465// The team is actual (hot), all workers are ready at the fork barrier.
1466// No lock needed to initialize the team a bit, then free workers.
1467static inline int
1468__kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
1469 kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
1470 enum fork_context_e call_context, microtask_t microtask,
1471 launch_t invoker, int master_set_numthreads, int level,
1472#if OMPT_SUPPORT
1473 ompt_data_t ompt_parallel_data, void *return_address,
1474#endif
1475 kmp_va_list ap) {
1476 void **argv;
1477 int i;
1478
1479 parent_team->t.t_ident = loc;
1480 __kmp_alloc_argv_entries(argc, team: parent_team, TRUE);
1481 parent_team->t.t_argc = argc;
1482 argv = (void **)parent_team->t.t_argv;
1483 for (i = argc - 1; i >= 0; --i) {
1484 *argv++ = va_arg(kmp_va_deref(ap), void *);
1485 }
1486 // Increment our nested depth levels, but not increase the serialization
1487 if (parent_team == master_th->th.th_serial_team) {
1488 // AC: we are in serialized parallel
1489 __kmpc_serialized_parallel(loc, global_tid: gtid);
1490 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1491
1492 if (call_context == fork_context_gnu) {
1493 // AC: need to decrement t_serialized for enquiry functions to work
1494 // correctly, will restore at join time
1495 parent_team->t.t_serialized--;
1496 return TRUE;
1497 }
1498
1499#if OMPD_SUPPORT
1500 parent_team->t.t_pkfn = microtask;
1501#endif
1502
1503#if OMPT_SUPPORT
1504 void *dummy;
1505 void **exit_frame_p;
1506 ompt_data_t *implicit_task_data;
1507 ompt_lw_taskteam_t lw_taskteam;
1508
1509 if (ompt_enabled.enabled) {
1510 __ompt_lw_taskteam_init(lwt: &lw_taskteam, thr: master_th, gtid,
1511 ompt_pid: &ompt_parallel_data, codeptr: return_address);
1512 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1513
1514 __ompt_lw_taskteam_link(lwt: &lw_taskteam, thr: master_th, on_heap: 0);
1515 // Don't use lw_taskteam after linking. Content was swapped.
1516
1517 /* OMPT implicit task begin */
1518 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1519 if (ompt_enabled.ompt_callback_implicit_task) {
1520 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1521 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1522 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1523 1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1524 }
1525
1526 /* OMPT state */
1527 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1528 } else {
1529 exit_frame_p = &dummy;
1530 }
1531#endif
1532
1533 // AC: need to decrement t_serialized for enquiry functions to work
1534 // correctly, will restore at join time
1535 parent_team->t.t_serialized--;
1536
1537 {
1538 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1539 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1540 __kmp_invoke_microtask(pkfn: microtask, gtid, npr: 0, argc, argv: parent_team->t.t_argv
1541#if OMPT_SUPPORT
1542 ,
1543 exit_frame_ptr: exit_frame_p
1544#endif
1545 );
1546 }
1547
1548#if OMPT_SUPPORT
1549 if (ompt_enabled.enabled) {
1550 *exit_frame_p = NULL;
1551 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1552 if (ompt_enabled.ompt_callback_implicit_task) {
1553 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1554 ompt_scope_end, NULL, implicit_task_data, 1,
1555 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1556 }
1557 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1558 __ompt_lw_taskteam_unlink(thr: master_th);
1559 if (ompt_enabled.ompt_callback_parallel_end) {
1560 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1561 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1562 OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
1563 }
1564 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1565 }
1566#endif
1567 return TRUE;
1568 }
1569
1570 parent_team->t.t_pkfn = microtask;
1571 parent_team->t.t_invoke = invoker;
1572 KMP_ATOMIC_INC(&root->r.r_in_parallel);
1573 parent_team->t.t_active_level++;
1574 parent_team->t.t_level++;
1575 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1576
1577 // If the threads allocated to the team are less than the thread limit, update
1578 // the thread limit here. th_teams_size.nth is specific to this team nested
1579 // in a teams construct, the team is fully created, and we're about to do
1580 // the actual fork. Best to do this here so that the subsequent uses below
1581 // and in the join have the correct value.
1582 master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
1583
1584#if OMPT_SUPPORT
1585 if (ompt_enabled.enabled) {
1586 ompt_lw_taskteam_t lw_taskteam;
1587 __ompt_lw_taskteam_init(lwt: &lw_taskteam, thr: master_th, gtid, ompt_pid: &ompt_parallel_data,
1588 codeptr: return_address);
1589 __ompt_lw_taskteam_link(lwt: &lw_taskteam, thr: master_th, on_heap: 1, always: true);
1590 }
1591#endif
1592
1593 /* Change number of threads in the team if requested */
1594 if (master_set_numthreads) { // The parallel has num_threads clause
1595 if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1596 // AC: only can reduce number of threads dynamically, can't increase
1597 kmp_info_t **other_threads = parent_team->t.t_threads;
1598 // NOTE: if using distributed barrier, we need to run this code block
1599 // even when the team size appears not to have changed from the max.
1600 int old_proc = master_th->th.th_teams_size.nth;
1601 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1602 __kmp_resize_dist_barrier(team: parent_team, old_nthreads: old_proc, new_nthreads: master_set_numthreads);
1603 __kmp_add_threads_to_team(team: parent_team, new_nthreads: master_set_numthreads);
1604 }
1605 parent_team->t.t_nproc = master_set_numthreads;
1606 for (i = 0; i < master_set_numthreads; ++i) {
1607 other_threads[i]->th.th_team_nproc = master_set_numthreads;
1608 }
1609 }
1610 // Keep extra threads hot in the team for possible next parallels
1611 master_th->th.th_set_nproc = 0;
1612 }
1613
1614#if USE_DEBUGGER
1615 if (__kmp_debugging) { // Let debugger override number of threads.
1616 int nth = __kmp_omp_num_threads(loc);
1617 if (nth > 0) { // 0 means debugger doesn't want to change num threads
1618 master_set_numthreads = nth;
1619 }
1620 }
1621#endif
1622
1623 // Figure out the proc_bind policy for the nested parallel within teams
1624 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1625 // proc_bind_default means don't update
1626 kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1627 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1628 proc_bind = proc_bind_false;
1629 } else {
1630 // No proc_bind clause specified; use current proc-bind-var
1631 if (proc_bind == proc_bind_default) {
1632 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1633 }
1634 /* else: The proc_bind policy was specified explicitly on parallel clause.
1635 This overrides proc-bind-var for this parallel region, but does not
1636 change proc-bind-var. */
1637 // Figure the value of proc-bind-var for the child threads.
1638 if ((level + 1 < __kmp_nested_proc_bind.used) &&
1639 (__kmp_nested_proc_bind.bind_types[level + 1] !=
1640 master_th->th.th_current_task->td_icvs.proc_bind)) {
1641 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1642 }
1643 }
1644 KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1645 // Need to change the bind-var ICV to correct value for each implicit task
1646 if (proc_bind_icv != proc_bind_default &&
1647 master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1648 kmp_info_t **other_threads = parent_team->t.t_threads;
1649 for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1650 other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
1651 }
1652 }
1653 // Reset for next parallel region
1654 master_th->th.th_set_proc_bind = proc_bind_default;
1655
1656#if USE_ITT_BUILD && USE_ITT_NOTIFY
1657 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1658 KMP_ITT_DEBUG) &&
1659 __kmp_forkjoin_frames_mode == 3 &&
1660 parent_team->t.t_active_level == 1 // only report frames at level 1
1661 && master_th->th.th_teams_size.nteams == 1) {
1662 kmp_uint64 tmp_time = __itt_get_timestamp();
1663 master_th->th.th_frame_time = tmp_time;
1664 parent_team->t.t_region_time = tmp_time;
1665 }
1666 if (__itt_stack_caller_create_ptr) {
1667 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1668 // create new stack stitching id before entering fork barrier
1669 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1670 }
1671#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1672#if KMP_AFFINITY_SUPPORTED
1673 __kmp_partition_places(team: parent_team);
1674#endif
1675
1676 KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
1677 "master_th=%p, gtid=%d\n",
1678 root, parent_team, master_th, gtid));
1679 __kmp_internal_fork(id: loc, gtid, team: parent_team);
1680 KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
1681 "master_th=%p, gtid=%d\n",
1682 root, parent_team, master_th, gtid));
1683
1684 if (call_context == fork_context_gnu)
1685 return TRUE;
1686
1687 /* Invoke microtask for PRIMARY thread */
1688 KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
1689 parent_team->t.t_id, parent_team->t.t_pkfn));
1690
1691 if (!parent_team->t.t_invoke(gtid)) {
1692 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1693 }
1694 KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
1695 parent_team->t.t_id, parent_team->t.t_pkfn));
1696 KMP_MB(); /* Flush all pending memory write invalidates. */
1697
1698 KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
1699
1700 return TRUE;
1701}
1702
1703// Create a serialized parallel region
1704static inline int
1705__kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
1706 kmp_int32 argc, microtask_t microtask, launch_t invoker,
1707 kmp_info_t *master_th, kmp_team_t *parent_team,
1708#if OMPT_SUPPORT
1709 ompt_data_t *ompt_parallel_data, void **return_address,
1710 ompt_data_t **parent_task_data,
1711#endif
1712 kmp_va_list ap) {
1713 kmp_team_t *team;
1714 int i;
1715 void **argv;
1716
1717/* josh todo: hypothetical question: what do we do for OS X*? */
1718#if KMP_OS_LINUX && \
1719 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1720 SimpleVLA<void *> args(argc);
1721#else
1722 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1723#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1724 KMP_ARCH_AARCH64) */
1725
1726 KA_TRACE(
1727 20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
1728
1729 __kmpc_serialized_parallel(loc, global_tid: gtid);
1730
1731#if OMPD_SUPPORT
1732 master_th->th.th_serial_team->t.t_pkfn = microtask;
1733#endif
1734
1735 if (call_context == fork_context_intel) {
1736 /* TODO this sucks, use the compiler itself to pass args! :) */
1737 master_th->th.th_serial_team->t.t_ident = loc;
1738 if (!ap) {
1739 // revert change made in __kmpc_serialized_parallel()
1740 master_th->th.th_serial_team->t.t_level--;
1741// Get args from parent team for teams construct
1742
1743#if OMPT_SUPPORT
1744 void *dummy;
1745 void **exit_frame_p;
1746 ompt_task_info_t *task_info;
1747 ompt_lw_taskteam_t lw_taskteam;
1748
1749 if (ompt_enabled.enabled) {
1750 __ompt_lw_taskteam_init(lwt: &lw_taskteam, thr: master_th, gtid,
1751 ompt_pid: ompt_parallel_data, codeptr: *return_address);
1752
1753 __ompt_lw_taskteam_link(lwt: &lw_taskteam, thr: master_th, on_heap: 0);
1754 // don't use lw_taskteam after linking. content was swaped
1755 task_info = OMPT_CUR_TASK_INFO(master_th);
1756 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1757 if (ompt_enabled.ompt_callback_implicit_task) {
1758 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1759 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1760 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1761 &(task_info->task_data), 1,
1762 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1763 }
1764
1765 /* OMPT state */
1766 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1767 } else {
1768 exit_frame_p = &dummy;
1769 }
1770#endif
1771
1772 {
1773 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1774 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1775 __kmp_invoke_microtask(pkfn: microtask, gtid, npr: 0, argc, argv: parent_team->t.t_argv
1776#if OMPT_SUPPORT
1777 ,
1778 exit_frame_ptr: exit_frame_p
1779#endif
1780 );
1781 }
1782
1783#if OMPT_SUPPORT
1784 if (ompt_enabled.enabled) {
1785 *exit_frame_p = NULL;
1786 if (ompt_enabled.ompt_callback_implicit_task) {
1787 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1788 ompt_scope_end, NULL, &(task_info->task_data), 1,
1789 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1790 }
1791 *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1792 __ompt_lw_taskteam_unlink(thr: master_th);
1793 if (ompt_enabled.ompt_callback_parallel_end) {
1794 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1795 ompt_parallel_data, *parent_task_data,
1796 OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1797 }
1798 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1799 }
1800#endif
1801 } else if (microtask == (microtask_t)__kmp_teams_master) {
1802 KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
1803 team = master_th->th.th_team;
1804 // team->t.t_pkfn = microtask;
1805 team->t.t_invoke = invoker;
1806 __kmp_alloc_argv_entries(argc, team, TRUE);
1807 team->t.t_argc = argc;
1808 argv = (void **)team->t.t_argv;
1809 for (i = argc - 1; i >= 0; --i)
1810 *argv++ = va_arg(kmp_va_deref(ap), void *);
1811 // AC: revert change made in __kmpc_serialized_parallel()
1812 // because initial code in teams should have level=0
1813 team->t.t_level--;
1814 // AC: call special invoker for outer "parallel" of teams construct
1815 invoker(gtid);
1816#if OMPT_SUPPORT
1817 if (ompt_enabled.enabled) {
1818 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1819 if (ompt_enabled.ompt_callback_implicit_task) {
1820 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1821 ompt_scope_end, NULL, &(task_info->task_data), 0,
1822 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1823 }
1824 if (ompt_enabled.ompt_callback_parallel_end) {
1825 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1826 ompt_parallel_data, *parent_task_data,
1827 OMPT_INVOKER(call_context) | ompt_parallel_league,
1828 *return_address);
1829 }
1830 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1831 }
1832#endif
1833 } else {
1834 argv = args;
1835 for (i = argc - 1; i >= 0; --i)
1836 *argv++ = va_arg(kmp_va_deref(ap), void *);
1837 KMP_MB();
1838
1839#if OMPT_SUPPORT
1840 void *dummy;
1841 void **exit_frame_p;
1842 ompt_task_info_t *task_info;
1843 ompt_lw_taskteam_t lw_taskteam;
1844 ompt_data_t *implicit_task_data;
1845
1846 if (ompt_enabled.enabled) {
1847 __ompt_lw_taskteam_init(lwt: &lw_taskteam, thr: master_th, gtid,
1848 ompt_pid: ompt_parallel_data, codeptr: *return_address);
1849 __ompt_lw_taskteam_link(lwt: &lw_taskteam, thr: master_th, on_heap: 0);
1850 // don't use lw_taskteam after linking. content was swaped
1851 task_info = OMPT_CUR_TASK_INFO(master_th);
1852 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1853
1854 /* OMPT implicit task begin */
1855 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1856 if (ompt_enabled.ompt_callback_implicit_task) {
1857 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1858 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1859 implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1860 ompt_task_implicit);
1861 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1862 }
1863
1864 /* OMPT state */
1865 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1866 } else {
1867 exit_frame_p = &dummy;
1868 }
1869#endif
1870
1871 {
1872 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1873 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1874 __kmp_invoke_microtask(pkfn: microtask, gtid, npr: 0, argc, argv: args
1875#if OMPT_SUPPORT
1876 ,
1877 exit_frame_ptr: exit_frame_p
1878#endif
1879 );
1880 }
1881
1882#if OMPT_SUPPORT
1883 if (ompt_enabled.enabled) {
1884 *exit_frame_p = NULL;
1885 if (ompt_enabled.ompt_callback_implicit_task) {
1886 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1887 ompt_scope_end, NULL, &(task_info->task_data), 1,
1888 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1889 }
1890
1891 *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1892 __ompt_lw_taskteam_unlink(thr: master_th);
1893 if (ompt_enabled.ompt_callback_parallel_end) {
1894 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1895 ompt_parallel_data, *parent_task_data,
1896 OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1897 }
1898 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1899 }
1900#endif
1901 }
1902 } else if (call_context == fork_context_gnu) {
1903#if OMPT_SUPPORT
1904 if (ompt_enabled.enabled) {
1905 ompt_lw_taskteam_t lwt;
1906 __ompt_lw_taskteam_init(lwt: &lwt, thr: master_th, gtid, ompt_pid: ompt_parallel_data,
1907 codeptr: *return_address);
1908
1909 lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1910 __ompt_lw_taskteam_link(lwt: &lwt, thr: master_th, on_heap: 1);
1911 }
1912// don't use lw_taskteam after linking. content was swaped
1913#endif
1914
1915 // we were called from GNU native code
1916 KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1917 return FALSE;
1918 } else {
1919 KMP_ASSERT2(call_context < fork_context_last,
1920 "__kmp_serial_fork_call: unknown fork_context parameter");
1921 }
1922
1923 KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1924 KMP_MB();
1925 return FALSE;
1926}
1927
1928/* most of the work for a fork */
1929/* return true if we really went parallel, false if serialized */
1930int __kmp_fork_call(ident_t *loc, int gtid,
1931 enum fork_context_e call_context, // Intel, GNU, ...
1932 kmp_int32 argc, microtask_t microtask, launch_t invoker,
1933 kmp_va_list ap) {
1934 void **argv;
1935 int i;
1936 int master_tid;
1937 int master_this_cons;
1938 kmp_team_t *team;
1939 kmp_team_t *parent_team;
1940 kmp_info_t *master_th;
1941 kmp_root_t *root;
1942 int nthreads;
1943 int master_active;
1944 int master_set_numthreads;
1945 int task_thread_limit = 0;
1946 int level;
1947 int active_level;
1948 int teams_level;
1949#if KMP_NESTED_HOT_TEAMS
1950 kmp_hot_team_ptr_t **p_hot_teams;
1951#endif
1952 { // KMP_TIME_BLOCK
1953 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1954 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1955
1956 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1957 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1958 /* Some systems prefer the stack for the root thread(s) to start with */
1959 /* some gap from the parent stack to prevent false sharing. */
1960 void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1961 /* These 2 lines below are so this does not get optimized out */
1962 if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1963 __kmp_stkpadding += (short)((kmp_int64)dummy);
1964 }
1965
1966 /* initialize if needed */
1967 KMP_DEBUG_ASSERT(
1968 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1969 if (!TCR_4(__kmp_init_parallel))
1970 __kmp_parallel_initialize();
1971 __kmp_resume_if_soft_paused();
1972
1973 /* setup current data */
1974 // AC: potentially unsafe, not in sync with library shutdown,
1975 // __kmp_threads can be freed
1976 master_th = __kmp_threads[gtid];
1977
1978 parent_team = master_th->th.th_team;
1979 master_tid = master_th->th.th_info.ds.ds_tid;
1980 master_this_cons = master_th->th.th_local.this_construct;
1981 root = master_th->th.th_root;
1982 master_active = root->r.r_active;
1983 master_set_numthreads = master_th->th.th_set_nproc;
1984 task_thread_limit =
1985 master_th->th.th_current_task->td_icvs.task_thread_limit;
1986
1987#if OMPT_SUPPORT
1988 ompt_data_t ompt_parallel_data = ompt_data_none;
1989 ompt_data_t *parent_task_data = NULL;
1990 ompt_frame_t *ompt_frame = NULL;
1991 void *return_address = NULL;
1992
1993 if (ompt_enabled.enabled) {
1994 __ompt_get_task_info_internal(ancestor_level: 0, NULL, task_data: &parent_task_data, task_frame: &ompt_frame,
1995 NULL, NULL);
1996 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1997 }
1998#endif
1999
2000 // Assign affinity to root thread if it hasn't happened yet
2001 __kmp_assign_root_init_mask();
2002
2003 // Nested level will be an index in the nested nthreads array
2004 level = parent_team->t.t_level;
2005 // used to launch non-serial teams even if nested is not allowed
2006 active_level = parent_team->t.t_active_level;
2007 // needed to check nesting inside the teams
2008 teams_level = master_th->th.th_teams_level;
2009#if KMP_NESTED_HOT_TEAMS
2010 p_hot_teams = &master_th->th.th_hot_teams;
2011 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
2012 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
2013 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
2014 (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
2015 // it is either actual or not needed (when active_level > 0)
2016 (*p_hot_teams)[0].hot_team_nth = 1;
2017 }
2018#endif
2019
2020#if OMPT_SUPPORT
2021 if (ompt_enabled.enabled) {
2022 if (ompt_enabled.ompt_callback_parallel_begin) {
2023 int team_size = master_set_numthreads
2024 ? master_set_numthreads
2025 : get__nproc_2(parent_team, master_tid);
2026 int flags = OMPT_INVOKER(call_context) |
2027 ((microtask == (microtask_t)__kmp_teams_master)
2028 ? ompt_parallel_league
2029 : ompt_parallel_team);
2030 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
2031 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
2032 return_address);
2033 }
2034 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2035 }
2036#endif
2037
2038 master_th->th.th_ident = loc;
2039
2040 // Parallel closely nested in teams construct:
2041 if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
2042 return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
2043 call_context, microtask, invoker,
2044 master_set_numthreads, level,
2045#if OMPT_SUPPORT
2046 ompt_parallel_data, return_address,
2047#endif
2048 ap);
2049 } // End parallel closely nested in teams construct
2050
2051 // Need this to happen before we determine the number of threads, not while
2052 // we are allocating the team
2053 //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
2054
2055 KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(parent_team, master_th);
2056
2057 // Determine the number of threads
2058 int enter_teams =
2059 __kmp_is_entering_teams(active_level, level, teams_level, ap);
2060 if ((!enter_teams &&
2061 (parent_team->t.t_active_level >=
2062 master_th->th.th_current_task->td_icvs.max_active_levels)) ||
2063 (__kmp_library == library_serial)) {
2064 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
2065 nthreads = 1;
2066 } else {
2067 nthreads = master_set_numthreads
2068 ? master_set_numthreads
2069 // TODO: get nproc directly from current task
2070 : get__nproc_2(parent_team, master_tid);
2071 // Use the thread_limit set for the current target task if exists, else go
2072 // with the deduced nthreads
2073 nthreads = task_thread_limit > 0 && task_thread_limit < nthreads
2074 ? task_thread_limit
2075 : nthreads;
2076 // Check if we need to take forkjoin lock? (no need for serialized
2077 // parallel out of teams construct).
2078 if (nthreads > 1) {
2079 /* determine how many new threads we can use */
2080 __kmp_acquire_bootstrap_lock(lck: &__kmp_forkjoin_lock);
2081 /* AC: If we execute teams from parallel region (on host), then teams
2082 should be created but each can only have 1 thread if nesting is
2083 disabled. If teams called from serial region, then teams and their
2084 threads should be created regardless of the nesting setting. */
2085 nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
2086 set_nthreads: nthreads, enter_teams);
2087 if (nthreads == 1) {
2088 // Free lock for single thread execution here; for multi-thread
2089 // execution it will be freed later after team of threads created
2090 // and initialized
2091 __kmp_release_bootstrap_lock(lck: &__kmp_forkjoin_lock);
2092 }
2093 }
2094 }
2095 KMP_DEBUG_ASSERT(nthreads > 0);
2096
2097 // If we temporarily changed the set number of threads then restore it now
2098 master_th->th.th_set_nproc = 0;
2099
2100 if (nthreads == 1) {
2101 return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
2102 invoker, master_th, parent_team,
2103#if OMPT_SUPPORT
2104 ompt_parallel_data: &ompt_parallel_data, return_address: &return_address,
2105 parent_task_data: &parent_task_data,
2106#endif
2107 ap);
2108 } // if (nthreads == 1)
2109
2110 // GEH: only modify the executing flag in the case when not serialized
2111 // serialized case is handled in kmpc_serialized_parallel
2112 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
2113 "curtask=%p, curtask_max_aclevel=%d\n",
2114 parent_team->t.t_active_level, master_th,
2115 master_th->th.th_current_task,
2116 master_th->th.th_current_task->td_icvs.max_active_levels));
2117 // TODO: GEH - cannot do this assertion because root thread not set up as
2118 // executing
2119 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2120 master_th->th.th_current_task->td_flags.executing = 0;
2121
2122 if (!master_th->th.th_teams_microtask || level > teams_level) {
2123 /* Increment our nested depth level */
2124 KMP_ATOMIC_INC(&root->r.r_in_parallel);
2125 }
2126
2127 // See if we need to make a copy of the ICVs.
2128 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2129 kmp_nested_nthreads_t *nested_nth = NULL;
2130 if (!master_th->th.th_set_nested_nth &&
2131 (level + 1 < parent_team->t.t_nested_nth->used) &&
2132 (parent_team->t.t_nested_nth->nth[level + 1] != nthreads_icv)) {
2133 nthreads_icv = parent_team->t.t_nested_nth->nth[level + 1];
2134 } else if (master_th->th.th_set_nested_nth) {
2135 nested_nth = __kmp_override_nested_nth(thr: master_th, level);
2136 if ((level + 1 < nested_nth->used) &&
2137 (nested_nth->nth[level + 1] != nthreads_icv))
2138 nthreads_icv = nested_nth->nth[level + 1];
2139 else
2140 nthreads_icv = 0; // don't update
2141 } else {
2142 nthreads_icv = 0; // don't update
2143 }
2144
2145 // Figure out the proc_bind_policy for the new team.
2146 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2147 // proc_bind_default means don't update
2148 kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2149 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2150 proc_bind = proc_bind_false;
2151 } else {
2152 // No proc_bind clause specified; use current proc-bind-var for this
2153 // parallel region
2154 if (proc_bind == proc_bind_default) {
2155 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2156 }
2157 // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2158 if (master_th->th.th_teams_microtask &&
2159 microtask == (microtask_t)__kmp_teams_master) {
2160 proc_bind = __kmp_teams_proc_bind;
2161 }
2162 /* else: The proc_bind policy was specified explicitly on parallel clause.
2163 This overrides proc-bind-var for this parallel region, but does not
2164 change proc-bind-var. */
2165 // Figure the value of proc-bind-var for the child threads.
2166 if ((level + 1 < __kmp_nested_proc_bind.used) &&
2167 (__kmp_nested_proc_bind.bind_types[level + 1] !=
2168 master_th->th.th_current_task->td_icvs.proc_bind)) {
2169 // Do not modify the proc bind icv for the two teams construct forks
2170 // They just let the proc bind icv pass through
2171 if (!master_th->th.th_teams_microtask ||
2172 !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2173 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2174 }
2175 }
2176
2177 // Reset for next parallel region
2178 master_th->th.th_set_proc_bind = proc_bind_default;
2179
2180 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2181 kmp_internal_control_t new_icvs;
2182 copy_icvs(dst: &new_icvs, src: &master_th->th.th_current_task->td_icvs);
2183 new_icvs.next = NULL;
2184 if (nthreads_icv > 0) {
2185 new_icvs.nproc = nthreads_icv;
2186 }
2187 if (proc_bind_icv != proc_bind_default) {
2188 new_icvs.proc_bind = proc_bind_icv;
2189 }
2190
2191 /* allocate a new parallel team */
2192 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2193 team = __kmp_allocate_team(root, new_nproc: nthreads, max_nproc: nthreads,
2194#if OMPT_SUPPORT
2195 ompt_parallel_data,
2196#endif
2197 proc_bind, new_icvs: &new_icvs,
2198 argc USE_NESTED_HOT_ARG(master_th));
2199 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2200 copy_icvs(dst: (kmp_internal_control_t *)team->t.b->team_icvs, src: &new_icvs);
2201 } else {
2202 /* allocate a new parallel team */
2203 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2204 team = __kmp_allocate_team(root, new_nproc: nthreads, max_nproc: nthreads,
2205#if OMPT_SUPPORT
2206 ompt_parallel_data,
2207#endif
2208 proc_bind,
2209 new_icvs: &master_th->th.th_current_task->td_icvs,
2210 argc USE_NESTED_HOT_ARG(master_th));
2211 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2212 copy_icvs(dst: (kmp_internal_control_t *)team->t.b->team_icvs,
2213 src: &master_th->th.th_current_task->td_icvs);
2214 }
2215 KF_TRACE(
2216 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2217
2218 /* setup the new team */
2219 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2220 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2221 KMP_CHECK_UPDATE(team->t.t_ident, loc);
2222 KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2223 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2224#if OMPT_SUPPORT
2225 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2226 return_address);
2227#endif
2228 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2229 // TODO: parent_team->t.t_level == INT_MAX ???
2230 if (!master_th->th.th_teams_microtask || level > teams_level) {
2231 int new_level = parent_team->t.t_level + 1;
2232 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2233 new_level = parent_team->t.t_active_level + 1;
2234 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2235 } else {
2236 // AC: Do not increase parallel level at start of the teams construct
2237 int new_level = parent_team->t.t_level;
2238 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2239 new_level = parent_team->t.t_active_level;
2240 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2241 }
2242 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2243 // set primary thread's schedule as new run-time schedule
2244 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2245
2246 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2247 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2248
2249 // Check if hot team has potentially outdated list, and if so, free it
2250 if (team->t.t_nested_nth &&
2251 team->t.t_nested_nth != parent_team->t.t_nested_nth) {
2252 KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
2253 KMP_INTERNAL_FREE(team->t.t_nested_nth);
2254 team->t.t_nested_nth = NULL;
2255 }
2256 team->t.t_nested_nth = parent_team->t.t_nested_nth;
2257 if (master_th->th.th_set_nested_nth) {
2258 if (!nested_nth)
2259 nested_nth = __kmp_override_nested_nth(thr: master_th, level);
2260 team->t.t_nested_nth = nested_nth;
2261 KMP_INTERNAL_FREE(master_th->th.th_set_nested_nth);
2262 master_th->th.th_set_nested_nth = NULL;
2263 master_th->th.th_set_nested_nth_sz = 0;
2264 master_th->th.th_nt_strict = false;
2265 }
2266
2267 // Update the floating point rounding in the team if required.
2268 propagateFPControl(team);
2269#if OMPD_SUPPORT
2270 if (ompd_state & OMPD_ENABLE_BP)
2271 ompd_bp_parallel_begin();
2272#endif
2273
2274 KA_TRACE(
2275 20,
2276 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2277 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2278 team->t.t_nproc));
2279 KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2280 (team->t.t_master_tid == 0 &&
2281 (team->t.t_parent == root->r.r_root_team ||
2282 team->t.t_parent->t.t_serialized)));
2283 KMP_MB();
2284
2285 /* now, setup the arguments */
2286 argv = (void **)team->t.t_argv;
2287 if (ap) {
2288 for (i = argc - 1; i >= 0; --i) {
2289 void *new_argv = va_arg(kmp_va_deref(ap), void *);
2290 KMP_CHECK_UPDATE(*argv, new_argv);
2291 argv++;
2292 }
2293 } else {
2294 for (i = 0; i < argc; ++i) {
2295 // Get args from parent team for teams construct
2296 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2297 }
2298 }
2299
2300 /* now actually fork the threads */
2301 KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2302 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2303 root->r.r_active = TRUE;
2304
2305 __kmp_fork_team_threads(root, team, master_th, master_gtid: gtid, fork_teams_workers: !ap);
2306 __kmp_setup_icv_copy(team, new_nproc: nthreads,
2307 new_icvs: &master_th->th.th_current_task->td_icvs, loc);
2308
2309#if OMPT_SUPPORT
2310 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2311#endif
2312
2313 __kmp_release_bootstrap_lock(lck: &__kmp_forkjoin_lock);
2314
2315#if USE_ITT_BUILD
2316 if (team->t.t_active_level == 1 // only report frames at level 1
2317 && !master_th->th.th_teams_microtask) { // not in teams construct
2318#if USE_ITT_NOTIFY
2319 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2320 (__kmp_forkjoin_frames_mode == 3 ||
2321 __kmp_forkjoin_frames_mode == 1)) {
2322 kmp_uint64 tmp_time = 0;
2323 if (__itt_get_timestamp_ptr)
2324 tmp_time = __itt_get_timestamp();
2325 // Internal fork - report frame begin
2326 master_th->th.th_frame_time = tmp_time;
2327 if (__kmp_forkjoin_frames_mode == 3)
2328 team->t.t_region_time = tmp_time;
2329 } else
2330// only one notification scheme (either "submit" or "forking/joined", not both)
2331#endif /* USE_ITT_NOTIFY */
2332 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2333 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2334 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2335 __kmp_itt_region_forking(gtid, team_size: team->t.t_nproc, barriers: 0);
2336 }
2337 }
2338#endif /* USE_ITT_BUILD */
2339
2340 /* now go on and do the work */
2341 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2342 KMP_MB();
2343 KF_TRACE(10,
2344 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2345 root, team, master_th, gtid));
2346
2347#if USE_ITT_BUILD
2348 if (__itt_stack_caller_create_ptr) {
2349 // create new stack stitching id before entering fork barrier
2350 if (!enter_teams) {
2351 KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2352 team->t.t_stack_id = __kmp_itt_stack_caller_create();
2353 } else if (parent_team->t.t_serialized) {
2354 // keep stack stitching id in the serialized parent_team;
2355 // current team will be used for parallel inside the teams;
2356 // if parent_team is active, then it already keeps stack stitching id
2357 // for the league of teams
2358 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2359 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2360 }
2361 }
2362#endif /* USE_ITT_BUILD */
2363
2364 // AC: skip __kmp_internal_fork at teams construct, let only primary
2365 // threads execute
2366 if (ap) {
2367 __kmp_internal_fork(id: loc, gtid, team);
2368 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2369 "master_th=%p, gtid=%d\n",
2370 root, team, master_th, gtid));
2371 }
2372
2373 if (call_context == fork_context_gnu) {
2374 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2375 return TRUE;
2376 }
2377
2378 /* Invoke microtask for PRIMARY thread */
2379 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2380 team->t.t_id, team->t.t_pkfn));
2381 } // END of timer KMP_fork_call block
2382
2383#if KMP_STATS_ENABLED
2384 // If beginning a teams construct, then change thread state
2385 stats_state_e previous_state = KMP_GET_THREAD_STATE();
2386 if (!ap) {
2387 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2388 }
2389#endif
2390
2391 if (!team->t.t_invoke(gtid)) {
2392 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2393 }
2394
2395#if KMP_STATS_ENABLED
2396 // If was beginning of a teams construct, then reset thread state
2397 if (!ap) {
2398 KMP_SET_THREAD_STATE(previous_state);
2399 }
2400#endif
2401
2402 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2403 team->t.t_id, team->t.t_pkfn));
2404 KMP_MB(); /* Flush all pending memory write invalidates. */
2405
2406 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2407#if OMPT_SUPPORT
2408 if (ompt_enabled.enabled) {
2409 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2410 }
2411#endif
2412
2413 return TRUE;
2414}
2415
2416#if OMPT_SUPPORT
2417static inline void __kmp_join_restore_state(kmp_info_t *thread,
2418 kmp_team_t *team) {
2419 // restore state outside the region
2420 thread->th.ompt_thread_info.state =
2421 ((team->t.t_serialized) ? ompt_state_work_serial
2422 : ompt_state_work_parallel);
2423}
2424
2425static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2426 kmp_team_t *team, ompt_data_t *parallel_data,
2427 int flags, void *codeptr) {
2428 ompt_task_info_t *task_info = __ompt_get_task_info_object(depth: 0);
2429 if (ompt_enabled.ompt_callback_parallel_end) {
2430 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2431 parallel_data, &(task_info->task_data), flags, codeptr);
2432 }
2433
2434 task_info->frame.enter_frame = ompt_data_none;
2435 __kmp_join_restore_state(thread, team);
2436}
2437#endif
2438
2439void __kmp_join_call(ident_t *loc, int gtid
2440#if OMPT_SUPPORT
2441 ,
2442 enum fork_context_e fork_context
2443#endif
2444 ,
2445 int exit_teams) {
2446 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2447 kmp_team_t *team;
2448 kmp_team_t *parent_team;
2449 kmp_info_t *master_th;
2450 kmp_root_t *root;
2451 int master_active;
2452
2453 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2454
2455 /* setup current data */
2456 master_th = __kmp_threads[gtid];
2457 root = master_th->th.th_root;
2458 team = master_th->th.th_team;
2459 parent_team = team->t.t_parent;
2460
2461 master_th->th.th_ident = loc;
2462
2463#if OMPT_SUPPORT
2464 void *team_microtask = (void *)team->t.t_pkfn;
2465 // For GOMP interface with serialized parallel, need the
2466 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2467 // and end-parallel events.
2468 if (ompt_enabled.enabled &&
2469 !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2470 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2471 }
2472#endif
2473
2474#if KMP_DEBUG
2475 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2476 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2477 "th_task_team = %p\n",
2478 __kmp_gtid_from_thread(master_th), team,
2479 team->t.t_task_team[master_th->th.th_task_state],
2480 master_th->th.th_task_team));
2481 KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, master_th);
2482 }
2483#endif
2484
2485 if (team->t.t_serialized) {
2486 if (master_th->th.th_teams_microtask) {
2487 // We are in teams construct
2488 int level = team->t.t_level;
2489 int tlevel = master_th->th.th_teams_level;
2490 if (level == tlevel) {
2491 // AC: we haven't incremented it earlier at start of teams construct,
2492 // so do it here - at the end of teams construct
2493 team->t.t_level++;
2494 } else if (level == tlevel + 1) {
2495 // AC: we are exiting parallel inside teams, need to increment
2496 // serialization in order to restore it in the next call to
2497 // __kmpc_end_serialized_parallel
2498 team->t.t_serialized++;
2499 }
2500 }
2501 __kmpc_end_serialized_parallel(loc, global_tid: gtid);
2502
2503#if OMPT_SUPPORT
2504 if (ompt_enabled.enabled) {
2505 if (fork_context == fork_context_gnu) {
2506 __ompt_lw_taskteam_unlink(thr: master_th);
2507 }
2508 __kmp_join_restore_state(thread: master_th, team: parent_team);
2509 }
2510#endif
2511
2512 return;
2513 }
2514
2515 master_active = team->t.t_master_active;
2516
2517 if (!exit_teams) {
2518 // AC: No barrier for internal teams at exit from teams construct.
2519 // But there is barrier for external team (league).
2520 __kmp_internal_join(id: loc, gtid, team);
2521#if USE_ITT_BUILD
2522 if (__itt_stack_caller_create_ptr) {
2523 KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2524 // destroy the stack stitching id after join barrier
2525 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2526 team->t.t_stack_id = NULL;
2527 }
2528#endif
2529 } else {
2530 master_th->th.th_task_state =
2531 0; // AC: no tasking in teams (out of any parallel)
2532#if USE_ITT_BUILD
2533 if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2534 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2535 // destroy the stack stitching id on exit from the teams construct
2536 // if parent_team is active, then the id will be destroyed later on
2537 // by master of the league of teams
2538 __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2539 parent_team->t.t_stack_id = NULL;
2540 }
2541#endif
2542 }
2543
2544 KMP_MB();
2545
2546#if OMPT_SUPPORT
2547 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2548 void *codeptr = team->t.ompt_team_info.master_return_address;
2549#endif
2550
2551#if USE_ITT_BUILD
2552 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2553 if (team->t.t_active_level == 1 &&
2554 (!master_th->th.th_teams_microtask || /* not in teams construct */
2555 master_th->th.th_teams_size.nteams == 1)) {
2556 master_th->th.th_ident = loc;
2557 // only one notification scheme (either "submit" or "forking/joined", not
2558 // both)
2559 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2560 __kmp_forkjoin_frames_mode == 3)
2561 __kmp_itt_frame_submit(gtid, begin: team->t.t_region_time,
2562 end: master_th->th.th_frame_time, imbalance: 0, loc,
2563 team_size: master_th->th.th_team_nproc, region: 1);
2564 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2565 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2566 __kmp_itt_region_joined(gtid);
2567 } // active_level == 1
2568#endif /* USE_ITT_BUILD */
2569
2570#if KMP_AFFINITY_SUPPORTED
2571 if (!exit_teams) {
2572 // Restore master thread's partition.
2573 master_th->th.th_first_place = team->t.t_first_place;
2574 master_th->th.th_last_place = team->t.t_last_place;
2575 }
2576#endif // KMP_AFFINITY_SUPPORTED
2577
2578 if (master_th->th.th_teams_microtask && !exit_teams &&
2579 team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2580 team->t.t_level == master_th->th.th_teams_level + 1) {
2581// AC: We need to leave the team structure intact at the end of parallel
2582// inside the teams construct, so that at the next parallel same (hot) team
2583// works, only adjust nesting levels
2584#if OMPT_SUPPORT
2585 ompt_data_t ompt_parallel_data = ompt_data_none;
2586 if (ompt_enabled.enabled) {
2587 ompt_task_info_t *task_info = __ompt_get_task_info_object(depth: 0);
2588 if (ompt_enabled.ompt_callback_implicit_task) {
2589 int ompt_team_size = team->t.t_nproc;
2590 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2591 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2592 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2593 }
2594 task_info->frame.exit_frame = ompt_data_none;
2595 task_info->task_data = ompt_data_none;
2596 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2597 __ompt_lw_taskteam_unlink(thr: master_th);
2598 }
2599#endif
2600 /* Decrement our nested depth level */
2601 team->t.t_level--;
2602 team->t.t_active_level--;
2603 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2604
2605 // Restore number of threads in the team if needed. This code relies on
2606 // the proper adjustment of th_teams_size.nth after the fork in
2607 // __kmp_teams_master on each teams primary thread in the case that
2608 // __kmp_reserve_threads reduced it.
2609 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2610 int old_num = master_th->th.th_team_nproc;
2611 int new_num = master_th->th.th_teams_size.nth;
2612 kmp_info_t **other_threads = team->t.t_threads;
2613 team->t.t_nproc = new_num;
2614 for (int i = 0; i < old_num; ++i) {
2615 other_threads[i]->th.th_team_nproc = new_num;
2616 }
2617 // Adjust states of non-used threads of the team
2618 for (int i = old_num; i < new_num; ++i) {
2619 // Re-initialize thread's barrier data.
2620 KMP_DEBUG_ASSERT(other_threads[i]);
2621 kmp_balign_t *balign = other_threads[i]->th.th_bar;
2622 for (int b = 0; b < bs_last_barrier; ++b) {
2623 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2624 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2625#if USE_DEBUGGER
2626 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2627#endif
2628 }
2629 if (__kmp_tasking_mode != tskm_immediate_exec) {
2630 // Synchronize thread's task state
2631 other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2632 }
2633 }
2634 }
2635
2636#if OMPT_SUPPORT
2637 if (ompt_enabled.enabled) {
2638 __kmp_join_ompt(gtid, thread: master_th, team: parent_team, parallel_data: &ompt_parallel_data,
2639 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2640 }
2641#endif
2642
2643 return;
2644 }
2645
2646 /* do cleanup and restore the parent team */
2647 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2648 master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2649
2650 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2651
2652 /* jc: The following lock has instructions with REL and ACQ semantics,
2653 separating the parallel user code called in this parallel region
2654 from the serial user code called after this function returns. */
2655 __kmp_acquire_bootstrap_lock(lck: &__kmp_forkjoin_lock);
2656
2657 if (!master_th->th.th_teams_microtask ||
2658 team->t.t_level > master_th->th.th_teams_level) {
2659 /* Decrement our nested depth level */
2660 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2661 }
2662 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2663
2664#if OMPT_SUPPORT
2665 if (ompt_enabled.enabled) {
2666 ompt_task_info_t *task_info = __ompt_get_task_info_object(depth: 0);
2667 if (ompt_enabled.ompt_callback_implicit_task) {
2668 int flags = (team_microtask == (void *)__kmp_teams_master)
2669 ? ompt_task_initial
2670 : ompt_task_implicit;
2671 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2672 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2673 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2674 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2675 }
2676 task_info->frame.exit_frame = ompt_data_none;
2677 task_info->task_data = ompt_data_none;
2678 }
2679#endif
2680
2681 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2682 master_th, team));
2683 __kmp_pop_current_task_from_thread(this_thr: master_th);
2684
2685 master_th->th.th_def_allocator = team->t.t_def_allocator;
2686
2687#if OMPD_SUPPORT
2688 if (ompd_state & OMPD_ENABLE_BP)
2689 ompd_bp_parallel_end();
2690#endif
2691 updateHWFPControl(team);
2692
2693 if (root->r.r_active != master_active)
2694 root->r.r_active = master_active;
2695
2696 __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2697 master_th)); // this will free worker threads
2698
2699 /* this race was fun to find. make sure the following is in the critical
2700 region otherwise assertions may fail occasionally since the old team may be
2701 reallocated and the hierarchy appears inconsistent. it is actually safe to
2702 run and won't cause any bugs, but will cause those assertion failures. it's
2703 only one deref&assign so might as well put this in the critical region */
2704 master_th->th.th_team = parent_team;
2705 master_th->th.th_team_nproc = parent_team->t.t_nproc;
2706 master_th->th.th_team_master = parent_team->t.t_threads[0];
2707 master_th->th.th_team_serialized = parent_team->t.t_serialized;
2708
2709 /* restore serialized team, if need be */
2710 if (parent_team->t.t_serialized &&
2711 parent_team != master_th->th.th_serial_team &&
2712 parent_team != root->r.r_root_team) {
2713 __kmp_free_team(root,
2714 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2715 master_th->th.th_serial_team = parent_team;
2716 }
2717
2718 if (__kmp_tasking_mode != tskm_immediate_exec) {
2719 // Restore primary thread's task state from team structure
2720 KMP_DEBUG_ASSERT(team->t.t_primary_task_state == 0 ||
2721 team->t.t_primary_task_state == 1);
2722 master_th->th.th_task_state = (kmp_uint8)team->t.t_primary_task_state;
2723
2724 // Copy the task team from the parent team to the primary thread
2725 master_th->th.th_task_team =
2726 parent_team->t.t_task_team[master_th->th.th_task_state];
2727 KA_TRACE(20,
2728 ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2729 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2730 parent_team));
2731 }
2732
2733 // TODO: GEH - cannot do this assertion because root thread not set up as
2734 // executing
2735 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2736 master_th->th.th_current_task->td_flags.executing = 1;
2737
2738 __kmp_release_bootstrap_lock(lck: &__kmp_forkjoin_lock);
2739
2740#if KMP_AFFINITY_SUPPORTED
2741 if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
2742 __kmp_reset_root_init_mask(gtid);
2743 }
2744#endif
2745#if OMPT_SUPPORT
2746 int flags =
2747 OMPT_INVOKER(fork_context) |
2748 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2749 : ompt_parallel_team);
2750 if (ompt_enabled.enabled) {
2751 __kmp_join_ompt(gtid, thread: master_th, team: parent_team, parallel_data, flags,
2752 codeptr);
2753 }
2754#endif
2755
2756 KMP_MB();
2757 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2758}
2759
2760/* Check whether we should push an internal control record onto the
2761 serial team stack. If so, do it. */
2762void __kmp_save_internal_controls(kmp_info_t *thread) {
2763
2764 if (thread->th.th_team != thread->th.th_serial_team) {
2765 return;
2766 }
2767 if (thread->th.th_team->t.t_serialized > 1) {
2768 int push = 0;
2769
2770 if (thread->th.th_team->t.t_control_stack_top == NULL) {
2771 push = 1;
2772 } else {
2773 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2774 thread->th.th_team->t.t_serialized) {
2775 push = 1;
2776 }
2777 }
2778 if (push) { /* push a record on the serial team's stack */
2779 kmp_internal_control_t *control =
2780 (kmp_internal_control_t *)__kmp_allocate(
2781 sizeof(kmp_internal_control_t));
2782
2783 copy_icvs(dst: control, src: &thread->th.th_current_task->td_icvs);
2784
2785 control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2786
2787 control->next = thread->th.th_team->t.t_control_stack_top;
2788 thread->th.th_team->t.t_control_stack_top = control;
2789 }
2790 }
2791}
2792
2793/* Changes set_nproc */
2794void __kmp_set_num_threads(int new_nth, int gtid) {
2795 kmp_info_t *thread;
2796 kmp_root_t *root;
2797
2798 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2799 KMP_DEBUG_ASSERT(__kmp_init_serial);
2800
2801 if (new_nth < 1)
2802 new_nth = 1;
2803 else if (new_nth > __kmp_max_nth)
2804 new_nth = __kmp_max_nth;
2805
2806 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2807 thread = __kmp_threads[gtid];
2808 if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2809 return; // nothing to do
2810
2811 __kmp_save_internal_controls(thread);
2812
2813 set__nproc(thread, new_nth);
2814
2815 // If this omp_set_num_threads() call will cause the hot team size to be
2816 // reduced (in the absence of a num_threads clause), then reduce it now,
2817 // rather than waiting for the next parallel region.
2818 root = thread->th.th_root;
2819 if (__kmp_init_parallel && (!root->r.r_active) &&
2820 (root->r.r_hot_team->t.t_nproc > new_nth)
2821#if KMP_NESTED_HOT_TEAMS
2822 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2823#endif
2824 ) {
2825 kmp_team_t *hot_team = root->r.r_hot_team;
2826 int f;
2827
2828 __kmp_acquire_bootstrap_lock(lck: &__kmp_forkjoin_lock);
2829
2830 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2831 __kmp_resize_dist_barrier(team: hot_team, old_nthreads: hot_team->t.t_nproc, new_nthreads: new_nth);
2832 }
2833 // Release the extra threads we don't need any more.
2834 for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2835 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2836 if (__kmp_tasking_mode != tskm_immediate_exec) {
2837 // When decreasing team size, threads no longer in the team should unref
2838 // task team.
2839 hot_team->t.t_threads[f]->th.th_task_team = NULL;
2840 }
2841 __kmp_free_thread(hot_team->t.t_threads[f]);
2842 hot_team->t.t_threads[f] = NULL;
2843 }
2844 hot_team->t.t_nproc = new_nth;
2845#if KMP_NESTED_HOT_TEAMS
2846 if (thread->th.th_hot_teams) {
2847 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2848 thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2849 }
2850#endif
2851
2852 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2853 hot_team->t.b->update_num_threads(nthr: new_nth);
2854 __kmp_add_threads_to_team(team: hot_team, new_nthreads: new_nth);
2855 }
2856
2857 __kmp_release_bootstrap_lock(lck: &__kmp_forkjoin_lock);
2858
2859 // Update the t_nproc field in the threads that are still active.
2860 for (f = 0; f < new_nth; f++) {
2861 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2862 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2863 }
2864 // Special flag in case omp_set_num_threads() call
2865 hot_team->t.t_size_changed = -1;
2866 }
2867}
2868
2869/* Changes max_active_levels */
2870void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2871 kmp_info_t *thread;
2872
2873 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2874 "%d = (%d)\n",
2875 gtid, max_active_levels));
2876 KMP_DEBUG_ASSERT(__kmp_init_serial);
2877
2878 // validate max_active_levels
2879 if (max_active_levels < 0) {
2880 KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2881 // We ignore this call if the user has specified a negative value.
2882 // The current setting won't be changed. The last valid setting will be
2883 // used. A warning will be issued (if warnings are allowed as controlled by
2884 // the KMP_WARNINGS env var).
2885 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2886 "max_active_levels for thread %d = (%d)\n",
2887 gtid, max_active_levels));
2888 return;
2889 }
2890 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2891 // it's OK, the max_active_levels is within the valid range: [ 0;
2892 // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2893 // We allow a zero value. (implementation defined behavior)
2894 } else {
2895 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2896 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2897 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2898 // Current upper limit is MAX_INT. (implementation defined behavior)
2899 // If the input exceeds the upper limit, we correct the input to be the
2900 // upper limit. (implementation defined behavior)
2901 // Actually, the flow should never get here until we use MAX_INT limit.
2902 }
2903 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2904 "max_active_levels for thread %d = (%d)\n",
2905 gtid, max_active_levels));
2906
2907 thread = __kmp_threads[gtid];
2908
2909 __kmp_save_internal_controls(thread);
2910
2911 set__max_active_levels(thread, max_active_levels);
2912}
2913
2914/* Gets max_active_levels */
2915int __kmp_get_max_active_levels(int gtid) {
2916 kmp_info_t *thread;
2917
2918 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2919 KMP_DEBUG_ASSERT(__kmp_init_serial);
2920
2921 thread = __kmp_threads[gtid];
2922 KMP_DEBUG_ASSERT(thread->th.th_current_task);
2923 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2924 "curtask_maxaclevel=%d\n",
2925 gtid, thread->th.th_current_task,
2926 thread->th.th_current_task->td_icvs.max_active_levels));
2927 return thread->th.th_current_task->td_icvs.max_active_levels;
2928}
2929
2930// nteams-var per-device ICV
2931void __kmp_set_num_teams(int num_teams) {
2932 if (num_teams > 0)
2933 __kmp_nteams = num_teams;
2934}
2935int __kmp_get_max_teams(void) { return __kmp_nteams; }
2936// teams-thread-limit-var per-device ICV
2937void __kmp_set_teams_thread_limit(int limit) {
2938 if (limit > 0)
2939 __kmp_teams_thread_limit = limit;
2940}
2941int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2942
2943KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2944KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2945
2946/* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2947void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2948 kmp_info_t *thread;
2949 kmp_sched_t orig_kind;
2950 // kmp_team_t *team;
2951
2952 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2953 gtid, (int)kind, chunk));
2954 KMP_DEBUG_ASSERT(__kmp_init_serial);
2955
2956 // Check if the kind parameter is valid, correct if needed.
2957 // Valid parameters should fit in one of two intervals - standard or extended:
2958 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2959 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2960 orig_kind = kind;
2961 kind = __kmp_sched_without_mods(kind);
2962
2963 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2964 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2965 // TODO: Hint needs attention in case we change the default schedule.
2966 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2967 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2968 __kmp_msg_null);
2969 kind = kmp_sched_default;
2970 chunk = 0; // ignore chunk value in case of bad kind
2971 }
2972
2973 thread = __kmp_threads[gtid];
2974
2975 __kmp_save_internal_controls(thread);
2976
2977 if (kind < kmp_sched_upper_std) {
2978 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2979 // differ static chunked vs. unchunked: chunk should be invalid to
2980 // indicate unchunked schedule (which is the default)
2981 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2982 } else {
2983 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2984 __kmp_sch_map[kind - kmp_sched_lower - 1];
2985 }
2986 } else {
2987 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2988 // kmp_sched_lower - 2 ];
2989 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2990 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2991 kmp_sched_lower - 2];
2992 }
2993 __kmp_sched_apply_mods_intkind(
2994 kind: orig_kind, internal_kind: &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2995 if (kind == kmp_sched_auto || chunk < 1) {
2996 // ignore parameter chunk for schedule auto
2997 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2998 } else {
2999 thread->th.th_current_task->td_icvs.sched.chunk = chunk;
3000 }
3001}
3002
3003/* Gets def_sched_var ICV values */
3004void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
3005 kmp_info_t *thread;
3006 enum sched_type th_type;
3007
3008 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
3009 KMP_DEBUG_ASSERT(__kmp_init_serial);
3010
3011 thread = __kmp_threads[gtid];
3012
3013 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
3014 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
3015 case kmp_sch_static:
3016 case kmp_sch_static_greedy:
3017 case kmp_sch_static_balanced:
3018 *kind = kmp_sched_static;
3019 __kmp_sched_apply_mods_stdkind(kind, internal_kind: th_type);
3020 *chunk = 0; // chunk was not set, try to show this fact via zero value
3021 return;
3022 case kmp_sch_static_chunked:
3023 *kind = kmp_sched_static;
3024 break;
3025 case kmp_sch_dynamic_chunked:
3026 *kind = kmp_sched_dynamic;
3027 break;
3028 case kmp_sch_guided_chunked:
3029 case kmp_sch_guided_iterative_chunked:
3030 case kmp_sch_guided_analytical_chunked:
3031 *kind = kmp_sched_guided;
3032 break;
3033 case kmp_sch_auto:
3034 *kind = kmp_sched_auto;
3035 break;
3036 case kmp_sch_trapezoidal:
3037 *kind = kmp_sched_trapezoidal;
3038 break;
3039#if KMP_STATIC_STEAL_ENABLED
3040 case kmp_sch_static_steal:
3041 *kind = kmp_sched_static_steal;
3042 break;
3043#endif
3044 default:
3045 KMP_FATAL(UnknownSchedulingType, th_type);
3046 }
3047
3048 __kmp_sched_apply_mods_stdkind(kind, internal_kind: th_type);
3049 *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
3050}
3051
3052int __kmp_get_ancestor_thread_num(int gtid, int level) {
3053
3054 int ii, dd;
3055 kmp_team_t *team;
3056 kmp_info_t *thr;
3057
3058 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
3059 KMP_DEBUG_ASSERT(__kmp_init_serial);
3060
3061 // validate level
3062 if (level == 0)
3063 return 0;
3064 if (level < 0)
3065 return -1;
3066 thr = __kmp_threads[gtid];
3067 team = thr->th.th_team;
3068 ii = team->t.t_level;
3069 if (level > ii)
3070 return -1;
3071
3072 if (thr->th.th_teams_microtask) {
3073 // AC: we are in teams region where multiple nested teams have same level
3074 int tlevel = thr->th.th_teams_level; // the level of the teams construct
3075 if (level <=
3076 tlevel) { // otherwise usual algorithm works (will not touch the teams)
3077 KMP_DEBUG_ASSERT(ii >= tlevel);
3078 // AC: As we need to pass by the teams league, we need to artificially
3079 // increase ii
3080 if (ii == tlevel) {
3081 ii += 2; // three teams have same level
3082 } else {
3083 ii++; // two teams have same level
3084 }
3085 }
3086 }
3087
3088 if (ii == level)
3089 return __kmp_tid_from_gtid(gtid);
3090
3091 dd = team->t.t_serialized;
3092 level++;
3093 while (ii > level) {
3094 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3095 }
3096 if ((team->t.t_serialized) && (!dd)) {
3097 team = team->t.t_parent;
3098 continue;
3099 }
3100 if (ii > level) {
3101 team = team->t.t_parent;
3102 dd = team->t.t_serialized;
3103 ii--;
3104 }
3105 }
3106
3107 return (dd > 1) ? (0) : (team->t.t_master_tid);
3108}
3109
3110int __kmp_get_team_size(int gtid, int level) {
3111
3112 int ii, dd;
3113 kmp_team_t *team;
3114 kmp_info_t *thr;
3115
3116 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3117 KMP_DEBUG_ASSERT(__kmp_init_serial);
3118
3119 // validate level
3120 if (level == 0)
3121 return 1;
3122 if (level < 0)
3123 return -1;
3124 thr = __kmp_threads[gtid];
3125 team = thr->th.th_team;
3126 ii = team->t.t_level;
3127 if (level > ii)
3128 return -1;
3129
3130 if (thr->th.th_teams_microtask) {
3131 // AC: we are in teams region where multiple nested teams have same level
3132 int tlevel = thr->th.th_teams_level; // the level of the teams construct
3133 if (level <=
3134 tlevel) { // otherwise usual algorithm works (will not touch the teams)
3135 KMP_DEBUG_ASSERT(ii >= tlevel);
3136 // AC: As we need to pass by the teams league, we need to artificially
3137 // increase ii
3138 if (ii == tlevel) {
3139 ii += 2; // three teams have same level
3140 } else {
3141 ii++; // two teams have same level
3142 }
3143 }
3144 }
3145
3146 while (ii > level) {
3147 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3148 }
3149 if (team->t.t_serialized && (!dd)) {
3150 team = team->t.t_parent;
3151 continue;
3152 }
3153 if (ii > level) {
3154 team = team->t.t_parent;
3155 ii--;
3156 }
3157 }
3158
3159 return team->t.t_nproc;
3160}
3161
3162kmp_r_sched_t __kmp_get_schedule_global() {
3163 // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3164 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3165 // independently. So one can get the updated schedule here.
3166
3167 kmp_r_sched_t r_sched;
3168
3169 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3170 // __kmp_guided. __kmp_sched should keep original value, so that user can set
3171 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3172 // different roots (even in OMP 2.5)
3173 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3174 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3175 if (s == kmp_sch_static) {
3176 // replace STATIC with more detailed schedule (balanced or greedy)
3177 r_sched.r_sched_type = __kmp_static;
3178 } else if (s == kmp_sch_guided_chunked) {
3179 // replace GUIDED with more detailed schedule (iterative or analytical)
3180 r_sched.r_sched_type = __kmp_guided;
3181 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3182 r_sched.r_sched_type = __kmp_sched;
3183 }
3184 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3185
3186 if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3187 // __kmp_chunk may be wrong here (if it was not ever set)
3188 r_sched.chunk = KMP_DEFAULT_CHUNK;
3189 } else {
3190 r_sched.chunk = __kmp_chunk;
3191 }
3192
3193 return r_sched;
3194}
3195
3196/* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3197 at least argc number of *t_argv entries for the requested team. */
3198static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3199
3200 KMP_DEBUG_ASSERT(team);
3201 if (!realloc || argc > team->t.t_max_argc) {
3202
3203 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3204 "current entries=%d\n",
3205 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3206 /* if previously allocated heap space for args, free them */
3207 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3208 __kmp_free((void *)team->t.t_argv);
3209
3210 if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3211 /* use unused space in the cache line for arguments */
3212 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3213 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3214 "argv entries\n",
3215 team->t.t_id, team->t.t_max_argc));
3216 team->t.t_argv = &team->t.t_inline_argv[0];
3217 if (__kmp_storage_map) {
3218 __kmp_print_storage_map_gtid(
3219 gtid: -1, p1: &team->t.t_inline_argv[0],
3220 p2: &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3221 size: (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), format: "team_%d.t_inline_argv",
3222 team->t.t_id);
3223 }
3224 } else {
3225 /* allocate space for arguments in the heap */
3226 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3227 ? KMP_MIN_MALLOC_ARGV_ENTRIES
3228 : 2 * argc;
3229 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3230 "argv entries\n",
3231 team->t.t_id, team->t.t_max_argc));
3232 team->t.t_argv =
3233 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3234 if (__kmp_storage_map) {
3235 __kmp_print_storage_map_gtid(gtid: -1, p1: &team->t.t_argv[0],
3236 p2: &team->t.t_argv[team->t.t_max_argc],
3237 size: sizeof(void *) * team->t.t_max_argc,
3238 format: "team_%d.t_argv", team->t.t_id);
3239 }
3240 }
3241 }
3242}
3243
3244static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3245 int i;
3246 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3247 team->t.t_threads =
3248 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3249 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3250 sizeof(dispatch_shared_info_t) * num_disp_buff);
3251 team->t.t_dispatch =
3252 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3253 team->t.t_implicit_task_taskdata =
3254 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3255 team->t.t_max_nproc = max_nth;
3256
3257 /* setup dispatch buffers */
3258 for (i = 0; i < num_disp_buff; ++i) {
3259 team->t.t_disp_buffer[i].buffer_index = i;
3260 team->t.t_disp_buffer[i].doacross_buf_idx = i;
3261 }
3262}
3263
3264static void __kmp_free_team_arrays(kmp_team_t *team) {
3265 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3266 int i;
3267 for (i = 0; i < team->t.t_max_nproc; ++i) {
3268 if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3269 __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3270 team->t.t_dispatch[i].th_disp_buffer = NULL;
3271 }
3272 }
3273#if KMP_USE_HIER_SCHED
3274 __kmp_dispatch_free_hierarchies(team);
3275#endif
3276 __kmp_free(team->t.t_threads);
3277 __kmp_free(team->t.t_disp_buffer);
3278 __kmp_free(team->t.t_dispatch);
3279 __kmp_free(team->t.t_implicit_task_taskdata);
3280 team->t.t_threads = NULL;
3281 team->t.t_disp_buffer = NULL;
3282 team->t.t_dispatch = NULL;
3283 team->t.t_implicit_task_taskdata = 0;
3284}
3285
3286static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3287 kmp_info_t **oldThreads = team->t.t_threads;
3288
3289 __kmp_free(team->t.t_disp_buffer);
3290 __kmp_free(team->t.t_dispatch);
3291 __kmp_free(team->t.t_implicit_task_taskdata);
3292 __kmp_allocate_team_arrays(team, max_nth);
3293
3294 KMP_MEMCPY(dest: team->t.t_threads, src: oldThreads,
3295 n: team->t.t_nproc * sizeof(kmp_info_t *));
3296
3297 __kmp_free(oldThreads);
3298}
3299
3300static kmp_internal_control_t __kmp_get_global_icvs(void) {
3301
3302 kmp_r_sched_t r_sched =
3303 __kmp_get_schedule_global(); // get current state of scheduling globals
3304
3305 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3306
3307 kmp_internal_control_t g_icvs = {
3308 .serial_nesting_level: 0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3309 .dynamic: (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3310 // adjustment of threads (per thread)
3311 .bt_set: (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3312 // whether blocktime is explicitly set
3313 .blocktime: __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3314#if KMP_USE_MONITOR
3315 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3316// intervals
3317#endif
3318 .nproc: __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3319 // next parallel region (per thread)
3320 // (use a max ub on value if __kmp_parallel_initialize not called yet)
3321 .thread_limit: __kmp_cg_max_nth, // int thread_limit;
3322 .task_thread_limit: __kmp_task_max_nth, // int task_thread_limit; // to set the thread_limit
3323 // on task. This is used in the case of target thread_limit
3324 .max_active_levels: __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3325 // for max_active_levels
3326 .sched: r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3327 // {sched,chunk} pair
3328 .proc_bind: __kmp_nested_proc_bind.bind_types[0],
3329 .default_device: __kmp_default_device,
3330 NULL // struct kmp_internal_control *next;
3331 };
3332
3333 return g_icvs;
3334}
3335
3336static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3337
3338 kmp_internal_control_t gx_icvs;
3339 gx_icvs.serial_nesting_level =
3340 0; // probably =team->t.t_serial like in save_inter_controls
3341 copy_icvs(dst: &gx_icvs, src: &team->t.t_threads[0]->th.th_current_task->td_icvs);
3342 gx_icvs.next = NULL;
3343
3344 return gx_icvs;
3345}
3346
3347static void __kmp_initialize_root(kmp_root_t *root) {
3348 int f;
3349 kmp_team_t *root_team;
3350 kmp_team_t *hot_team;
3351 int hot_team_max_nth;
3352 kmp_r_sched_t r_sched =
3353 __kmp_get_schedule_global(); // get current state of scheduling globals
3354 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3355 KMP_DEBUG_ASSERT(root);
3356 KMP_ASSERT(!root->r.r_begin);
3357
3358 /* setup the root state structure */
3359 __kmp_init_lock(lck: &root->r.r_begin_lock);
3360 root->r.r_begin = FALSE;
3361 root->r.r_active = FALSE;
3362 root->r.r_in_parallel = 0;
3363 root->r.r_blocktime = __kmp_dflt_blocktime;
3364#if KMP_AFFINITY_SUPPORTED
3365 root->r.r_affinity_assigned = FALSE;
3366#endif
3367
3368 /* setup the root team for this task */
3369 /* allocate the root team structure */
3370 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3371
3372 root_team =
3373 __kmp_allocate_team(root,
3374 new_nproc: 1, // new_nproc
3375 max_nproc: 1, // max_nproc
3376#if OMPT_SUPPORT
3377 ompt_data_none, // root parallel id
3378#endif
3379 proc_bind: __kmp_nested_proc_bind.bind_types[0], new_icvs: &r_icvs,
3380 argc: 0 // argc
3381 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3382 );
3383#if USE_DEBUGGER
3384 // Non-NULL value should be assigned to make the debugger display the root
3385 // team.
3386 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3387#endif
3388
3389 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3390
3391 root->r.r_root_team = root_team;
3392 root_team->t.t_control_stack_top = NULL;
3393
3394 /* initialize root team */
3395 root_team->t.t_threads[0] = NULL;
3396 root_team->t.t_nproc = 1;
3397 root_team->t.t_serialized = 1;
3398 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3399 root_team->t.t_sched.sched = r_sched.sched;
3400 root_team->t.t_nested_nth = &__kmp_nested_nth;
3401 KA_TRACE(
3402 20,
3403 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3404 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3405
3406 /* setup the hot team for this task */
3407 /* allocate the hot team structure */
3408 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3409
3410 hot_team =
3411 __kmp_allocate_team(root,
3412 new_nproc: 1, // new_nproc
3413 max_nproc: __kmp_dflt_team_nth_ub * 2, // max_nproc
3414#if OMPT_SUPPORT
3415 ompt_data_none, // root parallel id
3416#endif
3417 proc_bind: __kmp_nested_proc_bind.bind_types[0], new_icvs: &r_icvs,
3418 argc: 0 // argc
3419 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3420 );
3421 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3422
3423 root->r.r_hot_team = hot_team;
3424 root_team->t.t_control_stack_top = NULL;
3425
3426 /* first-time initialization */
3427 hot_team->t.t_parent = root_team;
3428
3429 /* initialize hot team */
3430 hot_team_max_nth = hot_team->t.t_max_nproc;
3431 for (f = 0; f < hot_team_max_nth; ++f) {
3432 hot_team->t.t_threads[f] = NULL;
3433 }
3434 hot_team->t.t_nproc = 1;
3435 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3436 hot_team->t.t_sched.sched = r_sched.sched;
3437 hot_team->t.t_size_changed = 0;
3438 hot_team->t.t_nested_nth = &__kmp_nested_nth;
3439}
3440
3441#ifdef KMP_DEBUG
3442
3443typedef struct kmp_team_list_item {
3444 kmp_team_p const *entry;
3445 struct kmp_team_list_item *next;
3446} kmp_team_list_item_t;
3447typedef kmp_team_list_item_t *kmp_team_list_t;
3448
3449static void __kmp_print_structure_team_accum( // Add team to list of teams.
3450 kmp_team_list_t list, // List of teams.
3451 kmp_team_p const *team // Team to add.
3452) {
3453
3454 // List must terminate with item where both entry and next are NULL.
3455 // Team is added to the list only once.
3456 // List is sorted in ascending order by team id.
3457 // Team id is *not* a key.
3458
3459 kmp_team_list_t l;
3460
3461 KMP_DEBUG_ASSERT(list != NULL);
3462 if (team == NULL) {
3463 return;
3464 }
3465
3466 __kmp_print_structure_team_accum(list, team: team->t.t_parent);
3467 __kmp_print_structure_team_accum(list, team: team->t.t_next_pool);
3468
3469 // Search list for the team.
3470 l = list;
3471 while (l->next != NULL && l->entry != team) {
3472 l = l->next;
3473 }
3474 if (l->next != NULL) {
3475 return; // Team has been added before, exit.
3476 }
3477
3478 // Team is not found. Search list again for insertion point.
3479 l = list;
3480 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3481 l = l->next;
3482 }
3483
3484 // Insert team.
3485 {
3486 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3487 sizeof(kmp_team_list_item_t));
3488 *item = *l;
3489 l->entry = team;
3490 l->next = item;
3491 }
3492}
3493
3494static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3495
3496) {
3497 __kmp_printf(format: "%s", title);
3498 if (team != NULL) {
3499 __kmp_printf(format: "%2x %p\n", team->t.t_id, team);
3500 } else {
3501 __kmp_printf(format: " - (nil)\n");
3502 }
3503}
3504
3505static void __kmp_print_structure_thread(char const *title,
3506 kmp_info_p const *thread) {
3507 __kmp_printf(format: "%s", title);
3508 if (thread != NULL) {
3509 __kmp_printf(format: "%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3510 } else {
3511 __kmp_printf(format: " - (nil)\n");
3512 }
3513}
3514
3515void __kmp_print_structure(void) {
3516
3517 kmp_team_list_t list;
3518
3519 // Initialize list of teams.
3520 list =
3521 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3522 list->entry = NULL;
3523 list->next = NULL;
3524
3525 __kmp_printf(format: "\n------------------------------\nGlobal Thread "
3526 "Table\n------------------------------\n");
3527 {
3528 int gtid;
3529 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3530 __kmp_printf(format: "%2d", gtid);
3531 if (__kmp_threads != NULL) {
3532 __kmp_printf(format: " %p", __kmp_threads[gtid]);
3533 }
3534 if (__kmp_root != NULL) {
3535 __kmp_printf(format: " %p", __kmp_root[gtid]);
3536 }
3537 __kmp_printf(format: "\n");
3538 }
3539 }
3540
3541 // Print out __kmp_threads array.
3542 __kmp_printf(format: "\n------------------------------\nThreads\n--------------------"
3543 "----------\n");
3544 if (__kmp_threads != NULL) {
3545 int gtid;
3546 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3547 kmp_info_t const *thread = __kmp_threads[gtid];
3548 if (thread != NULL) {
3549 __kmp_printf(format: "GTID %2d %p:\n", gtid, thread);
3550 __kmp_printf(format: " Our Root: %p\n", thread->th.th_root);
3551 __kmp_print_structure_team(title: " Our Team: ", team: thread->th.th_team);
3552 __kmp_print_structure_team(title: " Serial Team: ",
3553 team: thread->th.th_serial_team);
3554 __kmp_printf(format: " Threads: %2d\n", thread->th.th_team_nproc);
3555 __kmp_print_structure_thread(title: " Primary: ",
3556 thread: thread->th.th_team_master);
3557 __kmp_printf(format: " Serialized?: %2d\n", thread->th.th_team_serialized);
3558 __kmp_printf(format: " Set NProc: %2d\n", thread->th.th_set_nproc);
3559 __kmp_printf(format: " Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3560 __kmp_print_structure_thread(title: " Next in pool: ",
3561 thread: thread->th.th_next_pool);
3562 __kmp_printf(format: "\n");
3563 __kmp_print_structure_team_accum(list, team: thread->th.th_team);
3564 __kmp_print_structure_team_accum(list, team: thread->th.th_serial_team);
3565 }
3566 }
3567 } else {
3568 __kmp_printf(format: "Threads array is not allocated.\n");
3569 }
3570
3571 // Print out __kmp_root array.
3572 __kmp_printf(format: "\n------------------------------\nUbers\n----------------------"
3573 "--------\n");
3574 if (__kmp_root != NULL) {
3575 int gtid;
3576 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3577 kmp_root_t const *root = __kmp_root[gtid];
3578 if (root != NULL) {
3579 __kmp_printf(format: "GTID %2d %p:\n", gtid, root);
3580 __kmp_print_structure_team(title: " Root Team: ", team: root->r.r_root_team);
3581 __kmp_print_structure_team(title: " Hot Team: ", team: root->r.r_hot_team);
3582 __kmp_print_structure_thread(title: " Uber Thread: ",
3583 thread: root->r.r_uber_thread);
3584 __kmp_printf(format: " Active?: %2d\n", root->r.r_active);
3585 __kmp_printf(format: " In Parallel: %2d\n",
3586 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3587 __kmp_printf(format: "\n");
3588 __kmp_print_structure_team_accum(list, team: root->r.r_root_team);
3589 __kmp_print_structure_team_accum(list, team: root->r.r_hot_team);
3590 }
3591 }
3592 } else {
3593 __kmp_printf(format: "Ubers array is not allocated.\n");
3594 }
3595
3596 __kmp_printf(format: "\n------------------------------\nTeams\n----------------------"
3597 "--------\n");
3598 while (list->next != NULL) {
3599 kmp_team_p const *team = list->entry;
3600 int i;
3601 __kmp_printf(format: "Team %2x %p:\n", team->t.t_id, team);
3602 __kmp_print_structure_team(title: " Parent Team: ", team: team->t.t_parent);
3603 __kmp_printf(format: " Primary TID: %2d\n", team->t.t_master_tid);
3604 __kmp_printf(format: " Max threads: %2d\n", team->t.t_max_nproc);
3605 __kmp_printf(format: " Levels of serial: %2d\n", team->t.t_serialized);
3606 __kmp_printf(format: " Number threads: %2d\n", team->t.t_nproc);
3607 for (i = 0; i < team->t.t_nproc; ++i) {
3608 __kmp_printf(format: " Thread %2d: ", i);
3609 __kmp_print_structure_thread(title: "", thread: team->t.t_threads[i]);
3610 }
3611 __kmp_print_structure_team(title: " Next in pool: ", team: team->t.t_next_pool);
3612 __kmp_printf(format: "\n");
3613 list = list->next;
3614 }
3615
3616 // Print out __kmp_thread_pool and __kmp_team_pool.
3617 __kmp_printf(format: "\n------------------------------\nPools\n----------------------"
3618 "--------\n");
3619 __kmp_print_structure_thread(title: "Thread pool: ",
3620 CCAST(kmp_info_t *, __kmp_thread_pool));
3621 __kmp_print_structure_team(title: "Team pool: ",
3622 CCAST(kmp_team_t *, __kmp_team_pool));
3623 __kmp_printf(format: "\n");
3624
3625 // Free team list.
3626 while (list != NULL) {
3627 kmp_team_list_item_t *item = list;
3628 list = list->next;
3629 KMP_INTERNAL_FREE(item);
3630 }
3631}
3632
3633#endif
3634
3635//---------------------------------------------------------------------------
3636// Stuff for per-thread fast random number generator
3637// Table of primes
3638static const unsigned __kmp_primes[] = {
3639 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3640 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3641 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3642 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3643 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3644 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3645 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3646 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3647 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3648 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3649 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3650
3651//---------------------------------------------------------------------------
3652// __kmp_get_random: Get a random number using a linear congruential method.
3653unsigned short __kmp_get_random(kmp_info_t *thread) {
3654 unsigned x = thread->th.th_x;
3655 unsigned short r = (unsigned short)(x >> 16);
3656
3657 thread->th.th_x = x * thread->th.th_a + 1;
3658
3659 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3660 thread->th.th_info.ds.ds_tid, r));
3661
3662 return r;
3663}
3664//--------------------------------------------------------
3665// __kmp_init_random: Initialize a random number generator
3666void __kmp_init_random(kmp_info_t *thread) {
3667 unsigned seed = thread->th.th_info.ds.ds_tid;
3668
3669 thread->th.th_a =
3670 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3671 thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3672 KA_TRACE(30,
3673 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3674}
3675
3676#if KMP_OS_WINDOWS
3677/* reclaim array entries for root threads that are already dead, returns number
3678 * reclaimed */
3679static int __kmp_reclaim_dead_roots(void) {
3680 int i, r = 0;
3681
3682 for (i = 0; i < __kmp_threads_capacity; ++i) {
3683 if (KMP_UBER_GTID(i) &&
3684 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3685 !__kmp_root[i]
3686 ->r.r_active) { // AC: reclaim only roots died in non-active state
3687 r += __kmp_unregister_root_other_thread(i);
3688 }
3689 }
3690 return r;
3691}
3692#endif
3693
3694/* This function attempts to create free entries in __kmp_threads and
3695 __kmp_root, and returns the number of free entries generated.
3696
3697 For Windows* OS static library, the first mechanism used is to reclaim array
3698 entries for root threads that are already dead.
3699
3700 On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3701 __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3702 capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3703 threadprivate cache array has been created. Synchronization with
3704 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3705
3706 After any dead root reclamation, if the clipping value allows array expansion
3707 to result in the generation of a total of nNeed free slots, the function does
3708 that expansion. If not, nothing is done beyond the possible initial root
3709 thread reclamation.
3710
3711 If any argument is negative, the behavior is undefined. */
3712static int __kmp_expand_threads(int nNeed) {
3713 int added = 0;
3714 int minimumRequiredCapacity;
3715 int newCapacity;
3716 kmp_info_t **newThreads;
3717 kmp_root_t **newRoot;
3718
3719 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3720 // resizing __kmp_threads does not need additional protection if foreign
3721 // threads are present
3722
3723#if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3724 /* only for Windows static library */
3725 /* reclaim array entries for root threads that are already dead */
3726 added = __kmp_reclaim_dead_roots();
3727
3728 if (nNeed) {
3729 nNeed -= added;
3730 if (nNeed < 0)
3731 nNeed = 0;
3732 }
3733#endif
3734 if (nNeed <= 0)
3735 return added;
3736
3737 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3738 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3739 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3740 // > __kmp_max_nth in one of two ways:
3741 //
3742 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3743 // may not be reused by another thread, so we may need to increase
3744 // __kmp_threads_capacity to __kmp_max_nth + 1.
3745 //
3746 // 2) New foreign root(s) are encountered. We always register new foreign
3747 // roots. This may cause a smaller # of threads to be allocated at
3748 // subsequent parallel regions, but the worker threads hang around (and
3749 // eventually go to sleep) and need slots in the __kmp_threads[] array.
3750 //
3751 // Anyway, that is the reason for moving the check to see if
3752 // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3753 // instead of having it performed here. -BB
3754
3755 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3756
3757 /* compute expansion headroom to check if we can expand */
3758 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3759 /* possible expansion too small -- give up */
3760 return added;
3761 }
3762 minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3763
3764 newCapacity = __kmp_threads_capacity;
3765 do {
3766 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3767 : __kmp_sys_max_nth;
3768 } while (newCapacity < minimumRequiredCapacity);
3769 newThreads = (kmp_info_t **)__kmp_allocate(
3770 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3771 newRoot =
3772 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3773 KMP_MEMCPY(dest: newThreads, src: __kmp_threads,
3774 n: __kmp_threads_capacity * sizeof(kmp_info_t *));
3775 KMP_MEMCPY(dest: newRoot, src: __kmp_root,
3776 n: __kmp_threads_capacity * sizeof(kmp_root_t *));
3777 // Put old __kmp_threads array on a list. Any ongoing references to the old
3778 // list will be valid. This list is cleaned up at library shutdown.
3779 kmp_old_threads_list_t *node =
3780 (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3781 node->threads = __kmp_threads;
3782 node->next = __kmp_old_threads_list;
3783 __kmp_old_threads_list = node;
3784
3785 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3786 *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3787 added += newCapacity - __kmp_threads_capacity;
3788 *(volatile int *)&__kmp_threads_capacity = newCapacity;
3789
3790 if (newCapacity > __kmp_tp_capacity) {
3791 __kmp_acquire_bootstrap_lock(lck: &__kmp_tp_cached_lock);
3792 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3793 __kmp_threadprivate_resize_cache(newCapacity);
3794 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3795 *(volatile int *)&__kmp_tp_capacity = newCapacity;
3796 }
3797 __kmp_release_bootstrap_lock(lck: &__kmp_tp_cached_lock);
3798 }
3799
3800 return added;
3801}
3802
3803/* Register the current thread as a root thread and obtain our gtid. We must
3804 have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3805 thread that calls from __kmp_do_serial_initialize() */
3806int __kmp_register_root(int initial_thread) {
3807 kmp_info_t *root_thread;
3808 kmp_root_t *root;
3809 int gtid;
3810 int capacity;
3811 __kmp_acquire_bootstrap_lock(lck: &__kmp_forkjoin_lock);
3812 KA_TRACE(20, ("__kmp_register_root: entered\n"));
3813 KMP_MB();
3814
3815 /* 2007-03-02:
3816 If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3817 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3818 work as expected -- it may return false (that means there is at least one
3819 empty slot in __kmp_threads array), but it is possible the only free slot
3820 is #0, which is reserved for initial thread and so cannot be used for this
3821 one. Following code workarounds this bug.
3822
3823 However, right solution seems to be not reserving slot #0 for initial
3824 thread because:
3825 (1) there is no magic in slot #0,
3826 (2) we cannot detect initial thread reliably (the first thread which does
3827 serial initialization may be not a real initial thread).
3828 */
3829 capacity = __kmp_threads_capacity;
3830 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3831 --capacity;
3832 }
3833
3834 // If it is not for initializing the hidden helper team, we need to take
3835 // __kmp_hidden_helper_threads_num out of the capacity because it is included
3836 // in __kmp_threads_capacity.
3837 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3838 capacity -= __kmp_hidden_helper_threads_num;
3839 }
3840
3841 /* see if there are too many threads */
3842 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(nNeed: 1)) {
3843 if (__kmp_tp_cached) {
3844 __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3845 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3846 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3847 } else {
3848 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3849 __kmp_msg_null);
3850 }
3851 }
3852
3853 // When hidden helper task is enabled, __kmp_threads is organized as follows:
3854 // 0: initial thread, also a regular OpenMP thread.
3855 // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3856 // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3857 // regular OpenMP threads.
3858 if (TCR_4(__kmp_init_hidden_helper_threads)) {
3859 // Find an available thread slot for hidden helper thread. Slots for hidden
3860 // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3861 for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3862 gtid <= __kmp_hidden_helper_threads_num;
3863 gtid++)
3864 ;
3865 KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3866 KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3867 "hidden helper thread: T#%d\n",
3868 gtid));
3869 } else {
3870 /* find an available thread slot */
3871 // Don't reassign the zero slot since we need that to only be used by
3872 // initial thread. Slots for hidden helper threads should also be skipped.
3873 if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3874 gtid = 0;
3875 } else {
3876 for (gtid = __kmp_hidden_helper_threads_num + 1;
3877 TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3878 ;
3879 }
3880 KA_TRACE(
3881 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3882 KMP_ASSERT(gtid < __kmp_threads_capacity);
3883 }
3884
3885 /* update global accounting */
3886 __kmp_all_nth++;
3887 TCW_4(__kmp_nth, __kmp_nth + 1);
3888
3889 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3890 // numbers of procs, and method #2 (keyed API call) for higher numbers.
3891 if (__kmp_adjust_gtid_mode) {
3892 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3893 if (TCR_4(__kmp_gtid_mode) != 2) {
3894 TCW_4(__kmp_gtid_mode, 2);
3895 }
3896 } else {
3897 if (TCR_4(__kmp_gtid_mode) != 1) {
3898 TCW_4(__kmp_gtid_mode, 1);
3899 }
3900 }
3901 }
3902
3903#ifdef KMP_ADJUST_BLOCKTIME
3904 /* Adjust blocktime to zero if necessary */
3905 /* Middle initialization might not have occurred yet */
3906 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3907 if (__kmp_nth > __kmp_avail_proc) {
3908 __kmp_zero_bt = TRUE;
3909 }
3910 }
3911#endif /* KMP_ADJUST_BLOCKTIME */
3912
3913 /* setup this new hierarchy */
3914 if (!(root = __kmp_root[gtid])) {
3915 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3916 KMP_DEBUG_ASSERT(!root->r.r_root_team);
3917 }
3918
3919#if KMP_STATS_ENABLED
3920 // Initialize stats as soon as possible (right after gtid assignment).
3921 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3922 __kmp_stats_thread_ptr->startLife();
3923 KMP_SET_THREAD_STATE(SERIAL_REGION);
3924 KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3925#endif
3926 __kmp_initialize_root(root);
3927
3928 /* setup new root thread structure */
3929 if (root->r.r_uber_thread) {
3930 root_thread = root->r.r_uber_thread;
3931 } else {
3932 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3933 if (__kmp_storage_map) {
3934 __kmp_print_thread_storage_map(thr: root_thread, gtid);
3935 }
3936 root_thread->th.th_info.ds.ds_gtid = gtid;
3937#if OMPT_SUPPORT
3938 root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3939#endif
3940 root_thread->th.th_root = root;
3941 if (__kmp_env_consistency_check) {
3942 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3943 }
3944#if USE_FAST_MEMORY
3945 __kmp_initialize_fast_memory(this_thr: root_thread);
3946#endif /* USE_FAST_MEMORY */
3947
3948#if KMP_USE_BGET
3949 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3950 __kmp_initialize_bget(th: root_thread);
3951#endif
3952 __kmp_init_random(thread: root_thread); // Initialize random number generator
3953 }
3954
3955 /* setup the serial team held in reserve by the root thread */
3956 if (!root_thread->th.th_serial_team) {
3957 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3958 KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3959 root_thread->th.th_serial_team = __kmp_allocate_team(
3960 root, new_nproc: 1, max_nproc: 1,
3961#if OMPT_SUPPORT
3962 ompt_data_none, // root parallel id
3963#endif
3964 proc_bind: proc_bind_default, new_icvs: &r_icvs, argc: 0 USE_NESTED_HOT_ARG(NULL));
3965 }
3966 KMP_ASSERT(root_thread->th.th_serial_team);
3967 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3968 root_thread->th.th_serial_team));
3969
3970 /* drop root_thread into place */
3971 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3972
3973 root->r.r_root_team->t.t_threads[0] = root_thread;
3974 root->r.r_hot_team->t.t_threads[0] = root_thread;
3975 root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3976 // AC: the team created in reserve, not for execution (it is unused for now).
3977 root_thread->th.th_serial_team->t.t_serialized = 0;
3978 root->r.r_uber_thread = root_thread;
3979
3980 /* initialize the thread, get it ready to go */
3981 __kmp_initialize_info(root_thread, root->r.r_root_team, tid: 0, gtid);
3982 TCW_4(__kmp_init_gtid, TRUE);
3983
3984 /* prepare the primary thread for get_gtid() */
3985 __kmp_gtid_set_specific(gtid);
3986
3987#if USE_ITT_BUILD
3988 __kmp_itt_thread_name(gtid);
3989#endif /* USE_ITT_BUILD */
3990
3991#ifdef KMP_TDATA_GTID
3992 __kmp_gtid = gtid;
3993#endif
3994 __kmp_create_worker(gtid, th: root_thread, stack_size: __kmp_stksize);
3995 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3996
3997 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3998 "plain=%u\n",
3999 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
4000 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
4001 KMP_INIT_BARRIER_STATE));
4002 { // Initialize barrier data.
4003 int b;
4004 for (b = 0; b < bs_last_barrier; ++b) {
4005 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
4006#if USE_DEBUGGER
4007 root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
4008#endif
4009 }
4010 }
4011 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
4012 KMP_INIT_BARRIER_STATE);
4013
4014#if KMP_AFFINITY_SUPPORTED
4015 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
4016 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
4017 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
4018 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
4019#endif /* KMP_AFFINITY_SUPPORTED */
4020 root_thread->th.th_def_allocator = __kmp_def_allocator;
4021 root_thread->th.th_prev_level = 0;
4022 root_thread->th.th_prev_num_threads = 1;
4023
4024 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
4025 tmp->cg_root = root_thread;
4026 tmp->cg_thread_limit = __kmp_cg_max_nth;
4027 tmp->cg_nthreads = 1;
4028 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
4029 " cg_nthreads init to 1\n",
4030 root_thread, tmp));
4031 tmp->up = NULL;
4032 root_thread->th.th_cg_roots = tmp;
4033
4034 __kmp_root_counter++;
4035
4036#if OMPT_SUPPORT
4037 if (ompt_enabled.enabled) {
4038
4039 kmp_info_t *root_thread = ompt_get_thread();
4040
4041 ompt_set_thread_state(thread: root_thread, state: ompt_state_overhead);
4042
4043 if (ompt_enabled.ompt_callback_thread_begin) {
4044 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
4045 ompt_thread_initial, __ompt_get_thread_data_internal());
4046 }
4047 ompt_data_t *task_data;
4048 ompt_data_t *parallel_data;
4049 __ompt_get_task_info_internal(ancestor_level: 0, NULL, task_data: &task_data, NULL, parallel_data: &parallel_data,
4050 NULL);
4051 if (ompt_enabled.ompt_callback_implicit_task) {
4052 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4053 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
4054 }
4055
4056 ompt_set_thread_state(thread: root_thread, state: ompt_state_work_serial);
4057 }
4058#endif
4059#if OMPD_SUPPORT
4060 if (ompd_state & OMPD_ENABLE_BP)
4061 ompd_bp_thread_begin();
4062#endif
4063
4064 KMP_MB();
4065 __kmp_release_bootstrap_lock(lck: &__kmp_forkjoin_lock);
4066
4067 return gtid;
4068}
4069
4070#if KMP_NESTED_HOT_TEAMS
4071static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
4072 const int max_level) {
4073 int i, n, nth;
4074 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
4075 if (!hot_teams || !hot_teams[level].hot_team) {
4076 return 0;
4077 }
4078 KMP_DEBUG_ASSERT(level < max_level);
4079 kmp_team_t *team = hot_teams[level].hot_team;
4080 nth = hot_teams[level].hot_team_nth;
4081 n = nth - 1; // primary thread is not freed
4082 if (level < max_level - 1) {
4083 for (i = 0; i < nth; ++i) {
4084 kmp_info_t *th = team->t.t_threads[i];
4085 n += __kmp_free_hot_teams(root, thr: th, level: level + 1, max_level);
4086 if (i > 0 && th->th.th_hot_teams) {
4087 __kmp_free(th->th.th_hot_teams);
4088 th->th.th_hot_teams = NULL;
4089 }
4090 }
4091 }
4092 __kmp_free_team(root, team, NULL);
4093 return n;
4094}
4095#endif
4096
4097// Resets a root thread and clear its root and hot teams.
4098// Returns the number of __kmp_threads entries directly and indirectly freed.
4099static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4100 kmp_team_t *root_team = root->r.r_root_team;
4101 kmp_team_t *hot_team = root->r.r_hot_team;
4102 int n = hot_team->t.t_nproc;
4103 int i;
4104
4105 KMP_DEBUG_ASSERT(!root->r.r_active);
4106
4107 root->r.r_root_team = NULL;
4108 root->r.r_hot_team = NULL;
4109 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4110 // before call to __kmp_free_team().
4111 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4112#if KMP_NESTED_HOT_TEAMS
4113 if (__kmp_hot_teams_max_level >
4114 0) { // need to free nested hot teams and their threads if any
4115 for (i = 0; i < hot_team->t.t_nproc; ++i) {
4116 kmp_info_t *th = hot_team->t.t_threads[i];
4117 if (__kmp_hot_teams_max_level > 1) {
4118 n += __kmp_free_hot_teams(root, thr: th, level: 1, max_level: __kmp_hot_teams_max_level);
4119 }
4120 if (th->th.th_hot_teams) {
4121 __kmp_free(th->th.th_hot_teams);
4122 th->th.th_hot_teams = NULL;
4123 }
4124 }
4125 }
4126#endif
4127 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4128
4129 // Before we can reap the thread, we need to make certain that all other
4130 // threads in the teams that had this root as ancestor have stopped trying to
4131 // steal tasks.
4132 if (__kmp_tasking_mode != tskm_immediate_exec) {
4133 __kmp_wait_to_unref_task_teams();
4134 }
4135
4136#if KMP_OS_WINDOWS
4137 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4138 KA_TRACE(
4139 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4140 "\n",
4141 (LPVOID) & (root->r.r_uber_thread->th),
4142 root->r.r_uber_thread->th.th_info.ds.ds_thread));
4143 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4144#endif /* KMP_OS_WINDOWS */
4145
4146#if OMPD_SUPPORT
4147 if (ompd_state & OMPD_ENABLE_BP)
4148 ompd_bp_thread_end();
4149#endif
4150
4151#if OMPT_SUPPORT
4152 ompt_data_t *task_data;
4153 ompt_data_t *parallel_data;
4154 __ompt_get_task_info_internal(ancestor_level: 0, NULL, task_data: &task_data, NULL, parallel_data: &parallel_data,
4155 NULL);
4156 if (ompt_enabled.ompt_callback_implicit_task) {
4157 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4158 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4159 }
4160 if (ompt_enabled.ompt_callback_thread_end) {
4161 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4162 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4163 }
4164#endif
4165
4166 TCW_4(__kmp_nth,
4167 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4168 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4169 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4170 " to %d\n",
4171 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4172 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4173 if (i == 1) {
4174 // need to free contention group structure
4175 KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4176 root->r.r_uber_thread->th.th_cg_roots->cg_root);
4177 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4178 __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4179 root->r.r_uber_thread->th.th_cg_roots = NULL;
4180 }
4181 __kmp_reap_thread(thread: root->r.r_uber_thread, is_root: 1);
4182
4183 // We canot put root thread to __kmp_thread_pool, so we have to reap it
4184 // instead of freeing.
4185 root->r.r_uber_thread = NULL;
4186 /* mark root as no longer in use */
4187 root->r.r_begin = FALSE;
4188
4189 return n;
4190}
4191
4192void __kmp_unregister_root_current_thread(int gtid) {
4193 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4194 /* this lock should be ok, since unregister_root_current_thread is never
4195 called during an abort, only during a normal close. furthermore, if you
4196 have the forkjoin lock, you should never try to get the initz lock */
4197 __kmp_acquire_bootstrap_lock(lck: &__kmp_forkjoin_lock);
4198 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4199 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4200 "exiting T#%d\n",
4201 gtid));
4202 __kmp_release_bootstrap_lock(lck: &__kmp_forkjoin_lock);
4203 return;
4204 }
4205 kmp_root_t *root = __kmp_root[gtid];
4206
4207 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4208 KMP_ASSERT(KMP_UBER_GTID(gtid));
4209 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4210 KMP_ASSERT(root->r.r_active == FALSE);
4211
4212 KMP_MB();
4213
4214 kmp_info_t *thread = __kmp_threads[gtid];
4215 kmp_team_t *team = thread->th.th_team;
4216 kmp_task_team_t *task_team = thread->th.th_task_team;
4217
4218 // we need to wait for the proxy tasks before finishing the thread
4219 if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4220 task_team->tt.tt_hidden_helper_task_encountered)) {
4221#if OMPT_SUPPORT
4222 // the runtime is shutting down so we won't report any events
4223 thread->th.ompt_thread_info.state = ompt_state_undefined;
4224#endif
4225 __kmp_task_team_wait(this_thr: thread, team USE_ITT_BUILD_ARG(NULL));
4226 }
4227
4228 __kmp_reset_root(gtid, root);
4229
4230 KMP_MB();
4231 KC_TRACE(10,
4232 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4233
4234 __kmp_release_bootstrap_lock(lck: &__kmp_forkjoin_lock);
4235}
4236
4237#if KMP_OS_WINDOWS
4238/* __kmp_forkjoin_lock must be already held
4239 Unregisters a root thread that is not the current thread. Returns the number
4240 of __kmp_threads entries freed as a result. */
4241static int __kmp_unregister_root_other_thread(int gtid) {
4242 kmp_root_t *root = __kmp_root[gtid];
4243 int r;
4244
4245 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4246 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4247 KMP_ASSERT(KMP_UBER_GTID(gtid));
4248 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4249 KMP_ASSERT(root->r.r_active == FALSE);
4250
4251 r = __kmp_reset_root(gtid, root);
4252 KC_TRACE(10,
4253 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4254 return r;
4255}
4256#endif
4257
4258#if KMP_DEBUG
4259void __kmp_task_info() {
4260
4261 kmp_int32 gtid = __kmp_entry_gtid();
4262 kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4263 kmp_info_t *this_thr = __kmp_threads[gtid];
4264 kmp_team_t *steam = this_thr->th.th_serial_team;
4265 kmp_team_t *team = this_thr->th.th_team;
4266
4267 __kmp_printf(
4268 format: "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4269 "ptask=%p\n",
4270 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4271 team->t.t_implicit_task_taskdata[tid].td_parent);
4272}
4273#endif // KMP_DEBUG
4274
4275/* TODO optimize with one big memclr, take out what isn't needed, split
4276 responsibility to workers as much as possible, and delay initialization of
4277 features as much as possible */
4278static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4279 int tid, int gtid) {
4280 /* this_thr->th.th_info.ds.ds_gtid is setup in
4281 kmp_allocate_thread/create_worker.
4282 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4283 KMP_DEBUG_ASSERT(this_thr != NULL);
4284 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4285 KMP_DEBUG_ASSERT(team);
4286 KMP_DEBUG_ASSERT(team->t.t_threads);
4287 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4288 kmp_info_t *master = team->t.t_threads[0];
4289 KMP_DEBUG_ASSERT(master);
4290 KMP_DEBUG_ASSERT(master->th.th_root);
4291
4292 KMP_MB();
4293
4294 TCW_SYNC_PTR(this_thr->th.th_team, team);
4295
4296 this_thr->th.th_info.ds.ds_tid = tid;
4297 this_thr->th.th_set_nproc = 0;
4298 if (__kmp_tasking_mode != tskm_immediate_exec)
4299 // When tasking is possible, threads are not safe to reap until they are
4300 // done tasking; this will be set when tasking code is exited in wait
4301 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4302 else // no tasking --> always safe to reap
4303 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4304 this_thr->th.th_set_proc_bind = proc_bind_default;
4305
4306#if KMP_AFFINITY_SUPPORTED
4307 this_thr->th.th_new_place = this_thr->th.th_current_place;
4308#endif
4309 this_thr->th.th_root = master->th.th_root;
4310
4311 /* setup the thread's cache of the team structure */
4312 this_thr->th.th_team_nproc = team->t.t_nproc;
4313 this_thr->th.th_team_master = master;
4314 this_thr->th.th_team_serialized = team->t.t_serialized;
4315
4316 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4317
4318 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4319 tid, gtid, this_thr, this_thr->th.th_current_task));
4320
4321 __kmp_init_implicit_task(loc_ref: this_thr->th.th_team_master->th.th_ident, this_thr,
4322 team, tid, TRUE);
4323
4324 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4325 tid, gtid, this_thr, this_thr->th.th_current_task));
4326 // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4327 // __kmp_initialize_team()?
4328
4329 /* TODO no worksharing in speculative threads */
4330 this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4331
4332 this_thr->th.th_local.this_construct = 0;
4333
4334 if (!this_thr->th.th_pri_common) {
4335 this_thr->th.th_pri_common =
4336 (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4337 if (__kmp_storage_map) {
4338 __kmp_print_storage_map_gtid(
4339 gtid, p1: this_thr->th.th_pri_common, p2: this_thr->th.th_pri_common + 1,
4340 size: sizeof(struct common_table), format: "th_%d.th_pri_common\n", gtid);
4341 }
4342 this_thr->th.th_pri_head = NULL;
4343 }
4344
4345 if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4346 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4347 // Make new thread's CG root same as primary thread's
4348 KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4349 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4350 if (tmp) {
4351 // worker changes CG, need to check if old CG should be freed
4352 int i = tmp->cg_nthreads--;
4353 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4354 " on node %p of thread %p to %d\n",
4355 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4356 if (i == 1) {
4357 __kmp_free(tmp); // last thread left CG --> free it
4358 }
4359 }
4360 this_thr->th.th_cg_roots = master->th.th_cg_roots;
4361 // Increment new thread's CG root's counter to add the new thread
4362 this_thr->th.th_cg_roots->cg_nthreads++;
4363 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4364 " node %p of thread %p to %d\n",
4365 this_thr, this_thr->th.th_cg_roots,
4366 this_thr->th.th_cg_roots->cg_root,
4367 this_thr->th.th_cg_roots->cg_nthreads));
4368 this_thr->th.th_current_task->td_icvs.thread_limit =
4369 this_thr->th.th_cg_roots->cg_thread_limit;
4370 }
4371
4372 /* Initialize dynamic dispatch */
4373 {
4374 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4375 // Use team max_nproc since this will never change for the team.
4376 size_t disp_size =
4377 sizeof(dispatch_private_info_t) *
4378 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4379 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4380 team->t.t_max_nproc));
4381 KMP_ASSERT(dispatch);
4382 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4383 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4384
4385 dispatch->th_disp_index = 0;
4386 dispatch->th_doacross_buf_idx = 0;
4387 if (!dispatch->th_disp_buffer) {
4388 dispatch->th_disp_buffer =
4389 (dispatch_private_info_t *)__kmp_allocate(disp_size);
4390
4391 if (__kmp_storage_map) {
4392 __kmp_print_storage_map_gtid(
4393 gtid, p1: &dispatch->th_disp_buffer[0],
4394 p2: &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4395 ? 1
4396 : __kmp_dispatch_num_buffers],
4397 size: disp_size,
4398 format: "th_%d.th_dispatch.th_disp_buffer "
4399 "(team_%d.t_dispatch[%d].th_disp_buffer)",
4400 gtid, team->t.t_id, gtid);
4401 }
4402 } else {
4403 memset(s: &dispatch->th_disp_buffer[0], c: '\0', n: disp_size);
4404 }
4405
4406 dispatch->th_dispatch_pr_current = 0;
4407 dispatch->th_dispatch_sh_current = 0;
4408
4409 dispatch->th_deo_fcn = 0; /* ORDERED */
4410 dispatch->th_dxo_fcn = 0; /* END ORDERED */
4411 }
4412
4413 this_thr->th.th_next_pool = NULL;
4414
4415 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4416 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4417
4418 KMP_MB();
4419}
4420
4421/* allocate a new thread for the requesting team. this is only called from
4422 within a forkjoin critical section. we will first try to get an available
4423 thread from the thread pool. if none is available, we will fork a new one
4424 assuming we are able to create a new one. this should be assured, as the
4425 caller should check on this first. */
4426kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4427 int new_tid) {
4428 kmp_team_t *serial_team;
4429 kmp_info_t *new_thr;
4430 int new_gtid;
4431
4432 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4433 KMP_DEBUG_ASSERT(root && team);
4434#if !KMP_NESTED_HOT_TEAMS
4435 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4436#endif
4437 KMP_MB();
4438
4439 /* first, try to get one from the thread pool unless allocating thread is
4440 * the main hidden helper thread. The hidden helper team should always
4441 * allocate new OS threads. */
4442 if (__kmp_thread_pool && !KMP_HIDDEN_HELPER_TEAM(team)) {
4443 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4444 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4445 if (new_thr == __kmp_thread_pool_insert_pt) {
4446 __kmp_thread_pool_insert_pt = NULL;
4447 }
4448 TCW_4(new_thr->th.th_in_pool, FALSE);
4449 __kmp_suspend_initialize_thread(th: new_thr);
4450 __kmp_lock_suspend_mx(th: new_thr);
4451 if (new_thr->th.th_active_in_pool == TRUE) {
4452 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4453 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4454 new_thr->th.th_active_in_pool = FALSE;
4455 }
4456 __kmp_unlock_suspend_mx(th: new_thr);
4457
4458 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4459 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4460 KMP_ASSERT(!new_thr->th.th_team);
4461 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4462
4463 /* setup the thread structure */
4464 __kmp_initialize_info(this_thr: new_thr, team, tid: new_tid,
4465 gtid: new_thr->th.th_info.ds.ds_gtid);
4466 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4467
4468 TCW_4(__kmp_nth, __kmp_nth + 1);
4469
4470 new_thr->th.th_task_state = 0;
4471
4472 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4473 // Make sure pool thread has transitioned to waiting on own thread struct
4474 KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4475 // Thread activated in __kmp_allocate_team when increasing team size
4476 }
4477
4478#ifdef KMP_ADJUST_BLOCKTIME
4479 /* Adjust blocktime back to zero if necessary */
4480 /* Middle initialization might not have occurred yet */
4481 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4482 if (__kmp_nth > __kmp_avail_proc) {
4483 __kmp_zero_bt = TRUE;
4484 }
4485 }
4486#endif /* KMP_ADJUST_BLOCKTIME */
4487
4488#if KMP_DEBUG
4489 // If thread entered pool via __kmp_free_thread, wait_flag should !=
4490 // KMP_BARRIER_PARENT_FLAG.
4491 int b;
4492 kmp_balign_t *balign = new_thr->th.th_bar;
4493 for (b = 0; b < bs_last_barrier; ++b)
4494 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4495#endif
4496
4497 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4498 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4499
4500 KMP_MB();
4501 return new_thr;
4502 }
4503
4504 /* no, well fork a new one */
4505 KMP_ASSERT(KMP_HIDDEN_HELPER_TEAM(team) || __kmp_nth == __kmp_all_nth);
4506 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4507
4508#if KMP_USE_MONITOR
4509 // If this is the first worker thread the RTL is creating, then also
4510 // launch the monitor thread. We try to do this as early as possible.
4511 if (!TCR_4(__kmp_init_monitor)) {
4512 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4513 if (!TCR_4(__kmp_init_monitor)) {
4514 KF_TRACE(10, ("before __kmp_create_monitor\n"));
4515 TCW_4(__kmp_init_monitor, 1);
4516 __kmp_create_monitor(&__kmp_monitor);
4517 KF_TRACE(10, ("after __kmp_create_monitor\n"));
4518#if KMP_OS_WINDOWS
4519 // AC: wait until monitor has started. This is a fix for CQ232808.
4520 // The reason is that if the library is loaded/unloaded in a loop with
4521 // small (parallel) work in between, then there is high probability that
4522 // monitor thread started after the library shutdown. At shutdown it is
4523 // too late to cope with the problem, because when the primary thread is
4524 // in DllMain (process detach) the monitor has no chances to start (it is
4525 // blocked), and primary thread has no means to inform the monitor that
4526 // the library has gone, because all the memory which the monitor can
4527 // access is going to be released/reset.
4528 while (TCR_4(__kmp_init_monitor) < 2) {
4529 KMP_YIELD(TRUE);
4530 }
4531 KF_TRACE(10, ("after monitor thread has started\n"));
4532#endif
4533 }
4534 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4535 }
4536#endif
4537
4538 KMP_MB();
4539
4540 {
4541 int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4542 ? 1
4543 : __kmp_hidden_helper_threads_num + 1;
4544
4545 for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4546 ++new_gtid) {
4547 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4548 }
4549
4550 if (TCR_4(__kmp_init_hidden_helper_threads)) {
4551 KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4552 }
4553 }
4554
4555 /* allocate space for it. */
4556 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4557
4558 new_thr->th.th_nt_strict = false;
4559 new_thr->th.th_nt_loc = NULL;
4560 new_thr->th.th_nt_sev = severity_fatal;
4561 new_thr->th.th_nt_msg = NULL;
4562
4563 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4564
4565#if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4566 // suppress race conditions detection on synchronization flags in debug mode
4567 // this helps to analyze library internals eliminating false positives
4568 __itt_suppress_mark_range(
4569 __itt_suppress_range, __itt_suppress_threading_errors,
4570 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4571 __itt_suppress_mark_range(
4572 __itt_suppress_range, __itt_suppress_threading_errors,
4573 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4574#if KMP_OS_WINDOWS
4575 __itt_suppress_mark_range(
4576 __itt_suppress_range, __itt_suppress_threading_errors,
4577 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4578#else
4579 __itt_suppress_mark_range(__itt_suppress_range,
4580 __itt_suppress_threading_errors,
4581 &new_thr->th.th_suspend_init_count,
4582 sizeof(new_thr->th.th_suspend_init_count));
4583#endif
4584 // TODO: check if we need to also suppress b_arrived flags
4585 __itt_suppress_mark_range(__itt_suppress_range,
4586 __itt_suppress_threading_errors,
4587 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4588 sizeof(new_thr->th.th_bar[0].bb.b_go));
4589 __itt_suppress_mark_range(__itt_suppress_range,
4590 __itt_suppress_threading_errors,
4591 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4592 sizeof(new_thr->th.th_bar[1].bb.b_go));
4593 __itt_suppress_mark_range(__itt_suppress_range,
4594 __itt_suppress_threading_errors,
4595 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4596 sizeof(new_thr->th.th_bar[2].bb.b_go));
4597#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4598 if (__kmp_storage_map) {
4599 __kmp_print_thread_storage_map(thr: new_thr, gtid: new_gtid);
4600 }
4601
4602 // add the reserve serialized team, initialized from the team's primary thread
4603 {
4604 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4605 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4606 new_thr->th.th_serial_team = serial_team =
4607 (kmp_team_t *)__kmp_allocate_team(root, new_nproc: 1, max_nproc: 1,
4608#if OMPT_SUPPORT
4609 ompt_data_none, // root parallel id
4610#endif
4611 proc_bind: proc_bind_default, new_icvs: &r_icvs,
4612 argc: 0 USE_NESTED_HOT_ARG(NULL));
4613 }
4614 KMP_ASSERT(serial_team);
4615 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4616 // execution (it is unused for now).
4617 serial_team->t.t_threads[0] = new_thr;
4618 KF_TRACE(10,
4619 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4620 new_thr));
4621
4622 /* setup the thread structures */
4623 __kmp_initialize_info(this_thr: new_thr, team, tid: new_tid, gtid: new_gtid);
4624
4625#if USE_FAST_MEMORY
4626 __kmp_initialize_fast_memory(this_thr: new_thr);
4627#endif /* USE_FAST_MEMORY */
4628
4629#if KMP_USE_BGET
4630 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4631 __kmp_initialize_bget(th: new_thr);
4632#endif
4633
4634 __kmp_init_random(thread: new_thr); // Initialize random number generator
4635
4636 /* Initialize these only once when thread is grabbed for a team allocation */
4637 KA_TRACE(20,
4638 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4639 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4640
4641 int b;
4642 kmp_balign_t *balign = new_thr->th.th_bar;
4643 for (b = 0; b < bs_last_barrier; ++b) {
4644 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4645 balign[b].bb.team = NULL;
4646 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4647 balign[b].bb.use_oncore_barrier = 0;
4648 }
4649
4650 TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4651 new_thr->th.th_sleep_loc_type = flag_unset;
4652
4653 new_thr->th.th_spin_here = FALSE;
4654 new_thr->th.th_next_waiting = 0;
4655#if KMP_OS_UNIX
4656 new_thr->th.th_blocking = false;
4657#endif
4658
4659#if KMP_AFFINITY_SUPPORTED
4660 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4661 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4662 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4663 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4664#endif
4665 new_thr->th.th_def_allocator = __kmp_def_allocator;
4666 new_thr->th.th_prev_level = 0;
4667 new_thr->th.th_prev_num_threads = 1;
4668
4669 TCW_4(new_thr->th.th_in_pool, FALSE);
4670 new_thr->th.th_active_in_pool = FALSE;
4671 TCW_4(new_thr->th.th_active, TRUE);
4672
4673 new_thr->th.th_set_nested_nth = NULL;
4674 new_thr->th.th_set_nested_nth_sz = 0;
4675
4676 /* adjust the global counters */
4677 __kmp_all_nth++;
4678 __kmp_nth++;
4679
4680 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4681 // numbers of procs, and method #2 (keyed API call) for higher numbers.
4682 if (__kmp_adjust_gtid_mode) {
4683 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4684 if (TCR_4(__kmp_gtid_mode) != 2) {
4685 TCW_4(__kmp_gtid_mode, 2);
4686 }
4687 } else {
4688 if (TCR_4(__kmp_gtid_mode) != 1) {
4689 TCW_4(__kmp_gtid_mode, 1);
4690 }
4691 }
4692 }
4693
4694#ifdef KMP_ADJUST_BLOCKTIME
4695 /* Adjust blocktime back to zero if necessary */
4696 /* Middle initialization might not have occurred yet */
4697 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4698 if (__kmp_nth > __kmp_avail_proc) {
4699 __kmp_zero_bt = TRUE;
4700 }
4701 }
4702#endif /* KMP_ADJUST_BLOCKTIME */
4703
4704#if KMP_AFFINITY_SUPPORTED
4705 // Set the affinity and topology information for new thread
4706 __kmp_affinity_set_init_mask(gtid: new_gtid, /*isa_root=*/FALSE);
4707#endif
4708
4709 /* actually fork it and create the new worker thread */
4710 KF_TRACE(
4711 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4712 __kmp_create_worker(gtid: new_gtid, th: new_thr, stack_size: __kmp_stksize);
4713 KF_TRACE(10,
4714 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4715
4716 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4717 new_gtid));
4718 KMP_MB();
4719 return new_thr;
4720}
4721
4722/* Reinitialize team for reuse.
4723 The hot team code calls this case at every fork barrier, so EPCC barrier
4724 test are extremely sensitive to changes in it, esp. writes to the team
4725 struct, which cause a cache invalidation in all threads.
4726 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4727static void __kmp_reinitialize_team(kmp_team_t *team,
4728 kmp_internal_control_t *new_icvs,
4729 ident_t *loc) {
4730 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4731 team->t.t_threads[0], team));
4732 KMP_DEBUG_ASSERT(team && new_icvs);
4733 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4734 KMP_CHECK_UPDATE(team->t.t_ident, loc);
4735
4736 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4737 // Copy ICVs to the primary thread's implicit taskdata
4738 __kmp_init_implicit_task(loc_ref: loc, this_thr: team->t.t_threads[0], team, tid: 0, FALSE);
4739 copy_icvs(dst: &team->t.t_implicit_task_taskdata[0].td_icvs, src: new_icvs);
4740
4741 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4742 team->t.t_threads[0], team));
4743}
4744
4745/* Initialize the team data structure.
4746 This assumes the t_threads and t_max_nproc are already set.
4747 Also, we don't touch the arguments */
4748static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4749 kmp_internal_control_t *new_icvs,
4750 ident_t *loc) {
4751 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4752
4753 /* verify */
4754 KMP_DEBUG_ASSERT(team);
4755 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4756 KMP_DEBUG_ASSERT(team->t.t_threads);
4757 KMP_MB();
4758
4759 team->t.t_master_tid = 0; /* not needed */
4760 /* team->t.t_master_bar; not needed */
4761 team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4762 team->t.t_nproc = new_nproc;
4763
4764 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4765 team->t.t_next_pool = NULL;
4766 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4767 * up hot team */
4768
4769 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4770 team->t.t_invoke = NULL; /* not needed */
4771
4772 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4773 team->t.t_sched.sched = new_icvs->sched.sched;
4774
4775#if KMP_ARCH_X86 || KMP_ARCH_X86_64
4776 team->t.t_fp_control_saved = FALSE; /* not needed */
4777 team->t.t_x87_fpu_control_word = 0; /* not needed */
4778 team->t.t_mxcsr = 0; /* not needed */
4779#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4780
4781 team->t.t_construct = 0;
4782
4783 team->t.t_ordered.dt.t_value = 0;
4784 team->t.t_master_active = FALSE;
4785
4786#ifdef KMP_DEBUG
4787 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4788#endif
4789#if KMP_OS_WINDOWS
4790 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4791#endif
4792
4793 team->t.t_control_stack_top = NULL;
4794
4795 __kmp_reinitialize_team(team, new_icvs, loc);
4796
4797 KMP_MB();
4798 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4799}
4800
4801#if KMP_AFFINITY_SUPPORTED
4802static inline void __kmp_set_thread_place(kmp_team_t *team, kmp_info_t *th,
4803 int first, int last, int newp) {
4804 th->th.th_first_place = first;
4805 th->th.th_last_place = last;
4806 th->th.th_new_place = newp;
4807 if (newp != th->th.th_current_place) {
4808 if (__kmp_display_affinity && team->t.t_display_affinity != 1)
4809 team->t.t_display_affinity = 1;
4810 // Copy topology information associated with the new place
4811 th->th.th_topology_ids = __kmp_affinity.ids[th->th.th_new_place];
4812 th->th.th_topology_attrs = __kmp_affinity.attrs[th->th.th_new_place];
4813 }
4814}
4815
4816// __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4817// It calculates the worker + primary thread's partition based upon the parent
4818// thread's partition, and binds each worker to a thread in their partition.
4819// The primary thread's partition should already include its current binding.
4820static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4821 // Do not partition places for the hidden helper team
4822 if (KMP_HIDDEN_HELPER_TEAM(team))
4823 return;
4824 // Copy the primary thread's place partition to the team struct
4825 kmp_info_t *master_th = team->t.t_threads[0];
4826 KMP_DEBUG_ASSERT(master_th != NULL);
4827 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4828 int first_place = master_th->th.th_first_place;
4829 int last_place = master_th->th.th_last_place;
4830 int masters_place = master_th->th.th_current_place;
4831 int num_masks = __kmp_affinity.num_masks;
4832 team->t.t_first_place = first_place;
4833 team->t.t_last_place = last_place;
4834
4835 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4836 "bound to place %d partition = [%d,%d]\n",
4837 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4838 team->t.t_id, masters_place, first_place, last_place));
4839
4840 switch (proc_bind) {
4841
4842 case proc_bind_default:
4843 // Serial teams might have the proc_bind policy set to proc_bind_default.
4844 // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4845 KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4846 break;
4847
4848 case proc_bind_primary: {
4849 int f;
4850 int n_th = team->t.t_nproc;
4851 for (f = 1; f < n_th; f++) {
4852 kmp_info_t *th = team->t.t_threads[f];
4853 KMP_DEBUG_ASSERT(th != NULL);
4854 __kmp_set_thread_place(team, th, first: first_place, last: last_place, newp: masters_place);
4855
4856 KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4857 "partition = [%d,%d]\n",
4858 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4859 f, masters_place, first_place, last_place));
4860 }
4861 } break;
4862
4863 case proc_bind_close: {
4864 int f;
4865 int n_th = team->t.t_nproc;
4866 int n_places;
4867 if (first_place <= last_place) {
4868 n_places = last_place - first_place + 1;
4869 } else {
4870 n_places = num_masks - first_place + last_place + 1;
4871 }
4872 if (n_th <= n_places) {
4873 int place = masters_place;
4874 for (f = 1; f < n_th; f++) {
4875 kmp_info_t *th = team->t.t_threads[f];
4876 KMP_DEBUG_ASSERT(th != NULL);
4877
4878 if (place == last_place) {
4879 place = first_place;
4880 } else if (place == (num_masks - 1)) {
4881 place = 0;
4882 } else {
4883 place++;
4884 }
4885 __kmp_set_thread_place(team, th, first: first_place, last: last_place, newp: place);
4886
4887 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4888 "partition = [%d,%d]\n",
4889 __kmp_gtid_from_thread(team->t.t_threads[f]),
4890 team->t.t_id, f, place, first_place, last_place));
4891 }
4892 } else {
4893 int S, rem, gap, s_count;
4894 S = n_th / n_places;
4895 s_count = 0;
4896 rem = n_th - (S * n_places);
4897 gap = rem > 0 ? n_places / rem : n_places;
4898 int place = masters_place;
4899 int gap_ct = gap;
4900 for (f = 0; f < n_th; f++) {
4901 kmp_info_t *th = team->t.t_threads[f];
4902 KMP_DEBUG_ASSERT(th != NULL);
4903
4904 __kmp_set_thread_place(team, th, first: first_place, last: last_place, newp: place);
4905 s_count++;
4906
4907 if ((s_count == S) && rem && (gap_ct == gap)) {
4908 // do nothing, add an extra thread to place on next iteration
4909 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4910 // we added an extra thread to this place; move to next place
4911 if (place == last_place) {
4912 place = first_place;
4913 } else if (place == (num_masks - 1)) {
4914 place = 0;
4915 } else {
4916 place++;
4917 }
4918 s_count = 0;
4919 gap_ct = 1;
4920 rem--;
4921 } else if (s_count == S) { // place full; don't add extra
4922 if (place == last_place) {
4923 place = first_place;
4924 } else if (place == (num_masks - 1)) {
4925 place = 0;
4926 } else {
4927 place++;
4928 }
4929 gap_ct++;
4930 s_count = 0;
4931 }
4932
4933 KA_TRACE(100,
4934 ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4935 "partition = [%d,%d]\n",
4936 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4937 th->th.th_new_place, first_place, last_place));
4938 }
4939 KMP_DEBUG_ASSERT(place == masters_place);
4940 }
4941 } break;
4942
4943 case proc_bind_spread: {
4944 int f;
4945 int n_th = team->t.t_nproc;
4946 int n_places;
4947 int thidx;
4948 if (first_place <= last_place) {
4949 n_places = last_place - first_place + 1;
4950 } else {
4951 n_places = num_masks - first_place + last_place + 1;
4952 }
4953 if (n_th <= n_places) {
4954 int place = -1;
4955
4956 if (n_places != num_masks) {
4957 int S = n_places / n_th;
4958 int s_count, rem, gap, gap_ct;
4959
4960 place = masters_place;
4961 rem = n_places - n_th * S;
4962 gap = rem ? n_th / rem : 1;
4963 gap_ct = gap;
4964 thidx = n_th;
4965 if (update_master_only == 1)
4966 thidx = 1;
4967 for (f = 0; f < thidx; f++) {
4968 kmp_info_t *th = team->t.t_threads[f];
4969 KMP_DEBUG_ASSERT(th != NULL);
4970
4971 int fplace = place, nplace = place;
4972 s_count = 1;
4973 while (s_count < S) {
4974 if (place == last_place) {
4975 place = first_place;
4976 } else if (place == (num_masks - 1)) {
4977 place = 0;
4978 } else {
4979 place++;
4980 }
4981 s_count++;
4982 }
4983 if (rem && (gap_ct == gap)) {
4984 if (place == last_place) {
4985 place = first_place;
4986 } else if (place == (num_masks - 1)) {
4987 place = 0;
4988 } else {
4989 place++;
4990 }
4991 rem--;
4992 gap_ct = 0;
4993 }
4994 __kmp_set_thread_place(team, th, first: fplace, last: place, newp: nplace);
4995 gap_ct++;
4996
4997 if (place == last_place) {
4998 place = first_place;
4999 } else if (place == (num_masks - 1)) {
5000 place = 0;
5001 } else {
5002 place++;
5003 }
5004
5005 KA_TRACE(100,
5006 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5007 "partition = [%d,%d], num_masks: %u\n",
5008 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
5009 f, th->th.th_new_place, th->th.th_first_place,
5010 th->th.th_last_place, num_masks));
5011 }
5012 } else {
5013 /* Having uniform space of available computation places I can create
5014 T partitions of round(P/T) size and put threads into the first
5015 place of each partition. */
5016 double current = static_cast<double>(masters_place);
5017 double spacing =
5018 (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
5019 int first, last;
5020 kmp_info_t *th;
5021
5022 thidx = n_th + 1;
5023 if (update_master_only == 1)
5024 thidx = 1;
5025 for (f = 0; f < thidx; f++) {
5026 first = static_cast<int>(current);
5027 last = static_cast<int>(current + spacing) - 1;
5028 KMP_DEBUG_ASSERT(last >= first);
5029 if (first >= n_places) {
5030 if (masters_place) {
5031 first -= n_places;
5032 last -= n_places;
5033 if (first == (masters_place + 1)) {
5034 KMP_DEBUG_ASSERT(f == n_th);
5035 first--;
5036 }
5037 if (last == masters_place) {
5038 KMP_DEBUG_ASSERT(f == (n_th - 1));
5039 last--;
5040 }
5041 } else {
5042 KMP_DEBUG_ASSERT(f == n_th);
5043 first = 0;
5044 last = 0;
5045 }
5046 }
5047 if (last >= n_places) {
5048 last = (n_places - 1);
5049 }
5050 place = first;
5051 current += spacing;
5052 if (f < n_th) {
5053 KMP_DEBUG_ASSERT(0 <= first);
5054 KMP_DEBUG_ASSERT(n_places > first);
5055 KMP_DEBUG_ASSERT(0 <= last);
5056 KMP_DEBUG_ASSERT(n_places > last);
5057 KMP_DEBUG_ASSERT(last_place >= first_place);
5058 th = team->t.t_threads[f];
5059 KMP_DEBUG_ASSERT(th);
5060 __kmp_set_thread_place(team, th, first, last, newp: place);
5061 KA_TRACE(100,
5062 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5063 "partition = [%d,%d], spacing = %.4f\n",
5064 __kmp_gtid_from_thread(team->t.t_threads[f]),
5065 team->t.t_id, f, th->th.th_new_place,
5066 th->th.th_first_place, th->th.th_last_place, spacing));
5067 }
5068 }
5069 }
5070 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5071 } else {
5072 int S, rem, gap, s_count;
5073 S = n_th / n_places;
5074 s_count = 0;
5075 rem = n_th - (S * n_places);
5076 gap = rem > 0 ? n_places / rem : n_places;
5077 int place = masters_place;
5078 int gap_ct = gap;
5079 thidx = n_th;
5080 if (update_master_only == 1)
5081 thidx = 1;
5082 for (f = 0; f < thidx; f++) {
5083 kmp_info_t *th = team->t.t_threads[f];
5084 KMP_DEBUG_ASSERT(th != NULL);
5085
5086 __kmp_set_thread_place(team, th, first: place, last: place, newp: place);
5087 s_count++;
5088
5089 if ((s_count == S) && rem && (gap_ct == gap)) {
5090 // do nothing, add an extra thread to place on next iteration
5091 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5092 // we added an extra thread to this place; move on to next place
5093 if (place == last_place) {
5094 place = first_place;
5095 } else if (place == (num_masks - 1)) {
5096 place = 0;
5097 } else {
5098 place++;
5099 }
5100 s_count = 0;
5101 gap_ct = 1;
5102 rem--;
5103 } else if (s_count == S) { // place is full; don't add extra thread
5104 if (place == last_place) {
5105 place = first_place;
5106 } else if (place == (num_masks - 1)) {
5107 place = 0;
5108 } else {
5109 place++;
5110 }
5111 gap_ct++;
5112 s_count = 0;
5113 }
5114
5115 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5116 "partition = [%d,%d]\n",
5117 __kmp_gtid_from_thread(team->t.t_threads[f]),
5118 team->t.t_id, f, th->th.th_new_place,
5119 th->th.th_first_place, th->th.th_last_place));
5120 }
5121 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5122 }
5123 } break;
5124
5125 default:
5126 break;
5127 }
5128
5129 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5130}
5131
5132#endif // KMP_AFFINITY_SUPPORTED
5133
5134/* allocate a new team data structure to use. take one off of the free pool if
5135 available */
5136kmp_team_t *
5137__kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5138#if OMPT_SUPPORT
5139 ompt_data_t ompt_parallel_data,
5140#endif
5141 kmp_proc_bind_t new_proc_bind,
5142 kmp_internal_control_t *new_icvs,
5143 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5144 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5145 int f;
5146 kmp_team_t *team;
5147 int use_hot_team = !root->r.r_active;
5148 int level = 0;
5149 int do_place_partition = 1;
5150
5151 KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5152 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5153 KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5154 KMP_MB();
5155
5156#if KMP_NESTED_HOT_TEAMS
5157 kmp_hot_team_ptr_t *hot_teams;
5158 if (master) {
5159 team = master->th.th_team;
5160 level = team->t.t_active_level;
5161 if (master->th.th_teams_microtask) { // in teams construct?
5162 if (master->th.th_teams_size.nteams > 1 &&
5163 ( // #teams > 1
5164 team->t.t_pkfn ==
5165 (microtask_t)__kmp_teams_master || // inner fork of the teams
5166 master->th.th_teams_level <
5167 team->t.t_level)) { // or nested parallel inside the teams
5168 ++level; // not increment if #teams==1, or for outer fork of the teams;
5169 // increment otherwise
5170 }
5171 // Do not perform the place partition if inner fork of the teams
5172 // Wait until nested parallel region encountered inside teams construct
5173 if ((master->th.th_teams_size.nteams == 1 &&
5174 master->th.th_teams_level >= team->t.t_level) ||
5175 (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5176 do_place_partition = 0;
5177 }
5178 hot_teams = master->th.th_hot_teams;
5179 if (level < __kmp_hot_teams_max_level && hot_teams &&
5180 hot_teams[level].hot_team) {
5181 // hot team has already been allocated for given level
5182 use_hot_team = 1;
5183 } else {
5184 use_hot_team = 0;
5185 }
5186 } else {
5187 // check we won't access uninitialized hot_teams, just in case
5188 KMP_DEBUG_ASSERT(new_nproc == 1);
5189 }
5190#endif
5191 // Optimization to use a "hot" team
5192 if (use_hot_team && new_nproc > 1) {
5193 KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5194#if KMP_NESTED_HOT_TEAMS
5195 team = hot_teams[level].hot_team;
5196#else
5197 team = root->r.r_hot_team;
5198#endif
5199#if KMP_DEBUG
5200 if (__kmp_tasking_mode != tskm_immediate_exec) {
5201 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5202 "task_team[1] = %p before reinit\n",
5203 team->t.t_task_team[0], team->t.t_task_team[1]));
5204 }
5205#endif
5206
5207 if (team->t.t_nproc != new_nproc &&
5208 __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5209 // Distributed barrier may need a resize
5210 int old_nthr = team->t.t_nproc;
5211 __kmp_resize_dist_barrier(team, old_nthreads: old_nthr, new_nthreads: new_nproc);
5212 }
5213
5214 // If not doing the place partition, then reset the team's proc bind
5215 // to indicate that partitioning of all threads still needs to take place
5216 if (do_place_partition == 0)
5217 team->t.t_proc_bind = proc_bind_default;
5218 // Has the number of threads changed?
5219 /* Let's assume the most common case is that the number of threads is
5220 unchanged, and put that case first. */
5221 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5222 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5223 // This case can mean that omp_set_num_threads() was called and the hot
5224 // team size was already reduced, so we check the special flag
5225 if (team->t.t_size_changed == -1) {
5226 team->t.t_size_changed = 1;
5227 } else {
5228 KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5229 }
5230
5231 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5232 kmp_r_sched_t new_sched = new_icvs->sched;
5233 // set primary thread's schedule as new run-time schedule
5234 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5235
5236 __kmp_reinitialize_team(team, new_icvs,
5237 loc: root->r.r_uber_thread->th.th_ident);
5238
5239 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5240 team->t.t_threads[0], team));
5241 __kmp_push_current_task_to_thread(this_thr: team->t.t_threads[0], team, tid: 0);
5242
5243#if KMP_AFFINITY_SUPPORTED
5244 if ((team->t.t_size_changed == 0) &&
5245 (team->t.t_proc_bind == new_proc_bind)) {
5246 if (new_proc_bind == proc_bind_spread) {
5247 if (do_place_partition) {
5248 // add flag to update only master for spread
5249 __kmp_partition_places(team, update_master_only: 1);
5250 }
5251 }
5252 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5253 "proc_bind = %d, partition = [%d,%d]\n",
5254 team->t.t_id, new_proc_bind, team->t.t_first_place,
5255 team->t.t_last_place));
5256 } else {
5257 if (do_place_partition) {
5258 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5259 __kmp_partition_places(team);
5260 }
5261 }
5262#else
5263 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5264#endif /* KMP_AFFINITY_SUPPORTED */
5265 } else if (team->t.t_nproc > new_nproc) {
5266 KA_TRACE(20,
5267 ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5268 new_nproc));
5269
5270 team->t.t_size_changed = 1;
5271 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5272 // Barrier size already reduced earlier in this function
5273 // Activate team threads via th_used_in_team
5274 __kmp_add_threads_to_team(team, new_nthreads: new_nproc);
5275 }
5276 // When decreasing team size, threads no longer in the team should
5277 // unref task team.
5278 if (__kmp_tasking_mode != tskm_immediate_exec) {
5279 for (f = new_nproc; f < team->t.t_nproc; f++) {
5280 kmp_info_t *th = team->t.t_threads[f];
5281 KMP_DEBUG_ASSERT(th);
5282 th->th.th_task_team = NULL;
5283 }
5284 }
5285#if KMP_NESTED_HOT_TEAMS
5286 if (__kmp_hot_teams_mode == 0) {
5287 // AC: saved number of threads should correspond to team's value in this
5288 // mode, can be bigger in mode 1, when hot team has threads in reserve
5289 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5290 hot_teams[level].hot_team_nth = new_nproc;
5291#endif // KMP_NESTED_HOT_TEAMS
5292 /* release the extra threads we don't need any more */
5293 for (f = new_nproc; f < team->t.t_nproc; f++) {
5294 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5295 __kmp_free_thread(team->t.t_threads[f]);
5296 team->t.t_threads[f] = NULL;
5297 }
5298#if KMP_NESTED_HOT_TEAMS
5299 } // (__kmp_hot_teams_mode == 0)
5300 else {
5301 // When keeping extra threads in team, switch threads to wait on own
5302 // b_go flag
5303 for (f = new_nproc; f < team->t.t_nproc; ++f) {
5304 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5305 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5306 for (int b = 0; b < bs_last_barrier; ++b) {
5307 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5308 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5309 }
5310 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5311 }
5312 }
5313 }
5314#endif // KMP_NESTED_HOT_TEAMS
5315 team->t.t_nproc = new_nproc;
5316 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5317 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5318 __kmp_reinitialize_team(team, new_icvs,
5319 loc: root->r.r_uber_thread->th.th_ident);
5320
5321 // Update remaining threads
5322 for (f = 0; f < new_nproc; ++f) {
5323 team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5324 }
5325
5326 // restore the current task state of the primary thread: should be the
5327 // implicit task
5328 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5329 team->t.t_threads[0], team));
5330
5331 __kmp_push_current_task_to_thread(this_thr: team->t.t_threads[0], team, tid: 0);
5332
5333#ifdef KMP_DEBUG
5334 for (f = 0; f < team->t.t_nproc; f++) {
5335 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5336 team->t.t_threads[f]->th.th_team_nproc ==
5337 team->t.t_nproc);
5338 }
5339#endif
5340
5341 if (do_place_partition) {
5342 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5343#if KMP_AFFINITY_SUPPORTED
5344 __kmp_partition_places(team);
5345#endif
5346 }
5347 } else { // team->t.t_nproc < new_nproc
5348
5349 KA_TRACE(20,
5350 ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5351 new_nproc));
5352 int old_nproc = team->t.t_nproc; // save old value and use to update only
5353 team->t.t_size_changed = 1;
5354
5355#if KMP_NESTED_HOT_TEAMS
5356 int avail_threads = hot_teams[level].hot_team_nth;
5357 if (new_nproc < avail_threads)
5358 avail_threads = new_nproc;
5359 kmp_info_t **other_threads = team->t.t_threads;
5360 for (f = team->t.t_nproc; f < avail_threads; ++f) {
5361 // Adjust barrier data of reserved threads (if any) of the team
5362 // Other data will be set in __kmp_initialize_info() below.
5363 int b;
5364 kmp_balign_t *balign = other_threads[f]->th.th_bar;
5365 for (b = 0; b < bs_last_barrier; ++b) {
5366 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5367 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5368#if USE_DEBUGGER
5369 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5370#endif
5371 }
5372 }
5373 if (hot_teams[level].hot_team_nth >= new_nproc) {
5374 // we have all needed threads in reserve, no need to allocate any
5375 // this only possible in mode 1, cannot have reserved threads in mode 0
5376 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5377 team->t.t_nproc = new_nproc; // just get reserved threads involved
5378 } else {
5379 // We may have some threads in reserve, but not enough;
5380 // get reserved threads involved if any.
5381 team->t.t_nproc = hot_teams[level].hot_team_nth;
5382 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5383#endif // KMP_NESTED_HOT_TEAMS
5384 if (team->t.t_max_nproc < new_nproc) {
5385 /* reallocate larger arrays */
5386 __kmp_reallocate_team_arrays(team, max_nth: new_nproc);
5387 __kmp_reinitialize_team(team, new_icvs, NULL);
5388 }
5389
5390#if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) && \
5391 KMP_AFFINITY_SUPPORTED
5392 /* Temporarily set full mask for primary thread before creation of
5393 workers. The reason is that workers inherit the affinity from the
5394 primary thread, so if a lot of workers are created on the single
5395 core quickly, they don't get a chance to set their own affinity for
5396 a long time. */
5397 kmp_affinity_raii_t new_temp_affinity{__kmp_affin_fullMask};
5398#endif
5399
5400 /* allocate new threads for the hot team */
5401 for (f = team->t.t_nproc; f < new_nproc; f++) {
5402 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, new_tid: f);
5403 KMP_DEBUG_ASSERT(new_worker);
5404 team->t.t_threads[f] = new_worker;
5405
5406 KA_TRACE(20,
5407 ("__kmp_allocate_team: team %d init T#%d arrived: "
5408 "join=%llu, plain=%llu\n",
5409 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5410 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5411 team->t.t_bar[bs_plain_barrier].b_arrived));
5412
5413 { // Initialize barrier data for new threads.
5414 int b;
5415 kmp_balign_t *balign = new_worker->th.th_bar;
5416 for (b = 0; b < bs_last_barrier; ++b) {
5417 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5418 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5419 KMP_BARRIER_PARENT_FLAG);
5420#if USE_DEBUGGER
5421 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5422#endif
5423 }
5424 }
5425 }
5426
5427#if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) && \
5428 KMP_AFFINITY_SUPPORTED
5429 /* Restore initial primary thread's affinity mask */
5430 new_temp_affinity.restore();
5431#endif
5432#if KMP_NESTED_HOT_TEAMS
5433 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5434#endif // KMP_NESTED_HOT_TEAMS
5435 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5436 // Barrier size already increased earlier in this function
5437 // Activate team threads via th_used_in_team
5438 __kmp_add_threads_to_team(team, new_nthreads: new_nproc);
5439 }
5440 /* make sure everyone is syncronized */
5441 // new threads below
5442 __kmp_initialize_team(team, new_nproc, new_icvs,
5443 loc: root->r.r_uber_thread->th.th_ident);
5444
5445 /* reinitialize the threads */
5446 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5447 for (f = 0; f < team->t.t_nproc; ++f)
5448 __kmp_initialize_info(this_thr: team->t.t_threads[f], team, tid: f,
5449 gtid: __kmp_gtid_from_tid(tid: f, team));
5450
5451 // set th_task_state for new threads in hot team with older thread's state
5452 kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state;
5453 for (f = old_nproc; f < team->t.t_nproc; ++f)
5454 team->t.t_threads[f]->th.th_task_state = old_state;
5455
5456#ifdef KMP_DEBUG
5457 for (f = 0; f < team->t.t_nproc; ++f) {
5458 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5459 team->t.t_threads[f]->th.th_team_nproc ==
5460 team->t.t_nproc);
5461 }
5462#endif
5463
5464 if (do_place_partition) {
5465 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5466#if KMP_AFFINITY_SUPPORTED
5467 __kmp_partition_places(team);
5468#endif
5469 }
5470 } // Check changes in number of threads
5471
5472 if (master->th.th_teams_microtask) {
5473 for (f = 1; f < new_nproc; ++f) {
5474 // propagate teams construct specific info to workers
5475 kmp_info_t *thr = team->t.t_threads[f];
5476 thr->th.th_teams_microtask = master->th.th_teams_microtask;
5477 thr->th.th_teams_level = master->th.th_teams_level;
5478 thr->th.th_teams_size = master->th.th_teams_size;
5479 }
5480 }
5481#if KMP_NESTED_HOT_TEAMS
5482 if (level) {
5483 // Sync barrier state for nested hot teams, not needed for outermost hot
5484 // team.
5485 for (f = 1; f < new_nproc; ++f) {
5486 kmp_info_t *thr = team->t.t_threads[f];
5487 int b;
5488 kmp_balign_t *balign = thr->th.th_bar;
5489 for (b = 0; b < bs_last_barrier; ++b) {
5490 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5491 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5492#if USE_DEBUGGER
5493 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5494#endif
5495 }
5496 }
5497 }
5498#endif // KMP_NESTED_HOT_TEAMS
5499
5500 /* reallocate space for arguments if necessary */
5501 __kmp_alloc_argv_entries(argc, team, TRUE);
5502 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5503 // The hot team re-uses the previous task team,
5504 // if untouched during the previous release->gather phase.
5505
5506 KF_TRACE(10, (" hot_team = %p\n", team));
5507
5508#if KMP_DEBUG
5509 if (__kmp_tasking_mode != tskm_immediate_exec) {
5510 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5511 "task_team[1] = %p after reinit\n",
5512 team->t.t_task_team[0], team->t.t_task_team[1]));
5513 }
5514#endif
5515
5516#if OMPT_SUPPORT
5517 __ompt_team_assign_id(team, ompt_pid: ompt_parallel_data);
5518#endif
5519
5520 KMP_MB();
5521
5522 return team;
5523 }
5524
5525 /* next, let's try to take one from the team pool */
5526 KMP_MB();
5527 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5528 /* TODO: consider resizing undersized teams instead of reaping them, now
5529 that we have a resizing mechanism */
5530 if (team->t.t_max_nproc >= max_nproc) {
5531 /* take this team from the team pool */
5532 __kmp_team_pool = team->t.t_next_pool;
5533
5534 if (max_nproc > 1 &&
5535 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5536 if (!team->t.b) { // Allocate barrier structure
5537 team->t.b = distributedBarrier::allocate(nThreads: __kmp_dflt_team_nth_ub);
5538 }
5539 }
5540
5541 /* setup the team for fresh use */
5542 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5543
5544 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5545 "task_team[1] %p to NULL\n",
5546 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5547 team->t.t_task_team[0] = NULL;
5548 team->t.t_task_team[1] = NULL;
5549
5550 /* reallocate space for arguments if necessary */
5551 __kmp_alloc_argv_entries(argc, team, TRUE);
5552 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5553
5554 KA_TRACE(
5555 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5556 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5557 { // Initialize barrier data.
5558 int b;
5559 for (b = 0; b < bs_last_barrier; ++b) {
5560 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5561#if USE_DEBUGGER
5562 team->t.t_bar[b].b_master_arrived = 0;
5563 team->t.t_bar[b].b_team_arrived = 0;
5564#endif
5565 }
5566 }
5567
5568 team->t.t_proc_bind = new_proc_bind;
5569
5570 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5571 team->t.t_id));
5572
5573#if OMPT_SUPPORT
5574 __ompt_team_assign_id(team, ompt_pid: ompt_parallel_data);
5575#endif
5576
5577 team->t.t_nested_nth = NULL;
5578
5579 KMP_MB();
5580
5581 return team;
5582 }
5583
5584 /* reap team if it is too small, then loop back and check the next one */
5585 // not sure if this is wise, but, will be redone during the hot-teams
5586 // rewrite.
5587 /* TODO: Use technique to find the right size hot-team, don't reap them */
5588 team = __kmp_reap_team(team);
5589 __kmp_team_pool = team;
5590 }
5591
5592 /* nothing available in the pool, no matter, make a new team! */
5593 KMP_MB();
5594 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5595
5596 /* and set it up */
5597 team->t.t_max_nproc = max_nproc;
5598 if (max_nproc > 1 &&
5599 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5600 // Allocate barrier structure
5601 team->t.b = distributedBarrier::allocate(nThreads: __kmp_dflt_team_nth_ub);
5602 }
5603
5604 /* NOTE well, for some reason allocating one big buffer and dividing it up
5605 seems to really hurt performance a lot on the P4, so, let's not use this */
5606 __kmp_allocate_team_arrays(team, max_nth: max_nproc);
5607
5608 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5609 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5610
5611 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5612 "%p to NULL\n",
5613 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5614 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5615 // memory, no need to duplicate
5616 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5617 // memory, no need to duplicate
5618
5619 if (__kmp_storage_map) {
5620 __kmp_print_team_storage_map(header: "team", team, team_id: team->t.t_id, num_thr: new_nproc);
5621 }
5622
5623 /* allocate space for arguments */
5624 __kmp_alloc_argv_entries(argc, team, FALSE);
5625 team->t.t_argc = argc;
5626
5627 KA_TRACE(20,
5628 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5629 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5630 { // Initialize barrier data.
5631 int b;
5632 for (b = 0; b < bs_last_barrier; ++b) {
5633 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5634#if USE_DEBUGGER
5635 team->t.t_bar[b].b_master_arrived = 0;
5636 team->t.t_bar[b].b_team_arrived = 0;
5637#endif
5638 }
5639 }
5640
5641 team->t.t_proc_bind = new_proc_bind;
5642
5643#if OMPT_SUPPORT
5644 __ompt_team_assign_id(team, ompt_pid: ompt_parallel_data);
5645 team->t.ompt_serialized_team_info = NULL;
5646#endif
5647
5648 KMP_MB();
5649
5650 team->t.t_nested_nth = NULL;
5651
5652 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5653 team->t.t_id));
5654
5655 return team;
5656}
5657
5658/* TODO implement hot-teams at all levels */
5659/* TODO implement lazy thread release on demand (disband request) */
5660
5661/* free the team. return it to the team pool. release all the threads
5662 * associated with it */
5663void __kmp_free_team(kmp_root_t *root,
5664 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5665 int f;
5666 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5667 team->t.t_id));
5668
5669 /* verify state */
5670 KMP_DEBUG_ASSERT(root);
5671 KMP_DEBUG_ASSERT(team);
5672 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5673 KMP_DEBUG_ASSERT(team->t.t_threads);
5674
5675 int use_hot_team = team == root->r.r_hot_team;
5676#if KMP_NESTED_HOT_TEAMS
5677 int level;
5678 if (master) {
5679 level = team->t.t_active_level - 1;
5680 if (master->th.th_teams_microtask) { // in teams construct?
5681 if (master->th.th_teams_size.nteams > 1) {
5682 ++level; // level was not increased in teams construct for
5683 // team_of_masters
5684 }
5685 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5686 master->th.th_teams_level == team->t.t_level) {
5687 ++level; // level was not increased in teams construct for
5688 // team_of_workers before the parallel
5689 } // team->t.t_level will be increased inside parallel
5690 }
5691#if KMP_DEBUG
5692 kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5693#endif
5694 if (level < __kmp_hot_teams_max_level) {
5695 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5696 use_hot_team = 1;
5697 }
5698 }
5699#endif // KMP_NESTED_HOT_TEAMS
5700
5701 /* team is done working */
5702 TCW_SYNC_PTR(team->t.t_pkfn,
5703 NULL); // Important for Debugging Support Library.
5704#if KMP_OS_WINDOWS
5705 team->t.t_copyin_counter = 0; // init counter for possible reuse
5706#endif
5707 // Do not reset pointer to parent team to NULL for hot teams.
5708
5709 /* if we are non-hot team, release our threads */
5710 if (!use_hot_team) {
5711 if (__kmp_tasking_mode != tskm_immediate_exec) {
5712 // Wait for threads to reach reapable state
5713 for (f = 1; f < team->t.t_nproc; ++f) {
5714 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5715 kmp_info_t *th = team->t.t_threads[f];
5716 volatile kmp_uint32 *state = &th->th.th_reap_state;
5717 while (*state != KMP_SAFE_TO_REAP) {
5718#if KMP_OS_WINDOWS
5719 // On Windows a thread can be killed at any time, check this
5720 DWORD ecode;
5721 if (!__kmp_is_thread_alive(th, &ecode)) {
5722 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5723 break;
5724 }
5725#endif
5726 // first check if thread is sleeping
5727 if (th->th.th_sleep_loc)
5728 __kmp_null_resume_wrapper(thr: th);
5729 KMP_CPU_PAUSE();
5730 }
5731 }
5732
5733 // Delete task teams
5734 int tt_idx;
5735 for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5736 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5737 if (task_team != NULL) {
5738 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5739 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5740 team->t.t_threads[f]->th.th_task_team = NULL;
5741 }
5742 KA_TRACE(
5743 20,
5744 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5745 __kmp_get_gtid(), task_team, team->t.t_id));
5746#if KMP_NESTED_HOT_TEAMS
5747 __kmp_free_task_team(thread: master, task_team);
5748#endif
5749 team->t.t_task_team[tt_idx] = NULL;
5750 }
5751 }
5752 }
5753
5754 // Before clearing parent pointer, check if nested_nth list should be freed
5755 if (team->t.t_nested_nth && team->t.t_nested_nth != &__kmp_nested_nth &&
5756 team->t.t_nested_nth != team->t.t_parent->t.t_nested_nth) {
5757 KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
5758 KMP_INTERNAL_FREE(team->t.t_nested_nth);
5759 }
5760 team->t.t_nested_nth = NULL;
5761
5762 // Reset pointer to parent team only for non-hot teams.
5763 team->t.t_parent = NULL;
5764 team->t.t_level = 0;
5765 team->t.t_active_level = 0;
5766
5767 /* free the worker threads */
5768 for (f = 1; f < team->t.t_nproc; ++f) {
5769 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5770 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5771 (void)KMP_COMPARE_AND_STORE_ACQ32(
5772 &(team->t.t_threads[f]->th.th_used_in_team), 1, 2);
5773 }
5774 __kmp_free_thread(team->t.t_threads[f]);
5775 }
5776
5777 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5778 if (team->t.b) {
5779 // wake up thread at old location
5780 team->t.b->go_release();
5781 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5782 for (f = 1; f < team->t.t_nproc; ++f) {
5783 if (team->t.b->sleep[f].sleep) {
5784 __kmp_atomic_resume_64(
5785 target_gtid: team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5786 flag: (kmp_atomic_flag_64<> *)NULL);
5787 }
5788 }
5789 }
5790 // Wait for threads to be removed from team
5791 for (int f = 1; f < team->t.t_nproc; ++f) {
5792 while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5793 KMP_CPU_PAUSE();
5794 }
5795 }
5796 }
5797
5798 for (f = 1; f < team->t.t_nproc; ++f) {
5799 team->t.t_threads[f] = NULL;
5800 }
5801
5802 if (team->t.t_max_nproc > 1 &&
5803 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5804 distributedBarrier::deallocate(db: team->t.b);
5805 team->t.b = NULL;
5806 }
5807 /* put the team back in the team pool */
5808 /* TODO limit size of team pool, call reap_team if pool too large */
5809 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5810 __kmp_team_pool = (volatile kmp_team_t *)team;
5811 } else { // Check if team was created for primary threads in teams construct
5812 // See if first worker is a CG root
5813 KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5814 team->t.t_threads[1]->th.th_cg_roots);
5815 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5816 // Clean up the CG root nodes on workers so that this team can be re-used
5817 for (f = 1; f < team->t.t_nproc; ++f) {
5818 kmp_info_t *thr = team->t.t_threads[f];
5819 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5820 thr->th.th_cg_roots->cg_root == thr);
5821 // Pop current CG root off list
5822 kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5823 thr->th.th_cg_roots = tmp->up;
5824 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5825 " up to node %p. cg_nthreads was %d\n",
5826 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5827 int i = tmp->cg_nthreads--;
5828 if (i == 1) {
5829 __kmp_free(tmp); // free CG if we are the last thread in it
5830 }
5831 // Restore current task's thread_limit from CG root
5832 if (thr->th.th_cg_roots)
5833 thr->th.th_current_task->td_icvs.thread_limit =
5834 thr->th.th_cg_roots->cg_thread_limit;
5835 }
5836 }
5837 }
5838
5839 KMP_MB();
5840}
5841
5842/* reap the team. destroy it, reclaim all its resources and free its memory */
5843kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5844 kmp_team_t *next_pool = team->t.t_next_pool;
5845
5846 KMP_DEBUG_ASSERT(team);
5847 KMP_DEBUG_ASSERT(team->t.t_dispatch);
5848 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5849 KMP_DEBUG_ASSERT(team->t.t_threads);
5850 KMP_DEBUG_ASSERT(team->t.t_argv);
5851
5852 /* TODO clean the threads that are a part of this? */
5853
5854 /* free stuff */
5855 __kmp_free_team_arrays(team);
5856 if (team->t.t_argv != &team->t.t_inline_argv[0])
5857 __kmp_free((void *)team->t.t_argv);
5858 __kmp_free(team);
5859
5860 KMP_MB();
5861 return next_pool;
5862}
5863
5864// Free the thread. Don't reap it, just place it on the pool of available
5865// threads.
5866//
5867// Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5868// binding for the affinity mechanism to be useful.
5869//
5870// Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5871// However, we want to avoid a potential performance problem by always
5872// scanning through the list to find the correct point at which to insert
5873// the thread (potential N**2 behavior). To do this we keep track of the
5874// last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5875// With single-level parallelism, threads will always be added to the tail
5876// of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5877// parallelism, all bets are off and we may need to scan through the entire
5878// free list.
5879//
5880// This change also has a potentially large performance benefit, for some
5881// applications. Previously, as threads were freed from the hot team, they
5882// would be placed back on the free list in inverse order. If the hot team
5883// grew back to it's original size, then the freed thread would be placed
5884// back on the hot team in reverse order. This could cause bad cache
5885// locality problems on programs where the size of the hot team regularly
5886// grew and shrunk.
5887//
5888// Now, for single-level parallelism, the OMP tid is always == gtid.
5889void __kmp_free_thread(kmp_info_t *this_th) {
5890 int gtid;
5891 kmp_info_t **scan;
5892
5893 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5894 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5895
5896 KMP_DEBUG_ASSERT(this_th);
5897
5898 // When moving thread to pool, switch thread to wait on own b_go flag, and
5899 // uninitialized (NULL team).
5900 int b;
5901 kmp_balign_t *balign = this_th->th.th_bar;
5902 for (b = 0; b < bs_last_barrier; ++b) {
5903 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5904 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5905 balign[b].bb.team = NULL;
5906 balign[b].bb.leaf_kids = 0;
5907 }
5908 this_th->th.th_task_state = 0;
5909 this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5910
5911 /* put thread back on the free pool */
5912 TCW_PTR(this_th->th.th_team, NULL);
5913 TCW_PTR(this_th->th.th_root, NULL);
5914 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5915
5916 while (this_th->th.th_cg_roots) {
5917 this_th->th.th_cg_roots->cg_nthreads--;
5918 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5919 " %p of thread %p to %d\n",
5920 this_th, this_th->th.th_cg_roots,
5921 this_th->th.th_cg_roots->cg_root,
5922 this_th->th.th_cg_roots->cg_nthreads));
5923 kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5924 if (tmp->cg_root == this_th) { // Thread is a cg_root
5925 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5926 KA_TRACE(
5927 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5928 this_th->th.th_cg_roots = tmp->up;
5929 __kmp_free(tmp);
5930 } else { // Worker thread
5931 if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5932 __kmp_free(tmp);
5933 }
5934 this_th->th.th_cg_roots = NULL;
5935 break;
5936 }
5937 }
5938
5939 /* If the implicit task assigned to this thread can be used by other threads
5940 * -> multiple threads can share the data and try to free the task at
5941 * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5942 * with higher probability when hot team is disabled but can occurs even when
5943 * the hot team is enabled */
5944 __kmp_free_implicit_task(this_thr: this_th);
5945 this_th->th.th_current_task = NULL;
5946
5947 // If the __kmp_thread_pool_insert_pt is already past the new insert
5948 // point, then we need to re-scan the entire list.
5949 gtid = this_th->th.th_info.ds.ds_gtid;
5950 if (__kmp_thread_pool_insert_pt != NULL) {
5951 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5952 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5953 __kmp_thread_pool_insert_pt = NULL;
5954 }
5955 }
5956
5957 // Scan down the list to find the place to insert the thread.
5958 // scan is the address of a link in the list, possibly the address of
5959 // __kmp_thread_pool itself.
5960 //
5961 // In the absence of nested parallelism, the for loop will have 0 iterations.
5962 if (__kmp_thread_pool_insert_pt != NULL) {
5963 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5964 } else {
5965 scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5966 }
5967 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5968 scan = &((*scan)->th.th_next_pool))
5969 ;
5970
5971 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5972 // to its address.
5973 TCW_PTR(this_th->th.th_next_pool, *scan);
5974 __kmp_thread_pool_insert_pt = *scan = this_th;
5975 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5976 (this_th->th.th_info.ds.ds_gtid <
5977 this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5978 TCW_4(this_th->th.th_in_pool, TRUE);
5979 __kmp_suspend_initialize_thread(th: this_th);
5980 __kmp_lock_suspend_mx(th: this_th);
5981 if (this_th->th.th_active == TRUE) {
5982 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5983 this_th->th.th_active_in_pool = TRUE;
5984 }
5985#if KMP_DEBUG
5986 else {
5987 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5988 }
5989#endif
5990 __kmp_unlock_suspend_mx(th: this_th);
5991
5992 TCW_4(__kmp_nth, __kmp_nth - 1);
5993
5994#ifdef KMP_ADJUST_BLOCKTIME
5995 /* Adjust blocktime back to user setting or default if necessary */
5996 /* Middle initialization might never have occurred */
5997 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5998 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5999 if (__kmp_nth <= __kmp_avail_proc) {
6000 __kmp_zero_bt = FALSE;
6001 }
6002 }
6003#endif /* KMP_ADJUST_BLOCKTIME */
6004
6005 KMP_MB();
6006}
6007
6008/* ------------------------------------------------------------------------ */
6009
6010void *__kmp_launch_thread(kmp_info_t *this_thr) {
6011#if OMP_PROFILING_SUPPORT
6012 ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
6013 // TODO: add a configuration option for time granularity
6014 if (ProfileTraceFile)
6015 llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
6016#endif
6017
6018 int gtid = this_thr->th.th_info.ds.ds_gtid;
6019 /* void *stack_data;*/
6020 kmp_team_t **volatile pteam;
6021
6022 KMP_MB();
6023 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
6024
6025 if (__kmp_env_consistency_check) {
6026 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
6027 }
6028
6029#if OMPD_SUPPORT
6030 if (ompd_state & OMPD_ENABLE_BP)
6031 ompd_bp_thread_begin();
6032#endif
6033
6034#if OMPT_SUPPORT
6035 ompt_data_t *thread_data = nullptr;
6036 if (ompt_enabled.enabled) {
6037 thread_data = &(this_thr->th.ompt_thread_info.thread_data);
6038 *thread_data = ompt_data_none;
6039
6040 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6041 this_thr->th.ompt_thread_info.wait_id = 0;
6042 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
6043 this_thr->th.ompt_thread_info.parallel_flags = 0;
6044 if (ompt_enabled.ompt_callback_thread_begin) {
6045 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
6046 ompt_thread_worker, thread_data);
6047 }
6048 this_thr->th.ompt_thread_info.state = ompt_state_idle;
6049 }
6050#endif
6051
6052 /* This is the place where threads wait for work */
6053 while (!TCR_4(__kmp_global.g.g_done)) {
6054 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
6055 KMP_MB();
6056
6057 /* wait for work to do */
6058 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6059
6060 /* No tid yet since not part of a team */
6061 __kmp_fork_barrier(gtid, KMP_GTID_DNE);
6062
6063#if OMPT_SUPPORT
6064 if (ompt_enabled.enabled) {
6065 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6066 }
6067#endif
6068
6069 pteam = &this_thr->th.th_team;
6070
6071 /* have we been allocated? */
6072 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6073 /* we were just woken up, so run our new task */
6074 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6075 int rc;
6076 KA_TRACE(20,
6077 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6078 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6079 (*pteam)->t.t_pkfn));
6080
6081 updateHWFPControl(team: *pteam);
6082
6083#if OMPT_SUPPORT
6084 if (ompt_enabled.enabled) {
6085 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6086 }
6087#endif
6088
6089 rc = (*pteam)->t.t_invoke(gtid);
6090 KMP_ASSERT(rc);
6091
6092 KMP_MB();
6093 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6094 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6095 (*pteam)->t.t_pkfn));
6096 }
6097#if OMPT_SUPPORT
6098 if (ompt_enabled.enabled) {
6099 /* no frame set while outside task */
6100 __ompt_get_task_info_object(depth: 0)->frame.exit_frame = ompt_data_none;
6101
6102 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6103 }
6104#endif
6105 /* join barrier after parallel region */
6106 __kmp_join_barrier(gtid);
6107 }
6108 }
6109
6110#if OMPD_SUPPORT
6111 if (ompd_state & OMPD_ENABLE_BP)
6112 ompd_bp_thread_end();
6113#endif
6114
6115#if OMPT_SUPPORT
6116 if (ompt_enabled.ompt_callback_thread_end) {
6117 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6118 }
6119#endif
6120
6121 this_thr->th.th_task_team = NULL;
6122 /* run the destructors for the threadprivate data for this thread */
6123 __kmp_common_destroy_gtid(gtid);
6124
6125 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6126 KMP_MB();
6127
6128#if OMP_PROFILING_SUPPORT
6129 llvm::timeTraceProfilerFinishThread();
6130#endif
6131 return this_thr;
6132}
6133
6134/* ------------------------------------------------------------------------ */
6135
6136void __kmp_internal_end_dest(void *specific_gtid) {
6137 // Make sure no significant bits are lost
6138 int gtid;
6139 __kmp_type_convert(src: (kmp_intptr_t)specific_gtid - 1, dest: &gtid);
6140
6141 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6142 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6143 * this is because 0 is reserved for the nothing-stored case */
6144
6145 __kmp_internal_end_thread(gtid);
6146}
6147
6148#if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6149
6150__attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6151 __kmp_internal_end_atexit();
6152}
6153
6154#endif
6155
6156/* [Windows] josh: when the atexit handler is called, there may still be more
6157 than one thread alive */
6158void __kmp_internal_end_atexit(void) {
6159 KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6160 /* [Windows]
6161 josh: ideally, we want to completely shutdown the library in this atexit
6162 handler, but stat code that depends on thread specific data for gtid fails
6163 because that data becomes unavailable at some point during the shutdown, so
6164 we call __kmp_internal_end_thread instead. We should eventually remove the
6165 dependency on __kmp_get_specific_gtid in the stat code and use
6166 __kmp_internal_end_library to cleanly shutdown the library.
6167
6168 // TODO: Can some of this comment about GVS be removed?
6169 I suspect that the offending stat code is executed when the calling thread
6170 tries to clean up a dead root thread's data structures, resulting in GVS
6171 code trying to close the GVS structures for that thread, but since the stat
6172 code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6173 the calling thread is cleaning up itself instead of another thread, it get
6174 confused. This happens because allowing a thread to unregister and cleanup
6175 another thread is a recent modification for addressing an issue.
6176 Based on the current design (20050722), a thread may end up
6177 trying to unregister another thread only if thread death does not trigger
6178 the calling of __kmp_internal_end_thread. For Linux* OS, there is the
6179 thread specific data destructor function to detect thread death. For
6180 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6181 is nothing. Thus, the workaround is applicable only for Windows static
6182 stat library. */
6183 __kmp_internal_end_library(gtid: -1);
6184#if KMP_OS_WINDOWS
6185 __kmp_close_console();
6186#endif
6187}
6188
6189static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6190 // It is assumed __kmp_forkjoin_lock is acquired.
6191
6192 int gtid;
6193
6194 KMP_DEBUG_ASSERT(thread != NULL);
6195
6196 gtid = thread->th.th_info.ds.ds_gtid;
6197
6198 if (!is_root) {
6199 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6200 /* Assume the threads are at the fork barrier here */
6201 KA_TRACE(
6202 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6203 gtid));
6204 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6205 while (
6206 !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6207 KMP_CPU_PAUSE();
6208 __kmp_resume_32(target_gtid: gtid, flag: (kmp_flag_32<false, false> *)NULL);
6209 } else {
6210 /* Need release fence here to prevent seg faults for tree forkjoin
6211 barrier (GEH) */
6212 kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6213 thread);
6214 __kmp_release_64(flag: &flag);
6215 }
6216 }
6217
6218 // Terminate OS thread.
6219 __kmp_reap_worker(th: thread);
6220
6221 // The thread was killed asynchronously. If it was actively
6222 // spinning in the thread pool, decrement the global count.
6223 //
6224 // There is a small timing hole here - if the worker thread was just waking
6225 // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6226 // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6227 // the global counter might not get updated.
6228 //
6229 // Currently, this can only happen as the library is unloaded,
6230 // so there are no harmful side effects.
6231 if (thread->th.th_active_in_pool) {
6232 thread->th.th_active_in_pool = FALSE;
6233 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6234 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6235 }
6236 }
6237
6238 __kmp_free_implicit_task(this_thr: thread);
6239
6240// Free the fast memory for tasking
6241#if USE_FAST_MEMORY
6242 __kmp_free_fast_memory(this_thr: thread);
6243#endif /* USE_FAST_MEMORY */
6244
6245 __kmp_suspend_uninitialize_thread(th: thread);
6246
6247 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6248 TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6249
6250 --__kmp_all_nth;
6251 // __kmp_nth was decremented when thread is added to the pool.
6252
6253#ifdef KMP_ADJUST_BLOCKTIME
6254 /* Adjust blocktime back to user setting or default if necessary */
6255 /* Middle initialization might never have occurred */
6256 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6257 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6258 if (__kmp_nth <= __kmp_avail_proc) {
6259 __kmp_zero_bt = FALSE;
6260 }
6261 }
6262#endif /* KMP_ADJUST_BLOCKTIME */
6263
6264 /* free the memory being used */
6265 if (__kmp_env_consistency_check) {
6266 if (thread->th.th_cons) {
6267 __kmp_free_cons_stack(ptr: thread->th.th_cons);
6268 thread->th.th_cons = NULL;
6269 }
6270 }
6271
6272 if (thread->th.th_pri_common != NULL) {
6273 __kmp_free(thread->th.th_pri_common);
6274 thread->th.th_pri_common = NULL;
6275 }
6276
6277#if KMP_USE_BGET
6278 if (thread->th.th_local.bget_data != NULL) {
6279 __kmp_finalize_bget(th: thread);
6280 }
6281#endif
6282
6283#if KMP_AFFINITY_SUPPORTED
6284 if (thread->th.th_affin_mask != NULL) {
6285 KMP_CPU_FREE(thread->th.th_affin_mask);
6286 thread->th.th_affin_mask = NULL;
6287 }
6288#endif /* KMP_AFFINITY_SUPPORTED */
6289
6290#if KMP_USE_HIER_SCHED
6291 if (thread->th.th_hier_bar_data != NULL) {
6292 __kmp_free(thread->th.th_hier_bar_data);
6293 thread->th.th_hier_bar_data = NULL;
6294 }
6295#endif
6296
6297 __kmp_reap_team(team: thread->th.th_serial_team);
6298 thread->th.th_serial_team = NULL;
6299 __kmp_free(thread);
6300
6301 KMP_MB();
6302
6303} // __kmp_reap_thread
6304
6305static void __kmp_itthash_clean(kmp_info_t *th) {
6306#if USE_ITT_NOTIFY
6307 if (__kmp_itt_region_domains.count > 0) {
6308 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6309 kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6310 while (bucket) {
6311 kmp_itthash_entry_t *next = bucket->next_in_bucket;
6312 __kmp_thread_free(th, bucket);
6313 bucket = next;
6314 }
6315 }
6316 }
6317 if (__kmp_itt_barrier_domains.count > 0) {
6318 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6319 kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6320 while (bucket) {
6321 kmp_itthash_entry_t *next = bucket->next_in_bucket;
6322 __kmp_thread_free(th, bucket);
6323 bucket = next;
6324 }
6325 }
6326 }
6327#endif
6328}
6329
6330static void __kmp_internal_end(void) {
6331 int i;
6332
6333 /* First, unregister the library */
6334 __kmp_unregister_library();
6335
6336#if KMP_OS_WINDOWS
6337 /* In Win static library, we can't tell when a root actually dies, so we
6338 reclaim the data structures for any root threads that have died but not
6339 unregistered themselves, in order to shut down cleanly.
6340 In Win dynamic library we also can't tell when a thread dies. */
6341 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6342// dead roots
6343#endif
6344
6345 for (i = 0; i < __kmp_threads_capacity; i++)
6346 if (__kmp_root[i])
6347 if (__kmp_root[i]->r.r_active)
6348 break;
6349 KMP_MB(); /* Flush all pending memory write invalidates. */
6350 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6351
6352 if (i < __kmp_threads_capacity) {
6353#if KMP_USE_MONITOR
6354 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6355 KMP_MB(); /* Flush all pending memory write invalidates. */
6356
6357 // Need to check that monitor was initialized before reaping it. If we are
6358 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6359 // __kmp_monitor will appear to contain valid data, but it is only valid in
6360 // the parent process, not the child.
6361 // New behavior (201008): instead of keying off of the flag
6362 // __kmp_init_parallel, the monitor thread creation is keyed off
6363 // of the new flag __kmp_init_monitor.
6364 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6365 if (TCR_4(__kmp_init_monitor)) {
6366 __kmp_reap_monitor(&__kmp_monitor);
6367 TCW_4(__kmp_init_monitor, 0);
6368 }
6369 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6370 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6371#endif // KMP_USE_MONITOR
6372 } else {
6373/* TODO move this to cleanup code */
6374#ifdef KMP_DEBUG
6375 /* make sure that everything has properly ended */
6376 for (i = 0; i < __kmp_threads_capacity; i++) {
6377 if (__kmp_root[i]) {
6378 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6379 // there can be uber threads alive here
6380 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6381 }
6382 }
6383#endif
6384
6385 KMP_MB();
6386
6387 // Reap the worker threads.
6388 // This is valid for now, but be careful if threads are reaped sooner.
6389 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6390 // Get the next thread from the pool.
6391 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6392 __kmp_thread_pool = thread->th.th_next_pool;
6393 // Reap it.
6394 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6395 thread->th.th_next_pool = NULL;
6396 thread->th.th_in_pool = FALSE;
6397 __kmp_reap_thread(thread, is_root: 0);
6398 }
6399 __kmp_thread_pool_insert_pt = NULL;
6400
6401 // Reap teams.
6402 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6403 // Get the next team from the pool.
6404 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6405 __kmp_team_pool = team->t.t_next_pool;
6406 // Reap it.
6407 team->t.t_next_pool = NULL;
6408 __kmp_reap_team(team);
6409 }
6410
6411 __kmp_reap_task_teams();
6412
6413#if KMP_OS_UNIX
6414 // Threads that are not reaped should not access any resources since they
6415 // are going to be deallocated soon, so the shutdown sequence should wait
6416 // until all threads either exit the final spin-waiting loop or begin
6417 // sleeping after the given blocktime.
6418 for (i = 0; i < __kmp_threads_capacity; i++) {
6419 kmp_info_t *thr = __kmp_threads[i];
6420 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6421 KMP_CPU_PAUSE();
6422 }
6423#endif
6424
6425 for (i = 0; i < __kmp_threads_capacity; ++i) {
6426 // TBD: Add some checking...
6427 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6428 }
6429
6430 /* Make sure all threadprivate destructors get run by joining with all
6431 worker threads before resetting this flag */
6432 TCW_SYNC_4(__kmp_init_common, FALSE);
6433
6434 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6435 KMP_MB();
6436
6437#if KMP_USE_MONITOR
6438 // See note above: One of the possible fixes for CQ138434 / CQ140126
6439 //
6440 // FIXME: push both code fragments down and CSE them?
6441 // push them into __kmp_cleanup() ?
6442 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6443 if (TCR_4(__kmp_init_monitor)) {
6444 __kmp_reap_monitor(&__kmp_monitor);
6445 TCW_4(__kmp_init_monitor, 0);
6446 }
6447 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6448 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6449#endif
6450 } /* else !__kmp_global.t_active */
6451 TCW_4(__kmp_init_gtid, FALSE);
6452 KMP_MB(); /* Flush all pending memory write invalidates. */
6453
6454 __kmp_cleanup();
6455#if OMPT_SUPPORT
6456 ompt_fini();
6457#endif
6458}
6459
6460void __kmp_internal_end_library(int gtid_req) {
6461 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6462 /* this shouldn't be a race condition because __kmp_internal_end() is the
6463 only place to clear __kmp_serial_init */
6464 /* we'll check this later too, after we get the lock */
6465 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6466 // redundant, because the next check will work in any case.
6467 if (__kmp_global.g.g_abort) {
6468 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6469 /* TODO abort? */
6470 return;
6471 }
6472 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6473 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6474 return;
6475 }
6476
6477 // If hidden helper team has been initialized, we need to deinit it
6478 if (TCR_4(__kmp_init_hidden_helper) &&
6479 !TCR_4(__kmp_hidden_helper_team_done)) {
6480 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6481 // First release the main thread to let it continue its work
6482 __kmp_hidden_helper_main_thread_release();
6483 // Wait until the hidden helper team has been destroyed
6484 __kmp_hidden_helper_threads_deinitz_wait();
6485 }
6486
6487 KMP_MB(); /* Flush all pending memory write invalidates. */
6488 /* find out who we are and what we should do */
6489 {
6490 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6491 KA_TRACE(
6492 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6493 if (gtid == KMP_GTID_SHUTDOWN) {
6494 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6495 "already shutdown\n"));
6496 return;
6497 } else if (gtid == KMP_GTID_MONITOR) {
6498 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6499 "registered, or system shutdown\n"));
6500 return;
6501 } else if (gtid == KMP_GTID_DNE) {
6502 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6503 "shutdown\n"));
6504 /* we don't know who we are, but we may still shutdown the library */
6505 } else if (KMP_UBER_GTID(gtid)) {
6506 /* unregister ourselves as an uber thread. gtid is no longer valid */
6507 if (__kmp_root[gtid]->r.r_active) {
6508 __kmp_global.g.g_abort = -1;
6509 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6510 __kmp_unregister_library();
6511 KA_TRACE(10,
6512 ("__kmp_internal_end_library: root still active, abort T#%d\n",
6513 gtid));
6514 return;
6515 } else {
6516 __kmp_itthash_clean(th: __kmp_threads[gtid]);
6517 KA_TRACE(
6518 10,
6519 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6520 __kmp_unregister_root_current_thread(gtid);
6521 }
6522 } else {
6523/* worker threads may call this function through the atexit handler, if they
6524 * call exit() */
6525/* For now, skip the usual subsequent processing and just dump the debug buffer.
6526 TODO: do a thorough shutdown instead */
6527#ifdef DUMP_DEBUG_ON_EXIT
6528 if (__kmp_debug_buf)
6529 __kmp_dump_debug_buffer();
6530#endif
6531 // added unregister library call here when we switch to shm linux
6532 // if we don't, it will leave lots of files in /dev/shm
6533 // cleanup shared memory file before exiting.
6534 __kmp_unregister_library();
6535 return;
6536 }
6537 }
6538 /* synchronize the termination process */
6539 __kmp_acquire_bootstrap_lock(lck: &__kmp_initz_lock);
6540
6541 /* have we already finished */
6542 if (__kmp_global.g.g_abort) {
6543 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6544 /* TODO abort? */
6545 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
6546 return;
6547 }
6548 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6549 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
6550 return;
6551 }
6552
6553 /* We need this lock to enforce mutex between this reading of
6554 __kmp_threads_capacity and the writing by __kmp_register_root.
6555 Alternatively, we can use a counter of roots that is atomically updated by
6556 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6557 __kmp_internal_end_*. */
6558 __kmp_acquire_bootstrap_lock(lck: &__kmp_forkjoin_lock);
6559
6560 /* now we can safely conduct the actual termination */
6561 __kmp_internal_end();
6562
6563 __kmp_release_bootstrap_lock(lck: &__kmp_forkjoin_lock);
6564 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
6565
6566 KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6567
6568#ifdef DUMP_DEBUG_ON_EXIT
6569 if (__kmp_debug_buf)
6570 __kmp_dump_debug_buffer();
6571#endif
6572
6573#if KMP_OS_WINDOWS
6574 __kmp_close_console();
6575#endif
6576
6577 __kmp_fini_allocator();
6578
6579} // __kmp_internal_end_library
6580
6581void __kmp_internal_end_thread(int gtid_req) {
6582 int i;
6583
6584 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6585 /* this shouldn't be a race condition because __kmp_internal_end() is the
6586 * only place to clear __kmp_serial_init */
6587 /* we'll check this later too, after we get the lock */
6588 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6589 // redundant, because the next check will work in any case.
6590 if (__kmp_global.g.g_abort) {
6591 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6592 /* TODO abort? */
6593 return;
6594 }
6595 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6596 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6597 return;
6598 }
6599
6600 // If hidden helper team has been initialized, we need to deinit it
6601 if (TCR_4(__kmp_init_hidden_helper) &&
6602 !TCR_4(__kmp_hidden_helper_team_done)) {
6603 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6604 // First release the main thread to let it continue its work
6605 __kmp_hidden_helper_main_thread_release();
6606 // Wait until the hidden helper team has been destroyed
6607 __kmp_hidden_helper_threads_deinitz_wait();
6608 }
6609
6610 KMP_MB(); /* Flush all pending memory write invalidates. */
6611
6612 /* find out who we are and what we should do */
6613 {
6614 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6615 KA_TRACE(10,
6616 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6617 if (gtid == KMP_GTID_SHUTDOWN) {
6618 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6619 "already shutdown\n"));
6620 return;
6621 } else if (gtid == KMP_GTID_MONITOR) {
6622 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6623 "registered, or system shutdown\n"));
6624 return;
6625 } else if (gtid == KMP_GTID_DNE) {
6626 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6627 "shutdown\n"));
6628 return;
6629 /* we don't know who we are */
6630 } else if (KMP_UBER_GTID(gtid)) {
6631 /* unregister ourselves as an uber thread. gtid is no longer valid */
6632 if (__kmp_root[gtid]->r.r_active) {
6633 __kmp_global.g.g_abort = -1;
6634 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6635 KA_TRACE(10,
6636 ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6637 gtid));
6638 return;
6639 } else {
6640 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6641 gtid));
6642 __kmp_unregister_root_current_thread(gtid);
6643 }
6644 } else {
6645 /* just a worker thread, let's leave */
6646 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6647
6648 if (gtid >= 0) {
6649 __kmp_threads[gtid]->th.th_task_team = NULL;
6650 }
6651
6652 KA_TRACE(10,
6653 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6654 gtid));
6655 return;
6656 }
6657 }
6658#if KMP_DYNAMIC_LIB
6659 if (__kmp_pause_status != kmp_hard_paused)
6660 // AC: lets not shutdown the dynamic library at the exit of uber thread,
6661 // because we will better shutdown later in the library destructor.
6662 {
6663 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6664 return;
6665 }
6666#endif
6667 /* synchronize the termination process */
6668 __kmp_acquire_bootstrap_lock(lck: &__kmp_initz_lock);
6669
6670 /* have we already finished */
6671 if (__kmp_global.g.g_abort) {
6672 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6673 /* TODO abort? */
6674 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
6675 return;
6676 }
6677 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6678 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
6679 return;
6680 }
6681
6682 /* We need this lock to enforce mutex between this reading of
6683 __kmp_threads_capacity and the writing by __kmp_register_root.
6684 Alternatively, we can use a counter of roots that is atomically updated by
6685 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6686 __kmp_internal_end_*. */
6687
6688 /* should we finish the run-time? are all siblings done? */
6689 __kmp_acquire_bootstrap_lock(lck: &__kmp_forkjoin_lock);
6690
6691 for (i = 0; i < __kmp_threads_capacity; ++i) {
6692 if (KMP_UBER_GTID(gtid: i)) {
6693 KA_TRACE(
6694 10,
6695 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6696 __kmp_release_bootstrap_lock(lck: &__kmp_forkjoin_lock);
6697 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
6698 return;
6699 }
6700 }
6701
6702 /* now we can safely conduct the actual termination */
6703
6704 __kmp_internal_end();
6705
6706 __kmp_release_bootstrap_lock(lck: &__kmp_forkjoin_lock);
6707 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
6708
6709 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6710
6711#ifdef DUMP_DEBUG_ON_EXIT
6712 if (__kmp_debug_buf)
6713 __kmp_dump_debug_buffer();
6714#endif
6715} // __kmp_internal_end_thread
6716
6717// -----------------------------------------------------------------------------
6718// Library registration stuff.
6719
6720static long __kmp_registration_flag = 0;
6721// Random value used to indicate library initialization.
6722static char *__kmp_registration_str = NULL;
6723// Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6724
6725static inline char *__kmp_reg_status_name() {
6726/* On RHEL 3u5 if linked statically, getpid() returns different values in
6727 each thread. If registration and unregistration go in different threads
6728 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6729 env var can not be found, because the name will contain different pid. */
6730// macOS* complains about name being too long with additional getuid()
6731#if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6732 return __kmp_str_format(format: "__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6733 (int)getuid());
6734#else
6735 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6736#endif
6737} // __kmp_reg_status_get
6738
6739#if defined(KMP_USE_SHM)
6740bool __kmp_shm_available = false;
6741bool __kmp_tmp_available = false;
6742// If /dev/shm is not accessible, we will create a temporary file under /tmp.
6743char *temp_reg_status_file_name = nullptr;
6744#endif
6745
6746void __kmp_register_library_startup(void) {
6747
6748 char *name = __kmp_reg_status_name(); // Name of the environment variable.
6749 int done = 0;
6750 union {
6751 double dtime;
6752 long ltime;
6753 } time;
6754#if KMP_ARCH_X86 || KMP_ARCH_X86_64
6755 __kmp_initialize_system_tick();
6756#endif
6757 __kmp_read_system_time(delta: &time.dtime);
6758 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6759 __kmp_registration_str =
6760 __kmp_str_format(format: "%p-%lx-%s", &__kmp_registration_flag,
6761 __kmp_registration_flag, KMP_LIBRARY_FILE);
6762
6763 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6764 __kmp_registration_str));
6765
6766 while (!done) {
6767
6768 char *value = NULL; // Actual value of the environment variable.
6769
6770#if defined(KMP_USE_SHM)
6771 char *shm_name = nullptr;
6772 char *data1 = nullptr;
6773 __kmp_shm_available = __kmp_detect_shm();
6774 if (__kmp_shm_available) {
6775 int fd1 = -1;
6776 shm_name = __kmp_str_format(format: "/%s", name);
6777 int shm_preexist = 0;
6778 fd1 = shm_open(name: shm_name, O_CREAT | O_EXCL | O_RDWR, mode: 0600);
6779 if ((fd1 == -1) && (errno == EEXIST)) {
6780 // file didn't open because it already exists.
6781 // try opening existing file
6782 fd1 = shm_open(name: shm_name, O_RDWR, mode: 0600);
6783 if (fd1 == -1) { // file didn't open
6784 KMP_WARNING(FunctionError, "Can't open SHM");
6785 __kmp_shm_available = false;
6786 } else { // able to open existing file
6787 shm_preexist = 1;
6788 }
6789 }
6790 if (__kmp_shm_available && shm_preexist == 0) { // SHM created, set size
6791 if (ftruncate(fd: fd1, SHM_SIZE) == -1) { // error occured setting size;
6792 KMP_WARNING(FunctionError, "Can't set size of SHM");
6793 __kmp_shm_available = false;
6794 }
6795 }
6796 if (__kmp_shm_available) { // SHM exists, now map it
6797 data1 = (char *)mmap(addr: 0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6798 fd: fd1, offset: 0);
6799 if (data1 == MAP_FAILED) { // failed to map shared memory
6800 KMP_WARNING(FunctionError, "Can't map SHM");
6801 __kmp_shm_available = false;
6802 }
6803 }
6804 if (__kmp_shm_available) { // SHM mapped
6805 if (shm_preexist == 0) { // set data to SHM, set value
6806 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6807 }
6808 // Read value from either what we just wrote or existing file.
6809 value = __kmp_str_format(format: "%s", data1); // read value from SHM
6810 munmap(addr: data1, SHM_SIZE);
6811 }
6812 if (fd1 != -1)
6813 close(fd: fd1);
6814 }
6815 if (!__kmp_shm_available)
6816 __kmp_tmp_available = __kmp_detect_tmp();
6817 if (!__kmp_shm_available && __kmp_tmp_available) {
6818 // SHM failed to work due to an error other than that the file already
6819 // exists. Try to create a temp file under /tmp.
6820 // If /tmp isn't accessible, fall back to using environment variable.
6821 // TODO: /tmp might not always be the temporary directory. For now we will
6822 // not consider TMPDIR.
6823 int fd1 = -1;
6824 temp_reg_status_file_name = __kmp_str_format(format: "/tmp/%s", name);
6825 int tmp_preexist = 0;
6826 fd1 = open(file: temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0600);
6827 if ((fd1 == -1) && (errno == EEXIST)) {
6828 // file didn't open because it already exists.
6829 // try opening existing file
6830 fd1 = open(file: temp_reg_status_file_name, O_RDWR, 0600);
6831 if (fd1 == -1) { // file didn't open if (fd1 == -1) {
6832 KMP_WARNING(FunctionError, "Can't open TEMP");
6833 __kmp_tmp_available = false;
6834 } else {
6835 tmp_preexist = 1;
6836 }
6837 }
6838 if (__kmp_tmp_available && tmp_preexist == 0) {
6839 // we created /tmp file now set size
6840 if (ftruncate(fd: fd1, SHM_SIZE) == -1) { // error occured setting size;
6841 KMP_WARNING(FunctionError, "Can't set size of /tmp file");
6842 __kmp_tmp_available = false;
6843 }
6844 }
6845 if (__kmp_tmp_available) {
6846 data1 = (char *)mmap(addr: 0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6847 fd: fd1, offset: 0);
6848 if (data1 == MAP_FAILED) { // failed to map /tmp
6849 KMP_WARNING(FunctionError, "Can't map /tmp");
6850 __kmp_tmp_available = false;
6851 }
6852 }
6853 if (__kmp_tmp_available) {
6854 if (tmp_preexist == 0) { // set data to TMP, set value
6855 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6856 }
6857 // Read value from either what we just wrote or existing file.
6858 value = __kmp_str_format(format: "%s", data1); // read value from SHM
6859 munmap(addr: data1, SHM_SIZE);
6860 }
6861 if (fd1 != -1)
6862 close(fd: fd1);
6863 }
6864 if (!__kmp_shm_available && !__kmp_tmp_available) {
6865 // no /dev/shm and no /tmp -- fall back to environment variable
6866 // Set environment variable, but do not overwrite if it exists.
6867 __kmp_env_set(name, value: __kmp_registration_str, overwrite: 0);
6868 // read value to see if it got set
6869 value = __kmp_env_get(name);
6870 }
6871#else // Windows and unix with static library
6872 // Set environment variable, but do not overwrite if it exists.
6873 __kmp_env_set(name, __kmp_registration_str, 0);
6874 // read value to see if it got set
6875 value = __kmp_env_get(name);
6876#endif
6877
6878 if (value != NULL && strcmp(s1: value, s2: __kmp_registration_str) == 0) {
6879 done = 1; // Ok, environment variable set successfully, exit the loop.
6880 } else {
6881 // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6882 // Check whether it alive or dead.
6883 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6884 char *tail = value;
6885 char *flag_addr_str = NULL;
6886 char *flag_val_str = NULL;
6887 char const *file_name = NULL;
6888 __kmp_str_split(str: tail, delim: '-', head: &flag_addr_str, tail: &tail);
6889 __kmp_str_split(str: tail, delim: '-', head: &flag_val_str, tail: &tail);
6890 file_name = tail;
6891 if (tail != NULL) {
6892 unsigned long *flag_addr = 0;
6893 unsigned long flag_val = 0;
6894 KMP_SSCANF(s: flag_addr_str, format: "%p", RCAST(void **, &flag_addr));
6895 KMP_SSCANF(s: flag_val_str, format: "%lx", &flag_val);
6896 if (flag_addr != 0 && flag_val != 0 && strcmp(s1: file_name, s2: "") != 0) {
6897 // First, check whether environment-encoded address is mapped into
6898 // addr space.
6899 // If so, dereference it to see if it still has the right value.
6900 if (__kmp_is_address_mapped(addr: flag_addr) && *flag_addr == flag_val) {
6901 neighbor = 1;
6902 } else {
6903 // If not, then we know the other copy of the library is no longer
6904 // running.
6905 neighbor = 2;
6906 }
6907 }
6908 }
6909 switch (neighbor) {
6910 case 0: // Cannot parse environment variable -- neighbor status unknown.
6911 // Assume it is the incompatible format of future version of the
6912 // library. Assume the other library is alive.
6913 // WARN( ... ); // TODO: Issue a warning.
6914 file_name = "unknown library";
6915 KMP_FALLTHROUGH();
6916 // Attention! Falling to the next case. That's intentional.
6917 case 1: { // Neighbor is alive.
6918 // Check it is allowed.
6919 char *duplicate_ok = __kmp_env_get(name: "KMP_DUPLICATE_LIB_OK");
6920 if (!__kmp_str_match_true(data: duplicate_ok)) {
6921 // That's not allowed. Issue fatal error.
6922 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6923 KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6924 }
6925 KMP_INTERNAL_FREE(duplicate_ok);
6926 __kmp_duplicate_library_ok = 1;
6927 done = 1; // Exit the loop.
6928 } break;
6929 case 2: { // Neighbor is dead.
6930
6931#if defined(KMP_USE_SHM)
6932 if (__kmp_shm_available) { // close shared memory.
6933 shm_unlink(name: shm_name); // this removes file in /dev/shm
6934 } else if (__kmp_tmp_available) {
6935 unlink(name: temp_reg_status_file_name); // this removes the temp file
6936 } else {
6937 // Clear the variable and try to register library again.
6938 __kmp_env_unset(name);
6939 }
6940#else
6941 // Clear the variable and try to register library again.
6942 __kmp_env_unset(name);
6943#endif
6944 } break;
6945 default: {
6946 KMP_DEBUG_ASSERT(0);
6947 } break;
6948 }
6949 }
6950 KMP_INTERNAL_FREE((void *)value);
6951#if defined(KMP_USE_SHM)
6952 if (shm_name)
6953 KMP_INTERNAL_FREE((void *)shm_name);
6954#endif
6955 } // while
6956 KMP_INTERNAL_FREE((void *)name);
6957
6958} // func __kmp_register_library_startup
6959
6960void __kmp_unregister_library(void) {
6961
6962 char *name = __kmp_reg_status_name();
6963 char *value = NULL;
6964
6965#if defined(KMP_USE_SHM)
6966 char *shm_name = nullptr;
6967 int fd1;
6968 if (__kmp_shm_available) {
6969 shm_name = __kmp_str_format(format: "/%s", name);
6970 fd1 = shm_open(name: shm_name, O_RDONLY, mode: 0600);
6971 if (fd1 != -1) { // File opened successfully
6972 char *data1 = (char *)mmap(addr: 0, SHM_SIZE, PROT_READ, MAP_SHARED, fd: fd1, offset: 0);
6973 if (data1 != MAP_FAILED) {
6974 value = __kmp_str_format(format: "%s", data1); // read value from SHM
6975 munmap(addr: data1, SHM_SIZE);
6976 }
6977 close(fd: fd1);
6978 }
6979 } else if (__kmp_tmp_available) { // try /tmp
6980 fd1 = open(file: temp_reg_status_file_name, O_RDONLY);
6981 if (fd1 != -1) { // File opened successfully
6982 char *data1 = (char *)mmap(addr: 0, SHM_SIZE, PROT_READ, MAP_SHARED, fd: fd1, offset: 0);
6983 if (data1 != MAP_FAILED) {
6984 value = __kmp_str_format(format: "%s", data1); // read value from /tmp
6985 munmap(addr: data1, SHM_SIZE);
6986 }
6987 close(fd: fd1);
6988 }
6989 } else { // fall back to envirable
6990 value = __kmp_env_get(name);
6991 }
6992#else
6993 value = __kmp_env_get(name);
6994#endif
6995
6996 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6997 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6998 if (value != NULL && strcmp(s1: value, s2: __kmp_registration_str) == 0) {
6999// Ok, this is our variable. Delete it.
7000#if defined(KMP_USE_SHM)
7001 if (__kmp_shm_available) {
7002 shm_unlink(name: shm_name); // this removes file in /dev/shm
7003 } else if (__kmp_tmp_available) {
7004 unlink(name: temp_reg_status_file_name); // this removes the temp file
7005 } else {
7006 __kmp_env_unset(name);
7007 }
7008#else
7009 __kmp_env_unset(name);
7010#endif
7011 }
7012
7013#if defined(KMP_USE_SHM)
7014 if (shm_name)
7015 KMP_INTERNAL_FREE(shm_name);
7016 if (temp_reg_status_file_name)
7017 KMP_INTERNAL_FREE(temp_reg_status_file_name);
7018#endif
7019
7020 KMP_INTERNAL_FREE(__kmp_registration_str);
7021 KMP_INTERNAL_FREE(value);
7022 KMP_INTERNAL_FREE(name);
7023
7024 __kmp_registration_flag = 0;
7025 __kmp_registration_str = NULL;
7026
7027} // __kmp_unregister_library
7028
7029// End of Library registration stuff.
7030// -----------------------------------------------------------------------------
7031
7032#if KMP_MIC_SUPPORTED
7033
7034static void __kmp_check_mic_type() {
7035 kmp_cpuid_t cpuid_state = {.eax: 0};
7036 kmp_cpuid_t *cs_p = &cpuid_state;
7037 __kmp_x86_cpuid(leaf: 1, subleaf: 0, p: cs_p);
7038 // We don't support mic1 at the moment
7039 if ((cs_p->eax & 0xff0) == 0xB10) {
7040 __kmp_mic_type = mic2;
7041 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
7042 __kmp_mic_type = mic3;
7043 } else {
7044 __kmp_mic_type = non_mic;
7045 }
7046}
7047
7048#endif /* KMP_MIC_SUPPORTED */
7049
7050#if KMP_HAVE_UMWAIT
7051static void __kmp_user_level_mwait_init() {
7052 struct kmp_cpuid buf;
7053 __kmp_x86_cpuid(leaf: 7, subleaf: 0, p: &buf);
7054 __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
7055 __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
7056 __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
7057 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
7058 __kmp_umwait_enabled));
7059}
7060#elif KMP_HAVE_MWAIT
7061#ifndef AT_INTELPHIUSERMWAIT
7062// Spurious, non-existent value that should always fail to return anything.
7063// Will be replaced with the correct value when we know that.
7064#define AT_INTELPHIUSERMWAIT 10000
7065#endif
7066// getauxval() function is available in RHEL7 and SLES12. If a system with an
7067// earlier OS is used to build the RTL, we'll use the following internal
7068// function when the entry is not found.
7069unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
7070unsigned long getauxval(unsigned long) { return 0; }
7071
7072static void __kmp_user_level_mwait_init() {
7073 // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
7074 // use them to find if the user-level mwait is enabled. Otherwise, forcibly
7075 // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
7076 // KMP_USER_LEVEL_MWAIT was set to TRUE.
7077 if (__kmp_mic_type == mic3) {
7078 unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
7079 if ((res & 0x1) || __kmp_user_level_mwait) {
7080 __kmp_mwait_enabled = TRUE;
7081 if (__kmp_user_level_mwait) {
7082 KMP_INFORM(EnvMwaitWarn);
7083 }
7084 } else {
7085 __kmp_mwait_enabled = FALSE;
7086 }
7087 }
7088 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
7089 "__kmp_mwait_enabled = %d\n",
7090 __kmp_mic_type, __kmp_mwait_enabled));
7091}
7092#endif /* KMP_HAVE_UMWAIT */
7093
7094static void __kmp_do_serial_initialize(void) {
7095 int i, gtid;
7096 size_t size;
7097
7098 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
7099
7100 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
7101 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
7102 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
7103 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
7104 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
7105
7106#if OMPT_SUPPORT
7107 ompt_pre_init();
7108#endif
7109#if OMPD_SUPPORT
7110 __kmp_env_dump();
7111 ompd_init();
7112#endif
7113
7114 __kmp_validate_locks();
7115
7116#if ENABLE_LIBOMPTARGET
7117 /* Initialize functions from libomptarget */
7118 __kmp_init_omptarget();
7119#endif
7120
7121 /* Initialize internal memory allocator */
7122 __kmp_init_allocator();
7123
7124 /* Register the library startup via an environment variable or via mapped
7125 shared memory file and check to see whether another copy of the library is
7126 already registered. Since forked child process is often terminated, we
7127 postpone the registration till middle initialization in the child */
7128 if (__kmp_need_register_serial)
7129 __kmp_register_library_startup();
7130
7131 /* TODO reinitialization of library */
7132 if (TCR_4(__kmp_global.g.g_done)) {
7133 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
7134 }
7135
7136 __kmp_global.g.g_abort = 0;
7137 TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
7138
7139/* initialize the locks */
7140#if KMP_USE_ADAPTIVE_LOCKS
7141#if KMP_DEBUG_ADAPTIVE_LOCKS
7142 __kmp_init_speculative_stats();
7143#endif
7144#endif
7145#if KMP_STATS_ENABLED
7146 __kmp_stats_init();
7147#endif
7148 __kmp_init_lock(lck: &__kmp_global_lock);
7149 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock);
7150 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_1i);
7151 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_2i);
7152 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_4i);
7153 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_4r);
7154 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_8i);
7155 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_8r);
7156 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_8c);
7157 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_10r);
7158 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_16r);
7159 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_16c);
7160 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_20c);
7161 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_32c);
7162 __kmp_init_bootstrap_lock(lck: &__kmp_forkjoin_lock);
7163 __kmp_init_bootstrap_lock(lck: &__kmp_exit_lock);
7164#if KMP_USE_MONITOR
7165 __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7166#endif
7167 __kmp_init_bootstrap_lock(lck: &__kmp_tp_cached_lock);
7168
7169 /* conduct initialization and initial setup of configuration */
7170
7171 __kmp_runtime_initialize();
7172
7173#if KMP_MIC_SUPPORTED
7174 __kmp_check_mic_type();
7175#endif
7176
7177// Some global variable initialization moved here from kmp_env_initialize()
7178#ifdef KMP_DEBUG
7179 kmp_diag = 0;
7180#endif
7181 __kmp_abort_delay = 0;
7182
7183 // From __kmp_init_dflt_team_nth()
7184 /* assume the entire machine will be used */
7185 __kmp_dflt_team_nth_ub = __kmp_xproc;
7186 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7187 __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7188 }
7189 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7190 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7191 }
7192 __kmp_max_nth = __kmp_sys_max_nth;
7193 __kmp_cg_max_nth = __kmp_sys_max_nth;
7194 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7195 if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7196 __kmp_teams_max_nth = __kmp_sys_max_nth;
7197 }
7198
7199 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7200 // part
7201 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7202#if KMP_USE_MONITOR
7203 __kmp_monitor_wakeups =
7204 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7205 __kmp_bt_intervals =
7206 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7207#endif
7208 // From "KMP_LIBRARY" part of __kmp_env_initialize()
7209 __kmp_library = library_throughput;
7210 // From KMP_SCHEDULE initialization
7211 __kmp_static = kmp_sch_static_balanced;
7212// AC: do not use analytical here, because it is non-monotonous
7213//__kmp_guided = kmp_sch_guided_iterative_chunked;
7214//__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7215// need to repeat assignment
7216// Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7217// bit control and barrier method control parts
7218#if KMP_FAST_REDUCTION_BARRIER
7219#define kmp_reduction_barrier_gather_bb ((int)1)
7220#define kmp_reduction_barrier_release_bb ((int)1)
7221#define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7222#define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7223#endif // KMP_FAST_REDUCTION_BARRIER
7224 for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7225 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7226 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7227 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7228 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7229#if KMP_FAST_REDUCTION_BARRIER
7230 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7231 // lin_64 ): hyper,1
7232 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7233 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7234 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7235 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7236 }
7237#endif // KMP_FAST_REDUCTION_BARRIER
7238 }
7239#if KMP_FAST_REDUCTION_BARRIER
7240#undef kmp_reduction_barrier_release_pat
7241#undef kmp_reduction_barrier_gather_pat
7242#undef kmp_reduction_barrier_release_bb
7243#undef kmp_reduction_barrier_gather_bb
7244#endif // KMP_FAST_REDUCTION_BARRIER
7245#if KMP_MIC_SUPPORTED
7246 if (__kmp_mic_type == mic2) { // KNC
7247 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7248 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7249 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7250 1; // forkjoin release
7251 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7252 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7253 }
7254#if KMP_FAST_REDUCTION_BARRIER
7255 if (__kmp_mic_type == mic2) { // KNC
7256 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7257 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7258 }
7259#endif // KMP_FAST_REDUCTION_BARRIER
7260#endif // KMP_MIC_SUPPORTED
7261
7262// From KMP_CHECKS initialization
7263#ifdef KMP_DEBUG
7264 __kmp_env_checks = TRUE; /* development versions have the extra checks */
7265#else
7266 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7267#endif
7268
7269 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7270 __kmp_foreign_tp = TRUE;
7271
7272 __kmp_global.g.g_dynamic = FALSE;
7273 __kmp_global.g.g_dynamic_mode = dynamic_default;
7274
7275 __kmp_init_nesting_mode();
7276
7277 __kmp_env_initialize(NULL);
7278
7279#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7280 __kmp_user_level_mwait_init();
7281#endif
7282// Print all messages in message catalog for testing purposes.
7283#ifdef KMP_DEBUG
7284 char const *val = __kmp_env_get(name: "KMP_DUMP_CATALOG");
7285 if (__kmp_str_match_true(data: val)) {
7286 kmp_str_buf_t buffer;
7287 __kmp_str_buf_init(&buffer);
7288 __kmp_i18n_dump_catalog(buffer: &buffer);
7289 __kmp_printf(format: "%s", buffer.str);
7290 __kmp_str_buf_free(buffer: &buffer);
7291 }
7292 __kmp_env_free(value: &val);
7293#endif
7294
7295 __kmp_threads_capacity =
7296 __kmp_initial_threads_capacity(req_nproc: __kmp_dflt_team_nth_ub);
7297 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7298 __kmp_tp_capacity = __kmp_default_tp_capacity(
7299 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7300
7301 // If the library is shut down properly, both pools must be NULL. Just in
7302 // case, set them to NULL -- some memory may leak, but subsequent code will
7303 // work even if pools are not freed.
7304 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7305 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7306 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7307 __kmp_thread_pool = NULL;
7308 __kmp_thread_pool_insert_pt = NULL;
7309 __kmp_team_pool = NULL;
7310
7311 /* Allocate all of the variable sized records */
7312 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7313 * expandable */
7314 /* Since allocation is cache-aligned, just add extra padding at the end */
7315 size =
7316 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7317 CACHE_LINE;
7318 __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7319 __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7320 sizeof(kmp_info_t *) * __kmp_threads_capacity);
7321
7322 /* init thread counts */
7323 KMP_DEBUG_ASSERT(__kmp_all_nth ==
7324 0); // Asserts fail if the library is reinitializing and
7325 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7326 __kmp_all_nth = 0;
7327 __kmp_nth = 0;
7328
7329 /* setup the uber master thread and hierarchy */
7330 gtid = __kmp_register_root(TRUE);
7331 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
7332 KMP_ASSERT(KMP_UBER_GTID(gtid));
7333 KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7334
7335 KMP_MB(); /* Flush all pending memory write invalidates. */
7336
7337 __kmp_common_initialize();
7338
7339#if KMP_OS_UNIX
7340 /* invoke the child fork handler */
7341 __kmp_register_atfork();
7342#endif
7343
7344#if !KMP_DYNAMIC_LIB || \
7345 ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
7346 {
7347 /* Invoke the exit handler when the program finishes, only for static
7348 library and macOS* dynamic. For other dynamic libraries, we already
7349 have _fini and DllMain. */
7350 int rc = atexit(__kmp_internal_end_atexit);
7351 if (rc != 0) {
7352 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7353 __kmp_msg_null);
7354 }
7355 }
7356#endif
7357
7358#if KMP_HANDLE_SIGNALS
7359#if KMP_OS_UNIX
7360 /* NOTE: make sure that this is called before the user installs their own
7361 signal handlers so that the user handlers are called first. this way they
7362 can return false, not call our handler, avoid terminating the library, and
7363 continue execution where they left off. */
7364 __kmp_install_signals(FALSE);
7365#endif /* KMP_OS_UNIX */
7366#if KMP_OS_WINDOWS
7367 __kmp_install_signals(TRUE);
7368#endif /* KMP_OS_WINDOWS */
7369#endif
7370
7371 /* we have finished the serial initialization */
7372 __kmp_init_counter++;
7373
7374 __kmp_init_serial = TRUE;
7375
7376 if (__kmp_version) {
7377 __kmp_print_version_1();
7378 }
7379
7380 if (__kmp_settings) {
7381 __kmp_env_print();
7382 }
7383
7384 if (__kmp_display_env || __kmp_display_env_verbose) {
7385 __kmp_env_print_2();
7386 }
7387
7388#if OMPT_SUPPORT
7389 ompt_post_init();
7390#endif
7391
7392 KMP_MB();
7393
7394 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7395}
7396
7397void __kmp_serial_initialize(void) {
7398 if (__kmp_init_serial) {
7399 return;
7400 }
7401 __kmp_acquire_bootstrap_lock(lck: &__kmp_initz_lock);
7402 if (__kmp_init_serial) {
7403 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
7404 return;
7405 }
7406 __kmp_do_serial_initialize();
7407 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
7408}
7409
7410static void __kmp_do_middle_initialize(void) {
7411 int i, j;
7412 int prev_dflt_team_nth;
7413
7414 if (!__kmp_init_serial) {
7415 __kmp_do_serial_initialize();
7416 }
7417
7418 KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7419
7420 if (UNLIKELY(!__kmp_need_register_serial)) {
7421 // We are in a forked child process. The registration was skipped during
7422 // serial initialization in __kmp_atfork_child handler. Do it here.
7423 __kmp_register_library_startup();
7424 }
7425
7426 // Save the previous value for the __kmp_dflt_team_nth so that
7427 // we can avoid some reinitialization if it hasn't changed.
7428 prev_dflt_team_nth = __kmp_dflt_team_nth;
7429
7430#if KMP_AFFINITY_SUPPORTED
7431 // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7432 // number of cores on the machine.
7433 __kmp_affinity_initialize(affinity&: __kmp_affinity);
7434
7435#endif /* KMP_AFFINITY_SUPPORTED */
7436
7437 KMP_ASSERT(__kmp_xproc > 0);
7438 if (__kmp_avail_proc == 0) {
7439 __kmp_avail_proc = __kmp_xproc;
7440 }
7441
7442 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7443 // correct them now
7444 j = 0;
7445 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7446 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7447 __kmp_avail_proc;
7448 j++;
7449 }
7450
7451 if (__kmp_dflt_team_nth == 0) {
7452#ifdef KMP_DFLT_NTH_CORES
7453 // Default #threads = #cores
7454 __kmp_dflt_team_nth = __kmp_ncores;
7455 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7456 "__kmp_ncores (%d)\n",
7457 __kmp_dflt_team_nth));
7458#else
7459 // Default #threads = #available OS procs
7460 __kmp_dflt_team_nth = __kmp_avail_proc;
7461 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7462 "__kmp_avail_proc(%d)\n",
7463 __kmp_dflt_team_nth));
7464#endif /* KMP_DFLT_NTH_CORES */
7465 }
7466
7467 if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7468 __kmp_dflt_team_nth = KMP_MIN_NTH;
7469 }
7470 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7471 __kmp_dflt_team_nth = __kmp_sys_max_nth;
7472 }
7473
7474 if (__kmp_nesting_mode > 0)
7475 __kmp_set_nesting_mode_threads();
7476
7477 // There's no harm in continuing if the following check fails,
7478 // but it indicates an error in the previous logic.
7479 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7480
7481 if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7482 // Run through the __kmp_threads array and set the num threads icv for each
7483 // root thread that is currently registered with the RTL (which has not
7484 // already explicitly set its nthreads-var with a call to
7485 // omp_set_num_threads()).
7486 for (i = 0; i < __kmp_threads_capacity; i++) {
7487 kmp_info_t *thread = __kmp_threads[i];
7488 if (thread == NULL)
7489 continue;
7490 if (thread->th.th_current_task->td_icvs.nproc != 0)
7491 continue;
7492
7493 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7494 }
7495 }
7496 KA_TRACE(
7497 20,
7498 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7499 __kmp_dflt_team_nth));
7500
7501#ifdef KMP_ADJUST_BLOCKTIME
7502 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7503 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7504 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7505 if (__kmp_nth > __kmp_avail_proc) {
7506 __kmp_zero_bt = TRUE;
7507 }
7508 }
7509#endif /* KMP_ADJUST_BLOCKTIME */
7510
7511 /* we have finished middle initialization */
7512 TCW_SYNC_4(__kmp_init_middle, TRUE);
7513
7514 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7515}
7516
7517void __kmp_middle_initialize(void) {
7518 if (__kmp_init_middle) {
7519 return;
7520 }
7521 __kmp_acquire_bootstrap_lock(lck: &__kmp_initz_lock);
7522 if (__kmp_init_middle) {
7523 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
7524 return;
7525 }
7526 __kmp_do_middle_initialize();
7527 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
7528}
7529
7530void __kmp_parallel_initialize(void) {
7531 int gtid = __kmp_entry_gtid(); // this might be a new root
7532
7533 /* synchronize parallel initialization (for sibling) */
7534 if (TCR_4(__kmp_init_parallel))
7535 return;
7536 __kmp_acquire_bootstrap_lock(lck: &__kmp_initz_lock);
7537 if (TCR_4(__kmp_init_parallel)) {
7538 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
7539 return;
7540 }
7541
7542 /* TODO reinitialization after we have already shut down */
7543 if (TCR_4(__kmp_global.g.g_done)) {
7544 KA_TRACE(
7545 10,
7546 ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7547 __kmp_infinite_loop();
7548 }
7549
7550 /* jc: The lock __kmp_initz_lock is already held, so calling
7551 __kmp_serial_initialize would cause a deadlock. So we call
7552 __kmp_do_serial_initialize directly. */
7553 if (!__kmp_init_middle) {
7554 __kmp_do_middle_initialize();
7555 }
7556 __kmp_assign_root_init_mask();
7557 __kmp_resume_if_hard_paused();
7558
7559 /* begin initialization */
7560 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7561 KMP_ASSERT(KMP_UBER_GTID(gtid));
7562
7563#if KMP_ARCH_X86 || KMP_ARCH_X86_64
7564 // Save the FP control regs.
7565 // Worker threads will set theirs to these values at thread startup.
7566 __kmp_store_x87_fpu_control_word(p: &__kmp_init_x87_fpu_control_word);
7567 __kmp_store_mxcsr(p: &__kmp_init_mxcsr);
7568 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7569#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7570
7571#if KMP_OS_UNIX
7572#if KMP_HANDLE_SIGNALS
7573 /* must be after __kmp_serial_initialize */
7574 __kmp_install_signals(TRUE);
7575#endif
7576#endif
7577
7578 __kmp_suspend_initialize();
7579
7580#if defined(USE_LOAD_BALANCE)
7581 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7582 __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7583 }
7584#else
7585 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7586 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7587 }
7588#endif
7589
7590 if (__kmp_version) {
7591 __kmp_print_version_2();
7592 }
7593
7594 /* we have finished parallel initialization */
7595 TCW_SYNC_4(__kmp_init_parallel, TRUE);
7596
7597 KMP_MB();
7598 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7599
7600 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
7601}
7602
7603void __kmp_hidden_helper_initialize() {
7604 if (TCR_4(__kmp_init_hidden_helper))
7605 return;
7606
7607 // __kmp_parallel_initialize is required before we initialize hidden helper
7608 if (!TCR_4(__kmp_init_parallel))
7609 __kmp_parallel_initialize();
7610
7611 // Double check. Note that this double check should not be placed before
7612 // __kmp_parallel_initialize as it will cause dead lock.
7613 __kmp_acquire_bootstrap_lock(lck: &__kmp_initz_lock);
7614 if (TCR_4(__kmp_init_hidden_helper)) {
7615 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
7616 return;
7617 }
7618
7619#if KMP_AFFINITY_SUPPORTED
7620 // Initialize hidden helper affinity settings.
7621 // The above __kmp_parallel_initialize() will initialize
7622 // regular affinity (and topology) if not already done.
7623 if (!__kmp_hh_affinity.flags.initialized)
7624 __kmp_affinity_initialize(affinity&: __kmp_hh_affinity);
7625#endif
7626
7627 // Set the count of hidden helper tasks to be executed to zero
7628 KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7629
7630 // Set the global variable indicating that we're initializing hidden helper
7631 // team/threads
7632 TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7633
7634 // Platform independent initialization
7635 __kmp_do_initialize_hidden_helper_threads();
7636
7637 // Wait here for the finish of initialization of hidden helper teams
7638 __kmp_hidden_helper_threads_initz_wait();
7639
7640 // We have finished hidden helper initialization
7641 TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7642
7643 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
7644}
7645
7646/* ------------------------------------------------------------------------ */
7647
7648void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7649 kmp_team_t *team) {
7650 kmp_disp_t *dispatch;
7651
7652 KMP_MB();
7653
7654 /* none of the threads have encountered any constructs, yet. */
7655 this_thr->th.th_local.this_construct = 0;
7656#if KMP_CACHE_MANAGE
7657 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7658#endif /* KMP_CACHE_MANAGE */
7659 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7660 KMP_DEBUG_ASSERT(dispatch);
7661 KMP_DEBUG_ASSERT(team->t.t_dispatch);
7662 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7663 // this_thr->th.th_info.ds.ds_tid ] );
7664
7665 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7666 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7667 if (__kmp_env_consistency_check)
7668 __kmp_push_parallel(gtid, ident: team->t.t_ident);
7669
7670 KMP_MB(); /* Flush all pending memory write invalidates. */
7671}
7672
7673void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7674 kmp_team_t *team) {
7675 if (__kmp_env_consistency_check)
7676 __kmp_pop_parallel(gtid, ident: team->t.t_ident);
7677
7678 __kmp_finish_implicit_task(this_thr);
7679}
7680
7681int __kmp_invoke_task_func(int gtid) {
7682 int rc;
7683 int tid = __kmp_tid_from_gtid(gtid);
7684 kmp_info_t *this_thr = __kmp_threads[gtid];
7685 kmp_team_t *team = this_thr->th.th_team;
7686
7687 __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7688#if USE_ITT_BUILD
7689 if (__itt_stack_caller_create_ptr) {
7690 // inform ittnotify about entering user's code
7691 if (team->t.t_stack_id != NULL) {
7692 __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7693 } else {
7694 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7695 __kmp_itt_stack_callee_enter(
7696 (__itt_caller)team->t.t_parent->t.t_stack_id);
7697 }
7698 }
7699#endif /* USE_ITT_BUILD */
7700#if INCLUDE_SSC_MARKS
7701 SSC_MARK_INVOKING();
7702#endif
7703
7704#if OMPT_SUPPORT
7705 void *dummy;
7706 void **exit_frame_p;
7707 ompt_data_t *my_task_data;
7708 ompt_data_t *my_parallel_data;
7709 int ompt_team_size;
7710
7711 if (ompt_enabled.enabled) {
7712 exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7713 .ompt_task_info.frame.exit_frame.ptr);
7714 } else {
7715 exit_frame_p = &dummy;
7716 }
7717
7718 my_task_data =
7719 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7720 my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7721 if (ompt_enabled.ompt_callback_implicit_task) {
7722 ompt_team_size = team->t.t_nproc;
7723 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7724 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7725 __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7726 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7727 }
7728#endif
7729
7730#if KMP_STATS_ENABLED
7731 stats_state_e previous_state = KMP_GET_THREAD_STATE();
7732 if (previous_state == stats_state_e::TEAMS_REGION) {
7733 KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7734 } else {
7735 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7736 }
7737 KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7738#endif
7739
7740 rc = __kmp_invoke_microtask(pkfn: (microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7741 npr: tid, argc: (int)team->t.t_argc, argv: (void **)team->t.t_argv
7742#if OMPT_SUPPORT
7743 ,
7744 exit_frame_ptr: exit_frame_p
7745#endif
7746 );
7747#if OMPT_SUPPORT
7748 *exit_frame_p = NULL;
7749 this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_team;
7750#endif
7751
7752#if KMP_STATS_ENABLED
7753 if (previous_state == stats_state_e::TEAMS_REGION) {
7754 KMP_SET_THREAD_STATE(previous_state);
7755 }
7756 KMP_POP_PARTITIONED_TIMER();
7757#endif
7758
7759#if USE_ITT_BUILD
7760 if (__itt_stack_caller_create_ptr) {
7761 // inform ittnotify about leaving user's code
7762 if (team->t.t_stack_id != NULL) {
7763 __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7764 } else {
7765 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7766 __kmp_itt_stack_callee_leave(
7767 (__itt_caller)team->t.t_parent->t.t_stack_id);
7768 }
7769 }
7770#endif /* USE_ITT_BUILD */
7771 __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7772
7773 return rc;
7774}
7775
7776void __kmp_teams_master(int gtid) {
7777 // This routine is called by all primary threads in teams construct
7778 kmp_info_t *thr = __kmp_threads[gtid];
7779 kmp_team_t *team = thr->th.th_team;
7780 ident_t *loc = team->t.t_ident;
7781 thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7782 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7783 KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7784 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7785 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7786
7787 // This thread is a new CG root. Set up the proper variables.
7788 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7789 tmp->cg_root = thr; // Make thr the CG root
7790 // Init to thread limit stored when league primary threads were forked
7791 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7792 tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7793 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7794 " cg_nthreads to 1\n",
7795 thr, tmp));
7796 tmp->up = thr->th.th_cg_roots;
7797 thr->th.th_cg_roots = tmp;
7798
7799// Launch league of teams now, but not let workers execute
7800// (they hang on fork barrier until next parallel)
7801#if INCLUDE_SSC_MARKS
7802 SSC_MARK_FORKING();
7803#endif
7804 __kmp_fork_call(loc, gtid, call_context: fork_context_intel, argc: team->t.t_argc,
7805 microtask: (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7806 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7807#if INCLUDE_SSC_MARKS
7808 SSC_MARK_JOINING();
7809#endif
7810 // If the team size was reduced from the limit, set it to the new size
7811 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7812 thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7813 // AC: last parameter "1" eliminates join barrier which won't work because
7814 // worker threads are in a fork barrier waiting for more parallel regions
7815 __kmp_join_call(loc, gtid
7816#if OMPT_SUPPORT
7817 ,
7818 fork_context: fork_context_intel
7819#endif
7820 ,
7821 exit_teams: 1);
7822}
7823
7824int __kmp_invoke_teams_master(int gtid) {
7825 kmp_info_t *this_thr = __kmp_threads[gtid];
7826 kmp_team_t *team = this_thr->th.th_team;
7827#if KMP_DEBUG
7828 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7829 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7830 (void *)__kmp_teams_master);
7831#endif
7832 __kmp_run_before_invoked_task(gtid, tid: 0, this_thr, team);
7833#if OMPT_SUPPORT
7834 int tid = __kmp_tid_from_gtid(gtid);
7835 ompt_data_t *task_data =
7836 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7837 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7838 if (ompt_enabled.ompt_callback_implicit_task) {
7839 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7840 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7841 ompt_task_initial);
7842 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7843 }
7844#endif
7845 __kmp_teams_master(gtid);
7846#if OMPT_SUPPORT
7847 this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_league;
7848#endif
7849 __kmp_run_after_invoked_task(gtid, tid: 0, this_thr, team);
7850 return 1;
7851}
7852
7853/* this sets the requested number of threads for the next parallel region
7854 encountered by this team. since this should be enclosed in the forkjoin
7855 critical section it should avoid race conditions with asymmetrical nested
7856 parallelism */
7857void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7858 kmp_info_t *thr = __kmp_threads[gtid];
7859
7860 if (num_threads > 0)
7861 thr->th.th_set_nproc = num_threads;
7862}
7863
7864void __kmp_push_num_threads_list(ident_t *id, int gtid, kmp_uint32 list_length,
7865 int *num_threads_list) {
7866 kmp_info_t *thr = __kmp_threads[gtid];
7867
7868 KMP_DEBUG_ASSERT(list_length > 1);
7869
7870 if (num_threads_list[0] > 0)
7871 thr->th.th_set_nproc = num_threads_list[0];
7872 thr->th.th_set_nested_nth =
7873 (int *)KMP_INTERNAL_MALLOC(list_length * sizeof(int));
7874 for (kmp_uint32 i = 0; i < list_length; ++i)
7875 thr->th.th_set_nested_nth[i] = num_threads_list[i];
7876 thr->th.th_set_nested_nth_sz = list_length;
7877}
7878
7879void __kmp_set_strict_num_threads(ident_t *loc, int gtid, int sev,
7880 const char *msg) {
7881 kmp_info_t *thr = __kmp_threads[gtid];
7882 thr->th.th_nt_strict = true;
7883 thr->th.th_nt_loc = loc;
7884 // if sev is unset make fatal
7885 if (sev == severity_warning)
7886 thr->th.th_nt_sev = sev;
7887 else
7888 thr->th.th_nt_sev = severity_fatal;
7889 // if msg is unset, use an appropriate message
7890 if (msg)
7891 thr->th.th_nt_msg = msg;
7892 else
7893 thr->th.th_nt_msg = "Cannot form team with number of threads specified by "
7894 "strict num_threads clause.";
7895}
7896
7897static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7898 int num_threads) {
7899 KMP_DEBUG_ASSERT(thr);
7900 // Remember the number of threads for inner parallel regions
7901 if (!TCR_4(__kmp_init_middle))
7902 __kmp_middle_initialize(); // get internal globals calculated
7903 __kmp_assign_root_init_mask();
7904 KMP_DEBUG_ASSERT(__kmp_avail_proc);
7905 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7906
7907 if (num_threads == 0) {
7908 if (__kmp_teams_thread_limit > 0) {
7909 num_threads = __kmp_teams_thread_limit;
7910 } else {
7911 num_threads = __kmp_avail_proc / num_teams;
7912 }
7913 // adjust num_threads w/o warning as it is not user setting
7914 // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7915 // no thread_limit clause specified - do not change thread-limit-var ICV
7916 if (num_threads > __kmp_dflt_team_nth) {
7917 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7918 }
7919 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7920 num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7921 } // prevent team size to exceed thread-limit-var
7922 if (num_teams * num_threads > __kmp_teams_max_nth) {
7923 num_threads = __kmp_teams_max_nth / num_teams;
7924 }
7925 if (num_threads == 0) {
7926 num_threads = 1;
7927 }
7928 } else {
7929 if (num_threads < 0) {
7930 __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7931 __kmp_msg_null);
7932 num_threads = 1;
7933 }
7934 // This thread will be the primary thread of the league primary threads
7935 // Store new thread limit; old limit is saved in th_cg_roots list
7936 thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7937 // num_threads = min(num_threads, nthreads-var)
7938 if (num_threads > __kmp_dflt_team_nth) {
7939 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7940 }
7941 if (num_teams * num_threads > __kmp_teams_max_nth) {
7942 int new_threads = __kmp_teams_max_nth / num_teams;
7943 if (new_threads == 0) {
7944 new_threads = 1;
7945 }
7946 if (new_threads != num_threads) {
7947 if (!__kmp_reserve_warn) { // user asked for too many threads
7948 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7949 __kmp_msg(kmp_ms_warning,
7950 KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7951 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7952 }
7953 }
7954 num_threads = new_threads;
7955 }
7956 }
7957 thr->th.th_teams_size.nth = num_threads;
7958}
7959
7960/* this sets the requested number of teams for the teams region and/or
7961 the number of threads for the next parallel region encountered */
7962void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7963 int num_threads) {
7964 kmp_info_t *thr = __kmp_threads[gtid];
7965 if (num_teams < 0) {
7966 // OpenMP specification requires requested values to be positive,
7967 // but people can send us any value, so we'd better check
7968 __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7969 __kmp_msg_null);
7970 num_teams = 1;
7971 }
7972 if (num_teams == 0) {
7973 if (__kmp_nteams > 0) {
7974 num_teams = __kmp_nteams;
7975 } else {
7976 num_teams = 1; // default number of teams is 1.
7977 }
7978 }
7979 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7980 if (!__kmp_reserve_warn) {
7981 __kmp_reserve_warn = 1;
7982 __kmp_msg(kmp_ms_warning,
7983 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7984 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7985 }
7986 num_teams = __kmp_teams_max_nth;
7987 }
7988 // Set number of teams (number of threads in the outer "parallel" of the
7989 // teams)
7990 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7991
7992 __kmp_push_thread_limit(thr, num_teams, num_threads);
7993}
7994
7995/* This sets the requested number of teams for the teams region and/or
7996 the number of threads for the next parallel region encountered */
7997void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7998 int num_teams_ub, int num_threads) {
7999 kmp_info_t *thr = __kmp_threads[gtid];
8000 KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
8001 KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
8002 KMP_DEBUG_ASSERT(num_threads >= 0);
8003
8004 if (num_teams_lb > num_teams_ub) {
8005 __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
8006 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
8007 }
8008
8009 int num_teams = 1; // defalt number of teams is 1.
8010
8011 if (num_teams_lb == 0 && num_teams_ub > 0)
8012 num_teams_lb = num_teams_ub;
8013
8014 if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
8015 num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
8016 if (num_teams > __kmp_teams_max_nth) {
8017 if (!__kmp_reserve_warn) {
8018 __kmp_reserve_warn = 1;
8019 __kmp_msg(kmp_ms_warning,
8020 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
8021 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
8022 }
8023 num_teams = __kmp_teams_max_nth;
8024 }
8025 } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
8026 num_teams = num_teams_ub;
8027 } else { // num_teams_lb <= num_teams <= num_teams_ub
8028 if (num_threads <= 0) {
8029 if (num_teams_ub > __kmp_teams_max_nth) {
8030 num_teams = num_teams_lb;
8031 } else {
8032 num_teams = num_teams_ub;
8033 }
8034 } else {
8035 num_teams = (num_threads > __kmp_teams_max_nth)
8036 ? num_teams
8037 : __kmp_teams_max_nth / num_threads;
8038 if (num_teams < num_teams_lb) {
8039 num_teams = num_teams_lb;
8040 } else if (num_teams > num_teams_ub) {
8041 num_teams = num_teams_ub;
8042 }
8043 }
8044 }
8045 // Set number of teams (number of threads in the outer "parallel" of the
8046 // teams)
8047 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
8048
8049 __kmp_push_thread_limit(thr, num_teams, num_threads);
8050}
8051
8052// Set the proc_bind var to use in the following parallel region.
8053void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
8054 kmp_info_t *thr = __kmp_threads[gtid];
8055 thr->th.th_set_proc_bind = proc_bind;
8056}
8057
8058/* Launch the worker threads into the microtask. */
8059
8060void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
8061 kmp_info_t *this_thr = __kmp_threads[gtid];
8062
8063#ifdef KMP_DEBUG
8064 int f;
8065#endif /* KMP_DEBUG */
8066
8067 KMP_DEBUG_ASSERT(team);
8068 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8069 KMP_ASSERT(KMP_MASTER_GTID(gtid));
8070 KMP_MB(); /* Flush all pending memory write invalidates. */
8071
8072 team->t.t_construct = 0; /* no single directives seen yet */
8073 team->t.t_ordered.dt.t_value =
8074 0; /* thread 0 enters the ordered section first */
8075
8076 /* Reset the identifiers on the dispatch buffer */
8077 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
8078 if (team->t.t_max_nproc > 1) {
8079 int i;
8080 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
8081 team->t.t_disp_buffer[i].buffer_index = i;
8082 team->t.t_disp_buffer[i].doacross_buf_idx = i;
8083 }
8084 } else {
8085 team->t.t_disp_buffer[0].buffer_index = 0;
8086 team->t.t_disp_buffer[0].doacross_buf_idx = 0;
8087 }
8088
8089 KMP_MB(); /* Flush all pending memory write invalidates. */
8090 KMP_ASSERT(this_thr->th.th_team == team);
8091
8092#ifdef KMP_DEBUG
8093 for (f = 0; f < team->t.t_nproc; f++) {
8094 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
8095 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
8096 }
8097#endif /* KMP_DEBUG */
8098
8099 /* release the worker threads so they may begin working */
8100 __kmp_fork_barrier(gtid, tid: 0);
8101}
8102
8103void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
8104 kmp_info_t *this_thr = __kmp_threads[gtid];
8105
8106 KMP_DEBUG_ASSERT(team);
8107 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8108 KMP_ASSERT(KMP_MASTER_GTID(gtid));
8109 KMP_MB(); /* Flush all pending memory write invalidates. */
8110
8111 /* Join barrier after fork */
8112
8113#ifdef KMP_DEBUG
8114 if (__kmp_threads[gtid] &&
8115 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
8116 __kmp_printf(format: "GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
8117 __kmp_threads[gtid]);
8118 __kmp_printf(format: "__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
8119 "team->t.t_nproc=%d\n",
8120 gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
8121 team->t.t_nproc);
8122 __kmp_print_structure();
8123 }
8124 KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
8125 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
8126#endif /* KMP_DEBUG */
8127
8128 __kmp_join_barrier(gtid); /* wait for everyone */
8129#if OMPT_SUPPORT
8130 ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;
8131 if (ompt_enabled.enabled &&
8132 (ompt_state == ompt_state_wait_barrier_teams ||
8133 ompt_state == ompt_state_wait_barrier_implicit_parallel)) {
8134 int ds_tid = this_thr->th.th_info.ds.ds_tid;
8135 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
8136 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
8137#if OMPT_OPTIONAL
8138 void *codeptr = NULL;
8139 if (KMP_MASTER_TID(ds_tid) &&
8140 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
8141 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
8142 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
8143
8144 ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
8145 if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
8146 sync_kind = ompt_sync_region_barrier_teams;
8147 if (ompt_enabled.ompt_callback_sync_region_wait) {
8148 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
8149 sync_kind, ompt_scope_end, NULL, task_data, codeptr);
8150 }
8151 if (ompt_enabled.ompt_callback_sync_region) {
8152 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
8153 sync_kind, ompt_scope_end, NULL, task_data, codeptr);
8154 }
8155#endif
8156 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
8157 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
8158 ompt_scope_end, NULL, task_data, 0, ds_tid,
8159 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
8160 }
8161 }
8162#endif
8163
8164 KMP_MB(); /* Flush all pending memory write invalidates. */
8165 KMP_ASSERT(this_thr->th.th_team == team);
8166}
8167
8168/* ------------------------------------------------------------------------ */
8169
8170#ifdef USE_LOAD_BALANCE
8171
8172// Return the worker threads actively spinning in the hot team, if we
8173// are at the outermost level of parallelism. Otherwise, return 0.
8174static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
8175 int i;
8176 int retval;
8177 kmp_team_t *hot_team;
8178
8179 if (root->r.r_active) {
8180 return 0;
8181 }
8182 hot_team = root->r.r_hot_team;
8183 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
8184 return hot_team->t.t_nproc - 1; // Don't count primary thread
8185 }
8186
8187 // Skip the primary thread - it is accounted for elsewhere.
8188 retval = 0;
8189 for (i = 1; i < hot_team->t.t_nproc; i++) {
8190 if (hot_team->t.t_threads[i]->th.th_active) {
8191 retval++;
8192 }
8193 }
8194 return retval;
8195}
8196
8197// Perform an automatic adjustment to the number of
8198// threads used by the next parallel region.
8199static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8200 int retval;
8201 int pool_active;
8202 int hot_team_active;
8203 int team_curr_active;
8204 int system_active;
8205
8206 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8207 set_nproc));
8208 KMP_DEBUG_ASSERT(root);
8209 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8210 ->th.th_current_task->td_icvs.dynamic == TRUE);
8211 KMP_DEBUG_ASSERT(set_nproc > 1);
8212
8213 if (set_nproc == 1) {
8214 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8215 return 1;
8216 }
8217
8218 // Threads that are active in the thread pool, active in the hot team for this
8219 // particular root (if we are at the outer par level), and the currently
8220 // executing thread (to become the primary thread) are available to add to the
8221 // new team, but are currently contributing to the system load, and must be
8222 // accounted for.
8223 pool_active = __kmp_thread_pool_active_nth;
8224 hot_team_active = __kmp_active_hot_team_nproc(root);
8225 team_curr_active = pool_active + hot_team_active + 1;
8226
8227 // Check the system load.
8228 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8229 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8230 "hot team active = %d\n",
8231 system_active, pool_active, hot_team_active));
8232
8233 if (system_active < 0) {
8234 // There was an error reading the necessary info from /proc, so use the
8235 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8236 // = dynamic_thread_limit, we shouldn't wind up getting back here.
8237 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8238 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8239
8240 // Make this call behave like the thread limit algorithm.
8241 retval = __kmp_avail_proc - __kmp_nth +
8242 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8243 if (retval > set_nproc) {
8244 retval = set_nproc;
8245 }
8246 if (retval < KMP_MIN_NTH) {
8247 retval = KMP_MIN_NTH;
8248 }
8249
8250 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8251 retval));
8252 return retval;
8253 }
8254
8255 // There is a slight delay in the load balance algorithm in detecting new
8256 // running procs. The real system load at this instant should be at least as
8257 // large as the #active omp thread that are available to add to the team.
8258 if (system_active < team_curr_active) {
8259 system_active = team_curr_active;
8260 }
8261 retval = __kmp_avail_proc - system_active + team_curr_active;
8262 if (retval > set_nproc) {
8263 retval = set_nproc;
8264 }
8265 if (retval < KMP_MIN_NTH) {
8266 retval = KMP_MIN_NTH;
8267 }
8268
8269 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8270 return retval;
8271} // __kmp_load_balance_nproc()
8272
8273#endif /* USE_LOAD_BALANCE */
8274
8275/* ------------------------------------------------------------------------ */
8276
8277/* NOTE: this is called with the __kmp_init_lock held */
8278void __kmp_cleanup(void) {
8279 int f;
8280
8281 KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8282
8283 if (TCR_4(__kmp_init_parallel)) {
8284#if KMP_HANDLE_SIGNALS
8285 __kmp_remove_signals();
8286#endif
8287 TCW_4(__kmp_init_parallel, FALSE);
8288 }
8289
8290 if (TCR_4(__kmp_init_middle)) {
8291#if KMP_AFFINITY_SUPPORTED
8292 __kmp_affinity_uninitialize();
8293#endif /* KMP_AFFINITY_SUPPORTED */
8294 __kmp_cleanup_hierarchy();
8295 TCW_4(__kmp_init_middle, FALSE);
8296 }
8297
8298 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8299
8300 if (__kmp_init_serial) {
8301 __kmp_runtime_destroy();
8302 __kmp_init_serial = FALSE;
8303 }
8304
8305 __kmp_cleanup_threadprivate_caches();
8306
8307 for (f = 0; f < __kmp_threads_capacity; f++) {
8308 if (__kmp_root[f] != NULL) {
8309 __kmp_free(__kmp_root[f]);
8310 __kmp_root[f] = NULL;
8311 }
8312 }
8313 __kmp_free(__kmp_threads);
8314 // __kmp_threads and __kmp_root were allocated at once, as single block, so
8315 // there is no need in freeing __kmp_root.
8316 __kmp_threads = NULL;
8317 __kmp_root = NULL;
8318 __kmp_threads_capacity = 0;
8319
8320 // Free old __kmp_threads arrays if they exist.
8321 kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8322 while (ptr) {
8323 kmp_old_threads_list_t *next = ptr->next;
8324 __kmp_free(ptr->threads);
8325 __kmp_free(ptr);
8326 ptr = next;
8327 }
8328
8329#if KMP_USE_DYNAMIC_LOCK
8330 __kmp_cleanup_indirect_user_locks();
8331#else
8332 __kmp_cleanup_user_locks();
8333#endif
8334#if OMPD_SUPPORT
8335 if (ompd_state) {
8336 __kmp_free(ompd_env_block);
8337 ompd_env_block = NULL;
8338 ompd_env_block_size = 0;
8339 }
8340#endif
8341
8342#if KMP_AFFINITY_SUPPORTED
8343 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8344 __kmp_cpuinfo_file = NULL;
8345#endif /* KMP_AFFINITY_SUPPORTED */
8346
8347#if KMP_USE_ADAPTIVE_LOCKS
8348#if KMP_DEBUG_ADAPTIVE_LOCKS
8349 __kmp_print_speculative_stats();
8350#endif
8351#endif
8352 KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8353 __kmp_nested_nth.nth = NULL;
8354 __kmp_nested_nth.size = 0;
8355 __kmp_nested_nth.used = 0;
8356
8357 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8358 __kmp_nested_proc_bind.bind_types = NULL;
8359 __kmp_nested_proc_bind.size = 0;
8360 __kmp_nested_proc_bind.used = 0;
8361 if (__kmp_affinity_format) {
8362 KMP_INTERNAL_FREE(__kmp_affinity_format);
8363 __kmp_affinity_format = NULL;
8364 }
8365
8366 __kmp_i18n_catclose();
8367
8368#if KMP_USE_HIER_SCHED
8369 __kmp_hier_scheds.deallocate();
8370#endif
8371
8372#if KMP_STATS_ENABLED
8373 __kmp_stats_fini();
8374#endif
8375
8376 KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8377}
8378
8379/* ------------------------------------------------------------------------ */
8380
8381int __kmp_ignore_mppbeg(void) {
8382 char *env;
8383
8384 if ((env = getenv(name: "KMP_IGNORE_MPPBEG")) != NULL) {
8385 if (__kmp_str_match_false(data: env))
8386 return FALSE;
8387 }
8388 // By default __kmpc_begin() is no-op.
8389 return TRUE;
8390}
8391
8392int __kmp_ignore_mppend(void) {
8393 char *env;
8394
8395 if ((env = getenv(name: "KMP_IGNORE_MPPEND")) != NULL) {
8396 if (__kmp_str_match_false(data: env))
8397 return FALSE;
8398 }
8399 // By default __kmpc_end() is no-op.
8400 return TRUE;
8401}
8402
8403void __kmp_internal_begin(void) {
8404 int gtid;
8405 kmp_root_t *root;
8406
8407 /* this is a very important step as it will register new sibling threads
8408 and assign these new uber threads a new gtid */
8409 gtid = __kmp_entry_gtid();
8410 root = __kmp_threads[gtid]->th.th_root;
8411 KMP_ASSERT(KMP_UBER_GTID(gtid));
8412
8413 if (root->r.r_begin)
8414 return;
8415 __kmp_acquire_lock(lck: &root->r.r_begin_lock, gtid);
8416 if (root->r.r_begin) {
8417 __kmp_release_lock(lck: &root->r.r_begin_lock, gtid);
8418 return;
8419 }
8420
8421 root->r.r_begin = TRUE;
8422
8423 __kmp_release_lock(lck: &root->r.r_begin_lock, gtid);
8424}
8425
8426/* ------------------------------------------------------------------------ */
8427
8428void __kmp_user_set_library(enum library_type arg) {
8429 int gtid;
8430 kmp_root_t *root;
8431 kmp_info_t *thread;
8432
8433 /* first, make sure we are initialized so we can get our gtid */
8434
8435 gtid = __kmp_entry_gtid();
8436 thread = __kmp_threads[gtid];
8437
8438 root = thread->th.th_root;
8439
8440 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8441 library_serial));
8442 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8443 thread */
8444 KMP_WARNING(SetLibraryIncorrectCall);
8445 return;
8446 }
8447
8448 switch (arg) {
8449 case library_serial:
8450 thread->th.th_set_nproc = 0;
8451 set__nproc(thread, 1);
8452 break;
8453 case library_turnaround:
8454 thread->th.th_set_nproc = 0;
8455 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8456 : __kmp_dflt_team_nth_ub);
8457 break;
8458 case library_throughput:
8459 thread->th.th_set_nproc = 0;
8460 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8461 : __kmp_dflt_team_nth_ub);
8462 break;
8463 default:
8464 KMP_FATAL(UnknownLibraryType, arg);
8465 }
8466
8467 __kmp_aux_set_library(arg);
8468}
8469
8470void __kmp_aux_set_stacksize(size_t arg) {
8471 if (!__kmp_init_serial)
8472 __kmp_serial_initialize();
8473
8474#if KMP_OS_DARWIN
8475 if (arg & (0x1000 - 1)) {
8476 arg &= ~(0x1000 - 1);
8477 if (arg + 0x1000) /* check for overflow if we round up */
8478 arg += 0x1000;
8479 }
8480#endif
8481 __kmp_acquire_bootstrap_lock(lck: &__kmp_initz_lock);
8482
8483 /* only change the default stacksize before the first parallel region */
8484 if (!TCR_4(__kmp_init_parallel)) {
8485 size_t value = arg; /* argument is in bytes */
8486
8487 if (value < __kmp_sys_min_stksize)
8488 value = __kmp_sys_min_stksize;
8489 else if (value > KMP_MAX_STKSIZE)
8490 value = KMP_MAX_STKSIZE;
8491
8492 __kmp_stksize = value;
8493
8494 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8495 }
8496
8497 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
8498}
8499
8500/* set the behaviour of the runtime library */
8501/* TODO this can cause some odd behaviour with sibling parallelism... */
8502void __kmp_aux_set_library(enum library_type arg) {
8503 __kmp_library = arg;
8504
8505 switch (__kmp_library) {
8506 case library_serial: {
8507 KMP_INFORM(LibraryIsSerial);
8508 } break;
8509 case library_turnaround:
8510 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8511 __kmp_use_yield = 2; // only yield when oversubscribed
8512 break;
8513 case library_throughput:
8514 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8515 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8516 break;
8517 default:
8518 KMP_FATAL(UnknownLibraryType, arg);
8519 }
8520}
8521
8522/* Getting team information common for all team API */
8523// Returns NULL if not in teams construct
8524static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8525 kmp_info_t *thr = __kmp_entry_thread();
8526 teams_serialized = 0;
8527 if (thr->th.th_teams_microtask) {
8528 kmp_team_t *team = thr->th.th_team;
8529 int tlevel = thr->th.th_teams_level; // the level of the teams construct
8530 int ii = team->t.t_level;
8531 teams_serialized = team->t.t_serialized;
8532 int level = tlevel + 1;
8533 KMP_DEBUG_ASSERT(ii >= tlevel);
8534 while (ii > level) {
8535 for (teams_serialized = team->t.t_serialized;
8536 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8537 }
8538 if (team->t.t_serialized && (!teams_serialized)) {
8539 team = team->t.t_parent;
8540 continue;
8541 }
8542 if (ii > level) {
8543 team = team->t.t_parent;
8544 ii--;
8545 }
8546 }
8547 return team;
8548 }
8549 return NULL;
8550}
8551
8552int __kmp_aux_get_team_num() {
8553 int serialized;
8554 kmp_team_t *team = __kmp_aux_get_team_info(teams_serialized&: serialized);
8555 if (team) {
8556 if (serialized > 1) {
8557 return 0; // teams region is serialized ( 1 team of 1 thread ).
8558 } else {
8559 return team->t.t_master_tid;
8560 }
8561 }
8562 return 0;
8563}
8564
8565int __kmp_aux_get_num_teams() {
8566 int serialized;
8567 kmp_team_t *team = __kmp_aux_get_team_info(teams_serialized&: serialized);
8568 if (team) {
8569 if (serialized > 1) {
8570 return 1;
8571 } else {
8572 return team->t.t_parent->t.t_nproc;
8573 }
8574 }
8575 return 1;
8576}
8577
8578/* ------------------------------------------------------------------------ */
8579
8580/*
8581 * Affinity Format Parser
8582 *
8583 * Field is in form of: %[[[0].]size]type
8584 * % and type are required (%% means print a literal '%')
8585 * type is either single char or long name surrounded by {},
8586 * e.g., N or {num_threads}
8587 * 0 => leading zeros
8588 * . => right justified when size is specified
8589 * by default output is left justified
8590 * size is the *minimum* field length
8591 * All other characters are printed as is
8592 *
8593 * Available field types:
8594 * L {thread_level} - omp_get_level()
8595 * n {thread_num} - omp_get_thread_num()
8596 * h {host} - name of host machine
8597 * P {process_id} - process id (integer)
8598 * T {thread_identifier} - native thread identifier (integer)
8599 * N {num_threads} - omp_get_num_threads()
8600 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8601 * a {thread_affinity} - comma separated list of integers or integer ranges
8602 * (values of affinity mask)
8603 *
8604 * Implementation-specific field types can be added
8605 * If a type is unknown, print "undefined"
8606 */
8607
8608// Structure holding the short name, long name, and corresponding data type
8609// for snprintf. A table of these will represent the entire valid keyword
8610// field types.
8611typedef struct kmp_affinity_format_field_t {
8612 char short_name; // from spec e.g., L -> thread level
8613 const char *long_name; // from spec thread_level -> thread level
8614 char field_format; // data type for snprintf (typically 'd' or 's'
8615 // for integer or string)
8616} kmp_affinity_format_field_t;
8617
8618static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8619#if KMP_AFFINITY_SUPPORTED
8620 {.short_name: 'A', .long_name: "thread_affinity", .field_format: 's'},
8621#endif
8622 {.short_name: 't', .long_name: "team_num", .field_format: 'd'},
8623 {.short_name: 'T', .long_name: "num_teams", .field_format: 'd'},
8624 {.short_name: 'L', .long_name: "nesting_level", .field_format: 'd'},
8625 {.short_name: 'n', .long_name: "thread_num", .field_format: 'd'},
8626 {.short_name: 'N', .long_name: "num_threads", .field_format: 'd'},
8627 {.short_name: 'a', .long_name: "ancestor_tnum", .field_format: 'd'},
8628 {.short_name: 'H', .long_name: "host", .field_format: 's'},
8629 {.short_name: 'P', .long_name: "process_id", .field_format: 'd'},
8630 {.short_name: 'i', .long_name: "native_thread_id", .field_format: 'd'}};
8631
8632// Return the number of characters it takes to hold field
8633static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8634 const char **ptr,
8635 kmp_str_buf_t *field_buffer) {
8636 int rc, format_index, field_value;
8637 const char *width_left, *width_right;
8638 bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8639 static const int FORMAT_SIZE = 20;
8640 char format[FORMAT_SIZE] = {0};
8641 char absolute_short_name = 0;
8642
8643 KMP_DEBUG_ASSERT(gtid >= 0);
8644 KMP_DEBUG_ASSERT(th);
8645 KMP_DEBUG_ASSERT(**ptr == '%');
8646 KMP_DEBUG_ASSERT(field_buffer);
8647
8648 __kmp_str_buf_clear(buffer: field_buffer);
8649
8650 // Skip the initial %
8651 (*ptr)++;
8652
8653 // Check for %% first
8654 if (**ptr == '%') {
8655 __kmp_str_buf_cat(buffer: field_buffer, str: "%", len: 1);
8656 (*ptr)++; // skip over the second %
8657 return 1;
8658 }
8659
8660 // Parse field modifiers if they are present
8661 pad_zeros = false;
8662 if (**ptr == '0') {
8663 pad_zeros = true;
8664 (*ptr)++; // skip over 0
8665 }
8666 right_justify = false;
8667 if (**ptr == '.') {
8668 right_justify = true;
8669 (*ptr)++; // skip over .
8670 }
8671 // Parse width of field: [width_left, width_right)
8672 width_left = width_right = NULL;
8673 if (**ptr >= '0' && **ptr <= '9') {
8674 width_left = *ptr;
8675 SKIP_DIGITS(*ptr);
8676 width_right = *ptr;
8677 }
8678
8679 // Create the format for KMP_SNPRINTF based on flags parsed above
8680 format_index = 0;
8681 format[format_index++] = '%';
8682 if (!right_justify)
8683 format[format_index++] = '-';
8684 if (pad_zeros)
8685 format[format_index++] = '0';
8686 if (width_left && width_right) {
8687 int i = 0;
8688 // Only allow 8 digit number widths.
8689 // This also prevents overflowing format variable
8690 while (i < 8 && width_left < width_right) {
8691 format[format_index++] = *width_left;
8692 width_left++;
8693 i++;
8694 }
8695 }
8696
8697 // Parse a name (long or short)
8698 // Canonicalize the name into absolute_short_name
8699 found_valid_name = false;
8700 parse_long_name = (**ptr == '{');
8701 if (parse_long_name)
8702 (*ptr)++; // skip initial left brace
8703 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8704 sizeof(__kmp_affinity_format_table[0]);
8705 ++i) {
8706 char short_name = __kmp_affinity_format_table[i].short_name;
8707 const char *long_name = __kmp_affinity_format_table[i].long_name;
8708 char field_format = __kmp_affinity_format_table[i].field_format;
8709 if (parse_long_name) {
8710 size_t length = KMP_STRLEN(s: long_name);
8711 if (strncmp(s1: *ptr, s2: long_name, n: length) == 0) {
8712 found_valid_name = true;
8713 (*ptr) += length; // skip the long name
8714 }
8715 } else if (**ptr == short_name) {
8716 found_valid_name = true;
8717 (*ptr)++; // skip the short name
8718 }
8719 if (found_valid_name) {
8720 format[format_index++] = field_format;
8721 format[format_index++] = '\0';
8722 absolute_short_name = short_name;
8723 break;
8724 }
8725 }
8726 if (parse_long_name) {
8727 if (**ptr != '}') {
8728 absolute_short_name = 0;
8729 } else {
8730 (*ptr)++; // skip over the right brace
8731 }
8732 }
8733
8734 // Attempt to fill the buffer with the requested
8735 // value using snprintf within __kmp_str_buf_print()
8736 switch (absolute_short_name) {
8737 case 't':
8738 rc = __kmp_str_buf_print(buffer: field_buffer, format, __kmp_aux_get_team_num());
8739 break;
8740 case 'T':
8741 rc = __kmp_str_buf_print(buffer: field_buffer, format, __kmp_aux_get_num_teams());
8742 break;
8743 case 'L':
8744 rc = __kmp_str_buf_print(buffer: field_buffer, format, th->th.th_team->t.t_level);
8745 break;
8746 case 'n':
8747 rc = __kmp_str_buf_print(buffer: field_buffer, format, __kmp_tid_from_gtid(gtid));
8748 break;
8749 case 'H': {
8750 static const int BUFFER_SIZE = 256;
8751 char buf[BUFFER_SIZE];
8752 __kmp_expand_host_name(buffer: buf, size: BUFFER_SIZE);
8753 rc = __kmp_str_buf_print(buffer: field_buffer, format, buf);
8754 } break;
8755 case 'P':
8756 rc = __kmp_str_buf_print(buffer: field_buffer, format, getpid());
8757 break;
8758 case 'i':
8759 rc = __kmp_str_buf_print(buffer: field_buffer, format, __kmp_gettid());
8760 break;
8761 case 'N':
8762 rc = __kmp_str_buf_print(buffer: field_buffer, format, th->th.th_team->t.t_nproc);
8763 break;
8764 case 'a':
8765 field_value =
8766 __kmp_get_ancestor_thread_num(gtid, level: th->th.th_team->t.t_level - 1);
8767 rc = __kmp_str_buf_print(buffer: field_buffer, format, field_value);
8768 break;
8769#if KMP_AFFINITY_SUPPORTED
8770 case 'A': {
8771 kmp_str_buf_t buf;
8772 __kmp_str_buf_init(&buf);
8773 __kmp_affinity_str_buf_mask(buf: &buf, mask: th->th.th_affin_mask);
8774 rc = __kmp_str_buf_print(buffer: field_buffer, format, buf.str);
8775 __kmp_str_buf_free(buffer: &buf);
8776 } break;
8777#endif
8778 default:
8779 // According to spec, If an implementation does not have info for field
8780 // type, then "undefined" is printed
8781 rc = __kmp_str_buf_print(buffer: field_buffer, format: "%s", "undefined");
8782 // Skip the field
8783 if (parse_long_name) {
8784 SKIP_TOKEN(*ptr);
8785 if (**ptr == '}')
8786 (*ptr)++;
8787 } else {
8788 (*ptr)++;
8789 }
8790 }
8791
8792 KMP_ASSERT(format_index <= FORMAT_SIZE);
8793 return rc;
8794}
8795
8796/*
8797 * Return number of characters needed to hold the affinity string
8798 * (not including null byte character)
8799 * The resultant string is printed to buffer, which the caller can then
8800 * handle afterwards
8801 */
8802size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8803 kmp_str_buf_t *buffer) {
8804 const char *parse_ptr;
8805 size_t retval;
8806 const kmp_info_t *th;
8807 kmp_str_buf_t field;
8808
8809 KMP_DEBUG_ASSERT(buffer);
8810 KMP_DEBUG_ASSERT(gtid >= 0);
8811
8812 __kmp_str_buf_init(&field);
8813 __kmp_str_buf_clear(buffer);
8814
8815 th = __kmp_threads[gtid];
8816 retval = 0;
8817
8818 // If format is NULL or zero-length string, then we use
8819 // affinity-format-var ICV
8820 parse_ptr = format;
8821 if (parse_ptr == NULL || *parse_ptr == '\0') {
8822 parse_ptr = __kmp_affinity_format;
8823 }
8824 KMP_DEBUG_ASSERT(parse_ptr);
8825
8826 while (*parse_ptr != '\0') {
8827 // Parse a field
8828 if (*parse_ptr == '%') {
8829 // Put field in the buffer
8830 int rc = __kmp_aux_capture_affinity_field(gtid, th, ptr: &parse_ptr, field_buffer: &field);
8831 __kmp_str_buf_catbuf(dest: buffer, src: &field);
8832 retval += rc;
8833 } else {
8834 // Put literal character in buffer
8835 __kmp_str_buf_cat(buffer, str: parse_ptr, len: 1);
8836 retval++;
8837 parse_ptr++;
8838 }
8839 }
8840 __kmp_str_buf_free(buffer: &field);
8841 return retval;
8842}
8843
8844// Displays the affinity string to stdout
8845void __kmp_aux_display_affinity(int gtid, const char *format) {
8846 kmp_str_buf_t buf;
8847 __kmp_str_buf_init(&buf);
8848 __kmp_aux_capture_affinity(gtid, format, buffer: &buf);
8849 __kmp_fprintf(stream: kmp_out, format: "%s" KMP_END_OF_LINE, buf.str);
8850 __kmp_str_buf_free(buffer: &buf);
8851}
8852
8853/* ------------------------------------------------------------------------ */
8854void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8855 int blocktime = arg; /* argument is in microseconds */
8856#if KMP_USE_MONITOR
8857 int bt_intervals;
8858#endif
8859 kmp_int8 bt_set;
8860
8861 __kmp_save_internal_controls(thread);
8862
8863 /* Normalize and set blocktime for the teams */
8864 if (blocktime < KMP_MIN_BLOCKTIME)
8865 blocktime = KMP_MIN_BLOCKTIME;
8866 else if (blocktime > KMP_MAX_BLOCKTIME)
8867 blocktime = KMP_MAX_BLOCKTIME;
8868
8869 set__blocktime_team(thread->th.th_team, tid, blocktime);
8870 set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8871
8872#if KMP_USE_MONITOR
8873 /* Calculate and set blocktime intervals for the teams */
8874 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8875
8876 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8877 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8878#endif
8879
8880 /* Set whether blocktime has been set to "TRUE" */
8881 bt_set = TRUE;
8882
8883 set__bt_set_team(thread->th.th_team, tid, bt_set);
8884 set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8885#if KMP_USE_MONITOR
8886 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8887 "bt_intervals=%d, monitor_updates=%d\n",
8888 __kmp_gtid_from_tid(tid, thread->th.th_team),
8889 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8890 __kmp_monitor_wakeups));
8891#else
8892 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8893 __kmp_gtid_from_tid(tid, thread->th.th_team),
8894 thread->th.th_team->t.t_id, tid, blocktime));
8895#endif
8896}
8897
8898void __kmp_aux_set_defaults(char const *str, size_t len) {
8899 if (!__kmp_init_serial) {
8900 __kmp_serial_initialize();
8901 }
8902 __kmp_env_initialize(str);
8903
8904 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8905 __kmp_env_print();
8906 }
8907} // __kmp_aux_set_defaults
8908
8909/* ------------------------------------------------------------------------ */
8910/* internal fast reduction routines */
8911
8912PACKED_REDUCTION_METHOD_T
8913__kmp_determine_reduction_method(
8914 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8915 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8916 kmp_critical_name *lck) {
8917
8918 // Default reduction method: critical construct ( lck != NULL, like in current
8919 // PAROPT )
8920 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8921 // can be selected by RTL
8922 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8923 // can be selected by RTL
8924 // Finally, it's up to OpenMP RTL to make a decision on which method to select
8925 // among generated by PAROPT.
8926
8927 PACKED_REDUCTION_METHOD_T retval;
8928
8929 int team_size;
8930
8931 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8932
8933#define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8934 (loc && \
8935 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8936#define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8937
8938 retval = critical_reduce_block;
8939
8940 // another choice of getting a team size (with 1 dynamic deference) is slower
8941 team_size = __kmp_get_team_num_threads(global_tid);
8942 if (team_size == 1) {
8943
8944 retval = empty_reduce_block;
8945
8946 } else {
8947
8948 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8949
8950#if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8951 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || \
8952 KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_WASM
8953
8954#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8955 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HAIKU || \
8956 KMP_OS_HURD || KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8957
8958 int teamsize_cutoff = 4;
8959
8960#if KMP_MIC_SUPPORTED
8961 if (__kmp_mic_type != non_mic) {
8962 teamsize_cutoff = 8;
8963 }
8964#endif
8965 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8966 if (tree_available) {
8967 if (team_size <= teamsize_cutoff) {
8968 if (atomic_available) {
8969 retval = atomic_reduce_block;
8970 }
8971 } else {
8972 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8973 }
8974 } else if (atomic_available) {
8975 retval = atomic_reduce_block;
8976 }
8977#else
8978#error "Unknown or unsupported OS"
8979#endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8980 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HAIKU ||
8981 // KMP_OS_HURD || KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8982
8983#elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS || \
8984 KMP_ARCH_WASM || KMP_ARCH_PPC || KMP_ARCH_AARCH64_32 || KMP_ARCH_SPARC
8985
8986#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8987 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_HAIKU || KMP_OS_HURD || \
8988 KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8989
8990 // basic tuning
8991
8992 if (atomic_available) {
8993 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8994 retval = atomic_reduce_block;
8995 }
8996 } // otherwise: use critical section
8997
8998#elif KMP_OS_DARWIN
8999
9000 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
9001 if (atomic_available && (num_vars <= 3)) {
9002 retval = atomic_reduce_block;
9003 } else if (tree_available) {
9004 if ((reduce_size > (9 * sizeof(kmp_real64))) &&
9005 (reduce_size < (2000 * sizeof(kmp_real64)))) {
9006 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
9007 }
9008 } // otherwise: use critical section
9009
9010#else
9011#error "Unknown or unsupported OS"
9012#endif
9013
9014#else
9015#error "Unknown or unsupported architecture"
9016#endif
9017 }
9018
9019 // KMP_FORCE_REDUCTION
9020
9021 // If the team is serialized (team_size == 1), ignore the forced reduction
9022 // method and stay with the unsynchronized method (empty_reduce_block)
9023 if (__kmp_force_reduction_method != reduction_method_not_defined &&
9024 team_size != 1) {
9025
9026 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
9027
9028 int atomic_available, tree_available;
9029
9030 switch ((forced_retval = __kmp_force_reduction_method)) {
9031 case critical_reduce_block:
9032 KMP_ASSERT(lck); // lck should be != 0
9033 break;
9034
9035 case atomic_reduce_block:
9036 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
9037 if (!atomic_available) {
9038 KMP_WARNING(RedMethodNotSupported, "atomic");
9039 forced_retval = critical_reduce_block;
9040 }
9041 break;
9042
9043 case tree_reduce_block:
9044 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
9045 if (!tree_available) {
9046 KMP_WARNING(RedMethodNotSupported, "tree");
9047 forced_retval = critical_reduce_block;
9048 } else {
9049#if KMP_FAST_REDUCTION_BARRIER
9050 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
9051#endif
9052 }
9053 break;
9054
9055 default:
9056 KMP_ASSERT(0); // "unsupported method specified"
9057 }
9058
9059 retval = forced_retval;
9060 }
9061
9062 KA_TRACE(10, ("reduction method selected=%08x\n", retval));
9063
9064#undef FAST_REDUCTION_TREE_METHOD_GENERATED
9065#undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
9066
9067 return (retval);
9068}
9069// this function is for testing set/get/determine reduce method
9070kmp_int32 __kmp_get_reduce_method(void) {
9071 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
9072}
9073
9074// Soft pause sets up threads to ignore blocktime and just go to sleep.
9075// Spin-wait code checks __kmp_pause_status and reacts accordingly.
9076void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
9077
9078// Hard pause shuts down the runtime completely. Resume happens naturally when
9079// OpenMP is used subsequently.
9080void __kmp_hard_pause() {
9081 __kmp_pause_status = kmp_hard_paused;
9082 __kmp_internal_end_thread(gtid_req: -1);
9083}
9084
9085// Soft resume sets __kmp_pause_status, and wakes up all threads.
9086void __kmp_resume_if_soft_paused() {
9087 if (__kmp_pause_status == kmp_soft_paused) {
9088 __kmp_pause_status = kmp_not_paused;
9089
9090 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
9091 kmp_info_t *thread = __kmp_threads[gtid];
9092 if (thread) { // Wake it if sleeping
9093 kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
9094 thread);
9095 if (fl.is_sleeping())
9096 fl.resume(th_gtid: gtid);
9097 else if (__kmp_try_suspend_mx(th: thread)) { // got suspend lock
9098 __kmp_unlock_suspend_mx(th: thread); // unlock it; it won't sleep
9099 } else { // thread holds the lock and may sleep soon
9100 do { // until either the thread sleeps, or we can get the lock
9101 if (fl.is_sleeping()) {
9102 fl.resume(th_gtid: gtid);
9103 break;
9104 } else if (__kmp_try_suspend_mx(th: thread)) {
9105 __kmp_unlock_suspend_mx(th: thread);
9106 break;
9107 }
9108 } while (1);
9109 }
9110 }
9111 }
9112 }
9113}
9114
9115// This function is called via __kmpc_pause_resource. Returns 0 if successful.
9116// TODO: add warning messages
9117int __kmp_pause_resource(kmp_pause_status_t level) {
9118 if (level == kmp_not_paused) { // requesting resume
9119 if (__kmp_pause_status == kmp_not_paused) {
9120 // error message about runtime not being paused, so can't resume
9121 return 1;
9122 } else {
9123 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
9124 __kmp_pause_status == kmp_hard_paused);
9125 __kmp_pause_status = kmp_not_paused;
9126 return 0;
9127 }
9128 } else if (level == kmp_soft_paused) { // requesting soft pause
9129 if (__kmp_pause_status != kmp_not_paused) {
9130 // error message about already being paused
9131 return 1;
9132 } else {
9133 __kmp_soft_pause();
9134 return 0;
9135 }
9136 } else if (level == kmp_hard_paused || level == kmp_stop_tool_paused) {
9137 // requesting hard pause or stop_tool pause
9138 if (__kmp_pause_status != kmp_not_paused) {
9139 // error message about already being paused
9140 return 1;
9141 } else {
9142 __kmp_hard_pause();
9143 return 0;
9144 }
9145 } else {
9146 // error message about invalid level
9147 return 1;
9148 }
9149}
9150
9151void __kmp_omp_display_env(int verbose) {
9152 __kmp_acquire_bootstrap_lock(lck: &__kmp_initz_lock);
9153 if (__kmp_init_serial == 0)
9154 __kmp_do_serial_initialize();
9155 __kmp_display_env_impl(display_env: !verbose, display_env_verbose: verbose);
9156 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
9157}
9158
9159// The team size is changing, so distributed barrier must be modified
9160void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
9161 int new_nthreads) {
9162 KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
9163 bp_dist_bar);
9164 kmp_info_t **other_threads = team->t.t_threads;
9165
9166 // We want all the workers to stop waiting on the barrier while we adjust the
9167 // size of the team.
9168 for (int f = 1; f < old_nthreads; ++f) {
9169 KMP_DEBUG_ASSERT(other_threads[f] != NULL);
9170 // Ignore threads that are already inactive or not present in the team
9171 if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
9172 // teams construct causes thread_limit to get passed in, and some of
9173 // those could be inactive; just ignore them
9174 continue;
9175 }
9176 // If thread is transitioning still to in_use state, wait for it
9177 if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
9178 while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
9179 KMP_CPU_PAUSE();
9180 }
9181 // The thread should be in_use now
9182 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
9183 // Transition to unused state
9184 team->t.t_threads[f]->th.th_used_in_team.store(i: 2);
9185 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
9186 }
9187 // Release all the workers
9188 team->t.b->go_release();
9189
9190 KMP_MFENCE();
9191
9192 // Workers should see transition status 2 and move to 0; but may need to be
9193 // woken up first
9194 int count = old_nthreads - 1;
9195 while (count > 0) {
9196 count = old_nthreads - 1;
9197 for (int f = 1; f < old_nthreads; ++f) {
9198 if (other_threads[f]->th.th_used_in_team.load() != 0) {
9199 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
9200 kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
9201 void *, other_threads[f]->th.th_sleep_loc);
9202 __kmp_atomic_resume_64(target_gtid: other_threads[f]->th.th_info.ds.ds_gtid, flag);
9203 }
9204 } else {
9205 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9206 count--;
9207 }
9208 }
9209 }
9210 // Now update the barrier size
9211 team->t.b->update_num_threads(nthr: new_nthreads);
9212 team->t.b->go_reset();
9213}
9214
9215void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9216 // Add the threads back to the team
9217 KMP_DEBUG_ASSERT(team);
9218 // Threads were paused and pointed at th_used_in_team temporarily during a
9219 // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9220 // the thread that it should transition itself back into the team. Then, if
9221 // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9222 // to wake it up.
9223 for (int f = 1; f < new_nthreads; ++f) {
9224 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9225 (void)KMP_COMPARE_AND_STORE_ACQ32(
9226 &(team->t.t_threads[f]->th.th_used_in_team), 0, 3);
9227 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9228 __kmp_resume_32(target_gtid: team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9229 flag: (kmp_flag_32<false, false> *)NULL);
9230 }
9231 }
9232 // The threads should be transitioning to the team; when they are done, they
9233 // should have set th_used_in_team to 1. This loop forces master to wait until
9234 // all threads have moved into the team and are waiting in the barrier.
9235 int count = new_nthreads - 1;
9236 while (count > 0) {
9237 count = new_nthreads - 1;
9238 for (int f = 1; f < new_nthreads; ++f) {
9239 if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9240 count--;
9241 }
9242 }
9243 }
9244}
9245
9246// Globals and functions for hidden helper task
9247kmp_info_t **__kmp_hidden_helper_threads;
9248kmp_info_t *__kmp_hidden_helper_main_thread;
9249std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9250#if KMP_OS_LINUX
9251kmp_int32 __kmp_hidden_helper_threads_num = 8;
9252kmp_int32 __kmp_enable_hidden_helper = TRUE;
9253#else
9254kmp_int32 __kmp_hidden_helper_threads_num = 0;
9255kmp_int32 __kmp_enable_hidden_helper = FALSE;
9256#endif
9257
9258namespace {
9259std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9260
9261void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9262 // This is an explicit synchronization on all hidden helper threads in case
9263 // that when a regular thread pushes a hidden helper task to one hidden
9264 // helper thread, the thread has not been awaken once since they're released
9265 // by the main thread after creating the team.
9266 KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9267 while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9268 __kmp_hidden_helper_threads_num)
9269 ;
9270
9271 // If main thread, then wait for signal
9272 if (__kmpc_master(nullptr, global_tid: *gtid)) {
9273 // First, unset the initial state and release the initial thread
9274 TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9275 __kmp_hidden_helper_initz_release();
9276 __kmp_hidden_helper_main_thread_wait();
9277 // Now wake up all worker threads
9278 for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9279 __kmp_hidden_helper_worker_thread_signal();
9280 }
9281 }
9282}
9283} // namespace
9284
9285void __kmp_hidden_helper_threads_initz_routine() {
9286 // Create a new root for hidden helper team/threads
9287 const int gtid = __kmp_register_root(TRUE);
9288 __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9289 __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9290 __kmp_hidden_helper_main_thread->th.th_set_nproc =
9291 __kmp_hidden_helper_threads_num;
9292
9293 KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9294
9295 __kmpc_fork_call(nullptr, nargs: 0, microtask: __kmp_hidden_helper_wrapper_fn);
9296
9297 // Set the initialization flag to FALSE
9298 TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9299
9300 __kmp_hidden_helper_threads_deinitz_release();
9301}
9302
9303/* Nesting Mode:
9304 Set via KMP_NESTING_MODE, which takes an integer.
9305 Note: we skip duplicate topology levels, and skip levels with only
9306 one entity.
9307 KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9308 KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9309 in the topology, and initializes the number of threads at each of those
9310 levels to the number of entities at each level, respectively, below the
9311 entity at the parent level.
9312 KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9313 but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9314 the user to turn nesting on explicitly. This is an even more experimental
9315 option to this experimental feature, and may change or go away in the
9316 future.
9317*/
9318
9319// Allocate space to store nesting levels
9320void __kmp_init_nesting_mode() {
9321 int levels = KMP_HW_LAST;
9322 __kmp_nesting_mode_nlevels = levels;
9323 __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9324 for (int i = 0; i < levels; ++i)
9325 __kmp_nesting_nth_level[i] = 0;
9326 if (__kmp_nested_nth.size < levels) {
9327 __kmp_nested_nth.nth =
9328 (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9329 __kmp_nested_nth.size = levels;
9330 }
9331}
9332
9333// Set # threads for top levels of nesting; must be called after topology set
9334void __kmp_set_nesting_mode_threads() {
9335 kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9336
9337 if (__kmp_nesting_mode == 1)
9338 __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9339 else if (__kmp_nesting_mode > 1)
9340 __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9341
9342 if (__kmp_topology) { // use topology info
9343 int loc, hw_level;
9344 for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9345 loc < __kmp_nesting_mode_nlevels;
9346 loc++, hw_level++) {
9347 __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(level: hw_level);
9348 if (__kmp_nesting_nth_level[loc] == 1)
9349 loc--;
9350 }
9351 // Make sure all cores are used
9352 if (__kmp_nesting_mode > 1 && loc > 1) {
9353 int core_level = __kmp_topology->get_level(type: KMP_HW_CORE);
9354 int num_cores = __kmp_topology->get_count(level: core_level);
9355 int upper_levels = 1;
9356 for (int level = 0; level < loc - 1; ++level)
9357 upper_levels *= __kmp_nesting_nth_level[level];
9358 if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9359 __kmp_nesting_nth_level[loc - 1] =
9360 num_cores / __kmp_nesting_nth_level[loc - 2];
9361 }
9362 __kmp_nesting_mode_nlevels = loc;
9363 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9364 } else { // no topology info available; provide a reasonable guesstimation
9365 if (__kmp_avail_proc >= 4) {
9366 __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9367 __kmp_nesting_nth_level[1] = 2;
9368 __kmp_nesting_mode_nlevels = 2;
9369 } else {
9370 __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9371 __kmp_nesting_mode_nlevels = 1;
9372 }
9373 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9374 }
9375 for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9376 __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9377 }
9378 set__nproc(thread, __kmp_nesting_nth_level[0]);
9379 if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9380 __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9381 if (get__max_active_levels(thread) > 1) {
9382 // if max levels was set, set nesting mode levels to same
9383 __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9384 }
9385 if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9386 set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9387}
9388
9389// Empty symbols to export (see exports_so.txt) when feature is disabled
9390extern "C" {
9391#if !KMP_STATS_ENABLED
9392void __kmp_reset_stats() {}
9393#endif
9394#if !USE_DEBUGGER
9395int __kmp_omp_debug_struct_info = FALSE;
9396int __kmp_debugging = FALSE;
9397#endif
9398#if !USE_ITT_BUILD || !USE_ITT_NOTIFY
9399void __kmp_itt_fini_ittlib() {}
9400void __kmp_itt_init_ittlib() {}
9401#endif
9402}
9403
9404// end of file
9405

Provided by KDAB

Privacy Policy
Learn to use CMake with our Intro Training
Find out more

source code of openmp/runtime/src/kmp_runtime.cpp