1/*
2 * kmp_runtime.cpp -- KPTS runtime support library
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#include "kmp.h"
14#include "kmp_affinity.h"
15#include "kmp_atomic.h"
16#include "kmp_environment.h"
17#include "kmp_error.h"
18#include "kmp_i18n.h"
19#include "kmp_io.h"
20#include "kmp_itt.h"
21#include "kmp_settings.h"
22#include "kmp_stats.h"
23#include "kmp_str.h"
24#include "kmp_wait_release.h"
25#include "kmp_wrapper_getpid.h"
26#include "kmp_dispatch.h"
27#include "kmp_utils.h"
28#if KMP_USE_HIER_SCHED
29#include "kmp_dispatch_hier.h"
30#endif
31
32#if OMPT_SUPPORT
33#include "ompt-specific.h"
34#endif
35#if OMPD_SUPPORT
36#include "ompd-specific.h"
37#endif
38
39#if OMP_PROFILING_SUPPORT
40#include "llvm/Support/TimeProfiler.h"
41static char *ProfileTraceFile = nullptr;
42#endif
43
44/* these are temporary issues to be dealt with */
45#define KMP_USE_PRCTL 0
46
47#if KMP_OS_WINDOWS
48#include <process.h>
49#endif
50
51#ifndef KMP_USE_SHM
52// Windows and WASI do not need these include files as they don't use shared
53// memory.
54#else
55#include <sys/mman.h>
56#include <sys/stat.h>
57#include <fcntl.h>
58#define SHM_SIZE 1024
59#endif
60
61#if defined(KMP_GOMP_COMPAT)
62char const __kmp_version_alt_comp[] =
63 KMP_VERSION_PREFIX "alternative compiler support: yes";
64#endif /* defined(KMP_GOMP_COMPAT) */
65
66char const __kmp_version_omp_api[] =
67 KMP_VERSION_PREFIX "API version: 5.0 (201611)";
68
69#ifdef KMP_DEBUG
70char const __kmp_version_lock[] =
71 KMP_VERSION_PREFIX "lock type: run time selectable";
72#endif /* KMP_DEBUG */
73
74#define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
75
76/* ------------------------------------------------------------------------ */
77
78#if KMP_USE_MONITOR
79kmp_info_t __kmp_monitor;
80#endif
81
82/* Forward declarations */
83
84void __kmp_cleanup(void);
85
86static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
87 int gtid);
88static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
89 kmp_internal_control_t *new_icvs,
90 ident_t *loc);
91#if KMP_AFFINITY_SUPPORTED
92static void __kmp_partition_places(kmp_team_t *team,
93 int update_master_only = 0);
94#endif
95static void __kmp_do_serial_initialize(void);
96void __kmp_fork_barrier(int gtid, int tid);
97void __kmp_join_barrier(int gtid);
98void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
99 kmp_internal_control_t *new_icvs, ident_t *loc);
100
101#ifdef USE_LOAD_BALANCE
102static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
103#endif
104
105static int __kmp_expand_threads(int nNeed);
106#if KMP_OS_WINDOWS
107static int __kmp_unregister_root_other_thread(int gtid);
108#endif
109static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
110kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
111
112void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
113 int new_nthreads);
114void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
115
116static kmp_nested_nthreads_t *__kmp_override_nested_nth(kmp_info_t *thr,
117 int level) {
118 kmp_nested_nthreads_t *new_nested_nth =
119 (kmp_nested_nthreads_t *)KMP_INTERNAL_MALLOC(
120 sizeof(kmp_nested_nthreads_t));
121 int new_size = level + thr->th.th_set_nested_nth_sz;
122 new_nested_nth->nth = (int *)KMP_INTERNAL_MALLOC(new_size * sizeof(int));
123 for (int i = 0; i < level + 1; ++i)
124 new_nested_nth->nth[i] = 0;
125 for (int i = level + 1, j = 1; i < new_size; ++i, ++j)
126 new_nested_nth->nth[i] = thr->th.th_set_nested_nth[j];
127 new_nested_nth->size = new_nested_nth->used = new_size;
128 return new_nested_nth;
129}
130
131/* Calculate the identifier of the current thread */
132/* fast (and somewhat portable) way to get unique identifier of executing
133 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
134int __kmp_get_global_thread_id() {
135 int i;
136 kmp_info_t **other_threads;
137 size_t stack_data;
138 char *stack_addr;
139 size_t stack_size;
140 char *stack_base;
141
142 KA_TRACE(
143 1000,
144 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
145 __kmp_nth, __kmp_all_nth));
146
147 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
148 a parallel region, made it return KMP_GTID_DNE to force serial_initialize
149 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
150 __kmp_init_gtid for this to work. */
151
152 if (!TCR_4(__kmp_init_gtid))
153 return KMP_GTID_DNE;
154
155#ifdef KMP_TDATA_GTID
156 if (TCR_4(__kmp_gtid_mode) >= 3) {
157 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
158 return __kmp_gtid;
159 }
160#endif
161 if (TCR_4(__kmp_gtid_mode) >= 2) {
162 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
163 return __kmp_gtid_get_specific();
164 }
165 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
166
167 stack_addr = (char *)&stack_data;
168 other_threads = __kmp_threads;
169
170 /* ATT: The code below is a source of potential bugs due to unsynchronized
171 access to __kmp_threads array. For example:
172 1. Current thread loads other_threads[i] to thr and checks it, it is
173 non-NULL.
174 2. Current thread is suspended by OS.
175 3. Another thread unregisters and finishes (debug versions of free()
176 may fill memory with something like 0xEF).
177 4. Current thread is resumed.
178 5. Current thread reads junk from *thr.
179 TODO: Fix it. --ln */
180
181 for (i = 0; i < __kmp_threads_capacity; i++) {
182
183 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
184 if (!thr)
185 continue;
186
187 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
188 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
189
190 /* stack grows down -- search through all of the active threads */
191
192 if (stack_addr <= stack_base) {
193 size_t stack_diff = stack_base - stack_addr;
194
195 if (stack_diff <= stack_size) {
196 /* The only way we can be closer than the allocated */
197 /* stack size is if we are running on this thread. */
198 // __kmp_gtid_get_specific can return negative value because this
199 // function can be called by thread destructor. However, before the
200 // thread destructor is called, the value of the corresponding
201 // thread-specific data will be reset to NULL.
202 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() < 0 ||
203 __kmp_gtid_get_specific() == i);
204 return i;
205 }
206 }
207 }
208
209 /* get specific to try and determine our gtid */
210 KA_TRACE(1000,
211 ("*** __kmp_get_global_thread_id: internal alg. failed to find "
212 "thread, using TLS\n"));
213 i = __kmp_gtid_get_specific();
214
215 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
216
217 /* if we havn't been assigned a gtid, then return code */
218 if (i < 0)
219 return i;
220
221 // other_threads[i] can be nullptr at this point because the corresponding
222 // thread could have already been destructed. It can happen when this function
223 // is called in end library routine.
224 if (!TCR_SYNC_PTR(other_threads[i]))
225 return i;
226
227 /* dynamically updated stack window for uber threads to avoid get_specific
228 call */
229 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
230 KMP_FATAL(StackOverflow, i);
231 }
232
233 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
234 if (stack_addr > stack_base) {
235 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
236 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
237 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
238 stack_base);
239 } else {
240 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
241 stack_base - stack_addr);
242 }
243
244 /* Reprint stack bounds for ubermaster since they have been refined */
245 if (__kmp_storage_map) {
246 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
247 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
248 __kmp_print_storage_map_gtid(gtid: i, p1: stack_beg, p2: stack_end,
249 size: other_threads[i]->th.th_info.ds.ds_stacksize,
250 format: "th_%d stack (refinement)", i);
251 }
252 return i;
253}
254
255int __kmp_get_global_thread_id_reg() {
256 int gtid;
257
258 if (!__kmp_init_serial) {
259 gtid = KMP_GTID_DNE;
260 } else
261#ifdef KMP_TDATA_GTID
262 if (TCR_4(__kmp_gtid_mode) >= 3) {
263 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
264 gtid = __kmp_gtid;
265 } else
266#endif
267 if (TCR_4(__kmp_gtid_mode) >= 2) {
268 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
269 gtid = __kmp_gtid_get_specific();
270 } else {
271 KA_TRACE(1000,
272 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
273 gtid = __kmp_get_global_thread_id();
274 }
275
276 /* we must be a new uber master sibling thread */
277 if (gtid == KMP_GTID_DNE) {
278 KA_TRACE(10,
279 ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
280 "Registering a new gtid.\n"));
281 __kmp_acquire_bootstrap_lock(lck: &__kmp_initz_lock);
282 if (!__kmp_init_serial) {
283 __kmp_do_serial_initialize();
284 gtid = __kmp_gtid_get_specific();
285 } else {
286 gtid = __kmp_register_root(FALSE);
287 }
288 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
289 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
290 }
291
292 KMP_DEBUG_ASSERT(gtid >= 0);
293
294 return gtid;
295}
296
297/* caller must hold forkjoin_lock */
298void __kmp_check_stack_overlap(kmp_info_t *th) {
299 int f;
300 char *stack_beg = NULL;
301 char *stack_end = NULL;
302 int gtid;
303
304 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
305 if (__kmp_storage_map) {
306 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
307 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
308
309 gtid = __kmp_gtid_from_thread(thr: th);
310
311 if (gtid == KMP_GTID_MONITOR) {
312 __kmp_print_storage_map_gtid(
313 gtid, p1: stack_beg, p2: stack_end, size: th->th.th_info.ds.ds_stacksize,
314 format: "th_%s stack (%s)", "mon",
315 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
316 } else {
317 __kmp_print_storage_map_gtid(
318 gtid, p1: stack_beg, p2: stack_end, size: th->th.th_info.ds.ds_stacksize,
319 format: "th_%d stack (%s)", gtid,
320 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
321 }
322 }
323
324 /* No point in checking ubermaster threads since they use refinement and
325 * cannot overlap */
326 gtid = __kmp_gtid_from_thread(thr: th);
327 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
328 KA_TRACE(10,
329 ("__kmp_check_stack_overlap: performing extensive checking\n"));
330 if (stack_beg == NULL) {
331 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
332 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
333 }
334
335 for (f = 0; f < __kmp_threads_capacity; f++) {
336 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
337
338 if (f_th && f_th != th) {
339 char *other_stack_end =
340 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
341 char *other_stack_beg =
342 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
343 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
344 (stack_end > other_stack_beg && stack_end < other_stack_end)) {
345
346 /* Print the other stack values before the abort */
347 if (__kmp_storage_map)
348 __kmp_print_storage_map_gtid(
349 gtid: -1, p1: other_stack_beg, p2: other_stack_end,
350 size: (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
351 format: "th_%d stack (overlapped)", __kmp_gtid_from_thread(thr: f_th));
352
353 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
354 __kmp_msg_null);
355 }
356 }
357 }
358 }
359 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
360}
361
362/* ------------------------------------------------------------------------ */
363
364void __kmp_infinite_loop(void) {
365 static int done = FALSE;
366
367 while (!done) {
368 KMP_YIELD(TRUE);
369 }
370}
371
372#define MAX_MESSAGE 512
373
374void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
375 char const *format, ...) {
376 char buffer[MAX_MESSAGE];
377 va_list ap;
378
379 va_start(ap, format);
380 KMP_SNPRINTF(s: buffer, maxlen: sizeof(buffer), format: "OMP storage map: %p %p%8lu %s\n", p1,
381 p2, (unsigned long)size, format);
382 __kmp_acquire_bootstrap_lock(lck: &__kmp_stdio_lock);
383 __kmp_vprintf(stream: kmp_err, format: buffer, ap);
384#if KMP_PRINT_DATA_PLACEMENT
385 int node;
386 if (gtid >= 0) {
387 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
388 if (__kmp_storage_map_verbose) {
389 node = __kmp_get_host_node(p1);
390 if (node < 0) /* doesn't work, so don't try this next time */
391 __kmp_storage_map_verbose = FALSE;
392 else {
393 char *last;
394 int lastNode;
395 int localProc = __kmp_get_cpu_from_gtid(gtid);
396
397 const int page_size = KMP_GET_PAGE_SIZE();
398
399 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
400 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
401 if (localProc >= 0)
402 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
403 localProc >> 1);
404 else
405 __kmp_printf_no_lock(" GTID %d\n", gtid);
406#if KMP_USE_PRCTL
407 /* The more elaborate format is disabled for now because of the prctl
408 * hanging bug. */
409 do {
410 last = p1;
411 lastNode = node;
412 /* This loop collates adjacent pages with the same host node. */
413 do {
414 (char *)p1 += page_size;
415 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
416 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
417 lastNode);
418 } while (p1 <= p2);
419#else
420 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
421 (char *)p1 + (page_size - 1),
422 __kmp_get_host_node(p1));
423 if (p1 < p2) {
424 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
425 (char *)p2 + (page_size - 1),
426 __kmp_get_host_node(p2));
427 }
428#endif
429 }
430 }
431 } else
432 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
433 }
434#endif /* KMP_PRINT_DATA_PLACEMENT */
435 __kmp_release_bootstrap_lock(lck: &__kmp_stdio_lock);
436
437 va_end(ap);
438}
439
440void __kmp_warn(char const *format, ...) {
441 char buffer[MAX_MESSAGE];
442 va_list ap;
443
444 if (__kmp_generate_warnings == kmp_warnings_off) {
445 return;
446 }
447
448 va_start(ap, format);
449
450 KMP_SNPRINTF(s: buffer, maxlen: sizeof(buffer), format: "OMP warning: %s\n", format);
451 __kmp_acquire_bootstrap_lock(lck: &__kmp_stdio_lock);
452 __kmp_vprintf(stream: kmp_err, format: buffer, ap);
453 __kmp_release_bootstrap_lock(lck: &__kmp_stdio_lock);
454
455 va_end(ap);
456}
457
458void __kmp_abort_process() {
459 // Later threads may stall here, but that's ok because abort() will kill them.
460 __kmp_acquire_bootstrap_lock(lck: &__kmp_exit_lock);
461
462 if (__kmp_debug_buf) {
463 __kmp_dump_debug_buffer();
464 }
465
466#if KMP_OS_WINDOWS
467 // Let other threads know of abnormal termination and prevent deadlock
468 // if abort happened during library initialization or shutdown
469 __kmp_global.g.g_abort = SIGABRT;
470
471 /* On Windows* OS by default abort() causes pop-up error box, which stalls
472 nightly testing. Unfortunately, we cannot reliably suppress pop-up error
473 boxes. _set_abort_behavior() works well, but this function is not
474 available in VS7 (this is not problem for DLL, but it is a problem for
475 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
476 help, at least in some versions of MS C RTL.
477
478 It seems following sequence is the only way to simulate abort() and
479 avoid pop-up error box. */
480 raise(SIGABRT);
481 _exit(3); // Just in case, if signal ignored, exit anyway.
482#else
483 __kmp_unregister_library();
484 abort();
485#endif
486
487 __kmp_infinite_loop();
488 __kmp_release_bootstrap_lock(lck: &__kmp_exit_lock);
489
490} // __kmp_abort_process
491
492void __kmp_abort_thread(void) {
493 // TODO: Eliminate g_abort global variable and this function.
494 // In case of abort just call abort(), it will kill all the threads.
495 __kmp_infinite_loop();
496} // __kmp_abort_thread
497
498/* Print out the storage map for the major kmp_info_t thread data structures
499 that are allocated together. */
500
501static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
502 __kmp_print_storage_map_gtid(gtid, p1: thr, p2: thr + 1, size: sizeof(kmp_info_t), format: "th_%d",
503 gtid);
504
505 __kmp_print_storage_map_gtid(gtid, p1: &thr->th.th_info, p2: &thr->th.th_team,
506 size: sizeof(kmp_desc_t), format: "th_%d.th_info", gtid);
507
508 __kmp_print_storage_map_gtid(gtid, p1: &thr->th.th_local, p2: &thr->th.th_pri_head,
509 size: sizeof(kmp_local_t), format: "th_%d.th_local", gtid);
510
511 __kmp_print_storage_map_gtid(
512 gtid, p1: &thr->th.th_bar[0], p2: &thr->th.th_bar[bs_last_barrier],
513 size: sizeof(kmp_balign_t) * bs_last_barrier, format: "th_%d.th_bar", gtid);
514
515 __kmp_print_storage_map_gtid(gtid, p1: &thr->th.th_bar[bs_plain_barrier],
516 p2: &thr->th.th_bar[bs_plain_barrier + 1],
517 size: sizeof(kmp_balign_t), format: "th_%d.th_bar[plain]",
518 gtid);
519
520 __kmp_print_storage_map_gtid(gtid, p1: &thr->th.th_bar[bs_forkjoin_barrier],
521 p2: &thr->th.th_bar[bs_forkjoin_barrier + 1],
522 size: sizeof(kmp_balign_t), format: "th_%d.th_bar[forkjoin]",
523 gtid);
524
525#if KMP_FAST_REDUCTION_BARRIER
526 __kmp_print_storage_map_gtid(gtid, p1: &thr->th.th_bar[bs_reduction_barrier],
527 p2: &thr->th.th_bar[bs_reduction_barrier + 1],
528 size: sizeof(kmp_balign_t), format: "th_%d.th_bar[reduction]",
529 gtid);
530#endif // KMP_FAST_REDUCTION_BARRIER
531}
532
533/* Print out the storage map for the major kmp_team_t team data structures
534 that are allocated together. */
535
536static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
537 int team_id, int num_thr) {
538 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
539 __kmp_print_storage_map_gtid(gtid: -1, p1: team, p2: team + 1, size: sizeof(kmp_team_t), format: "%s_%d",
540 header, team_id);
541
542 __kmp_print_storage_map_gtid(gtid: -1, p1: &team->t.t_bar[0],
543 p2: &team->t.t_bar[bs_last_barrier],
544 size: sizeof(kmp_balign_team_t) * bs_last_barrier,
545 format: "%s_%d.t_bar", header, team_id);
546
547 __kmp_print_storage_map_gtid(gtid: -1, p1: &team->t.t_bar[bs_plain_barrier],
548 p2: &team->t.t_bar[bs_plain_barrier + 1],
549 size: sizeof(kmp_balign_team_t), format: "%s_%d.t_bar[plain]",
550 header, team_id);
551
552 __kmp_print_storage_map_gtid(gtid: -1, p1: &team->t.t_bar[bs_forkjoin_barrier],
553 p2: &team->t.t_bar[bs_forkjoin_barrier + 1],
554 size: sizeof(kmp_balign_team_t),
555 format: "%s_%d.t_bar[forkjoin]", header, team_id);
556
557#if KMP_FAST_REDUCTION_BARRIER
558 __kmp_print_storage_map_gtid(gtid: -1, p1: &team->t.t_bar[bs_reduction_barrier],
559 p2: &team->t.t_bar[bs_reduction_barrier + 1],
560 size: sizeof(kmp_balign_team_t),
561 format: "%s_%d.t_bar[reduction]", header, team_id);
562#endif // KMP_FAST_REDUCTION_BARRIER
563
564 __kmp_print_storage_map_gtid(
565 gtid: -1, p1: &team->t.t_dispatch[0], p2: &team->t.t_dispatch[num_thr],
566 size: sizeof(kmp_disp_t) * num_thr, format: "%s_%d.t_dispatch", header, team_id);
567
568 __kmp_print_storage_map_gtid(
569 gtid: -1, p1: &team->t.t_threads[0], p2: &team->t.t_threads[num_thr],
570 size: sizeof(kmp_info_t *) * num_thr, format: "%s_%d.t_threads", header, team_id);
571
572 __kmp_print_storage_map_gtid(gtid: -1, p1: &team->t.t_disp_buffer[0],
573 p2: &team->t.t_disp_buffer[num_disp_buff],
574 size: sizeof(dispatch_shared_info_t) * num_disp_buff,
575 format: "%s_%d.t_disp_buffer", header, team_id);
576}
577
578static void __kmp_init_allocator() {
579 __kmp_init_memkind();
580 __kmp_init_target_mem();
581}
582static void __kmp_fini_allocator() {
583 __kmp_fini_target_mem();
584 __kmp_fini_memkind();
585}
586
587/* ------------------------------------------------------------------------ */
588
589#if ENABLE_LIBOMPTARGET
590static void __kmp_init_omptarget() {
591 __kmp_init_target_task();
592}
593#endif
594
595/* ------------------------------------------------------------------------ */
596
597#if KMP_DYNAMIC_LIB
598#if KMP_OS_WINDOWS
599
600BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
601 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
602
603 switch (fdwReason) {
604
605 case DLL_PROCESS_ATTACH:
606 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
607
608 return TRUE;
609
610 case DLL_PROCESS_DETACH:
611 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
612
613 // According to Windows* documentation for DllMain entry point:
614 // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
615 // lpReserved == NULL when FreeLibrary() is called,
616 // lpReserved != NULL when the process is terminated.
617 // When FreeLibrary() is called, worker threads remain alive. So the
618 // runtime's state is consistent and executing proper shutdown is OK.
619 // When the process is terminated, worker threads have exited or been
620 // forcefully terminated by the OS and only the shutdown thread remains.
621 // This can leave the runtime in an inconsistent state.
622 // Hence, only attempt proper cleanup when FreeLibrary() is called.
623 // Otherwise, rely on OS to reclaim resources.
624 if (lpReserved == NULL)
625 __kmp_internal_end_library(__kmp_gtid_get_specific());
626
627 return TRUE;
628
629 case DLL_THREAD_ATTACH:
630 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
631
632 /* if we want to register new siblings all the time here call
633 * __kmp_get_gtid(); */
634 return TRUE;
635
636 case DLL_THREAD_DETACH:
637 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
638
639 __kmp_internal_end_thread(__kmp_gtid_get_specific());
640 return TRUE;
641 }
642
643 return TRUE;
644}
645
646#endif /* KMP_OS_WINDOWS */
647#endif /* KMP_DYNAMIC_LIB */
648
649/* __kmp_parallel_deo -- Wait until it's our turn. */
650void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
651 int gtid = *gtid_ref;
652#ifdef BUILD_PARALLEL_ORDERED
653 kmp_team_t *team = __kmp_team_from_gtid(gtid);
654#endif /* BUILD_PARALLEL_ORDERED */
655
656 if (__kmp_env_consistency_check) {
657 if (__kmp_threads[gtid]->th.th_root->r.r_active)
658#if KMP_USE_DYNAMIC_LOCK
659 __kmp_push_sync(gtid, ct: ct_ordered_in_parallel, ident: loc_ref, NULL, 0);
660#else
661 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
662#endif
663 }
664#ifdef BUILD_PARALLEL_ORDERED
665 if (!team->t.t_serialized) {
666 KMP_MB();
667 KMP_WAIT(spinner: &team->t.t_ordered.dt.t_value, checker: __kmp_tid_from_gtid(gtid), KMP_EQ,
668 NULL);
669 KMP_MB();
670 }
671#endif /* BUILD_PARALLEL_ORDERED */
672}
673
674/* __kmp_parallel_dxo -- Signal the next task. */
675void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
676 int gtid = *gtid_ref;
677#ifdef BUILD_PARALLEL_ORDERED
678 int tid = __kmp_tid_from_gtid(gtid);
679 kmp_team_t *team = __kmp_team_from_gtid(gtid);
680#endif /* BUILD_PARALLEL_ORDERED */
681
682 if (__kmp_env_consistency_check) {
683 if (__kmp_threads[gtid]->th.th_root->r.r_active)
684 __kmp_pop_sync(gtid, ct: ct_ordered_in_parallel, ident: loc_ref);
685 }
686#ifdef BUILD_PARALLEL_ORDERED
687 if (!team->t.t_serialized) {
688 KMP_MB(); /* Flush all pending memory write invalidates. */
689
690 /* use the tid of the next thread in this team */
691 /* TODO replace with general release procedure */
692 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
693
694 KMP_MB(); /* Flush all pending memory write invalidates. */
695 }
696#endif /* BUILD_PARALLEL_ORDERED */
697}
698
699/* ------------------------------------------------------------------------ */
700/* The BARRIER for a SINGLE process section is always explicit */
701
702int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
703 int status;
704 kmp_info_t *th;
705 kmp_team_t *team;
706
707 if (!TCR_4(__kmp_init_parallel))
708 __kmp_parallel_initialize();
709 __kmp_resume_if_soft_paused();
710
711 th = __kmp_threads[gtid];
712 team = th->th.th_team;
713 status = 0;
714
715 th->th.th_ident = id_ref;
716
717 if (team->t.t_serialized) {
718 status = 1;
719 } else {
720 kmp_int32 old_this = th->th.th_local.this_construct;
721
722 ++th->th.th_local.this_construct;
723 /* try to set team count to thread count--success means thread got the
724 single block */
725 /* TODO: Should this be acquire or release? */
726 if (team->t.t_construct == old_this) {
727 status = __kmp_atomic_compare_store_acq(p: &team->t.t_construct, expected: old_this,
728 desired: th->th.th_local.this_construct);
729 }
730#if USE_ITT_BUILD
731 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
732 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
733 team->t.t_active_level == 1) {
734 // Only report metadata by primary thread of active team at level 1
735 __kmp_itt_metadata_single(loc: id_ref);
736 }
737#endif /* USE_ITT_BUILD */
738 }
739
740 if (__kmp_env_consistency_check) {
741 if (status && push_ws) {
742 __kmp_push_workshare(gtid, ct: ct_psingle, ident: id_ref);
743 } else {
744 __kmp_check_workshare(gtid, ct: ct_psingle, ident: id_ref);
745 }
746 }
747#if USE_ITT_BUILD
748 if (status) {
749 __kmp_itt_single_start(gtid);
750 }
751#endif /* USE_ITT_BUILD */
752 return status;
753}
754
755void __kmp_exit_single(int gtid) {
756#if USE_ITT_BUILD
757 __kmp_itt_single_end(gtid);
758#endif /* USE_ITT_BUILD */
759 if (__kmp_env_consistency_check)
760 __kmp_pop_workshare(gtid, ct: ct_psingle, NULL);
761}
762
763/* determine if we can go parallel or must use a serialized parallel region and
764 * how many threads we can use
765 * set_nproc is the number of threads requested for the team
766 * returns 0 if we should serialize or only use one thread,
767 * otherwise the number of threads to use
768 * The forkjoin lock is held by the caller. */
769static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
770 int master_tid, int set_nthreads,
771 int enter_teams) {
772 int capacity;
773 int new_nthreads;
774 KMP_DEBUG_ASSERT(__kmp_init_serial);
775 KMP_DEBUG_ASSERT(root && parent_team);
776 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
777
778 // If dyn-var is set, dynamically adjust the number of desired threads,
779 // according to the method specified by dynamic_mode.
780 new_nthreads = set_nthreads;
781 if (!get__dynamic_2(parent_team, master_tid)) {
782 ;
783 }
784#ifdef USE_LOAD_BALANCE
785 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
786 new_nthreads = __kmp_load_balance_nproc(root, set_nproc: set_nthreads);
787 if (new_nthreads == 1) {
788 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
789 "reservation to 1 thread\n",
790 master_tid));
791 return 1;
792 }
793 if (new_nthreads < set_nthreads) {
794 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
795 "reservation to %d threads\n",
796 master_tid, new_nthreads));
797 }
798 }
799#endif /* USE_LOAD_BALANCE */
800 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
801 new_nthreads = __kmp_avail_proc - __kmp_nth +
802 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
803 if (new_nthreads <= 1) {
804 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
805 "reservation to 1 thread\n",
806 master_tid));
807 return 1;
808 }
809 if (new_nthreads < set_nthreads) {
810 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
811 "reservation to %d threads\n",
812 master_tid, new_nthreads));
813 } else {
814 new_nthreads = set_nthreads;
815 }
816 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
817 if (set_nthreads > 2) {
818 new_nthreads = __kmp_get_random(thread: parent_team->t.t_threads[master_tid]);
819 new_nthreads = (new_nthreads % set_nthreads) + 1;
820 if (new_nthreads == 1) {
821 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
822 "reservation to 1 thread\n",
823 master_tid));
824 return 1;
825 }
826 if (new_nthreads < set_nthreads) {
827 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
828 "reservation to %d threads\n",
829 master_tid, new_nthreads));
830 }
831 }
832 } else {
833 KMP_ASSERT(0);
834 }
835
836 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
837 if (__kmp_nth + new_nthreads -
838 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
839 __kmp_max_nth) {
840 int tl_nthreads = __kmp_max_nth - __kmp_nth +
841 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
842 if (tl_nthreads <= 0) {
843 tl_nthreads = 1;
844 }
845
846 // If dyn-var is false, emit a 1-time warning.
847 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
848 __kmp_reserve_warn = 1;
849 __kmp_msg(severity: kmp_ms_warning,
850 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
851 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
852 }
853 if (tl_nthreads == 1) {
854 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
855 "reduced reservation to 1 thread\n",
856 master_tid));
857 return 1;
858 }
859 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
860 "reservation to %d threads\n",
861 master_tid, tl_nthreads));
862 new_nthreads = tl_nthreads;
863 }
864
865 // Respect OMP_THREAD_LIMIT
866 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
867 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
868 if (cg_nthreads + new_nthreads -
869 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
870 max_cg_threads) {
871 int tl_nthreads = max_cg_threads - cg_nthreads +
872 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
873 if (tl_nthreads <= 0) {
874 tl_nthreads = 1;
875 }
876
877 // If dyn-var is false, emit a 1-time warning.
878 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
879 __kmp_reserve_warn = 1;
880 __kmp_msg(severity: kmp_ms_warning,
881 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
882 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
883 }
884 if (tl_nthreads == 1) {
885 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
886 "reduced reservation to 1 thread\n",
887 master_tid));
888 return 1;
889 }
890 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
891 "reservation to %d threads\n",
892 master_tid, tl_nthreads));
893 new_nthreads = tl_nthreads;
894 }
895
896 // Check if the threads array is large enough, or needs expanding.
897 // See comment in __kmp_register_root() about the adjustment if
898 // __kmp_threads[0] == NULL.
899 capacity = __kmp_threads_capacity;
900 if (TCR_PTR(__kmp_threads[0]) == NULL) {
901 --capacity;
902 }
903 // If it is not for initializing the hidden helper team, we need to take
904 // __kmp_hidden_helper_threads_num out of the capacity because it is included
905 // in __kmp_threads_capacity.
906 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
907 capacity -= __kmp_hidden_helper_threads_num;
908 }
909 if (__kmp_nth + new_nthreads -
910 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
911 capacity) {
912 // Expand the threads array.
913 int slotsRequired = __kmp_nth + new_nthreads -
914 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
915 capacity;
916 int slotsAdded = __kmp_expand_threads(nNeed: slotsRequired);
917 if (slotsAdded < slotsRequired) {
918 // The threads array was not expanded enough.
919 new_nthreads -= (slotsRequired - slotsAdded);
920 KMP_ASSERT(new_nthreads >= 1);
921
922 // If dyn-var is false, emit a 1-time warning.
923 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
924 __kmp_reserve_warn = 1;
925 if (__kmp_tp_cached) {
926 __kmp_msg(severity: kmp_ms_warning,
927 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
928 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
929 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
930 } else {
931 __kmp_msg(severity: kmp_ms_warning,
932 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
933 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
934 }
935 }
936 }
937 }
938
939#ifdef KMP_DEBUG
940 if (new_nthreads == 1) {
941 KC_TRACE(10,
942 ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
943 "dead roots and rechecking; requested %d threads\n",
944 __kmp_get_gtid(), set_nthreads));
945 } else {
946 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
947 " %d threads\n",
948 __kmp_get_gtid(), new_nthreads, set_nthreads));
949 }
950#endif // KMP_DEBUG
951
952 if (this_thr->th.th_nt_strict && new_nthreads < set_nthreads) {
953 __kmpc_error(loc: this_thr->th.th_nt_loc, severity: this_thr->th.th_nt_sev,
954 message: this_thr->th.th_nt_msg);
955 }
956 return new_nthreads;
957}
958
959/* Allocate threads from the thread pool and assign them to the new team. We are
960 assured that there are enough threads available, because we checked on that
961 earlier within critical section forkjoin */
962static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
963 kmp_info_t *master_th, int master_gtid,
964 int fork_teams_workers) {
965 int i;
966 int use_hot_team;
967
968 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
969 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
970 KMP_MB();
971
972 /* first, let's setup the primary thread */
973 master_th->th.th_info.ds.ds_tid = 0;
974 master_th->th.th_team = team;
975 master_th->th.th_team_nproc = team->t.t_nproc;
976 master_th->th.th_team_master = master_th;
977 master_th->th.th_team_serialized = FALSE;
978 master_th->th.th_dispatch = &team->t.t_dispatch[0];
979
980/* make sure we are not the optimized hot team */
981#if KMP_NESTED_HOT_TEAMS
982 use_hot_team = 0;
983 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
984 if (hot_teams) { // hot teams array is not allocated if
985 // KMP_HOT_TEAMS_MAX_LEVEL=0
986 int level = team->t.t_active_level - 1; // index in array of hot teams
987 if (master_th->th.th_teams_microtask) { // are we inside the teams?
988 if (master_th->th.th_teams_size.nteams > 1) {
989 ++level; // level was not increased in teams construct for
990 // team_of_masters
991 }
992 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
993 master_th->th.th_teams_level == team->t.t_level) {
994 ++level; // level was not increased in teams construct for
995 // team_of_workers before the parallel
996 } // team->t.t_level will be increased inside parallel
997 }
998 if (level < __kmp_hot_teams_max_level) {
999 if (hot_teams[level].hot_team) {
1000 // hot team has already been allocated for given level
1001 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1002 use_hot_team = 1; // the team is ready to use
1003 } else {
1004 use_hot_team = 0; // AC: threads are not allocated yet
1005 hot_teams[level].hot_team = team; // remember new hot team
1006 hot_teams[level].hot_team_nth = team->t.t_nproc;
1007 }
1008 } else {
1009 use_hot_team = 0;
1010 }
1011 }
1012#else
1013 use_hot_team = team == root->r.r_hot_team;
1014#endif
1015 if (!use_hot_team) {
1016
1017 /* install the primary thread */
1018 team->t.t_threads[0] = master_th;
1019 __kmp_initialize_info(master_th, team, tid: 0, gtid: master_gtid);
1020
1021 /* now, install the worker threads */
1022 for (i = 1; i < team->t.t_nproc; i++) {
1023
1024 /* fork or reallocate a new thread and install it in team */
1025 kmp_info_t *thr = __kmp_allocate_thread(root, team, tid: i);
1026 team->t.t_threads[i] = thr;
1027 KMP_DEBUG_ASSERT(thr);
1028 KMP_DEBUG_ASSERT(thr->th.th_team == team);
1029 /* align team and thread arrived states */
1030 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1031 "T#%d(%d:%d) join =%llu, plain=%llu\n",
1032 __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1033 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1034 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1035 team->t.t_bar[bs_plain_barrier].b_arrived));
1036 thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1037 thr->th.th_teams_level = master_th->th.th_teams_level;
1038 thr->th.th_teams_size = master_th->th.th_teams_size;
1039 { // Initialize threads' barrier data.
1040 int b;
1041 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1042 for (b = 0; b < bs_last_barrier; ++b) {
1043 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1044 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1045#if USE_DEBUGGER
1046 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1047#endif
1048 }
1049 }
1050 }
1051
1052#if KMP_AFFINITY_SUPPORTED
1053 // Do not partition the places list for teams construct workers who
1054 // haven't actually been forked to do real work yet. This partitioning
1055 // will take place in the parallel region nested within the teams construct.
1056 if (!fork_teams_workers) {
1057 __kmp_partition_places(team);
1058 }
1059#endif
1060
1061 if (team->t.t_nproc > 1 &&
1062 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1063 team->t.b->update_num_threads(nthr: team->t.t_nproc);
1064 __kmp_add_threads_to_team(team, new_nthreads: team->t.t_nproc);
1065 }
1066 }
1067
1068 // Take care of primary thread's task state
1069 if (__kmp_tasking_mode != tskm_immediate_exec) {
1070 if (use_hot_team) {
1071 KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team->t.t_parent, master_th);
1072 KA_TRACE(
1073 20,
1074 ("__kmp_fork_team_threads: Primary T#%d pushing task_team %p / team "
1075 "%p, new task_team %p / team %p\n",
1076 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
1077 team->t.t_parent, team->t.t_task_team[master_th->th.th_task_state],
1078 team));
1079
1080 // Store primary thread's current task state on new team
1081 KMP_CHECK_UPDATE(team->t.t_primary_task_state,
1082 master_th->th.th_task_state);
1083
1084 // Restore primary thread's task state to hot team's state
1085 // by using thread 1's task state
1086 if (team->t.t_nproc > 1) {
1087 KMP_DEBUG_ASSERT(team->t.t_threads[1]->th.th_task_state == 0 ||
1088 team->t.t_threads[1]->th.th_task_state == 1);
1089 KMP_CHECK_UPDATE(master_th->th.th_task_state,
1090 team->t.t_threads[1]->th.th_task_state);
1091 } else {
1092 master_th->th.th_task_state = 0;
1093 }
1094 } else {
1095 // Store primary thread's current task_state on new team
1096 KMP_CHECK_UPDATE(team->t.t_primary_task_state,
1097 master_th->th.th_task_state);
1098 // Are not using hot team, so set task state to 0.
1099 master_th->th.th_task_state = 0;
1100 }
1101 }
1102
1103 if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1104 for (i = 0; i < team->t.t_nproc; i++) {
1105 kmp_info_t *thr = team->t.t_threads[i];
1106 if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1107 thr->th.th_prev_level != team->t.t_level) {
1108 team->t.t_display_affinity = 1;
1109 break;
1110 }
1111 }
1112 }
1113
1114 KMP_MB();
1115}
1116
1117#if KMP_ARCH_X86 || KMP_ARCH_X86_64
1118// Propagate any changes to the floating point control registers out to the team
1119// We try to avoid unnecessary writes to the relevant cache line in the team
1120// structure, so we don't make changes unless they are needed.
1121inline static void propagateFPControl(kmp_team_t *team) {
1122 if (__kmp_inherit_fp_control) {
1123 kmp_int16 x87_fpu_control_word;
1124 kmp_uint32 mxcsr;
1125
1126 // Get primary thread's values of FPU control flags (both X87 and vector)
1127 __kmp_store_x87_fpu_control_word(p: &x87_fpu_control_word);
1128 __kmp_store_mxcsr(p: &mxcsr);
1129 mxcsr &= KMP_X86_MXCSR_MASK;
1130
1131 // There is no point looking at t_fp_control_saved here.
1132 // If it is TRUE, we still have to update the values if they are different
1133 // from those we now have. If it is FALSE we didn't save anything yet, but
1134 // our objective is the same. We have to ensure that the values in the team
1135 // are the same as those we have.
1136 // So, this code achieves what we need whether or not t_fp_control_saved is
1137 // true. By checking whether the value needs updating we avoid unnecessary
1138 // writes that would put the cache-line into a written state, causing all
1139 // threads in the team to have to read it again.
1140 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1141 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1142 // Although we don't use this value, other code in the runtime wants to know
1143 // whether it should restore them. So we must ensure it is correct.
1144 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1145 } else {
1146 // Similarly here. Don't write to this cache-line in the team structure
1147 // unless we have to.
1148 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1149 }
1150}
1151
1152// Do the opposite, setting the hardware registers to the updated values from
1153// the team.
1154inline static void updateHWFPControl(kmp_team_t *team) {
1155 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1156 // Only reset the fp control regs if they have been changed in the team.
1157 // the parallel region that we are exiting.
1158 kmp_int16 x87_fpu_control_word;
1159 kmp_uint32 mxcsr;
1160 __kmp_store_x87_fpu_control_word(p: &x87_fpu_control_word);
1161 __kmp_store_mxcsr(p: &mxcsr);
1162 mxcsr &= KMP_X86_MXCSR_MASK;
1163
1164 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1165 __kmp_clear_x87_fpu_status_word();
1166 __kmp_load_x87_fpu_control_word(p: &team->t.t_x87_fpu_control_word);
1167 }
1168
1169 if (team->t.t_mxcsr != mxcsr) {
1170 __kmp_load_mxcsr(p: &team->t.t_mxcsr);
1171 }
1172 }
1173}
1174#else
1175#define propagateFPControl(x) ((void)0)
1176#define updateHWFPControl(x) ((void)0)
1177#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1178
1179static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1180 int realloc); // forward declaration
1181
1182/* Run a parallel region that has been serialized, so runs only in a team of the
1183 single primary thread. */
1184void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1185 kmp_info_t *this_thr;
1186 kmp_team_t *serial_team;
1187
1188 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1189
1190 /* Skip all this code for autopar serialized loops since it results in
1191 unacceptable overhead */
1192 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1193 return;
1194
1195 if (!TCR_4(__kmp_init_parallel))
1196 __kmp_parallel_initialize();
1197 __kmp_resume_if_soft_paused();
1198
1199 this_thr = __kmp_threads[global_tid];
1200 serial_team = this_thr->th.th_serial_team;
1201
1202 /* utilize the serialized team held by this thread */
1203 KMP_DEBUG_ASSERT(serial_team);
1204 KMP_MB();
1205
1206 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1207 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1208 proc_bind = proc_bind_false;
1209 } else if (proc_bind == proc_bind_default) {
1210 // No proc_bind clause was specified, so use the current value
1211 // of proc-bind-var for this parallel region.
1212 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1213 }
1214 // Reset for next parallel region
1215 this_thr->th.th_set_proc_bind = proc_bind_default;
1216
1217 // OpenMP 6.0 12.1.2 requires the num_threads 'strict' modifier to also have
1218 // effect when parallel execution is disabled by a corresponding if clause
1219 // attached to the parallel directive.
1220 if (this_thr->th.th_nt_strict && this_thr->th.th_set_nproc > 1)
1221 __kmpc_error(loc: this_thr->th.th_nt_loc, severity: this_thr->th.th_nt_sev,
1222 message: this_thr->th.th_nt_msg);
1223 // Reset num_threads for next parallel region
1224 this_thr->th.th_set_nproc = 0;
1225
1226#if OMPT_SUPPORT
1227 ompt_data_t ompt_parallel_data = ompt_data_none;
1228 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1229 if (ompt_enabled.enabled &&
1230 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1231
1232 ompt_task_info_t *parent_task_info;
1233 parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1234
1235 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1236 if (ompt_enabled.ompt_callback_parallel_begin) {
1237 int team_size = 1;
1238
1239 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1240 &(parent_task_info->task_data), &(parent_task_info->frame),
1241 &ompt_parallel_data, team_size,
1242 ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1243 }
1244 }
1245#endif // OMPT_SUPPORT
1246
1247 if (this_thr->th.th_team != serial_team) {
1248 // Nested level will be an index in the nested nthreads array
1249 int level = this_thr->th.th_team->t.t_level;
1250
1251 if (serial_team->t.t_serialized) {
1252 /* this serial team was already used
1253 TODO increase performance by making this locks more specific */
1254 kmp_team_t *new_team;
1255
1256 __kmp_acquire_bootstrap_lock(lck: &__kmp_forkjoin_lock);
1257
1258 new_team =
1259 __kmp_allocate_team(root: this_thr->th.th_root, new_nproc: 1, max_nproc: 1,
1260#if OMPT_SUPPORT
1261 ompt_parallel_data,
1262#endif
1263 proc_bind, new_icvs: &this_thr->th.th_current_task->td_icvs,
1264 argc: 0 USE_NESTED_HOT_ARG(NULL));
1265 __kmp_release_bootstrap_lock(lck: &__kmp_forkjoin_lock);
1266 KMP_ASSERT(new_team);
1267
1268 /* setup new serialized team and install it */
1269 new_team->t.t_threads[0] = this_thr;
1270 new_team->t.t_parent = this_thr->th.th_team;
1271 serial_team = new_team;
1272 this_thr->th.th_serial_team = serial_team;
1273
1274 KF_TRACE(
1275 10,
1276 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1277 global_tid, serial_team));
1278
1279 /* TODO the above breaks the requirement that if we run out of resources,
1280 then we can still guarantee that serialized teams are ok, since we may
1281 need to allocate a new one */
1282 } else {
1283 KF_TRACE(
1284 10,
1285 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1286 global_tid, serial_team));
1287 }
1288
1289 /* we have to initialize this serial team */
1290 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1291 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1292 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1293 serial_team->t.t_ident = loc;
1294 serial_team->t.t_serialized = 1;
1295 serial_team->t.t_nproc = 1;
1296 serial_team->t.t_parent = this_thr->th.th_team;
1297 if (this_thr->th.th_team->t.t_nested_nth)
1298 serial_team->t.t_nested_nth = this_thr->th.th_team->t.t_nested_nth;
1299 else
1300 serial_team->t.t_nested_nth = &__kmp_nested_nth;
1301 // Save previous team's task state on serial team structure
1302 serial_team->t.t_primary_task_state = this_thr->th.th_task_state;
1303 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1304 this_thr->th.th_team = serial_team;
1305 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1306
1307 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1308 this_thr->th.th_current_task));
1309 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1310 this_thr->th.th_current_task->td_flags.executing = 0;
1311
1312 __kmp_push_current_task_to_thread(this_thr, team: serial_team, tid: 0);
1313
1314 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1315 implicit task for each serialized task represented by
1316 team->t.t_serialized? */
1317 copy_icvs(dst: &this_thr->th.th_current_task->td_icvs,
1318 src: &this_thr->th.th_current_task->td_parent->td_icvs);
1319
1320 // Thread value exists in the nested nthreads array for the next nested
1321 // level
1322 kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
1323 if (this_thr->th.th_team->t.t_nested_nth)
1324 nested_nth = this_thr->th.th_team->t.t_nested_nth;
1325 if (nested_nth->used && (level + 1 < nested_nth->used)) {
1326 this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
1327 }
1328
1329 if (__kmp_nested_proc_bind.used &&
1330 (level + 1 < __kmp_nested_proc_bind.used)) {
1331 this_thr->th.th_current_task->td_icvs.proc_bind =
1332 __kmp_nested_proc_bind.bind_types[level + 1];
1333 }
1334
1335#if USE_DEBUGGER
1336 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1337#endif
1338 this_thr->th.th_info.ds.ds_tid = 0;
1339
1340 /* set thread cache values */
1341 this_thr->th.th_team_nproc = 1;
1342 this_thr->th.th_team_master = this_thr;
1343 this_thr->th.th_team_serialized = 1;
1344 this_thr->th.th_task_team = NULL;
1345 this_thr->th.th_task_state = 0;
1346
1347 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1348 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1349 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1350
1351 propagateFPControl(team: serial_team);
1352
1353 /* check if we need to allocate dispatch buffers stack */
1354 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1355 if (!serial_team->t.t_dispatch->th_disp_buffer) {
1356 serial_team->t.t_dispatch->th_disp_buffer =
1357 (dispatch_private_info_t *)__kmp_allocate(
1358 sizeof(dispatch_private_info_t));
1359 }
1360 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1361
1362 KMP_MB();
1363
1364 } else {
1365 /* this serialized team is already being used,
1366 * that's fine, just add another nested level */
1367 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1368 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1369 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1370 ++serial_team->t.t_serialized;
1371 this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1372
1373 // Nested level will be an index in the nested nthreads array
1374 int level = this_thr->th.th_team->t.t_level;
1375 // Thread value exists in the nested nthreads array for the next nested
1376 // level
1377
1378 kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
1379 if (serial_team->t.t_nested_nth)
1380 nested_nth = serial_team->t.t_nested_nth;
1381 if (nested_nth->used && (level + 1 < nested_nth->used)) {
1382 this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
1383 }
1384
1385 serial_team->t.t_level++;
1386 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1387 "of serial team %p to %d\n",
1388 global_tid, serial_team, serial_team->t.t_level));
1389
1390 /* allocate/push dispatch buffers stack */
1391 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1392 {
1393 dispatch_private_info_t *disp_buffer =
1394 (dispatch_private_info_t *)__kmp_allocate(
1395 sizeof(dispatch_private_info_t));
1396 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1397 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1398 }
1399 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1400
1401 /* allocate/push task team stack */
1402 __kmp_push_task_team_node(thread: this_thr, team: serial_team);
1403
1404 KMP_MB();
1405 }
1406 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1407
1408 // Perform the display affinity functionality for
1409 // serialized parallel regions
1410 if (__kmp_display_affinity) {
1411 if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1412 this_thr->th.th_prev_num_threads != 1) {
1413 // NULL means use the affinity-format-var ICV
1414 __kmp_aux_display_affinity(gtid: global_tid, NULL);
1415 this_thr->th.th_prev_level = serial_team->t.t_level;
1416 this_thr->th.th_prev_num_threads = 1;
1417 }
1418 }
1419
1420 if (__kmp_env_consistency_check)
1421 __kmp_push_parallel(gtid: global_tid, NULL);
1422#if OMPT_SUPPORT
1423 serial_team->t.ompt_team_info.master_return_address = codeptr;
1424 if (ompt_enabled.enabled &&
1425 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1426 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1427 OMPT_GET_FRAME_ADDRESS(0);
1428
1429 ompt_lw_taskteam_t lw_taskteam;
1430 __ompt_lw_taskteam_init(lwt: &lw_taskteam, thr: this_thr, gtid: global_tid,
1431 ompt_pid: &ompt_parallel_data, codeptr);
1432
1433 __ompt_lw_taskteam_link(lwt: &lw_taskteam, thr: this_thr, on_heap: 1);
1434 // don't use lw_taskteam after linking. content was swaped
1435
1436 /* OMPT implicit task begin */
1437 if (ompt_enabled.ompt_callback_implicit_task) {
1438 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1439 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1440 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(gtid: global_tid),
1441 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1442 OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1443 __kmp_tid_from_gtid(gtid: global_tid);
1444 }
1445
1446 /* OMPT state */
1447 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1448 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1449 OMPT_GET_FRAME_ADDRESS(0);
1450 }
1451#endif
1452}
1453
1454// Test if this fork is for a team closely nested in a teams construct
1455static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
1456 microtask_t microtask, int level,
1457 int teams_level, kmp_va_list ap) {
1458 return (master_th->th.th_teams_microtask && ap &&
1459 microtask != (microtask_t)__kmp_teams_master && level == teams_level);
1460}
1461
1462// Test if this fork is for the teams construct, i.e. to form the outer league
1463// of teams
1464static inline bool __kmp_is_entering_teams(int active_level, int level,
1465 int teams_level, kmp_va_list ap) {
1466 return ((ap == NULL && active_level == 0) ||
1467 (ap && teams_level > 0 && teams_level == level));
1468}
1469
1470// AC: This is start of parallel that is nested inside teams construct.
1471// The team is actual (hot), all workers are ready at the fork barrier.
1472// No lock needed to initialize the team a bit, then free workers.
1473static inline int
1474__kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
1475 kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
1476 enum fork_context_e call_context, microtask_t microtask,
1477 launch_t invoker, int master_set_numthreads, int level,
1478#if OMPT_SUPPORT
1479 ompt_data_t ompt_parallel_data, void *return_address,
1480#endif
1481 kmp_va_list ap) {
1482 void **argv;
1483 int i;
1484
1485 parent_team->t.t_ident = loc;
1486 __kmp_alloc_argv_entries(argc, team: parent_team, TRUE);
1487 parent_team->t.t_argc = argc;
1488 argv = (void **)parent_team->t.t_argv;
1489 for (i = argc - 1; i >= 0; --i) {
1490 *argv++ = va_arg(kmp_va_deref(ap), void *);
1491 }
1492 // Increment our nested depth levels, but not increase the serialization
1493 if (parent_team == master_th->th.th_serial_team) {
1494 // AC: we are in serialized parallel
1495 __kmpc_serialized_parallel(loc, global_tid: gtid);
1496 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1497
1498 if (call_context == fork_context_gnu) {
1499 // AC: need to decrement t_serialized for enquiry functions to work
1500 // correctly, will restore at join time
1501 parent_team->t.t_serialized--;
1502 return TRUE;
1503 }
1504
1505#if OMPD_SUPPORT
1506 parent_team->t.t_pkfn = microtask;
1507#endif
1508
1509#if OMPT_SUPPORT
1510 void *dummy;
1511 void **exit_frame_p;
1512 ompt_data_t *implicit_task_data;
1513 ompt_lw_taskteam_t lw_taskteam;
1514
1515 if (ompt_enabled.enabled) {
1516 __ompt_lw_taskteam_init(lwt: &lw_taskteam, thr: master_th, gtid,
1517 ompt_pid: &ompt_parallel_data, codeptr: return_address);
1518 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1519
1520 __ompt_lw_taskteam_link(lwt: &lw_taskteam, thr: master_th, on_heap: 0);
1521 // Don't use lw_taskteam after linking. Content was swapped.
1522
1523 /* OMPT implicit task begin */
1524 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1525 if (ompt_enabled.ompt_callback_implicit_task) {
1526 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1527 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1528 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1529 1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1530 }
1531
1532 /* OMPT state */
1533 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1534 } else {
1535 exit_frame_p = &dummy;
1536 }
1537#endif
1538
1539 // AC: need to decrement t_serialized for enquiry functions to work
1540 // correctly, will restore at join time
1541 parent_team->t.t_serialized--;
1542
1543 {
1544 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1545 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1546 __kmp_invoke_microtask(pkfn: microtask, gtid, npr: 0, argc, argv: parent_team->t.t_argv
1547#if OMPT_SUPPORT
1548 ,
1549 exit_frame_ptr: exit_frame_p
1550#endif
1551 );
1552 }
1553
1554#if OMPT_SUPPORT
1555 if (ompt_enabled.enabled) {
1556 *exit_frame_p = NULL;
1557 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1558 if (ompt_enabled.ompt_callback_implicit_task) {
1559 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1560 ompt_scope_end, NULL, implicit_task_data, 1,
1561 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1562 }
1563 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1564 __ompt_lw_taskteam_unlink(thr: master_th);
1565 if (ompt_enabled.ompt_callback_parallel_end) {
1566 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1567 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1568 OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
1569 }
1570 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1571 }
1572#endif
1573 return TRUE;
1574 }
1575
1576 parent_team->t.t_pkfn = microtask;
1577 parent_team->t.t_invoke = invoker;
1578 KMP_ATOMIC_INC(&root->r.r_in_parallel);
1579 parent_team->t.t_active_level++;
1580 parent_team->t.t_level++;
1581 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1582
1583 // If the threads allocated to the team are less than the thread limit, update
1584 // the thread limit here. th_teams_size.nth is specific to this team nested
1585 // in a teams construct, the team is fully created, and we're about to do
1586 // the actual fork. Best to do this here so that the subsequent uses below
1587 // and in the join have the correct value.
1588 master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
1589
1590#if OMPT_SUPPORT
1591 if (ompt_enabled.enabled) {
1592 ompt_lw_taskteam_t lw_taskteam;
1593 __ompt_lw_taskteam_init(lwt: &lw_taskteam, thr: master_th, gtid, ompt_pid: &ompt_parallel_data,
1594 codeptr: return_address);
1595 __ompt_lw_taskteam_link(lwt: &lw_taskteam, thr: master_th, on_heap: 1, always: true);
1596 }
1597#endif
1598
1599 /* Change number of threads in the team if requested */
1600 if (master_set_numthreads) { // The parallel has num_threads clause
1601 if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1602 // AC: only can reduce number of threads dynamically, can't increase
1603 kmp_info_t **other_threads = parent_team->t.t_threads;
1604 // NOTE: if using distributed barrier, we need to run this code block
1605 // even when the team size appears not to have changed from the max.
1606 int old_proc = master_th->th.th_teams_size.nth;
1607 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1608 __kmp_resize_dist_barrier(team: parent_team, old_nthreads: old_proc, new_nthreads: master_set_numthreads);
1609 __kmp_add_threads_to_team(team: parent_team, new_nthreads: master_set_numthreads);
1610 }
1611 parent_team->t.t_nproc = master_set_numthreads;
1612 for (i = 0; i < master_set_numthreads; ++i) {
1613 other_threads[i]->th.th_team_nproc = master_set_numthreads;
1614 }
1615 }
1616 // Keep extra threads hot in the team for possible next parallels
1617 master_th->th.th_set_nproc = 0;
1618 }
1619
1620#if USE_DEBUGGER
1621 if (__kmp_debugging) { // Let debugger override number of threads.
1622 int nth = __kmp_omp_num_threads(loc);
1623 if (nth > 0) { // 0 means debugger doesn't want to change num threads
1624 master_set_numthreads = nth;
1625 }
1626 }
1627#endif
1628
1629 // Figure out the proc_bind policy for the nested parallel within teams
1630 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1631 // proc_bind_default means don't update
1632 kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1633 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1634 proc_bind = proc_bind_false;
1635 } else {
1636 // No proc_bind clause specified; use current proc-bind-var
1637 if (proc_bind == proc_bind_default) {
1638 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1639 }
1640 /* else: The proc_bind policy was specified explicitly on parallel clause.
1641 This overrides proc-bind-var for this parallel region, but does not
1642 change proc-bind-var. */
1643 // Figure the value of proc-bind-var for the child threads.
1644 if ((level + 1 < __kmp_nested_proc_bind.used) &&
1645 (__kmp_nested_proc_bind.bind_types[level + 1] !=
1646 master_th->th.th_current_task->td_icvs.proc_bind)) {
1647 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1648 }
1649 }
1650 KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1651 // Need to change the bind-var ICV to correct value for each implicit task
1652 if (proc_bind_icv != proc_bind_default &&
1653 master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1654 kmp_info_t **other_threads = parent_team->t.t_threads;
1655 for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1656 other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
1657 }
1658 }
1659 // Reset for next parallel region
1660 master_th->th.th_set_proc_bind = proc_bind_default;
1661
1662#if USE_ITT_BUILD && USE_ITT_NOTIFY
1663 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1664 KMP_ITT_DEBUG) &&
1665 __kmp_forkjoin_frames_mode == 3 &&
1666 parent_team->t.t_active_level == 1 // only report frames at level 1
1667 && master_th->th.th_teams_size.nteams == 1) {
1668 kmp_uint64 tmp_time = __itt_get_timestamp();
1669 master_th->th.th_frame_time = tmp_time;
1670 parent_team->t.t_region_time = tmp_time;
1671 }
1672 if (__itt_stack_caller_create_ptr) {
1673 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1674 // create new stack stitching id before entering fork barrier
1675 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1676 }
1677#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1678#if KMP_AFFINITY_SUPPORTED
1679 __kmp_partition_places(team: parent_team);
1680#endif
1681
1682 KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
1683 "master_th=%p, gtid=%d\n",
1684 root, parent_team, master_th, gtid));
1685 __kmp_internal_fork(id: loc, gtid, team: parent_team);
1686 KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
1687 "master_th=%p, gtid=%d\n",
1688 root, parent_team, master_th, gtid));
1689
1690 if (call_context == fork_context_gnu)
1691 return TRUE;
1692
1693 /* Invoke microtask for PRIMARY thread */
1694 KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
1695 parent_team->t.t_id, parent_team->t.t_pkfn));
1696
1697 if (!parent_team->t.t_invoke(gtid)) {
1698 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1699 }
1700 KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
1701 parent_team->t.t_id, parent_team->t.t_pkfn));
1702 KMP_MB(); /* Flush all pending memory write invalidates. */
1703
1704 KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
1705
1706 return TRUE;
1707}
1708
1709// Create a serialized parallel region
1710static inline int
1711__kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
1712 kmp_int32 argc, microtask_t microtask, launch_t invoker,
1713 kmp_info_t *master_th, kmp_team_t *parent_team,
1714#if OMPT_SUPPORT
1715 ompt_data_t *ompt_parallel_data, void **return_address,
1716 ompt_data_t **parent_task_data,
1717#endif
1718 kmp_va_list ap) {
1719 kmp_team_t *team;
1720 int i;
1721 void **argv;
1722
1723/* josh todo: hypothetical question: what do we do for OS X*? */
1724#if KMP_OS_LINUX && \
1725 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1726 SimpleVLA<void *> args(argc);
1727#else
1728 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1729#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1730 KMP_ARCH_AARCH64) */
1731
1732 KA_TRACE(
1733 20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
1734
1735 __kmpc_serialized_parallel(loc, global_tid: gtid);
1736
1737#if OMPD_SUPPORT
1738 master_th->th.th_serial_team->t.t_pkfn = microtask;
1739#endif
1740
1741 if (call_context == fork_context_intel) {
1742 /* TODO this sucks, use the compiler itself to pass args! :) */
1743 master_th->th.th_serial_team->t.t_ident = loc;
1744 if (!ap) {
1745 // revert change made in __kmpc_serialized_parallel()
1746 master_th->th.th_serial_team->t.t_level--;
1747// Get args from parent team for teams construct
1748
1749#if OMPT_SUPPORT
1750 void *dummy;
1751 void **exit_frame_p;
1752 ompt_task_info_t *task_info;
1753 ompt_lw_taskteam_t lw_taskteam;
1754
1755 if (ompt_enabled.enabled) {
1756 __ompt_lw_taskteam_init(lwt: &lw_taskteam, thr: master_th, gtid,
1757 ompt_pid: ompt_parallel_data, codeptr: *return_address);
1758
1759 __ompt_lw_taskteam_link(lwt: &lw_taskteam, thr: master_th, on_heap: 0);
1760 // don't use lw_taskteam after linking. content was swaped
1761 task_info = OMPT_CUR_TASK_INFO(master_th);
1762 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1763 if (ompt_enabled.ompt_callback_implicit_task) {
1764 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1765 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1766 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1767 &(task_info->task_data), 1,
1768 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1769 }
1770
1771 /* OMPT state */
1772 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1773 } else {
1774 exit_frame_p = &dummy;
1775 }
1776#endif
1777
1778 {
1779 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1780 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1781 __kmp_invoke_microtask(pkfn: microtask, gtid, npr: 0, argc, argv: parent_team->t.t_argv
1782#if OMPT_SUPPORT
1783 ,
1784 exit_frame_ptr: exit_frame_p
1785#endif
1786 );
1787 }
1788
1789#if OMPT_SUPPORT
1790 if (ompt_enabled.enabled) {
1791 *exit_frame_p = NULL;
1792 if (ompt_enabled.ompt_callback_implicit_task) {
1793 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1794 ompt_scope_end, NULL, &(task_info->task_data), 1,
1795 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1796 }
1797 *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1798 __ompt_lw_taskteam_unlink(thr: master_th);
1799 if (ompt_enabled.ompt_callback_parallel_end) {
1800 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1801 ompt_parallel_data, *parent_task_data,
1802 OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1803 }
1804 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1805 }
1806#endif
1807 } else if (microtask == (microtask_t)__kmp_teams_master) {
1808 KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
1809 team = master_th->th.th_team;
1810 // team->t.t_pkfn = microtask;
1811 team->t.t_invoke = invoker;
1812 __kmp_alloc_argv_entries(argc, team, TRUE);
1813 team->t.t_argc = argc;
1814 argv = (void **)team->t.t_argv;
1815 for (i = argc - 1; i >= 0; --i)
1816 *argv++ = va_arg(kmp_va_deref(ap), void *);
1817 // AC: revert change made in __kmpc_serialized_parallel()
1818 // because initial code in teams should have level=0
1819 team->t.t_level--;
1820 // AC: call special invoker for outer "parallel" of teams construct
1821 invoker(gtid);
1822#if OMPT_SUPPORT
1823 if (ompt_enabled.enabled) {
1824 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1825 if (ompt_enabled.ompt_callback_implicit_task) {
1826 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1827 ompt_scope_end, NULL, &(task_info->task_data), 0,
1828 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1829 }
1830 if (ompt_enabled.ompt_callback_parallel_end) {
1831 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1832 ompt_parallel_data, *parent_task_data,
1833 OMPT_INVOKER(call_context) | ompt_parallel_league,
1834 *return_address);
1835 }
1836 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1837 }
1838#endif
1839 } else {
1840 argv = args;
1841 for (i = argc - 1; i >= 0; --i)
1842 *argv++ = va_arg(kmp_va_deref(ap), void *);
1843 KMP_MB();
1844
1845#if OMPT_SUPPORT
1846 void *dummy;
1847 void **exit_frame_p;
1848 ompt_task_info_t *task_info;
1849 ompt_lw_taskteam_t lw_taskteam;
1850 ompt_data_t *implicit_task_data;
1851
1852 if (ompt_enabled.enabled) {
1853 __ompt_lw_taskteam_init(lwt: &lw_taskteam, thr: master_th, gtid,
1854 ompt_pid: ompt_parallel_data, codeptr: *return_address);
1855 __ompt_lw_taskteam_link(lwt: &lw_taskteam, thr: master_th, on_heap: 0);
1856 // don't use lw_taskteam after linking. content was swaped
1857 task_info = OMPT_CUR_TASK_INFO(master_th);
1858 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1859
1860 /* OMPT implicit task begin */
1861 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1862 if (ompt_enabled.ompt_callback_implicit_task) {
1863 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1864 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1865 implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1866 ompt_task_implicit);
1867 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1868 }
1869
1870 /* OMPT state */
1871 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1872 } else {
1873 exit_frame_p = &dummy;
1874 }
1875#endif
1876
1877 {
1878 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1879 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1880 __kmp_invoke_microtask(pkfn: microtask, gtid, npr: 0, argc, argv: args
1881#if OMPT_SUPPORT
1882 ,
1883 exit_frame_ptr: exit_frame_p
1884#endif
1885 );
1886 }
1887
1888#if OMPT_SUPPORT
1889 if (ompt_enabled.enabled) {
1890 *exit_frame_p = NULL;
1891 if (ompt_enabled.ompt_callback_implicit_task) {
1892 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1893 ompt_scope_end, NULL, &(task_info->task_data), 1,
1894 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1895 }
1896
1897 *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1898 __ompt_lw_taskteam_unlink(thr: master_th);
1899 if (ompt_enabled.ompt_callback_parallel_end) {
1900 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1901 ompt_parallel_data, *parent_task_data,
1902 OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1903 }
1904 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1905 }
1906#endif
1907 }
1908 } else if (call_context == fork_context_gnu) {
1909#if OMPT_SUPPORT
1910 if (ompt_enabled.enabled) {
1911 ompt_lw_taskteam_t lwt;
1912 __ompt_lw_taskteam_init(lwt: &lwt, thr: master_th, gtid, ompt_pid: ompt_parallel_data,
1913 codeptr: *return_address);
1914
1915 lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1916 __ompt_lw_taskteam_link(lwt: &lwt, thr: master_th, on_heap: 1);
1917 }
1918// don't use lw_taskteam after linking. content was swaped
1919#endif
1920
1921 // we were called from GNU native code
1922 KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1923 return FALSE;
1924 } else {
1925 KMP_ASSERT2(call_context < fork_context_last,
1926 "__kmp_serial_fork_call: unknown fork_context parameter");
1927 }
1928
1929 KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1930 KMP_MB();
1931 return FALSE;
1932}
1933
1934/* most of the work for a fork */
1935/* return true if we really went parallel, false if serialized */
1936int __kmp_fork_call(ident_t *loc, int gtid,
1937 enum fork_context_e call_context, // Intel, GNU, ...
1938 kmp_int32 argc, microtask_t microtask, launch_t invoker,
1939 kmp_va_list ap) {
1940 void **argv;
1941 int i;
1942 int master_tid;
1943 int master_this_cons;
1944 kmp_team_t *team;
1945 kmp_team_t *parent_team;
1946 kmp_info_t *master_th;
1947 kmp_root_t *root;
1948 int nthreads;
1949 int master_active;
1950 int master_set_numthreads;
1951 int task_thread_limit = 0;
1952 int level;
1953 int active_level;
1954 int teams_level;
1955#if KMP_NESTED_HOT_TEAMS
1956 kmp_hot_team_ptr_t **p_hot_teams;
1957#endif
1958 { // KMP_TIME_BLOCK
1959 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1960 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1961
1962 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1963 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1964 /* Some systems prefer the stack for the root thread(s) to start with */
1965 /* some gap from the parent stack to prevent false sharing. */
1966 void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1967 /* These 2 lines below are so this does not get optimized out */
1968 if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1969 __kmp_stkpadding += (short)((kmp_int64)dummy);
1970 }
1971
1972 /* initialize if needed */
1973 KMP_DEBUG_ASSERT(
1974 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1975 if (!TCR_4(__kmp_init_parallel))
1976 __kmp_parallel_initialize();
1977 __kmp_resume_if_soft_paused();
1978
1979 /* setup current data */
1980 // AC: potentially unsafe, not in sync with library shutdown,
1981 // __kmp_threads can be freed
1982 master_th = __kmp_threads[gtid];
1983
1984 parent_team = master_th->th.th_team;
1985 master_tid = master_th->th.th_info.ds.ds_tid;
1986 master_this_cons = master_th->th.th_local.this_construct;
1987 root = master_th->th.th_root;
1988 master_active = root->r.r_active;
1989 master_set_numthreads = master_th->th.th_set_nproc;
1990 task_thread_limit =
1991 master_th->th.th_current_task->td_icvs.task_thread_limit;
1992
1993#if OMPT_SUPPORT
1994 ompt_data_t ompt_parallel_data = ompt_data_none;
1995 ompt_data_t *parent_task_data = NULL;
1996 ompt_frame_t *ompt_frame = NULL;
1997 void *return_address = NULL;
1998
1999 if (ompt_enabled.enabled) {
2000 __ompt_get_task_info_internal(ancestor_level: 0, NULL, task_data: &parent_task_data, task_frame: &ompt_frame,
2001 NULL, NULL);
2002 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
2003 }
2004#endif
2005
2006 // Assign affinity to root thread if it hasn't happened yet
2007 __kmp_assign_root_init_mask();
2008
2009 // Nested level will be an index in the nested nthreads array
2010 level = parent_team->t.t_level;
2011 // used to launch non-serial teams even if nested is not allowed
2012 active_level = parent_team->t.t_active_level;
2013 // needed to check nesting inside the teams
2014 teams_level = master_th->th.th_teams_level;
2015#if KMP_NESTED_HOT_TEAMS
2016 p_hot_teams = &master_th->th.th_hot_teams;
2017 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
2018 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
2019 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
2020 (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
2021 // it is either actual or not needed (when active_level > 0)
2022 (*p_hot_teams)[0].hot_team_nth = 1;
2023 }
2024#endif
2025
2026#if OMPT_SUPPORT
2027 if (ompt_enabled.enabled) {
2028 if (ompt_enabled.ompt_callback_parallel_begin) {
2029 int team_size = master_set_numthreads
2030 ? master_set_numthreads
2031 : get__nproc_2(parent_team, master_tid);
2032 int flags = OMPT_INVOKER(call_context) |
2033 ((microtask == (microtask_t)__kmp_teams_master)
2034 ? ompt_parallel_league
2035 : ompt_parallel_team);
2036 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
2037 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
2038 return_address);
2039 }
2040 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2041 }
2042#endif
2043
2044 master_th->th.th_ident = loc;
2045
2046 // Parallel closely nested in teams construct:
2047 if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
2048 return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
2049 call_context, microtask, invoker,
2050 master_set_numthreads, level,
2051#if OMPT_SUPPORT
2052 ompt_parallel_data, return_address,
2053#endif
2054 ap);
2055 } // End parallel closely nested in teams construct
2056
2057 // Need this to happen before we determine the number of threads, not while
2058 // we are allocating the team
2059 //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
2060
2061 KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(parent_team, master_th);
2062
2063 // Determine the number of threads
2064 int enter_teams =
2065 __kmp_is_entering_teams(active_level, level, teams_level, ap);
2066 if ((!enter_teams &&
2067 (parent_team->t.t_active_level >=
2068 master_th->th.th_current_task->td_icvs.max_active_levels)) ||
2069 (__kmp_library == library_serial)) {
2070 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
2071 nthreads = 1;
2072 } else {
2073 nthreads = master_set_numthreads
2074 ? master_set_numthreads
2075 // TODO: get nproc directly from current task
2076 : get__nproc_2(parent_team, master_tid);
2077 // Use the thread_limit set for the current target task if exists, else go
2078 // with the deduced nthreads
2079 nthreads = task_thread_limit > 0 && task_thread_limit < nthreads
2080 ? task_thread_limit
2081 : nthreads;
2082 // Check if we need to take forkjoin lock? (no need for serialized
2083 // parallel out of teams construct).
2084 if (nthreads > 1) {
2085 /* determine how many new threads we can use */
2086 __kmp_acquire_bootstrap_lock(lck: &__kmp_forkjoin_lock);
2087 /* AC: If we execute teams from parallel region (on host), then teams
2088 should be created but each can only have 1 thread if nesting is
2089 disabled. If teams called from serial region, then teams and their
2090 threads should be created regardless of the nesting setting. */
2091 nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
2092 set_nthreads: nthreads, enter_teams);
2093 if (nthreads == 1) {
2094 // Free lock for single thread execution here; for multi-thread
2095 // execution it will be freed later after team of threads created
2096 // and initialized
2097 __kmp_release_bootstrap_lock(lck: &__kmp_forkjoin_lock);
2098 }
2099 }
2100 }
2101 KMP_DEBUG_ASSERT(nthreads > 0);
2102
2103 // If we temporarily changed the set number of threads then restore it now
2104 master_th->th.th_set_nproc = 0;
2105
2106 if (nthreads == 1) {
2107 return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
2108 invoker, master_th, parent_team,
2109#if OMPT_SUPPORT
2110 ompt_parallel_data: &ompt_parallel_data, return_address: &return_address,
2111 parent_task_data: &parent_task_data,
2112#endif
2113 ap);
2114 } // if (nthreads == 1)
2115
2116 // GEH: only modify the executing flag in the case when not serialized
2117 // serialized case is handled in kmpc_serialized_parallel
2118 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
2119 "curtask=%p, curtask_max_aclevel=%d\n",
2120 parent_team->t.t_active_level, master_th,
2121 master_th->th.th_current_task,
2122 master_th->th.th_current_task->td_icvs.max_active_levels));
2123 // TODO: GEH - cannot do this assertion because root thread not set up as
2124 // executing
2125 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2126 master_th->th.th_current_task->td_flags.executing = 0;
2127
2128 if (!master_th->th.th_teams_microtask || level > teams_level) {
2129 /* Increment our nested depth level */
2130 KMP_ATOMIC_INC(&root->r.r_in_parallel);
2131 }
2132
2133 // See if we need to make a copy of the ICVs.
2134 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2135 kmp_nested_nthreads_t *nested_nth = NULL;
2136 if (!master_th->th.th_set_nested_nth &&
2137 (level + 1 < parent_team->t.t_nested_nth->used) &&
2138 (parent_team->t.t_nested_nth->nth[level + 1] != nthreads_icv)) {
2139 nthreads_icv = parent_team->t.t_nested_nth->nth[level + 1];
2140 } else if (master_th->th.th_set_nested_nth) {
2141 nested_nth = __kmp_override_nested_nth(thr: master_th, level);
2142 if ((level + 1 < nested_nth->used) &&
2143 (nested_nth->nth[level + 1] != nthreads_icv))
2144 nthreads_icv = nested_nth->nth[level + 1];
2145 else
2146 nthreads_icv = 0; // don't update
2147 } else {
2148 nthreads_icv = 0; // don't update
2149 }
2150
2151 // Figure out the proc_bind_policy for the new team.
2152 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2153 // proc_bind_default means don't update
2154 kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2155 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2156 proc_bind = proc_bind_false;
2157 } else {
2158 // No proc_bind clause specified; use current proc-bind-var for this
2159 // parallel region
2160 if (proc_bind == proc_bind_default) {
2161 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2162 }
2163 // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2164 if (master_th->th.th_teams_microtask &&
2165 microtask == (microtask_t)__kmp_teams_master) {
2166 proc_bind = __kmp_teams_proc_bind;
2167 }
2168 /* else: The proc_bind policy was specified explicitly on parallel clause.
2169 This overrides proc-bind-var for this parallel region, but does not
2170 change proc-bind-var. */
2171 // Figure the value of proc-bind-var for the child threads.
2172 if ((level + 1 < __kmp_nested_proc_bind.used) &&
2173 (__kmp_nested_proc_bind.bind_types[level + 1] !=
2174 master_th->th.th_current_task->td_icvs.proc_bind)) {
2175 // Do not modify the proc bind icv for the two teams construct forks
2176 // They just let the proc bind icv pass through
2177 if (!master_th->th.th_teams_microtask ||
2178 !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2179 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2180 }
2181 }
2182
2183 // Reset for next parallel region
2184 master_th->th.th_set_proc_bind = proc_bind_default;
2185
2186 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2187 kmp_internal_control_t new_icvs;
2188 copy_icvs(dst: &new_icvs, src: &master_th->th.th_current_task->td_icvs);
2189 new_icvs.next = NULL;
2190 if (nthreads_icv > 0) {
2191 new_icvs.nproc = nthreads_icv;
2192 }
2193 if (proc_bind_icv != proc_bind_default) {
2194 new_icvs.proc_bind = proc_bind_icv;
2195 }
2196
2197 /* allocate a new parallel team */
2198 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2199 team = __kmp_allocate_team(root, new_nproc: nthreads, max_nproc: nthreads,
2200#if OMPT_SUPPORT
2201 ompt_parallel_data,
2202#endif
2203 proc_bind, new_icvs: &new_icvs,
2204 argc USE_NESTED_HOT_ARG(master_th));
2205 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2206 copy_icvs(dst: (kmp_internal_control_t *)team->t.b->team_icvs, src: &new_icvs);
2207 } else {
2208 /* allocate a new parallel team */
2209 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2210 team = __kmp_allocate_team(root, new_nproc: nthreads, max_nproc: nthreads,
2211#if OMPT_SUPPORT
2212 ompt_parallel_data,
2213#endif
2214 proc_bind,
2215 new_icvs: &master_th->th.th_current_task->td_icvs,
2216 argc USE_NESTED_HOT_ARG(master_th));
2217 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2218 copy_icvs(dst: (kmp_internal_control_t *)team->t.b->team_icvs,
2219 src: &master_th->th.th_current_task->td_icvs);
2220 }
2221 KF_TRACE(
2222 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2223
2224 /* setup the new team */
2225 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2226 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2227 KMP_CHECK_UPDATE(team->t.t_ident, loc);
2228 KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2229 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2230#if OMPT_SUPPORT
2231 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2232 return_address);
2233#endif
2234 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2235 // TODO: parent_team->t.t_level == INT_MAX ???
2236 if (!master_th->th.th_teams_microtask || level > teams_level) {
2237 int new_level = parent_team->t.t_level + 1;
2238 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2239 new_level = parent_team->t.t_active_level + 1;
2240 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2241 } else {
2242 // AC: Do not increase parallel level at start of the teams construct
2243 int new_level = parent_team->t.t_level;
2244 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2245 new_level = parent_team->t.t_active_level;
2246 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2247 }
2248 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2249 // set primary thread's schedule as new run-time schedule
2250 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2251
2252 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2253 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2254
2255 // Check if hot team has potentially outdated list, and if so, free it
2256 if (team->t.t_nested_nth &&
2257 team->t.t_nested_nth != parent_team->t.t_nested_nth) {
2258 KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
2259 KMP_INTERNAL_FREE(team->t.t_nested_nth);
2260 team->t.t_nested_nth = NULL;
2261 }
2262 team->t.t_nested_nth = parent_team->t.t_nested_nth;
2263 if (master_th->th.th_set_nested_nth) {
2264 if (!nested_nth)
2265 nested_nth = __kmp_override_nested_nth(thr: master_th, level);
2266 team->t.t_nested_nth = nested_nth;
2267 KMP_INTERNAL_FREE(master_th->th.th_set_nested_nth);
2268 master_th->th.th_set_nested_nth = NULL;
2269 master_th->th.th_set_nested_nth_sz = 0;
2270 master_th->th.th_nt_strict = false;
2271 }
2272
2273 // Update the floating point rounding in the team if required.
2274 propagateFPControl(team);
2275#if OMPD_SUPPORT
2276 if (ompd_state & OMPD_ENABLE_BP)
2277 ompd_bp_parallel_begin();
2278#endif
2279
2280 KA_TRACE(
2281 20,
2282 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2283 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2284 team->t.t_nproc));
2285 KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2286 (team->t.t_master_tid == 0 &&
2287 (team->t.t_parent == root->r.r_root_team ||
2288 team->t.t_parent->t.t_serialized)));
2289 KMP_MB();
2290
2291 /* now, setup the arguments */
2292 argv = (void **)team->t.t_argv;
2293 if (ap) {
2294 for (i = argc - 1; i >= 0; --i) {
2295 void *new_argv = va_arg(kmp_va_deref(ap), void *);
2296 KMP_CHECK_UPDATE(*argv, new_argv);
2297 argv++;
2298 }
2299 } else {
2300 for (i = 0; i < argc; ++i) {
2301 // Get args from parent team for teams construct
2302 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2303 }
2304 }
2305
2306 /* now actually fork the threads */
2307 KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2308 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2309 root->r.r_active = TRUE;
2310
2311 __kmp_fork_team_threads(root, team, master_th, master_gtid: gtid, fork_teams_workers: !ap);
2312 __kmp_setup_icv_copy(team, new_nproc: nthreads,
2313 new_icvs: &master_th->th.th_current_task->td_icvs, loc);
2314
2315#if OMPT_SUPPORT
2316 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2317#endif
2318
2319 __kmp_release_bootstrap_lock(lck: &__kmp_forkjoin_lock);
2320
2321#if USE_ITT_BUILD
2322 if (team->t.t_active_level == 1 // only report frames at level 1
2323 && !master_th->th.th_teams_microtask) { // not in teams construct
2324#if USE_ITT_NOTIFY
2325 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2326 (__kmp_forkjoin_frames_mode == 3 ||
2327 __kmp_forkjoin_frames_mode == 1)) {
2328 kmp_uint64 tmp_time = 0;
2329 if (__itt_get_timestamp_ptr)
2330 tmp_time = __itt_get_timestamp();
2331 // Internal fork - report frame begin
2332 master_th->th.th_frame_time = tmp_time;
2333 if (__kmp_forkjoin_frames_mode == 3)
2334 team->t.t_region_time = tmp_time;
2335 } else
2336// only one notification scheme (either "submit" or "forking/joined", not both)
2337#endif /* USE_ITT_NOTIFY */
2338 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2339 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2340 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2341 __kmp_itt_region_forking(gtid, team_size: team->t.t_nproc, barriers: 0);
2342 }
2343 }
2344#endif /* USE_ITT_BUILD */
2345
2346 /* now go on and do the work */
2347 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2348 KMP_MB();
2349 KF_TRACE(10,
2350 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2351 root, team, master_th, gtid));
2352
2353#if USE_ITT_BUILD
2354 if (__itt_stack_caller_create_ptr) {
2355 // create new stack stitching id before entering fork barrier
2356 if (!enter_teams) {
2357 KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2358 team->t.t_stack_id = __kmp_itt_stack_caller_create();
2359 } else if (parent_team->t.t_serialized) {
2360 // keep stack stitching id in the serialized parent_team;
2361 // current team will be used for parallel inside the teams;
2362 // if parent_team is active, then it already keeps stack stitching id
2363 // for the league of teams
2364 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2365 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2366 }
2367 }
2368#endif /* USE_ITT_BUILD */
2369
2370 // AC: skip __kmp_internal_fork at teams construct, let only primary
2371 // threads execute
2372 if (ap) {
2373 __kmp_internal_fork(id: loc, gtid, team);
2374 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2375 "master_th=%p, gtid=%d\n",
2376 root, team, master_th, gtid));
2377 }
2378
2379 if (call_context == fork_context_gnu) {
2380 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2381 return TRUE;
2382 }
2383
2384 /* Invoke microtask for PRIMARY thread */
2385 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2386 team->t.t_id, team->t.t_pkfn));
2387 } // END of timer KMP_fork_call block
2388
2389#if KMP_STATS_ENABLED
2390 // If beginning a teams construct, then change thread state
2391 stats_state_e previous_state = KMP_GET_THREAD_STATE();
2392 if (!ap) {
2393 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2394 }
2395#endif
2396
2397 if (!team->t.t_invoke(gtid)) {
2398 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2399 }
2400
2401#if KMP_STATS_ENABLED
2402 // If was beginning of a teams construct, then reset thread state
2403 if (!ap) {
2404 KMP_SET_THREAD_STATE(previous_state);
2405 }
2406#endif
2407
2408 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2409 team->t.t_id, team->t.t_pkfn));
2410 KMP_MB(); /* Flush all pending memory write invalidates. */
2411
2412 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2413#if OMPT_SUPPORT
2414 if (ompt_enabled.enabled) {
2415 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2416 }
2417#endif
2418
2419 return TRUE;
2420}
2421
2422#if OMPT_SUPPORT
2423static inline void __kmp_join_restore_state(kmp_info_t *thread,
2424 kmp_team_t *team) {
2425 // restore state outside the region
2426 thread->th.ompt_thread_info.state =
2427 ((team->t.t_serialized) ? ompt_state_work_serial
2428 : ompt_state_work_parallel);
2429}
2430
2431static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2432 kmp_team_t *team, ompt_data_t *parallel_data,
2433 int flags, void *codeptr) {
2434 ompt_task_info_t *task_info = __ompt_get_task_info_object(depth: 0);
2435 if (ompt_enabled.ompt_callback_parallel_end) {
2436 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2437 parallel_data, &(task_info->task_data), flags, codeptr);
2438 }
2439
2440 task_info->frame.enter_frame = ompt_data_none;
2441 __kmp_join_restore_state(thread, team);
2442}
2443#endif
2444
2445void __kmp_join_call(ident_t *loc, int gtid
2446#if OMPT_SUPPORT
2447 ,
2448 enum fork_context_e fork_context
2449#endif
2450 ,
2451 int exit_teams) {
2452 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2453 kmp_team_t *team;
2454 kmp_team_t *parent_team;
2455 kmp_info_t *master_th;
2456 kmp_root_t *root;
2457 int master_active;
2458
2459 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2460
2461 /* setup current data */
2462 master_th = __kmp_threads[gtid];
2463 root = master_th->th.th_root;
2464 team = master_th->th.th_team;
2465 parent_team = team->t.t_parent;
2466
2467 master_th->th.th_ident = loc;
2468
2469#if OMPT_SUPPORT
2470 void *team_microtask = (void *)team->t.t_pkfn;
2471 // For GOMP interface with serialized parallel, need the
2472 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2473 // and end-parallel events.
2474 if (ompt_enabled.enabled &&
2475 !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2476 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2477 }
2478#endif
2479
2480#if KMP_DEBUG
2481 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2482 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2483 "th_task_team = %p\n",
2484 __kmp_gtid_from_thread(master_th), team,
2485 team->t.t_task_team[master_th->th.th_task_state],
2486 master_th->th.th_task_team));
2487 KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, master_th);
2488 }
2489#endif
2490
2491 if (team->t.t_serialized) {
2492 if (master_th->th.th_teams_microtask) {
2493 // We are in teams construct
2494 int level = team->t.t_level;
2495 int tlevel = master_th->th.th_teams_level;
2496 if (level == tlevel) {
2497 // AC: we haven't incremented it earlier at start of teams construct,
2498 // so do it here - at the end of teams construct
2499 team->t.t_level++;
2500 } else if (level == tlevel + 1) {
2501 // AC: we are exiting parallel inside teams, need to increment
2502 // serialization in order to restore it in the next call to
2503 // __kmpc_end_serialized_parallel
2504 team->t.t_serialized++;
2505 }
2506 }
2507 __kmpc_end_serialized_parallel(loc, global_tid: gtid);
2508
2509#if OMPT_SUPPORT
2510 if (ompt_enabled.enabled) {
2511 if (fork_context == fork_context_gnu) {
2512 __ompt_lw_taskteam_unlink(thr: master_th);
2513 }
2514 __kmp_join_restore_state(thread: master_th, team: parent_team);
2515 }
2516#endif
2517
2518 return;
2519 }
2520
2521 master_active = team->t.t_master_active;
2522
2523 if (!exit_teams) {
2524 // AC: No barrier for internal teams at exit from teams construct.
2525 // But there is barrier for external team (league).
2526 __kmp_internal_join(id: loc, gtid, team);
2527#if USE_ITT_BUILD
2528 if (__itt_stack_caller_create_ptr) {
2529 KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2530 // destroy the stack stitching id after join barrier
2531 __kmp_itt_stack_caller_destroy(id: (__itt_caller)team->t.t_stack_id);
2532 team->t.t_stack_id = NULL;
2533 }
2534#endif
2535 } else {
2536 master_th->th.th_task_state =
2537 0; // AC: no tasking in teams (out of any parallel)
2538#if USE_ITT_BUILD
2539 if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2540 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2541 // destroy the stack stitching id on exit from the teams construct
2542 // if parent_team is active, then the id will be destroyed later on
2543 // by master of the league of teams
2544 __kmp_itt_stack_caller_destroy(id: (__itt_caller)parent_team->t.t_stack_id);
2545 parent_team->t.t_stack_id = NULL;
2546 }
2547#endif
2548 }
2549
2550 KMP_MB();
2551
2552#if OMPT_SUPPORT
2553 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2554 void *codeptr = team->t.ompt_team_info.master_return_address;
2555#endif
2556
2557#if USE_ITT_BUILD
2558 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2559 if (team->t.t_active_level == 1 &&
2560 (!master_th->th.th_teams_microtask || /* not in teams construct */
2561 master_th->th.th_teams_size.nteams == 1)) {
2562 master_th->th.th_ident = loc;
2563 // only one notification scheme (either "submit" or "forking/joined", not
2564 // both)
2565 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2566 __kmp_forkjoin_frames_mode == 3)
2567 __kmp_itt_frame_submit(gtid, begin: team->t.t_region_time,
2568 end: master_th->th.th_frame_time, imbalance: 0, loc,
2569 team_size: master_th->th.th_team_nproc, region: 1);
2570 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2571 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2572 __kmp_itt_region_joined(gtid);
2573 } // active_level == 1
2574#endif /* USE_ITT_BUILD */
2575
2576#if KMP_AFFINITY_SUPPORTED
2577 if (!exit_teams) {
2578 // Restore master thread's partition.
2579 master_th->th.th_first_place = team->t.t_first_place;
2580 master_th->th.th_last_place = team->t.t_last_place;
2581 }
2582#endif // KMP_AFFINITY_SUPPORTED
2583
2584 if (master_th->th.th_teams_microtask && !exit_teams &&
2585 team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2586 team->t.t_level == master_th->th.th_teams_level + 1) {
2587// AC: We need to leave the team structure intact at the end of parallel
2588// inside the teams construct, so that at the next parallel same (hot) team
2589// works, only adjust nesting levels
2590#if OMPT_SUPPORT
2591 ompt_data_t ompt_parallel_data = ompt_data_none;
2592 if (ompt_enabled.enabled) {
2593 ompt_task_info_t *task_info = __ompt_get_task_info_object(depth: 0);
2594 if (ompt_enabled.ompt_callback_implicit_task) {
2595 int ompt_team_size = team->t.t_nproc;
2596 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2597 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2598 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2599 }
2600 task_info->frame.exit_frame = ompt_data_none;
2601 task_info->task_data = ompt_data_none;
2602 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2603 __ompt_lw_taskteam_unlink(thr: master_th);
2604 }
2605#endif
2606 /* Decrement our nested depth level */
2607 team->t.t_level--;
2608 team->t.t_active_level--;
2609 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2610
2611 // Restore number of threads in the team if needed. This code relies on
2612 // the proper adjustment of th_teams_size.nth after the fork in
2613 // __kmp_teams_master on each teams primary thread in the case that
2614 // __kmp_reserve_threads reduced it.
2615 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2616 int old_num = master_th->th.th_team_nproc;
2617 int new_num = master_th->th.th_teams_size.nth;
2618 kmp_info_t **other_threads = team->t.t_threads;
2619 team->t.t_nproc = new_num;
2620 for (int i = 0; i < old_num; ++i) {
2621 other_threads[i]->th.th_team_nproc = new_num;
2622 }
2623 // Adjust states of non-used threads of the team
2624 for (int i = old_num; i < new_num; ++i) {
2625 // Re-initialize thread's barrier data.
2626 KMP_DEBUG_ASSERT(other_threads[i]);
2627 kmp_balign_t *balign = other_threads[i]->th.th_bar;
2628 for (int b = 0; b < bs_last_barrier; ++b) {
2629 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2630 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2631#if USE_DEBUGGER
2632 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2633#endif
2634 }
2635 if (__kmp_tasking_mode != tskm_immediate_exec) {
2636 // Synchronize thread's task state
2637 other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2638 }
2639 }
2640 }
2641
2642#if OMPT_SUPPORT
2643 if (ompt_enabled.enabled) {
2644 __kmp_join_ompt(gtid, thread: master_th, team: parent_team, parallel_data: &ompt_parallel_data,
2645 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2646 }
2647#endif
2648
2649 return;
2650 }
2651
2652 /* do cleanup and restore the parent team */
2653 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2654 master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2655
2656 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2657
2658 /* jc: The following lock has instructions with REL and ACQ semantics,
2659 separating the parallel user code called in this parallel region
2660 from the serial user code called after this function returns. */
2661 __kmp_acquire_bootstrap_lock(lck: &__kmp_forkjoin_lock);
2662
2663 if (!master_th->th.th_teams_microtask ||
2664 team->t.t_level > master_th->th.th_teams_level) {
2665 /* Decrement our nested depth level */
2666 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2667 }
2668 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2669
2670#if OMPT_SUPPORT
2671 if (ompt_enabled.enabled) {
2672 ompt_task_info_t *task_info = __ompt_get_task_info_object(depth: 0);
2673 if (ompt_enabled.ompt_callback_implicit_task) {
2674 int flags = (team_microtask == (void *)__kmp_teams_master)
2675 ? ompt_task_initial
2676 : ompt_task_implicit;
2677 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2678 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2679 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2680 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2681 }
2682 task_info->frame.exit_frame = ompt_data_none;
2683 task_info->task_data = ompt_data_none;
2684 }
2685#endif
2686
2687 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2688 master_th, team));
2689 __kmp_pop_current_task_from_thread(this_thr: master_th);
2690
2691 master_th->th.th_def_allocator = team->t.t_def_allocator;
2692
2693#if OMPD_SUPPORT
2694 if (ompd_state & OMPD_ENABLE_BP)
2695 ompd_bp_parallel_end();
2696#endif
2697 updateHWFPControl(team);
2698
2699 if (root->r.r_active != master_active)
2700 root->r.r_active = master_active;
2701
2702 __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2703 master_th)); // this will free worker threads
2704
2705 /* this race was fun to find. make sure the following is in the critical
2706 region otherwise assertions may fail occasionally since the old team may be
2707 reallocated and the hierarchy appears inconsistent. it is actually safe to
2708 run and won't cause any bugs, but will cause those assertion failures. it's
2709 only one deref&assign so might as well put this in the critical region */
2710 master_th->th.th_team = parent_team;
2711 master_th->th.th_team_nproc = parent_team->t.t_nproc;
2712 master_th->th.th_team_master = parent_team->t.t_threads[0];
2713 master_th->th.th_team_serialized = parent_team->t.t_serialized;
2714
2715 /* restore serialized team, if need be */
2716 if (parent_team->t.t_serialized &&
2717 parent_team != master_th->th.th_serial_team &&
2718 parent_team != root->r.r_root_team) {
2719 __kmp_free_team(root,
2720 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2721 master_th->th.th_serial_team = parent_team;
2722 }
2723
2724 if (__kmp_tasking_mode != tskm_immediate_exec) {
2725 // Restore primary thread's task state from team structure
2726 KMP_DEBUG_ASSERT(team->t.t_primary_task_state == 0 ||
2727 team->t.t_primary_task_state == 1);
2728 master_th->th.th_task_state = (kmp_uint8)team->t.t_primary_task_state;
2729
2730 // Copy the task team from the parent team to the primary thread
2731 master_th->th.th_task_team =
2732 parent_team->t.t_task_team[master_th->th.th_task_state];
2733 KA_TRACE(20,
2734 ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2735 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2736 parent_team));
2737 }
2738
2739 // TODO: GEH - cannot do this assertion because root thread not set up as
2740 // executing
2741 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2742 master_th->th.th_current_task->td_flags.executing = 1;
2743
2744 __kmp_release_bootstrap_lock(lck: &__kmp_forkjoin_lock);
2745
2746#if KMP_AFFINITY_SUPPORTED
2747 if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
2748 __kmp_reset_root_init_mask(gtid);
2749 }
2750#endif
2751#if OMPT_SUPPORT
2752 int flags =
2753 OMPT_INVOKER(fork_context) |
2754 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2755 : ompt_parallel_team);
2756 if (ompt_enabled.enabled) {
2757 __kmp_join_ompt(gtid, thread: master_th, team: parent_team, parallel_data, flags,
2758 codeptr);
2759 }
2760#endif
2761
2762 KMP_MB();
2763 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2764}
2765
2766/* Check whether we should push an internal control record onto the
2767 serial team stack. If so, do it. */
2768void __kmp_save_internal_controls(kmp_info_t *thread) {
2769
2770 if (thread->th.th_team != thread->th.th_serial_team) {
2771 return;
2772 }
2773 if (thread->th.th_team->t.t_serialized > 1) {
2774 int push = 0;
2775
2776 if (thread->th.th_team->t.t_control_stack_top == NULL) {
2777 push = 1;
2778 } else {
2779 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2780 thread->th.th_team->t.t_serialized) {
2781 push = 1;
2782 }
2783 }
2784 if (push) { /* push a record on the serial team's stack */
2785 kmp_internal_control_t *control =
2786 (kmp_internal_control_t *)__kmp_allocate(
2787 sizeof(kmp_internal_control_t));
2788
2789 copy_icvs(dst: control, src: &thread->th.th_current_task->td_icvs);
2790
2791 control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2792
2793 control->next = thread->th.th_team->t.t_control_stack_top;
2794 thread->th.th_team->t.t_control_stack_top = control;
2795 }
2796 }
2797}
2798
2799/* Changes set_nproc */
2800void __kmp_set_num_threads(int new_nth, int gtid) {
2801 kmp_info_t *thread;
2802 kmp_root_t *root;
2803
2804 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2805 KMP_DEBUG_ASSERT(__kmp_init_serial);
2806
2807 if (new_nth < 1)
2808 new_nth = 1;
2809 else if (new_nth > __kmp_max_nth)
2810 new_nth = __kmp_max_nth;
2811
2812 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2813 thread = __kmp_threads[gtid];
2814 if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2815 return; // nothing to do
2816
2817 __kmp_save_internal_controls(thread);
2818
2819 set__nproc(thread, new_nth);
2820
2821 // If this omp_set_num_threads() call will cause the hot team size to be
2822 // reduced (in the absence of a num_threads clause), then reduce it now,
2823 // rather than waiting for the next parallel region.
2824 root = thread->th.th_root;
2825 if (__kmp_init_parallel && (!root->r.r_active) &&
2826 (root->r.r_hot_team->t.t_nproc > new_nth)
2827#if KMP_NESTED_HOT_TEAMS
2828 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2829#endif
2830 ) {
2831 kmp_team_t *hot_team = root->r.r_hot_team;
2832 int f;
2833
2834 __kmp_acquire_bootstrap_lock(lck: &__kmp_forkjoin_lock);
2835
2836 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2837 __kmp_resize_dist_barrier(team: hot_team, old_nthreads: hot_team->t.t_nproc, new_nthreads: new_nth);
2838 }
2839 // Release the extra threads we don't need any more.
2840 for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2841 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2842 if (__kmp_tasking_mode != tskm_immediate_exec) {
2843 // When decreasing team size, threads no longer in the team should unref
2844 // task team.
2845 hot_team->t.t_threads[f]->th.th_task_team = NULL;
2846 }
2847 __kmp_free_thread(hot_team->t.t_threads[f]);
2848 hot_team->t.t_threads[f] = NULL;
2849 }
2850 hot_team->t.t_nproc = new_nth;
2851#if KMP_NESTED_HOT_TEAMS
2852 if (thread->th.th_hot_teams) {
2853 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2854 thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2855 }
2856#endif
2857
2858 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2859 hot_team->t.b->update_num_threads(nthr: new_nth);
2860 __kmp_add_threads_to_team(team: hot_team, new_nthreads: new_nth);
2861 }
2862
2863 __kmp_release_bootstrap_lock(lck: &__kmp_forkjoin_lock);
2864
2865 // Update the t_nproc field in the threads that are still active.
2866 for (f = 0; f < new_nth; f++) {
2867 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2868 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2869 }
2870 // Special flag in case omp_set_num_threads() call
2871 hot_team->t.t_size_changed = -1;
2872 }
2873}
2874
2875/* Changes max_active_levels */
2876void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2877 kmp_info_t *thread;
2878
2879 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2880 "%d = (%d)\n",
2881 gtid, max_active_levels));
2882 KMP_DEBUG_ASSERT(__kmp_init_serial);
2883
2884 // validate max_active_levels
2885 if (max_active_levels < 0) {
2886 KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2887 // We ignore this call if the user has specified a negative value.
2888 // The current setting won't be changed. The last valid setting will be
2889 // used. A warning will be issued (if warnings are allowed as controlled by
2890 // the KMP_WARNINGS env var).
2891 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2892 "max_active_levels for thread %d = (%d)\n",
2893 gtid, max_active_levels));
2894 return;
2895 }
2896 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2897 // it's OK, the max_active_levels is within the valid range: [ 0;
2898 // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2899 // We allow a zero value. (implementation defined behavior)
2900 } else {
2901 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2902 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2903 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2904 // Current upper limit is MAX_INT. (implementation defined behavior)
2905 // If the input exceeds the upper limit, we correct the input to be the
2906 // upper limit. (implementation defined behavior)
2907 // Actually, the flow should never get here until we use MAX_INT limit.
2908 }
2909 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2910 "max_active_levels for thread %d = (%d)\n",
2911 gtid, max_active_levels));
2912
2913 thread = __kmp_threads[gtid];
2914
2915 __kmp_save_internal_controls(thread);
2916
2917 set__max_active_levels(thread, max_active_levels);
2918}
2919
2920/* Gets max_active_levels */
2921int __kmp_get_max_active_levels(int gtid) {
2922 kmp_info_t *thread;
2923
2924 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2925 KMP_DEBUG_ASSERT(__kmp_init_serial);
2926
2927 thread = __kmp_threads[gtid];
2928 KMP_DEBUG_ASSERT(thread->th.th_current_task);
2929 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2930 "curtask_maxaclevel=%d\n",
2931 gtid, thread->th.th_current_task,
2932 thread->th.th_current_task->td_icvs.max_active_levels));
2933 return thread->th.th_current_task->td_icvs.max_active_levels;
2934}
2935
2936// nteams-var per-device ICV
2937void __kmp_set_num_teams(int num_teams) {
2938 if (num_teams > 0)
2939 __kmp_nteams = num_teams;
2940}
2941int __kmp_get_max_teams(void) { return __kmp_nteams; }
2942// teams-thread-limit-var per-device ICV
2943void __kmp_set_teams_thread_limit(int limit) {
2944 if (limit > 0)
2945 __kmp_teams_thread_limit = limit;
2946}
2947int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2948
2949KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2950KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2951
2952/* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2953void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2954 kmp_info_t *thread;
2955 kmp_sched_t orig_kind;
2956 // kmp_team_t *team;
2957
2958 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2959 gtid, (int)kind, chunk));
2960 KMP_DEBUG_ASSERT(__kmp_init_serial);
2961
2962 // Check if the kind parameter is valid, correct if needed.
2963 // Valid parameters should fit in one of two intervals - standard or extended:
2964 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2965 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2966 orig_kind = kind;
2967 kind = __kmp_sched_without_mods(kind);
2968
2969 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2970 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2971 // TODO: Hint needs attention in case we change the default schedule.
2972 __kmp_msg(severity: kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2973 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2974 __kmp_msg_null);
2975 kind = kmp_sched_default;
2976 chunk = 0; // ignore chunk value in case of bad kind
2977 }
2978
2979 thread = __kmp_threads[gtid];
2980
2981 __kmp_save_internal_controls(thread);
2982
2983 if (kind < kmp_sched_upper_std) {
2984 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2985 // differ static chunked vs. unchunked: chunk should be invalid to
2986 // indicate unchunked schedule (which is the default)
2987 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2988 } else {
2989 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2990 __kmp_sch_map[kind - kmp_sched_lower - 1];
2991 }
2992 } else {
2993 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2994 // kmp_sched_lower - 2 ];
2995 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2996 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2997 kmp_sched_lower - 2];
2998 }
2999 __kmp_sched_apply_mods_intkind(
3000 kind: orig_kind, internal_kind: &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
3001 if (kind == kmp_sched_auto || chunk < 1) {
3002 // ignore parameter chunk for schedule auto
3003 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
3004 } else {
3005 thread->th.th_current_task->td_icvs.sched.chunk = chunk;
3006 }
3007}
3008
3009/* Gets def_sched_var ICV values */
3010void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
3011 kmp_info_t *thread;
3012 enum sched_type th_type;
3013
3014 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
3015 KMP_DEBUG_ASSERT(__kmp_init_serial);
3016
3017 thread = __kmp_threads[gtid];
3018
3019 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
3020 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
3021 case kmp_sch_static:
3022 case kmp_sch_static_greedy:
3023 case kmp_sch_static_balanced:
3024 *kind = kmp_sched_static;
3025 __kmp_sched_apply_mods_stdkind(kind, internal_kind: th_type);
3026 *chunk = 0; // chunk was not set, try to show this fact via zero value
3027 return;
3028 case kmp_sch_static_chunked:
3029 *kind = kmp_sched_static;
3030 break;
3031 case kmp_sch_dynamic_chunked:
3032 *kind = kmp_sched_dynamic;
3033 break;
3034 case kmp_sch_guided_chunked:
3035 case kmp_sch_guided_iterative_chunked:
3036 case kmp_sch_guided_analytical_chunked:
3037 *kind = kmp_sched_guided;
3038 break;
3039 case kmp_sch_auto:
3040 *kind = kmp_sched_auto;
3041 break;
3042 case kmp_sch_trapezoidal:
3043 *kind = kmp_sched_trapezoidal;
3044 break;
3045#if KMP_STATIC_STEAL_ENABLED
3046 case kmp_sch_static_steal:
3047 *kind = kmp_sched_static_steal;
3048 break;
3049#endif
3050 default:
3051 KMP_FATAL(UnknownSchedulingType, th_type);
3052 }
3053
3054 __kmp_sched_apply_mods_stdkind(kind, internal_kind: th_type);
3055 *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
3056}
3057
3058int __kmp_get_ancestor_thread_num(int gtid, int level) {
3059
3060 int ii, dd;
3061 kmp_team_t *team;
3062 kmp_info_t *thr;
3063
3064 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
3065 KMP_DEBUG_ASSERT(__kmp_init_serial);
3066
3067 // validate level
3068 if (level == 0)
3069 return 0;
3070 if (level < 0)
3071 return -1;
3072 thr = __kmp_threads[gtid];
3073 team = thr->th.th_team;
3074 ii = team->t.t_level;
3075 if (level > ii)
3076 return -1;
3077
3078 if (thr->th.th_teams_microtask) {
3079 // AC: we are in teams region where multiple nested teams have same level
3080 int tlevel = thr->th.th_teams_level; // the level of the teams construct
3081 if (level <=
3082 tlevel) { // otherwise usual algorithm works (will not touch the teams)
3083 KMP_DEBUG_ASSERT(ii >= tlevel);
3084 // AC: As we need to pass by the teams league, we need to artificially
3085 // increase ii
3086 if (ii == tlevel) {
3087 ii += 2; // three teams have same level
3088 } else {
3089 ii++; // two teams have same level
3090 }
3091 }
3092 }
3093
3094 if (ii == level)
3095 return __kmp_tid_from_gtid(gtid);
3096
3097 dd = team->t.t_serialized;
3098 level++;
3099 while (ii > level) {
3100 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3101 }
3102 if ((team->t.t_serialized) && (!dd)) {
3103 team = team->t.t_parent;
3104 continue;
3105 }
3106 if (ii > level) {
3107 team = team->t.t_parent;
3108 dd = team->t.t_serialized;
3109 ii--;
3110 }
3111 }
3112
3113 return (dd > 1) ? (0) : (team->t.t_master_tid);
3114}
3115
3116int __kmp_get_team_size(int gtid, int level) {
3117
3118 int ii, dd;
3119 kmp_team_t *team;
3120 kmp_info_t *thr;
3121
3122 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3123 KMP_DEBUG_ASSERT(__kmp_init_serial);
3124
3125 // validate level
3126 if (level == 0)
3127 return 1;
3128 if (level < 0)
3129 return -1;
3130 thr = __kmp_threads[gtid];
3131 team = thr->th.th_team;
3132 ii = team->t.t_level;
3133 if (level > ii)
3134 return -1;
3135
3136 if (thr->th.th_teams_microtask) {
3137 // AC: we are in teams region where multiple nested teams have same level
3138 int tlevel = thr->th.th_teams_level; // the level of the teams construct
3139 if (level <=
3140 tlevel) { // otherwise usual algorithm works (will not touch the teams)
3141 KMP_DEBUG_ASSERT(ii >= tlevel);
3142 // AC: As we need to pass by the teams league, we need to artificially
3143 // increase ii
3144 if (ii == tlevel) {
3145 ii += 2; // three teams have same level
3146 } else {
3147 ii++; // two teams have same level
3148 }
3149 }
3150 }
3151
3152 while (ii > level) {
3153 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3154 }
3155 if (team->t.t_serialized && (!dd)) {
3156 team = team->t.t_parent;
3157 continue;
3158 }
3159 if (ii > level) {
3160 team = team->t.t_parent;
3161 ii--;
3162 }
3163 }
3164
3165 return team->t.t_nproc;
3166}
3167
3168kmp_r_sched_t __kmp_get_schedule_global() {
3169 // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3170 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3171 // independently. So one can get the updated schedule here.
3172
3173 kmp_r_sched_t r_sched;
3174
3175 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3176 // __kmp_guided. __kmp_sched should keep original value, so that user can set
3177 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3178 // different roots (even in OMP 2.5)
3179 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3180 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3181 if (s == kmp_sch_static) {
3182 // replace STATIC with more detailed schedule (balanced or greedy)
3183 r_sched.r_sched_type = __kmp_static;
3184 } else if (s == kmp_sch_guided_chunked) {
3185 // replace GUIDED with more detailed schedule (iterative or analytical)
3186 r_sched.r_sched_type = __kmp_guided;
3187 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3188 r_sched.r_sched_type = __kmp_sched;
3189 }
3190 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3191
3192 if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3193 // __kmp_chunk may be wrong here (if it was not ever set)
3194 r_sched.chunk = KMP_DEFAULT_CHUNK;
3195 } else {
3196 r_sched.chunk = __kmp_chunk;
3197 }
3198
3199 return r_sched;
3200}
3201
3202/* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3203 at least argc number of *t_argv entries for the requested team. */
3204static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3205
3206 KMP_DEBUG_ASSERT(team);
3207 if (!realloc || argc > team->t.t_max_argc) {
3208
3209 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3210 "current entries=%d\n",
3211 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3212 /* if previously allocated heap space for args, free them */
3213 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3214 __kmp_free((void *)team->t.t_argv);
3215
3216 if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3217 /* use unused space in the cache line for arguments */
3218 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3219 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3220 "argv entries\n",
3221 team->t.t_id, team->t.t_max_argc));
3222 team->t.t_argv = &team->t.t_inline_argv[0];
3223 if (__kmp_storage_map) {
3224 __kmp_print_storage_map_gtid(
3225 gtid: -1, p1: &team->t.t_inline_argv[0],
3226 p2: &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3227 size: (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), format: "team_%d.t_inline_argv",
3228 team->t.t_id);
3229 }
3230 } else {
3231 /* allocate space for arguments in the heap */
3232 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3233 ? KMP_MIN_MALLOC_ARGV_ENTRIES
3234 : 2 * argc;
3235 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3236 "argv entries\n",
3237 team->t.t_id, team->t.t_max_argc));
3238 team->t.t_argv =
3239 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3240 if (__kmp_storage_map) {
3241 __kmp_print_storage_map_gtid(gtid: -1, p1: &team->t.t_argv[0],
3242 p2: &team->t.t_argv[team->t.t_max_argc],
3243 size: sizeof(void *) * team->t.t_max_argc,
3244 format: "team_%d.t_argv", team->t.t_id);
3245 }
3246 }
3247 }
3248}
3249
3250static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3251 int i;
3252 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3253 team->t.t_threads =
3254 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3255 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3256 sizeof(dispatch_shared_info_t) * num_disp_buff);
3257 team->t.t_dispatch =
3258 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3259 team->t.t_implicit_task_taskdata =
3260 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3261 team->t.t_max_nproc = max_nth;
3262
3263 /* setup dispatch buffers */
3264 for (i = 0; i < num_disp_buff; ++i) {
3265 team->t.t_disp_buffer[i].buffer_index = i;
3266 team->t.t_disp_buffer[i].doacross_buf_idx = i;
3267 }
3268}
3269
3270static void __kmp_free_team_arrays(kmp_team_t *team) {
3271 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3272 int i;
3273 for (i = 0; i < team->t.t_max_nproc; ++i) {
3274 if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3275 __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3276 team->t.t_dispatch[i].th_disp_buffer = NULL;
3277 }
3278 }
3279#if KMP_USE_HIER_SCHED
3280 __kmp_dispatch_free_hierarchies(team);
3281#endif
3282 __kmp_free(team->t.t_threads);
3283 __kmp_free(team->t.t_disp_buffer);
3284 __kmp_free(team->t.t_dispatch);
3285 __kmp_free(team->t.t_implicit_task_taskdata);
3286 team->t.t_threads = NULL;
3287 team->t.t_disp_buffer = NULL;
3288 team->t.t_dispatch = NULL;
3289 team->t.t_implicit_task_taskdata = 0;
3290}
3291
3292static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3293 kmp_info_t **oldThreads = team->t.t_threads;
3294
3295 __kmp_free(team->t.t_disp_buffer);
3296 __kmp_free(team->t.t_dispatch);
3297 __kmp_free(team->t.t_implicit_task_taskdata);
3298 __kmp_allocate_team_arrays(team, max_nth);
3299
3300 KMP_MEMCPY(dest: team->t.t_threads, src: oldThreads,
3301 n: team->t.t_nproc * sizeof(kmp_info_t *));
3302
3303 __kmp_free(oldThreads);
3304}
3305
3306static kmp_internal_control_t __kmp_get_global_icvs(void) {
3307
3308 kmp_r_sched_t r_sched =
3309 __kmp_get_schedule_global(); // get current state of scheduling globals
3310
3311 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3312
3313 kmp_internal_control_t g_icvs = {
3314 .serial_nesting_level: 0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3315 .dynamic: (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3316 // adjustment of threads (per thread)
3317 .bt_set: (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3318 // whether blocktime is explicitly set
3319 .blocktime: __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3320#if KMP_USE_MONITOR
3321 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3322// intervals
3323#endif
3324 .nproc: __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3325 // next parallel region (per thread)
3326 // (use a max ub on value if __kmp_parallel_initialize not called yet)
3327 .thread_limit: __kmp_cg_max_nth, // int thread_limit;
3328 .task_thread_limit: __kmp_task_max_nth, // int task_thread_limit; // to set the thread_limit
3329 // on task. This is used in the case of target thread_limit
3330 .max_active_levels: __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3331 // for max_active_levels
3332 .sched: r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3333 // {sched,chunk} pair
3334 .proc_bind: __kmp_nested_proc_bind.bind_types[0],
3335 .default_device: __kmp_default_device,
3336 NULL // struct kmp_internal_control *next;
3337 };
3338
3339 return g_icvs;
3340}
3341
3342static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3343
3344 kmp_internal_control_t gx_icvs;
3345 gx_icvs.serial_nesting_level =
3346 0; // probably =team->t.t_serial like in save_inter_controls
3347 copy_icvs(dst: &gx_icvs, src: &team->t.t_threads[0]->th.th_current_task->td_icvs);
3348 gx_icvs.next = NULL;
3349
3350 return gx_icvs;
3351}
3352
3353static void __kmp_initialize_root(kmp_root_t *root) {
3354 int f;
3355 kmp_team_t *root_team;
3356 kmp_team_t *hot_team;
3357 int hot_team_max_nth;
3358 kmp_r_sched_t r_sched =
3359 __kmp_get_schedule_global(); // get current state of scheduling globals
3360 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3361 KMP_DEBUG_ASSERT(root);
3362 KMP_ASSERT(!root->r.r_begin);
3363
3364 /* setup the root state structure */
3365 __kmp_init_lock(lck: &root->r.r_begin_lock);
3366 root->r.r_begin = FALSE;
3367 root->r.r_active = FALSE;
3368 root->r.r_in_parallel = 0;
3369 root->r.r_blocktime = __kmp_dflt_blocktime;
3370#if KMP_AFFINITY_SUPPORTED
3371 root->r.r_affinity_assigned = FALSE;
3372#endif
3373
3374 /* setup the root team for this task */
3375 /* allocate the root team structure */
3376 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3377
3378 root_team =
3379 __kmp_allocate_team(root,
3380 new_nproc: 1, // new_nproc
3381 max_nproc: 1, // max_nproc
3382#if OMPT_SUPPORT
3383 ompt_data_none, // root parallel id
3384#endif
3385 proc_bind: __kmp_nested_proc_bind.bind_types[0], new_icvs: &r_icvs,
3386 argc: 0 // argc
3387 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3388 );
3389#if USE_DEBUGGER
3390 // Non-NULL value should be assigned to make the debugger display the root
3391 // team.
3392 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3393#endif
3394
3395 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3396
3397 root->r.r_root_team = root_team;
3398 root_team->t.t_control_stack_top = NULL;
3399
3400 /* initialize root team */
3401 root_team->t.t_threads[0] = NULL;
3402 root_team->t.t_nproc = 1;
3403 root_team->t.t_serialized = 1;
3404 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3405 root_team->t.t_sched.sched = r_sched.sched;
3406 root_team->t.t_nested_nth = &__kmp_nested_nth;
3407 KA_TRACE(
3408 20,
3409 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3410 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3411
3412 /* setup the hot team for this task */
3413 /* allocate the hot team structure */
3414 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3415
3416 hot_team =
3417 __kmp_allocate_team(root,
3418 new_nproc: 1, // new_nproc
3419 max_nproc: __kmp_dflt_team_nth_ub * 2, // max_nproc
3420#if OMPT_SUPPORT
3421 ompt_data_none, // root parallel id
3422#endif
3423 proc_bind: __kmp_nested_proc_bind.bind_types[0], new_icvs: &r_icvs,
3424 argc: 0 // argc
3425 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3426 );
3427 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3428
3429 root->r.r_hot_team = hot_team;
3430 root_team->t.t_control_stack_top = NULL;
3431
3432 /* first-time initialization */
3433 hot_team->t.t_parent = root_team;
3434
3435 /* initialize hot team */
3436 hot_team_max_nth = hot_team->t.t_max_nproc;
3437 for (f = 0; f < hot_team_max_nth; ++f) {
3438 hot_team->t.t_threads[f] = NULL;
3439 }
3440 hot_team->t.t_nproc = 1;
3441 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3442 hot_team->t.t_sched.sched = r_sched.sched;
3443 hot_team->t.t_size_changed = 0;
3444 hot_team->t.t_nested_nth = &__kmp_nested_nth;
3445}
3446
3447#ifdef KMP_DEBUG
3448
3449typedef struct kmp_team_list_item {
3450 kmp_team_p const *entry;
3451 struct kmp_team_list_item *next;
3452} kmp_team_list_item_t;
3453typedef kmp_team_list_item_t *kmp_team_list_t;
3454
3455static void __kmp_print_structure_team_accum( // Add team to list of teams.
3456 kmp_team_list_t list, // List of teams.
3457 kmp_team_p const *team // Team to add.
3458) {
3459
3460 // List must terminate with item where both entry and next are NULL.
3461 // Team is added to the list only once.
3462 // List is sorted in ascending order by team id.
3463 // Team id is *not* a key.
3464
3465 kmp_team_list_t l;
3466
3467 KMP_DEBUG_ASSERT(list != NULL);
3468 if (team == NULL) {
3469 return;
3470 }
3471
3472 __kmp_print_structure_team_accum(list, team->t.t_parent);
3473 __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3474
3475 // Search list for the team.
3476 l = list;
3477 while (l->next != NULL && l->entry != team) {
3478 l = l->next;
3479 }
3480 if (l->next != NULL) {
3481 return; // Team has been added before, exit.
3482 }
3483
3484 // Team is not found. Search list again for insertion point.
3485 l = list;
3486 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3487 l = l->next;
3488 }
3489
3490 // Insert team.
3491 {
3492 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3493 sizeof(kmp_team_list_item_t));
3494 *item = *l;
3495 l->entry = team;
3496 l->next = item;
3497 }
3498}
3499
3500static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3501
3502) {
3503 __kmp_printf("%s", title);
3504 if (team != NULL) {
3505 __kmp_printf("%2x %p\n", team->t.t_id, team);
3506 } else {
3507 __kmp_printf(" - (nil)\n");
3508 }
3509}
3510
3511static void __kmp_print_structure_thread(char const *title,
3512 kmp_info_p const *thread) {
3513 __kmp_printf("%s", title);
3514 if (thread != NULL) {
3515 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3516 } else {
3517 __kmp_printf(" - (nil)\n");
3518 }
3519}
3520
3521void __kmp_print_structure(void) {
3522
3523 kmp_team_list_t list;
3524
3525 // Initialize list of teams.
3526 list =
3527 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3528 list->entry = NULL;
3529 list->next = NULL;
3530
3531 __kmp_printf("\n------------------------------\nGlobal Thread "
3532 "Table\n------------------------------\n");
3533 {
3534 int gtid;
3535 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3536 __kmp_printf("%2d", gtid);
3537 if (__kmp_threads != NULL) {
3538 __kmp_printf(" %p", __kmp_threads[gtid]);
3539 }
3540 if (__kmp_root != NULL) {
3541 __kmp_printf(" %p", __kmp_root[gtid]);
3542 }
3543 __kmp_printf("\n");
3544 }
3545 }
3546
3547 // Print out __kmp_threads array.
3548 __kmp_printf("\n------------------------------\nThreads\n--------------------"
3549 "----------\n");
3550 if (__kmp_threads != NULL) {
3551 int gtid;
3552 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3553 kmp_info_t const *thread = __kmp_threads[gtid];
3554 if (thread != NULL) {
3555 __kmp_printf("GTID %2d %p:\n", gtid, thread);
3556 __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3557 __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3558 __kmp_print_structure_team(" Serial Team: ",
3559 thread->th.th_serial_team);
3560 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3561 __kmp_print_structure_thread(" Primary: ",
3562 thread->th.th_team_master);
3563 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3564 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3565 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3566 __kmp_print_structure_thread(" Next in pool: ",
3567 thread->th.th_next_pool);
3568 __kmp_printf("\n");
3569 __kmp_print_structure_team_accum(list, thread->th.th_team);
3570 __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3571 }
3572 }
3573 } else {
3574 __kmp_printf("Threads array is not allocated.\n");
3575 }
3576
3577 // Print out __kmp_root array.
3578 __kmp_printf("\n------------------------------\nUbers\n----------------------"
3579 "--------\n");
3580 if (__kmp_root != NULL) {
3581 int gtid;
3582 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3583 kmp_root_t const *root = __kmp_root[gtid];
3584 if (root != NULL) {
3585 __kmp_printf("GTID %2d %p:\n", gtid, root);
3586 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3587 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3588 __kmp_print_structure_thread(" Uber Thread: ",
3589 root->r.r_uber_thread);
3590 __kmp_printf(" Active?: %2d\n", root->r.r_active);
3591 __kmp_printf(" In Parallel: %2d\n",
3592 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3593 __kmp_printf("\n");
3594 __kmp_print_structure_team_accum(list, root->r.r_root_team);
3595 __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3596 }
3597 }
3598 } else {
3599 __kmp_printf("Ubers array is not allocated.\n");
3600 }
3601
3602 __kmp_printf("\n------------------------------\nTeams\n----------------------"
3603 "--------\n");
3604 while (list->next != NULL) {
3605 kmp_team_p const *team = list->entry;
3606 int i;
3607 __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3608 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3609 __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid);
3610 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3611 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3612 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3613 for (i = 0; i < team->t.t_nproc; ++i) {
3614 __kmp_printf(" Thread %2d: ", i);
3615 __kmp_print_structure_thread("", team->t.t_threads[i]);
3616 }
3617 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3618 __kmp_printf("\n");
3619 list = list->next;
3620 }
3621
3622 // Print out __kmp_thread_pool and __kmp_team_pool.
3623 __kmp_printf("\n------------------------------\nPools\n----------------------"
3624 "--------\n");
3625 __kmp_print_structure_thread("Thread pool: ",
3626 CCAST(kmp_info_t *, __kmp_thread_pool));
3627 __kmp_print_structure_team("Team pool: ",
3628 CCAST(kmp_team_t *, __kmp_team_pool));
3629 __kmp_printf("\n");
3630
3631 // Free team list.
3632 while (list != NULL) {
3633 kmp_team_list_item_t *item = list;
3634 list = list->next;
3635 KMP_INTERNAL_FREE(item);
3636 }
3637}
3638
3639#endif
3640
3641//---------------------------------------------------------------------------
3642// Stuff for per-thread fast random number generator
3643// Table of primes
3644static const unsigned __kmp_primes[] = {
3645 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3646 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3647 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3648 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3649 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3650 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3651 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3652 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3653 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3654 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3655 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3656
3657//---------------------------------------------------------------------------
3658// __kmp_get_random: Get a random number using a linear congruential method.
3659unsigned short __kmp_get_random(kmp_info_t *thread) {
3660 unsigned x = thread->th.th_x;
3661 unsigned short r = (unsigned short)(x >> 16);
3662
3663 thread->th.th_x = x * thread->th.th_a + 1;
3664
3665 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3666 thread->th.th_info.ds.ds_tid, r));
3667
3668 return r;
3669}
3670//--------------------------------------------------------
3671// __kmp_init_random: Initialize a random number generator
3672void __kmp_init_random(kmp_info_t *thread) {
3673 unsigned seed = thread->th.th_info.ds.ds_tid;
3674
3675 thread->th.th_a =
3676 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3677 thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3678 KA_TRACE(30,
3679 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3680}
3681
3682#if KMP_OS_WINDOWS
3683/* reclaim array entries for root threads that are already dead, returns number
3684 * reclaimed */
3685static int __kmp_reclaim_dead_roots(void) {
3686 int i, r = 0;
3687
3688 for (i = 0; i < __kmp_threads_capacity; ++i) {
3689 if (KMP_UBER_GTID(i) &&
3690 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3691 !__kmp_root[i]
3692 ->r.r_active) { // AC: reclaim only roots died in non-active state
3693 r += __kmp_unregister_root_other_thread(i);
3694 }
3695 }
3696 return r;
3697}
3698#endif
3699
3700/* This function attempts to create free entries in __kmp_threads and
3701 __kmp_root, and returns the number of free entries generated.
3702
3703 For Windows* OS static library, the first mechanism used is to reclaim array
3704 entries for root threads that are already dead.
3705
3706 On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3707 __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3708 capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3709 threadprivate cache array has been created. Synchronization with
3710 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3711
3712 After any dead root reclamation, if the clipping value allows array expansion
3713 to result in the generation of a total of nNeed free slots, the function does
3714 that expansion. If not, nothing is done beyond the possible initial root
3715 thread reclamation.
3716
3717 If any argument is negative, the behavior is undefined. */
3718static int __kmp_expand_threads(int nNeed) {
3719 int added = 0;
3720 int minimumRequiredCapacity;
3721 int newCapacity;
3722 kmp_info_t **newThreads;
3723 kmp_root_t **newRoot;
3724
3725 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3726 // resizing __kmp_threads does not need additional protection if foreign
3727 // threads are present
3728
3729#if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3730 /* only for Windows static library */
3731 /* reclaim array entries for root threads that are already dead */
3732 added = __kmp_reclaim_dead_roots();
3733
3734 if (nNeed) {
3735 nNeed -= added;
3736 if (nNeed < 0)
3737 nNeed = 0;
3738 }
3739#endif
3740 if (nNeed <= 0)
3741 return added;
3742
3743 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3744 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3745 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3746 // > __kmp_max_nth in one of two ways:
3747 //
3748 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3749 // may not be reused by another thread, so we may need to increase
3750 // __kmp_threads_capacity to __kmp_max_nth + 1.
3751 //
3752 // 2) New foreign root(s) are encountered. We always register new foreign
3753 // roots. This may cause a smaller # of threads to be allocated at
3754 // subsequent parallel regions, but the worker threads hang around (and
3755 // eventually go to sleep) and need slots in the __kmp_threads[] array.
3756 //
3757 // Anyway, that is the reason for moving the check to see if
3758 // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3759 // instead of having it performed here. -BB
3760
3761 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3762
3763 /* compute expansion headroom to check if we can expand */
3764 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3765 /* possible expansion too small -- give up */
3766 return added;
3767 }
3768 minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3769
3770 newCapacity = __kmp_threads_capacity;
3771 do {
3772 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3773 : __kmp_sys_max_nth;
3774 } while (newCapacity < minimumRequiredCapacity);
3775 newThreads = (kmp_info_t **)__kmp_allocate(
3776 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3777 newRoot =
3778 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3779 KMP_MEMCPY(dest: newThreads, src: __kmp_threads,
3780 n: __kmp_threads_capacity * sizeof(kmp_info_t *));
3781 KMP_MEMCPY(dest: newRoot, src: __kmp_root,
3782 n: __kmp_threads_capacity * sizeof(kmp_root_t *));
3783 // Put old __kmp_threads array on a list. Any ongoing references to the old
3784 // list will be valid. This list is cleaned up at library shutdown.
3785 kmp_old_threads_list_t *node =
3786 (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3787 node->threads = __kmp_threads;
3788 node->next = __kmp_old_threads_list;
3789 __kmp_old_threads_list = node;
3790
3791 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3792 *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3793 added += newCapacity - __kmp_threads_capacity;
3794 *(volatile int *)&__kmp_threads_capacity = newCapacity;
3795
3796 if (newCapacity > __kmp_tp_capacity) {
3797 __kmp_acquire_bootstrap_lock(lck: &__kmp_tp_cached_lock);
3798 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3799 __kmp_threadprivate_resize_cache(newCapacity);
3800 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3801 *(volatile int *)&__kmp_tp_capacity = newCapacity;
3802 }
3803 __kmp_release_bootstrap_lock(lck: &__kmp_tp_cached_lock);
3804 }
3805
3806 return added;
3807}
3808
3809/* Register the current thread as a root thread and obtain our gtid. We must
3810 have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3811 thread that calls from __kmp_do_serial_initialize() */
3812int __kmp_register_root(int initial_thread) {
3813 kmp_info_t *root_thread;
3814 kmp_root_t *root;
3815 int gtid;
3816 int capacity;
3817 __kmp_acquire_bootstrap_lock(lck: &__kmp_forkjoin_lock);
3818 KA_TRACE(20, ("__kmp_register_root: entered\n"));
3819 KMP_MB();
3820
3821 /* 2007-03-02:
3822 If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3823 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3824 work as expected -- it may return false (that means there is at least one
3825 empty slot in __kmp_threads array), but it is possible the only free slot
3826 is #0, which is reserved for initial thread and so cannot be used for this
3827 one. Following code workarounds this bug.
3828
3829 However, right solution seems to be not reserving slot #0 for initial
3830 thread because:
3831 (1) there is no magic in slot #0,
3832 (2) we cannot detect initial thread reliably (the first thread which does
3833 serial initialization may be not a real initial thread).
3834 */
3835 capacity = __kmp_threads_capacity;
3836 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3837 --capacity;
3838 }
3839
3840 // If it is not for initializing the hidden helper team, we need to take
3841 // __kmp_hidden_helper_threads_num out of the capacity because it is included
3842 // in __kmp_threads_capacity.
3843 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3844 capacity -= __kmp_hidden_helper_threads_num;
3845 }
3846
3847 /* see if there are too many threads */
3848 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(nNeed: 1)) {
3849 if (__kmp_tp_cached) {
3850 __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3851 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3852 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3853 } else {
3854 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3855 __kmp_msg_null);
3856 }
3857 }
3858
3859 // When hidden helper task is enabled, __kmp_threads is organized as follows:
3860 // 0: initial thread, also a regular OpenMP thread.
3861 // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3862 // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3863 // regular OpenMP threads.
3864 if (TCR_4(__kmp_init_hidden_helper_threads)) {
3865 // Find an available thread slot for hidden helper thread. Slots for hidden
3866 // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3867 for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3868 gtid <= __kmp_hidden_helper_threads_num;
3869 gtid++)
3870 ;
3871 KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3872 KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3873 "hidden helper thread: T#%d\n",
3874 gtid));
3875 } else {
3876 /* find an available thread slot */
3877 // Don't reassign the zero slot since we need that to only be used by
3878 // initial thread. Slots for hidden helper threads should also be skipped.
3879 if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3880 gtid = 0;
3881 } else {
3882 for (gtid = __kmp_hidden_helper_threads_num + 1;
3883 TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3884 ;
3885 }
3886 KA_TRACE(
3887 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3888 KMP_ASSERT(gtid < __kmp_threads_capacity);
3889 }
3890
3891 /* update global accounting */
3892 __kmp_all_nth++;
3893 TCW_4(__kmp_nth, __kmp_nth + 1);
3894
3895 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3896 // numbers of procs, and method #2 (keyed API call) for higher numbers.
3897 if (__kmp_adjust_gtid_mode) {
3898 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3899 if (TCR_4(__kmp_gtid_mode) != 2) {
3900 TCW_4(__kmp_gtid_mode, 2);
3901 }
3902 } else {
3903 if (TCR_4(__kmp_gtid_mode) != 1) {
3904 TCW_4(__kmp_gtid_mode, 1);
3905 }
3906 }
3907 }
3908
3909#ifdef KMP_ADJUST_BLOCKTIME
3910 /* Adjust blocktime to zero if necessary */
3911 /* Middle initialization might not have occurred yet */
3912 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3913 if (__kmp_nth > __kmp_avail_proc) {
3914 __kmp_zero_bt = TRUE;
3915 }
3916 }
3917#endif /* KMP_ADJUST_BLOCKTIME */
3918
3919 /* setup this new hierarchy */
3920 if (!(root = __kmp_root[gtid])) {
3921 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3922 KMP_DEBUG_ASSERT(!root->r.r_root_team);
3923 }
3924
3925#if KMP_STATS_ENABLED
3926 // Initialize stats as soon as possible (right after gtid assignment).
3927 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3928 __kmp_stats_thread_ptr->startLife();
3929 KMP_SET_THREAD_STATE(SERIAL_REGION);
3930 KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3931#endif
3932 __kmp_initialize_root(root);
3933
3934 /* setup new root thread structure */
3935 if (root->r.r_uber_thread) {
3936 root_thread = root->r.r_uber_thread;
3937 } else {
3938 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3939 if (__kmp_storage_map) {
3940 __kmp_print_thread_storage_map(thr: root_thread, gtid);
3941 }
3942 root_thread->th.th_info.ds.ds_gtid = gtid;
3943#if OMPT_SUPPORT
3944 root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3945#endif
3946 root_thread->th.th_root = root;
3947 if (__kmp_env_consistency_check) {
3948 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3949 }
3950#if USE_FAST_MEMORY
3951 __kmp_initialize_fast_memory(this_thr: root_thread);
3952#endif /* USE_FAST_MEMORY */
3953
3954#if KMP_USE_BGET
3955 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3956 __kmp_initialize_bget(th: root_thread);
3957#endif
3958 __kmp_init_random(thread: root_thread); // Initialize random number generator
3959 }
3960
3961 /* setup the serial team held in reserve by the root thread */
3962 if (!root_thread->th.th_serial_team) {
3963 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3964 KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3965 root_thread->th.th_serial_team = __kmp_allocate_team(
3966 root, new_nproc: 1, max_nproc: 1,
3967#if OMPT_SUPPORT
3968 ompt_data_none, // root parallel id
3969#endif
3970 proc_bind: proc_bind_default, new_icvs: &r_icvs, argc: 0 USE_NESTED_HOT_ARG(NULL));
3971 }
3972 KMP_ASSERT(root_thread->th.th_serial_team);
3973 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3974 root_thread->th.th_serial_team));
3975
3976 /* drop root_thread into place */
3977 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3978
3979 root->r.r_root_team->t.t_threads[0] = root_thread;
3980 root->r.r_hot_team->t.t_threads[0] = root_thread;
3981 root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3982 // AC: the team created in reserve, not for execution (it is unused for now).
3983 root_thread->th.th_serial_team->t.t_serialized = 0;
3984 root->r.r_uber_thread = root_thread;
3985
3986 /* initialize the thread, get it ready to go */
3987 __kmp_initialize_info(root_thread, root->r.r_root_team, tid: 0, gtid);
3988 TCW_4(__kmp_init_gtid, TRUE);
3989
3990 /* prepare the primary thread for get_gtid() */
3991 __kmp_gtid_set_specific(gtid);
3992
3993#if USE_ITT_BUILD
3994 __kmp_itt_thread_name(gtid);
3995#endif /* USE_ITT_BUILD */
3996
3997#ifdef KMP_TDATA_GTID
3998 __kmp_gtid = gtid;
3999#endif
4000 __kmp_create_worker(gtid, th: root_thread, stack_size: __kmp_stksize);
4001 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
4002
4003 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
4004 "plain=%u\n",
4005 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
4006 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
4007 KMP_INIT_BARRIER_STATE));
4008 { // Initialize barrier data.
4009 int b;
4010 for (b = 0; b < bs_last_barrier; ++b) {
4011 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
4012#if USE_DEBUGGER
4013 root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
4014#endif
4015 }
4016 }
4017 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
4018 KMP_INIT_BARRIER_STATE);
4019
4020#if KMP_AFFINITY_SUPPORTED
4021 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
4022 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
4023 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
4024 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
4025#endif /* KMP_AFFINITY_SUPPORTED */
4026 root_thread->th.th_def_allocator = __kmp_def_allocator;
4027 root_thread->th.th_prev_level = 0;
4028 root_thread->th.th_prev_num_threads = 1;
4029
4030 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
4031 tmp->cg_root = root_thread;
4032 tmp->cg_thread_limit = __kmp_cg_max_nth;
4033 tmp->cg_nthreads = 1;
4034 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
4035 " cg_nthreads init to 1\n",
4036 root_thread, tmp));
4037 tmp->up = NULL;
4038 root_thread->th.th_cg_roots = tmp;
4039
4040 __kmp_root_counter++;
4041
4042#if OMPT_SUPPORT
4043 if (ompt_enabled.enabled) {
4044
4045 kmp_info_t *root_thread = ompt_get_thread();
4046
4047 ompt_set_thread_state(thread: root_thread, state: ompt_state_overhead);
4048
4049 if (ompt_enabled.ompt_callback_thread_begin) {
4050 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
4051 ompt_thread_initial, __ompt_get_thread_data_internal());
4052 }
4053 ompt_data_t *task_data;
4054 ompt_data_t *parallel_data;
4055 __ompt_get_task_info_internal(ancestor_level: 0, NULL, task_data: &task_data, NULL, parallel_data: &parallel_data,
4056 NULL);
4057 if (ompt_enabled.ompt_callback_implicit_task) {
4058 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4059 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
4060 }
4061
4062 ompt_set_thread_state(thread: root_thread, state: ompt_state_work_serial);
4063 }
4064#endif
4065#if OMPD_SUPPORT
4066 if (ompd_state & OMPD_ENABLE_BP)
4067 ompd_bp_thread_begin();
4068#endif
4069
4070 KMP_MB();
4071 __kmp_release_bootstrap_lock(lck: &__kmp_forkjoin_lock);
4072
4073 return gtid;
4074}
4075
4076#if KMP_NESTED_HOT_TEAMS
4077static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
4078 const int max_level) {
4079 int i, n, nth;
4080 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
4081 if (!hot_teams || !hot_teams[level].hot_team) {
4082 return 0;
4083 }
4084 KMP_DEBUG_ASSERT(level < max_level);
4085 kmp_team_t *team = hot_teams[level].hot_team;
4086 nth = hot_teams[level].hot_team_nth;
4087 n = nth - 1; // primary thread is not freed
4088 if (level < max_level - 1) {
4089 for (i = 0; i < nth; ++i) {
4090 kmp_info_t *th = team->t.t_threads[i];
4091 n += __kmp_free_hot_teams(root, thr: th, level: level + 1, max_level);
4092 if (i > 0 && th->th.th_hot_teams) {
4093 __kmp_free(th->th.th_hot_teams);
4094 th->th.th_hot_teams = NULL;
4095 }
4096 }
4097 }
4098 __kmp_free_team(root, team, NULL);
4099 return n;
4100}
4101#endif
4102
4103// Resets a root thread and clear its root and hot teams.
4104// Returns the number of __kmp_threads entries directly and indirectly freed.
4105static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4106 kmp_team_t *root_team = root->r.r_root_team;
4107 kmp_team_t *hot_team = root->r.r_hot_team;
4108 int n = hot_team->t.t_nproc;
4109 int i;
4110
4111 KMP_DEBUG_ASSERT(!root->r.r_active);
4112
4113 root->r.r_root_team = NULL;
4114 root->r.r_hot_team = NULL;
4115 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4116 // before call to __kmp_free_team().
4117 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4118#if KMP_NESTED_HOT_TEAMS
4119 if (__kmp_hot_teams_max_level >
4120 0) { // need to free nested hot teams and their threads if any
4121 for (i = 0; i < hot_team->t.t_nproc; ++i) {
4122 kmp_info_t *th = hot_team->t.t_threads[i];
4123 if (__kmp_hot_teams_max_level > 1) {
4124 n += __kmp_free_hot_teams(root, thr: th, level: 1, max_level: __kmp_hot_teams_max_level);
4125 }
4126 if (th->th.th_hot_teams) {
4127 __kmp_free(th->th.th_hot_teams);
4128 th->th.th_hot_teams = NULL;
4129 }
4130 }
4131 }
4132#endif
4133 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4134
4135 // Before we can reap the thread, we need to make certain that all other
4136 // threads in the teams that had this root as ancestor have stopped trying to
4137 // steal tasks.
4138 if (__kmp_tasking_mode != tskm_immediate_exec) {
4139 __kmp_wait_to_unref_task_teams();
4140 }
4141
4142#if KMP_OS_WINDOWS
4143 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4144 KA_TRACE(
4145 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4146 "\n",
4147 (LPVOID) & (root->r.r_uber_thread->th),
4148 root->r.r_uber_thread->th.th_info.ds.ds_thread));
4149 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4150#endif /* KMP_OS_WINDOWS */
4151
4152#if OMPD_SUPPORT
4153 if (ompd_state & OMPD_ENABLE_BP)
4154 ompd_bp_thread_end();
4155#endif
4156
4157#if OMPT_SUPPORT
4158 ompt_data_t *task_data;
4159 ompt_data_t *parallel_data;
4160 __ompt_get_task_info_internal(ancestor_level: 0, NULL, task_data: &task_data, NULL, parallel_data: &parallel_data,
4161 NULL);
4162 if (ompt_enabled.ompt_callback_implicit_task) {
4163 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4164 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4165 }
4166 if (ompt_enabled.ompt_callback_thread_end) {
4167 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4168 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4169 }
4170#endif
4171
4172 TCW_4(__kmp_nth,
4173 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4174 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4175 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4176 " to %d\n",
4177 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4178 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4179 if (i == 1) {
4180 // need to free contention group structure
4181 KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4182 root->r.r_uber_thread->th.th_cg_roots->cg_root);
4183 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4184 __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4185 root->r.r_uber_thread->th.th_cg_roots = NULL;
4186 }
4187 __kmp_reap_thread(thread: root->r.r_uber_thread, is_root: 1);
4188
4189 // We canot put root thread to __kmp_thread_pool, so we have to reap it
4190 // instead of freeing.
4191 root->r.r_uber_thread = NULL;
4192 /* mark root as no longer in use */
4193 root->r.r_begin = FALSE;
4194
4195 return n;
4196}
4197
4198void __kmp_unregister_root_current_thread(int gtid) {
4199 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4200 /* this lock should be ok, since unregister_root_current_thread is never
4201 called during an abort, only during a normal close. furthermore, if you
4202 have the forkjoin lock, you should never try to get the initz lock */
4203 __kmp_acquire_bootstrap_lock(lck: &__kmp_forkjoin_lock);
4204 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4205 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4206 "exiting T#%d\n",
4207 gtid));
4208 __kmp_release_bootstrap_lock(lck: &__kmp_forkjoin_lock);
4209 return;
4210 }
4211 kmp_root_t *root = __kmp_root[gtid];
4212
4213 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4214 KMP_ASSERT(KMP_UBER_GTID(gtid));
4215 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4216 KMP_ASSERT(root->r.r_active == FALSE);
4217
4218 KMP_MB();
4219
4220 kmp_info_t *thread = __kmp_threads[gtid];
4221 kmp_team_t *team = thread->th.th_team;
4222 kmp_task_team_t *task_team = thread->th.th_task_team;
4223
4224 // we need to wait for the proxy tasks before finishing the thread
4225 if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4226 task_team->tt.tt_hidden_helper_task_encountered)) {
4227#if OMPT_SUPPORT
4228 // the runtime is shutting down so we won't report any events
4229 thread->th.ompt_thread_info.state = ompt_state_undefined;
4230#endif
4231 __kmp_task_team_wait(this_thr: thread, team USE_ITT_BUILD_ARG(NULL));
4232 }
4233
4234 __kmp_reset_root(gtid, root);
4235
4236 KMP_MB();
4237 KC_TRACE(10,
4238 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4239
4240 __kmp_release_bootstrap_lock(lck: &__kmp_forkjoin_lock);
4241}
4242
4243#if KMP_OS_WINDOWS
4244/* __kmp_forkjoin_lock must be already held
4245 Unregisters a root thread that is not the current thread. Returns the number
4246 of __kmp_threads entries freed as a result. */
4247static int __kmp_unregister_root_other_thread(int gtid) {
4248 kmp_root_t *root = __kmp_root[gtid];
4249 int r;
4250
4251 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4252 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4253 KMP_ASSERT(KMP_UBER_GTID(gtid));
4254 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4255 KMP_ASSERT(root->r.r_active == FALSE);
4256
4257 r = __kmp_reset_root(gtid, root);
4258 KC_TRACE(10,
4259 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4260 return r;
4261}
4262#endif
4263
4264#if KMP_DEBUG
4265void __kmp_task_info() {
4266
4267 kmp_int32 gtid = __kmp_entry_gtid();
4268 kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4269 kmp_info_t *this_thr = __kmp_threads[gtid];
4270 kmp_team_t *steam = this_thr->th.th_serial_team;
4271 kmp_team_t *team = this_thr->th.th_team;
4272
4273 __kmp_printf(
4274 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4275 "ptask=%p\n",
4276 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4277 team->t.t_implicit_task_taskdata[tid].td_parent);
4278}
4279#endif // KMP_DEBUG
4280
4281/* TODO optimize with one big memclr, take out what isn't needed, split
4282 responsibility to workers as much as possible, and delay initialization of
4283 features as much as possible */
4284static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4285 int tid, int gtid) {
4286 /* this_thr->th.th_info.ds.ds_gtid is setup in
4287 kmp_allocate_thread/create_worker.
4288 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4289 KMP_DEBUG_ASSERT(this_thr != NULL);
4290 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4291 KMP_DEBUG_ASSERT(team);
4292 KMP_DEBUG_ASSERT(team->t.t_threads);
4293 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4294 kmp_info_t *master = team->t.t_threads[0];
4295 KMP_DEBUG_ASSERT(master);
4296 KMP_DEBUG_ASSERT(master->th.th_root);
4297
4298 KMP_MB();
4299
4300 TCW_SYNC_PTR(this_thr->th.th_team, team);
4301
4302 this_thr->th.th_info.ds.ds_tid = tid;
4303 this_thr->th.th_set_nproc = 0;
4304 if (__kmp_tasking_mode != tskm_immediate_exec)
4305 // When tasking is possible, threads are not safe to reap until they are
4306 // done tasking; this will be set when tasking code is exited in wait
4307 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4308 else // no tasking --> always safe to reap
4309 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4310 this_thr->th.th_set_proc_bind = proc_bind_default;
4311
4312#if KMP_AFFINITY_SUPPORTED
4313 this_thr->th.th_new_place = this_thr->th.th_current_place;
4314#endif
4315 this_thr->th.th_root = master->th.th_root;
4316
4317 /* setup the thread's cache of the team structure */
4318 this_thr->th.th_team_nproc = team->t.t_nproc;
4319 this_thr->th.th_team_master = master;
4320 this_thr->th.th_team_serialized = team->t.t_serialized;
4321
4322 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4323
4324 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4325 tid, gtid, this_thr, this_thr->th.th_current_task));
4326
4327 __kmp_init_implicit_task(loc_ref: this_thr->th.th_team_master->th.th_ident, this_thr,
4328 team, tid, TRUE);
4329
4330 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4331 tid, gtid, this_thr, this_thr->th.th_current_task));
4332 // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4333 // __kmp_initialize_team()?
4334
4335 /* TODO no worksharing in speculative threads */
4336 this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4337
4338 this_thr->th.th_local.this_construct = 0;
4339
4340 if (!this_thr->th.th_pri_common) {
4341 this_thr->th.th_pri_common =
4342 (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4343 if (__kmp_storage_map) {
4344 __kmp_print_storage_map_gtid(
4345 gtid, p1: this_thr->th.th_pri_common, p2: this_thr->th.th_pri_common + 1,
4346 size: sizeof(struct common_table), format: "th_%d.th_pri_common\n", gtid);
4347 }
4348 this_thr->th.th_pri_head = NULL;
4349 }
4350
4351 if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4352 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4353 // Make new thread's CG root same as primary thread's
4354 KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4355 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4356 if (tmp) {
4357 // worker changes CG, need to check if old CG should be freed
4358 int i = tmp->cg_nthreads--;
4359 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4360 " on node %p of thread %p to %d\n",
4361 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4362 if (i == 1) {
4363 __kmp_free(tmp); // last thread left CG --> free it
4364 }
4365 }
4366 this_thr->th.th_cg_roots = master->th.th_cg_roots;
4367 // Increment new thread's CG root's counter to add the new thread
4368 this_thr->th.th_cg_roots->cg_nthreads++;
4369 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4370 " node %p of thread %p to %d\n",
4371 this_thr, this_thr->th.th_cg_roots,
4372 this_thr->th.th_cg_roots->cg_root,
4373 this_thr->th.th_cg_roots->cg_nthreads));
4374 this_thr->th.th_current_task->td_icvs.thread_limit =
4375 this_thr->th.th_cg_roots->cg_thread_limit;
4376 }
4377
4378 /* Initialize dynamic dispatch */
4379 {
4380 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4381 // Use team max_nproc since this will never change for the team.
4382 size_t disp_size =
4383 sizeof(dispatch_private_info_t) *
4384 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4385 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4386 team->t.t_max_nproc));
4387 KMP_ASSERT(dispatch);
4388 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4389 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4390
4391 dispatch->th_disp_index = 0;
4392 dispatch->th_doacross_buf_idx = 0;
4393 if (!dispatch->th_disp_buffer) {
4394 dispatch->th_disp_buffer =
4395 (dispatch_private_info_t *)__kmp_allocate(disp_size);
4396
4397 if (__kmp_storage_map) {
4398 __kmp_print_storage_map_gtid(
4399 gtid, p1: &dispatch->th_disp_buffer[0],
4400 p2: &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4401 ? 1
4402 : __kmp_dispatch_num_buffers],
4403 size: disp_size,
4404 format: "th_%d.th_dispatch.th_disp_buffer "
4405 "(team_%d.t_dispatch[%d].th_disp_buffer)",
4406 gtid, team->t.t_id, gtid);
4407 }
4408 } else {
4409 memset(s: &dispatch->th_disp_buffer[0], c: '\0', n: disp_size);
4410 }
4411
4412 dispatch->th_dispatch_pr_current = 0;
4413 dispatch->th_dispatch_sh_current = 0;
4414
4415 dispatch->th_deo_fcn = 0; /* ORDERED */
4416 dispatch->th_dxo_fcn = 0; /* END ORDERED */
4417 }
4418
4419 this_thr->th.th_next_pool = NULL;
4420
4421 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4422 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4423
4424 KMP_MB();
4425}
4426
4427/* allocate a new thread for the requesting team. this is only called from
4428 within a forkjoin critical section. we will first try to get an available
4429 thread from the thread pool. if none is available, we will fork a new one
4430 assuming we are able to create a new one. this should be assured, as the
4431 caller should check on this first. */
4432kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4433 int new_tid) {
4434 kmp_team_t *serial_team;
4435 kmp_info_t *new_thr;
4436 int new_gtid;
4437
4438 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4439 KMP_DEBUG_ASSERT(root && team);
4440#if !KMP_NESTED_HOT_TEAMS
4441 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4442#endif
4443 KMP_MB();
4444
4445 /* first, try to get one from the thread pool unless allocating thread is
4446 * the main hidden helper thread. The hidden helper team should always
4447 * allocate new OS threads. */
4448 if (__kmp_thread_pool && !KMP_HIDDEN_HELPER_TEAM(team)) {
4449 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4450 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4451 if (new_thr == __kmp_thread_pool_insert_pt) {
4452 __kmp_thread_pool_insert_pt = NULL;
4453 }
4454 TCW_4(new_thr->th.th_in_pool, FALSE);
4455 __kmp_suspend_initialize_thread(th: new_thr);
4456 __kmp_lock_suspend_mx(th: new_thr);
4457 if (new_thr->th.th_active_in_pool == TRUE) {
4458 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4459 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4460 new_thr->th.th_active_in_pool = FALSE;
4461 }
4462 __kmp_unlock_suspend_mx(th: new_thr);
4463
4464 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4465 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4466 KMP_ASSERT(!new_thr->th.th_team);
4467 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4468
4469 /* setup the thread structure */
4470 __kmp_initialize_info(this_thr: new_thr, team, tid: new_tid,
4471 gtid: new_thr->th.th_info.ds.ds_gtid);
4472 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4473
4474 TCW_4(__kmp_nth, __kmp_nth + 1);
4475
4476 new_thr->th.th_task_state = 0;
4477
4478 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4479 // Make sure pool thread has transitioned to waiting on own thread struct
4480 KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4481 // Thread activated in __kmp_allocate_team when increasing team size
4482 }
4483
4484#ifdef KMP_ADJUST_BLOCKTIME
4485 /* Adjust blocktime back to zero if necessary */
4486 /* Middle initialization might not have occurred yet */
4487 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4488 if (__kmp_nth > __kmp_avail_proc) {
4489 __kmp_zero_bt = TRUE;
4490 }
4491 }
4492#endif /* KMP_ADJUST_BLOCKTIME */
4493
4494#if KMP_DEBUG
4495 // If thread entered pool via __kmp_free_thread, wait_flag should !=
4496 // KMP_BARRIER_PARENT_FLAG.
4497 int b;
4498 kmp_balign_t *balign = new_thr->th.th_bar;
4499 for (b = 0; b < bs_last_barrier; ++b)
4500 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4501#endif
4502
4503 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4504 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4505
4506 KMP_MB();
4507 return new_thr;
4508 }
4509
4510 /* no, well fork a new one */
4511 KMP_ASSERT(KMP_HIDDEN_HELPER_TEAM(team) || __kmp_nth == __kmp_all_nth);
4512 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4513
4514#if KMP_USE_MONITOR
4515 // If this is the first worker thread the RTL is creating, then also
4516 // launch the monitor thread. We try to do this as early as possible.
4517 if (!TCR_4(__kmp_init_monitor)) {
4518 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4519 if (!TCR_4(__kmp_init_monitor)) {
4520 KF_TRACE(10, ("before __kmp_create_monitor\n"));
4521 TCW_4(__kmp_init_monitor, 1);
4522 __kmp_create_monitor(&__kmp_monitor);
4523 KF_TRACE(10, ("after __kmp_create_monitor\n"));
4524#if KMP_OS_WINDOWS
4525 // AC: wait until monitor has started. This is a fix for CQ232808.
4526 // The reason is that if the library is loaded/unloaded in a loop with
4527 // small (parallel) work in between, then there is high probability that
4528 // monitor thread started after the library shutdown. At shutdown it is
4529 // too late to cope with the problem, because when the primary thread is
4530 // in DllMain (process detach) the monitor has no chances to start (it is
4531 // blocked), and primary thread has no means to inform the monitor that
4532 // the library has gone, because all the memory which the monitor can
4533 // access is going to be released/reset.
4534 while (TCR_4(__kmp_init_monitor) < 2) {
4535 KMP_YIELD(TRUE);
4536 }
4537 KF_TRACE(10, ("after monitor thread has started\n"));
4538#endif
4539 }
4540 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4541 }
4542#endif
4543
4544 KMP_MB();
4545
4546 {
4547 int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4548 ? 1
4549 : __kmp_hidden_helper_threads_num + 1;
4550
4551 for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4552 ++new_gtid) {
4553 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4554 }
4555
4556 if (TCR_4(__kmp_init_hidden_helper_threads)) {
4557 KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4558 }
4559 }
4560
4561 /* allocate space for it. */
4562 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4563
4564 new_thr->th.th_nt_strict = false;
4565 new_thr->th.th_nt_loc = NULL;
4566 new_thr->th.th_nt_sev = severity_fatal;
4567 new_thr->th.th_nt_msg = NULL;
4568
4569 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4570
4571#if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4572 // suppress race conditions detection on synchronization flags in debug mode
4573 // this helps to analyze library internals eliminating false positives
4574 __itt_suppress_mark_range(
4575 __itt_suppress_range, __itt_suppress_threading_errors,
4576 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4577 __itt_suppress_mark_range(
4578 __itt_suppress_range, __itt_suppress_threading_errors,
4579 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4580#if KMP_OS_WINDOWS
4581 __itt_suppress_mark_range(
4582 __itt_suppress_range, __itt_suppress_threading_errors,
4583 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4584#else
4585 __itt_suppress_mark_range(__itt_suppress_range,
4586 __itt_suppress_threading_errors,
4587 &new_thr->th.th_suspend_init_count,
4588 sizeof(new_thr->th.th_suspend_init_count));
4589#endif
4590 // TODO: check if we need to also suppress b_arrived flags
4591 __itt_suppress_mark_range(__itt_suppress_range,
4592 __itt_suppress_threading_errors,
4593 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4594 sizeof(new_thr->th.th_bar[0].bb.b_go));
4595 __itt_suppress_mark_range(__itt_suppress_range,
4596 __itt_suppress_threading_errors,
4597 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4598 sizeof(new_thr->th.th_bar[1].bb.b_go));
4599 __itt_suppress_mark_range(__itt_suppress_range,
4600 __itt_suppress_threading_errors,
4601 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4602 sizeof(new_thr->th.th_bar[2].bb.b_go));
4603#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4604 if (__kmp_storage_map) {
4605 __kmp_print_thread_storage_map(thr: new_thr, gtid: new_gtid);
4606 }
4607
4608 // add the reserve serialized team, initialized from the team's primary thread
4609 {
4610 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4611 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4612 new_thr->th.th_serial_team = serial_team =
4613 (kmp_team_t *)__kmp_allocate_team(root, new_nproc: 1, max_nproc: 1,
4614#if OMPT_SUPPORT
4615 ompt_data_none, // root parallel id
4616#endif
4617 proc_bind: proc_bind_default, new_icvs: &r_icvs,
4618 argc: 0 USE_NESTED_HOT_ARG(NULL));
4619 }
4620 KMP_ASSERT(serial_team);
4621 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4622 // execution (it is unused for now).
4623 serial_team->t.t_threads[0] = new_thr;
4624 KF_TRACE(10,
4625 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4626 new_thr));
4627
4628 /* setup the thread structures */
4629 __kmp_initialize_info(this_thr: new_thr, team, tid: new_tid, gtid: new_gtid);
4630
4631#if USE_FAST_MEMORY
4632 __kmp_initialize_fast_memory(this_thr: new_thr);
4633#endif /* USE_FAST_MEMORY */
4634
4635#if KMP_USE_BGET
4636 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4637 __kmp_initialize_bget(th: new_thr);
4638#endif
4639
4640 __kmp_init_random(thread: new_thr); // Initialize random number generator
4641
4642 /* Initialize these only once when thread is grabbed for a team allocation */
4643 KA_TRACE(20,
4644 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4645 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4646
4647 int b;
4648 kmp_balign_t *balign = new_thr->th.th_bar;
4649 for (b = 0; b < bs_last_barrier; ++b) {
4650 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4651 balign[b].bb.team = NULL;
4652 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4653 balign[b].bb.use_oncore_barrier = 0;
4654 }
4655
4656 TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4657 new_thr->th.th_sleep_loc_type = flag_unset;
4658
4659 new_thr->th.th_spin_here = FALSE;
4660 new_thr->th.th_next_waiting = 0;
4661#if KMP_OS_UNIX
4662 new_thr->th.th_blocking = false;
4663#endif
4664
4665#if KMP_AFFINITY_SUPPORTED
4666 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4667 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4668 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4669 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4670#endif
4671 new_thr->th.th_def_allocator = __kmp_def_allocator;
4672 new_thr->th.th_prev_level = 0;
4673 new_thr->th.th_prev_num_threads = 1;
4674
4675 TCW_4(new_thr->th.th_in_pool, FALSE);
4676 new_thr->th.th_active_in_pool = FALSE;
4677 TCW_4(new_thr->th.th_active, TRUE);
4678
4679 new_thr->th.th_set_nested_nth = NULL;
4680 new_thr->th.th_set_nested_nth_sz = 0;
4681
4682 /* adjust the global counters */
4683 __kmp_all_nth++;
4684 __kmp_nth++;
4685
4686 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4687 // numbers of procs, and method #2 (keyed API call) for higher numbers.
4688 if (__kmp_adjust_gtid_mode) {
4689 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4690 if (TCR_4(__kmp_gtid_mode) != 2) {
4691 TCW_4(__kmp_gtid_mode, 2);
4692 }
4693 } else {
4694 if (TCR_4(__kmp_gtid_mode) != 1) {
4695 TCW_4(__kmp_gtid_mode, 1);
4696 }
4697 }
4698 }
4699
4700#ifdef KMP_ADJUST_BLOCKTIME
4701 /* Adjust blocktime back to zero if necessary */
4702 /* Middle initialization might not have occurred yet */
4703 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4704 if (__kmp_nth > __kmp_avail_proc) {
4705 __kmp_zero_bt = TRUE;
4706 }
4707 }
4708#endif /* KMP_ADJUST_BLOCKTIME */
4709
4710#if KMP_AFFINITY_SUPPORTED
4711 // Set the affinity and topology information for new thread
4712 __kmp_affinity_set_init_mask(gtid: new_gtid, /*isa_root=*/FALSE);
4713#endif
4714
4715 /* actually fork it and create the new worker thread */
4716 KF_TRACE(
4717 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4718 __kmp_create_worker(gtid: new_gtid, th: new_thr, stack_size: __kmp_stksize);
4719 KF_TRACE(10,
4720 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4721
4722 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4723 new_gtid));
4724 KMP_MB();
4725 return new_thr;
4726}
4727
4728/* Reinitialize team for reuse.
4729 The hot team code calls this case at every fork barrier, so EPCC barrier
4730 test are extremely sensitive to changes in it, esp. writes to the team
4731 struct, which cause a cache invalidation in all threads.
4732 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4733static void __kmp_reinitialize_team(kmp_team_t *team,
4734 kmp_internal_control_t *new_icvs,
4735 ident_t *loc) {
4736 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4737 team->t.t_threads[0], team));
4738 KMP_DEBUG_ASSERT(team && new_icvs);
4739 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4740 KMP_CHECK_UPDATE(team->t.t_ident, loc);
4741
4742 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4743 // Copy ICVs to the primary thread's implicit taskdata
4744 __kmp_init_implicit_task(loc_ref: loc, this_thr: team->t.t_threads[0], team, tid: 0, FALSE);
4745 copy_icvs(dst: &team->t.t_implicit_task_taskdata[0].td_icvs, src: new_icvs);
4746
4747 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4748 team->t.t_threads[0], team));
4749}
4750
4751/* Initialize the team data structure.
4752 This assumes the t_threads and t_max_nproc are already set.
4753 Also, we don't touch the arguments */
4754static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4755 kmp_internal_control_t *new_icvs,
4756 ident_t *loc) {
4757 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4758
4759 /* verify */
4760 KMP_DEBUG_ASSERT(team);
4761 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4762 KMP_DEBUG_ASSERT(team->t.t_threads);
4763 KMP_MB();
4764
4765 team->t.t_master_tid = 0; /* not needed */
4766 /* team->t.t_master_bar; not needed */
4767 team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4768 team->t.t_nproc = new_nproc;
4769
4770 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4771 team->t.t_next_pool = NULL;
4772 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4773 * up hot team */
4774
4775 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4776 team->t.t_invoke = NULL; /* not needed */
4777
4778 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4779 team->t.t_sched.sched = new_icvs->sched.sched;
4780
4781#if KMP_ARCH_X86 || KMP_ARCH_X86_64
4782 team->t.t_fp_control_saved = FALSE; /* not needed */
4783 team->t.t_x87_fpu_control_word = 0; /* not needed */
4784 team->t.t_mxcsr = 0; /* not needed */
4785#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4786
4787 team->t.t_construct = 0;
4788
4789 team->t.t_ordered.dt.t_value = 0;
4790 team->t.t_master_active = FALSE;
4791
4792#ifdef KMP_DEBUG
4793 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4794#endif
4795#if KMP_OS_WINDOWS
4796 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4797#endif
4798
4799 team->t.t_control_stack_top = NULL;
4800
4801 __kmp_reinitialize_team(team, new_icvs, loc);
4802
4803 KMP_MB();
4804 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4805}
4806
4807#if KMP_AFFINITY_SUPPORTED
4808static inline void __kmp_set_thread_place(kmp_team_t *team, kmp_info_t *th,
4809 int first, int last, int newp) {
4810 th->th.th_first_place = first;
4811 th->th.th_last_place = last;
4812 th->th.th_new_place = newp;
4813 if (newp != th->th.th_current_place) {
4814 if (__kmp_display_affinity && team->t.t_display_affinity != 1)
4815 team->t.t_display_affinity = 1;
4816 // Copy topology information associated with the new place
4817 th->th.th_topology_ids = __kmp_affinity.ids[th->th.th_new_place];
4818 th->th.th_topology_attrs = __kmp_affinity.attrs[th->th.th_new_place];
4819 }
4820}
4821
4822// __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4823// It calculates the worker + primary thread's partition based upon the parent
4824// thread's partition, and binds each worker to a thread in their partition.
4825// The primary thread's partition should already include its current binding.
4826static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4827 // Do not partition places for the hidden helper team
4828 if (KMP_HIDDEN_HELPER_TEAM(team))
4829 return;
4830 // Copy the primary thread's place partition to the team struct
4831 kmp_info_t *master_th = team->t.t_threads[0];
4832 KMP_DEBUG_ASSERT(master_th != NULL);
4833 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4834 int first_place = master_th->th.th_first_place;
4835 int last_place = master_th->th.th_last_place;
4836 int masters_place = master_th->th.th_current_place;
4837 int num_masks = __kmp_affinity.num_masks;
4838 team->t.t_first_place = first_place;
4839 team->t.t_last_place = last_place;
4840
4841 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4842 "bound to place %d partition = [%d,%d]\n",
4843 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4844 team->t.t_id, masters_place, first_place, last_place));
4845
4846 switch (proc_bind) {
4847
4848 case proc_bind_default:
4849 // Serial teams might have the proc_bind policy set to proc_bind_default.
4850 // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4851 KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4852 break;
4853
4854 case proc_bind_primary: {
4855 int f;
4856 int n_th = team->t.t_nproc;
4857 for (f = 1; f < n_th; f++) {
4858 kmp_info_t *th = team->t.t_threads[f];
4859 KMP_DEBUG_ASSERT(th != NULL);
4860 __kmp_set_thread_place(team, th, first: first_place, last: last_place, newp: masters_place);
4861
4862 KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4863 "partition = [%d,%d]\n",
4864 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4865 f, masters_place, first_place, last_place));
4866 }
4867 } break;
4868
4869 case proc_bind_close: {
4870 int f;
4871 int n_th = team->t.t_nproc;
4872 int n_places;
4873 if (first_place <= last_place) {
4874 n_places = last_place - first_place + 1;
4875 } else {
4876 n_places = num_masks - first_place + last_place + 1;
4877 }
4878 if (n_th <= n_places) {
4879 int place = masters_place;
4880 for (f = 1; f < n_th; f++) {
4881 kmp_info_t *th = team->t.t_threads[f];
4882 KMP_DEBUG_ASSERT(th != NULL);
4883
4884 if (place == last_place) {
4885 place = first_place;
4886 } else if (place == (num_masks - 1)) {
4887 place = 0;
4888 } else {
4889 place++;
4890 }
4891 __kmp_set_thread_place(team, th, first: first_place, last: last_place, newp: place);
4892
4893 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4894 "partition = [%d,%d]\n",
4895 __kmp_gtid_from_thread(team->t.t_threads[f]),
4896 team->t.t_id, f, place, first_place, last_place));
4897 }
4898 } else {
4899 int S, rem, gap, s_count;
4900 S = n_th / n_places;
4901 s_count = 0;
4902 rem = n_th - (S * n_places);
4903 gap = rem > 0 ? n_places / rem : n_places;
4904 int place = masters_place;
4905 int gap_ct = gap;
4906 for (f = 0; f < n_th; f++) {
4907 kmp_info_t *th = team->t.t_threads[f];
4908 KMP_DEBUG_ASSERT(th != NULL);
4909
4910 __kmp_set_thread_place(team, th, first: first_place, last: last_place, newp: place);
4911 s_count++;
4912
4913 if ((s_count == S) && rem && (gap_ct == gap)) {
4914 // do nothing, add an extra thread to place on next iteration
4915 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4916 // we added an extra thread to this place; move to next place
4917 if (place == last_place) {
4918 place = first_place;
4919 } else if (place == (num_masks - 1)) {
4920 place = 0;
4921 } else {
4922 place++;
4923 }
4924 s_count = 0;
4925 gap_ct = 1;
4926 rem--;
4927 } else if (s_count == S) { // place full; don't add extra
4928 if (place == last_place) {
4929 place = first_place;
4930 } else if (place == (num_masks - 1)) {
4931 place = 0;
4932 } else {
4933 place++;
4934 }
4935 gap_ct++;
4936 s_count = 0;
4937 }
4938
4939 KA_TRACE(100,
4940 ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4941 "partition = [%d,%d]\n",
4942 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4943 th->th.th_new_place, first_place, last_place));
4944 }
4945 KMP_DEBUG_ASSERT(place == masters_place);
4946 }
4947 } break;
4948
4949 case proc_bind_spread: {
4950 int f;
4951 int n_th = team->t.t_nproc;
4952 int n_places;
4953 int thidx;
4954 if (first_place <= last_place) {
4955 n_places = last_place - first_place + 1;
4956 } else {
4957 n_places = num_masks - first_place + last_place + 1;
4958 }
4959 if (n_th <= n_places) {
4960 int place = -1;
4961
4962 if (n_places != num_masks) {
4963 int S = n_places / n_th;
4964 int s_count, rem, gap, gap_ct;
4965
4966 place = masters_place;
4967 rem = n_places - n_th * S;
4968 gap = rem ? n_th / rem : 1;
4969 gap_ct = gap;
4970 thidx = n_th;
4971 if (update_master_only == 1)
4972 thidx = 1;
4973 for (f = 0; f < thidx; f++) {
4974 kmp_info_t *th = team->t.t_threads[f];
4975 KMP_DEBUG_ASSERT(th != NULL);
4976
4977 int fplace = place, nplace = place;
4978 s_count = 1;
4979 while (s_count < S) {
4980 if (place == last_place) {
4981 place = first_place;
4982 } else if (place == (num_masks - 1)) {
4983 place = 0;
4984 } else {
4985 place++;
4986 }
4987 s_count++;
4988 }
4989 if (rem && (gap_ct == gap)) {
4990 if (place == last_place) {
4991 place = first_place;
4992 } else if (place == (num_masks - 1)) {
4993 place = 0;
4994 } else {
4995 place++;
4996 }
4997 rem--;
4998 gap_ct = 0;
4999 }
5000 __kmp_set_thread_place(team, th, first: fplace, last: place, newp: nplace);
5001 gap_ct++;
5002
5003 if (place == last_place) {
5004 place = first_place;
5005 } else if (place == (num_masks - 1)) {
5006 place = 0;
5007 } else {
5008 place++;
5009 }
5010
5011 KA_TRACE(100,
5012 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5013 "partition = [%d,%d], num_masks: %u\n",
5014 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
5015 f, th->th.th_new_place, th->th.th_first_place,
5016 th->th.th_last_place, num_masks));
5017 }
5018 } else {
5019 /* Having uniform space of available computation places I can create
5020 T partitions of round(P/T) size and put threads into the first
5021 place of each partition. */
5022 double current = static_cast<double>(masters_place);
5023 double spacing =
5024 (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
5025 int first, last;
5026 kmp_info_t *th;
5027
5028 thidx = n_th + 1;
5029 if (update_master_only == 1)
5030 thidx = 1;
5031 for (f = 0; f < thidx; f++) {
5032 first = static_cast<int>(current);
5033 last = static_cast<int>(current + spacing) - 1;
5034 KMP_DEBUG_ASSERT(last >= first);
5035 if (first >= n_places) {
5036 if (masters_place) {
5037 first -= n_places;
5038 last -= n_places;
5039 if (first == (masters_place + 1)) {
5040 KMP_DEBUG_ASSERT(f == n_th);
5041 first--;
5042 }
5043 if (last == masters_place) {
5044 KMP_DEBUG_ASSERT(f == (n_th - 1));
5045 last--;
5046 }
5047 } else {
5048 KMP_DEBUG_ASSERT(f == n_th);
5049 first = 0;
5050 last = 0;
5051 }
5052 }
5053 if (last >= n_places) {
5054 last = (n_places - 1);
5055 }
5056 place = first;
5057 current += spacing;
5058 if (f < n_th) {
5059 KMP_DEBUG_ASSERT(0 <= first);
5060 KMP_DEBUG_ASSERT(n_places > first);
5061 KMP_DEBUG_ASSERT(0 <= last);
5062 KMP_DEBUG_ASSERT(n_places > last);
5063 KMP_DEBUG_ASSERT(last_place >= first_place);
5064 th = team->t.t_threads[f];
5065 KMP_DEBUG_ASSERT(th);
5066 __kmp_set_thread_place(team, th, first, last, newp: place);
5067 KA_TRACE(100,
5068 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5069 "partition = [%d,%d], spacing = %.4f\n",
5070 __kmp_gtid_from_thread(team->t.t_threads[f]),
5071 team->t.t_id, f, th->th.th_new_place,
5072 th->th.th_first_place, th->th.th_last_place, spacing));
5073 }
5074 }
5075 }
5076 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5077 } else {
5078 int S, rem, gap, s_count;
5079 S = n_th / n_places;
5080 s_count = 0;
5081 rem = n_th - (S * n_places);
5082 gap = rem > 0 ? n_places / rem : n_places;
5083 int place = masters_place;
5084 int gap_ct = gap;
5085 thidx = n_th;
5086 if (update_master_only == 1)
5087 thidx = 1;
5088 for (f = 0; f < thidx; f++) {
5089 kmp_info_t *th = team->t.t_threads[f];
5090 KMP_DEBUG_ASSERT(th != NULL);
5091
5092 __kmp_set_thread_place(team, th, first: place, last: place, newp: place);
5093 s_count++;
5094
5095 if ((s_count == S) && rem && (gap_ct == gap)) {
5096 // do nothing, add an extra thread to place on next iteration
5097 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5098 // we added an extra thread to this place; move on to next place
5099 if (place == last_place) {
5100 place = first_place;
5101 } else if (place == (num_masks - 1)) {
5102 place = 0;
5103 } else {
5104 place++;
5105 }
5106 s_count = 0;
5107 gap_ct = 1;
5108 rem--;
5109 } else if (s_count == S) { // place is full; don't add extra thread
5110 if (place == last_place) {
5111 place = first_place;
5112 } else if (place == (num_masks - 1)) {
5113 place = 0;
5114 } else {
5115 place++;
5116 }
5117 gap_ct++;
5118 s_count = 0;
5119 }
5120
5121 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5122 "partition = [%d,%d]\n",
5123 __kmp_gtid_from_thread(team->t.t_threads[f]),
5124 team->t.t_id, f, th->th.th_new_place,
5125 th->th.th_first_place, th->th.th_last_place));
5126 }
5127 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5128 }
5129 } break;
5130
5131 default:
5132 break;
5133 }
5134
5135 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5136}
5137
5138#endif // KMP_AFFINITY_SUPPORTED
5139
5140/* allocate a new team data structure to use. take one off of the free pool if
5141 available */
5142kmp_team_t *
5143__kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5144#if OMPT_SUPPORT
5145 ompt_data_t ompt_parallel_data,
5146#endif
5147 kmp_proc_bind_t new_proc_bind,
5148 kmp_internal_control_t *new_icvs,
5149 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5150 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5151 int f;
5152 kmp_team_t *team;
5153 int use_hot_team = !root->r.r_active;
5154 int level = 0;
5155 int do_place_partition = 1;
5156
5157 KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5158 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5159 KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5160 KMP_MB();
5161
5162#if KMP_NESTED_HOT_TEAMS
5163 kmp_hot_team_ptr_t *hot_teams;
5164 if (master) {
5165 team = master->th.th_team;
5166 level = team->t.t_active_level;
5167 if (master->th.th_teams_microtask) { // in teams construct?
5168 if (master->th.th_teams_size.nteams > 1 &&
5169 ( // #teams > 1
5170 team->t.t_pkfn ==
5171 (microtask_t)__kmp_teams_master || // inner fork of the teams
5172 master->th.th_teams_level <
5173 team->t.t_level)) { // or nested parallel inside the teams
5174 ++level; // not increment if #teams==1, or for outer fork of the teams;
5175 // increment otherwise
5176 }
5177 // Do not perform the place partition if inner fork of the teams
5178 // Wait until nested parallel region encountered inside teams construct
5179 if ((master->th.th_teams_size.nteams == 1 &&
5180 master->th.th_teams_level >= team->t.t_level) ||
5181 (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5182 do_place_partition = 0;
5183 }
5184 hot_teams = master->th.th_hot_teams;
5185 if (level < __kmp_hot_teams_max_level && hot_teams &&
5186 hot_teams[level].hot_team) {
5187 // hot team has already been allocated for given level
5188 use_hot_team = 1;
5189 } else {
5190 use_hot_team = 0;
5191 }
5192 } else {
5193 // check we won't access uninitialized hot_teams, just in case
5194 KMP_DEBUG_ASSERT(new_nproc == 1);
5195 }
5196#endif
5197 // Optimization to use a "hot" team
5198 if (use_hot_team && new_nproc > 1) {
5199 KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5200#if KMP_NESTED_HOT_TEAMS
5201 team = hot_teams[level].hot_team;
5202#else
5203 team = root->r.r_hot_team;
5204#endif
5205#if KMP_DEBUG
5206 if (__kmp_tasking_mode != tskm_immediate_exec) {
5207 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5208 "task_team[1] = %p before reinit\n",
5209 team->t.t_task_team[0], team->t.t_task_team[1]));
5210 }
5211#endif
5212
5213 if (team->t.t_nproc != new_nproc &&
5214 __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5215 // Distributed barrier may need a resize
5216 int old_nthr = team->t.t_nproc;
5217 __kmp_resize_dist_barrier(team, old_nthreads: old_nthr, new_nthreads: new_nproc);
5218 }
5219
5220 // If not doing the place partition, then reset the team's proc bind
5221 // to indicate that partitioning of all threads still needs to take place
5222 if (do_place_partition == 0)
5223 team->t.t_proc_bind = proc_bind_default;
5224 // Has the number of threads changed?
5225 /* Let's assume the most common case is that the number of threads is
5226 unchanged, and put that case first. */
5227 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5228 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5229 // This case can mean that omp_set_num_threads() was called and the hot
5230 // team size was already reduced, so we check the special flag
5231 if (team->t.t_size_changed == -1) {
5232 team->t.t_size_changed = 1;
5233 } else {
5234 KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5235 }
5236
5237 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5238 kmp_r_sched_t new_sched = new_icvs->sched;
5239 // set primary thread's schedule as new run-time schedule
5240 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5241
5242 __kmp_reinitialize_team(team, new_icvs,
5243 loc: root->r.r_uber_thread->th.th_ident);
5244
5245 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5246 team->t.t_threads[0], team));
5247 __kmp_push_current_task_to_thread(this_thr: team->t.t_threads[0], team, tid: 0);
5248
5249#if KMP_AFFINITY_SUPPORTED
5250 if ((team->t.t_size_changed == 0) &&
5251 (team->t.t_proc_bind == new_proc_bind)) {
5252 if (new_proc_bind == proc_bind_spread) {
5253 if (do_place_partition) {
5254 // add flag to update only master for spread
5255 __kmp_partition_places(team, update_master_only: 1);
5256 }
5257 }
5258 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5259 "proc_bind = %d, partition = [%d,%d]\n",
5260 team->t.t_id, new_proc_bind, team->t.t_first_place,
5261 team->t.t_last_place));
5262 } else {
5263 if (do_place_partition) {
5264 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5265 __kmp_partition_places(team);
5266 }
5267 }
5268#else
5269 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5270#endif /* KMP_AFFINITY_SUPPORTED */
5271 } else if (team->t.t_nproc > new_nproc) {
5272 KA_TRACE(20,
5273 ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5274 new_nproc));
5275
5276 team->t.t_size_changed = 1;
5277 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5278 // Barrier size already reduced earlier in this function
5279 // Activate team threads via th_used_in_team
5280 __kmp_add_threads_to_team(team, new_nthreads: new_nproc);
5281 }
5282 // When decreasing team size, threads no longer in the team should
5283 // unref task team.
5284 if (__kmp_tasking_mode != tskm_immediate_exec) {
5285 for (f = new_nproc; f < team->t.t_nproc; f++) {
5286 kmp_info_t *th = team->t.t_threads[f];
5287 KMP_DEBUG_ASSERT(th);
5288 th->th.th_task_team = NULL;
5289 }
5290 }
5291#if KMP_NESTED_HOT_TEAMS
5292 if (__kmp_hot_teams_mode == 0) {
5293 // AC: saved number of threads should correspond to team's value in this
5294 // mode, can be bigger in mode 1, when hot team has threads in reserve
5295 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5296 hot_teams[level].hot_team_nth = new_nproc;
5297#endif // KMP_NESTED_HOT_TEAMS
5298 /* release the extra threads we don't need any more */
5299 for (f = new_nproc; f < team->t.t_nproc; f++) {
5300 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5301 __kmp_free_thread(team->t.t_threads[f]);
5302 team->t.t_threads[f] = NULL;
5303 }
5304#if KMP_NESTED_HOT_TEAMS
5305 } // (__kmp_hot_teams_mode == 0)
5306 else {
5307 // When keeping extra threads in team, switch threads to wait on own
5308 // b_go flag
5309 for (f = new_nproc; f < team->t.t_nproc; ++f) {
5310 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5311 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5312 for (int b = 0; b < bs_last_barrier; ++b) {
5313 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5314 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5315 }
5316 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5317 }
5318 }
5319 }
5320#endif // KMP_NESTED_HOT_TEAMS
5321 team->t.t_nproc = new_nproc;
5322 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5323 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5324 __kmp_reinitialize_team(team, new_icvs,
5325 loc: root->r.r_uber_thread->th.th_ident);
5326
5327 // Update remaining threads
5328 for (f = 0; f < new_nproc; ++f) {
5329 team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5330 }
5331
5332 // restore the current task state of the primary thread: should be the
5333 // implicit task
5334 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5335 team->t.t_threads[0], team));
5336
5337 __kmp_push_current_task_to_thread(this_thr: team->t.t_threads[0], team, tid: 0);
5338
5339#ifdef KMP_DEBUG
5340 for (f = 0; f < team->t.t_nproc; f++) {
5341 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5342 team->t.t_threads[f]->th.th_team_nproc ==
5343 team->t.t_nproc);
5344 }
5345#endif
5346
5347 if (do_place_partition) {
5348 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5349#if KMP_AFFINITY_SUPPORTED
5350 __kmp_partition_places(team);
5351#endif
5352 }
5353 } else { // team->t.t_nproc < new_nproc
5354
5355 KA_TRACE(20,
5356 ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5357 new_nproc));
5358 int old_nproc = team->t.t_nproc; // save old value and use to update only
5359 team->t.t_size_changed = 1;
5360
5361#if KMP_NESTED_HOT_TEAMS
5362 int avail_threads = hot_teams[level].hot_team_nth;
5363 if (new_nproc < avail_threads)
5364 avail_threads = new_nproc;
5365 kmp_info_t **other_threads = team->t.t_threads;
5366 for (f = team->t.t_nproc; f < avail_threads; ++f) {
5367 // Adjust barrier data of reserved threads (if any) of the team
5368 // Other data will be set in __kmp_initialize_info() below.
5369 int b;
5370 kmp_balign_t *balign = other_threads[f]->th.th_bar;
5371 for (b = 0; b < bs_last_barrier; ++b) {
5372 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5373 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5374#if USE_DEBUGGER
5375 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5376#endif
5377 }
5378 }
5379 if (hot_teams[level].hot_team_nth >= new_nproc) {
5380 // we have all needed threads in reserve, no need to allocate any
5381 // this only possible in mode 1, cannot have reserved threads in mode 0
5382 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5383 team->t.t_nproc = new_nproc; // just get reserved threads involved
5384 } else {
5385 // We may have some threads in reserve, but not enough;
5386 // get reserved threads involved if any.
5387 team->t.t_nproc = hot_teams[level].hot_team_nth;
5388 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5389#endif // KMP_NESTED_HOT_TEAMS
5390 if (team->t.t_max_nproc < new_nproc) {
5391 /* reallocate larger arrays */
5392 __kmp_reallocate_team_arrays(team, max_nth: new_nproc);
5393 __kmp_reinitialize_team(team, new_icvs, NULL);
5394 }
5395
5396#if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) && \
5397 KMP_AFFINITY_SUPPORTED
5398 /* Temporarily set full mask for primary thread before creation of
5399 workers. The reason is that workers inherit the affinity from the
5400 primary thread, so if a lot of workers are created on the single
5401 core quickly, they don't get a chance to set their own affinity for
5402 a long time. */
5403 kmp_affinity_raii_t new_temp_affinity{__kmp_affin_fullMask};
5404#endif
5405
5406 /* allocate new threads for the hot team */
5407 for (f = team->t.t_nproc; f < new_nproc; f++) {
5408 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, new_tid: f);
5409 KMP_DEBUG_ASSERT(new_worker);
5410 team->t.t_threads[f] = new_worker;
5411
5412 KA_TRACE(20,
5413 ("__kmp_allocate_team: team %d init T#%d arrived: "
5414 "join=%llu, plain=%llu\n",
5415 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5416 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5417 team->t.t_bar[bs_plain_barrier].b_arrived));
5418
5419 { // Initialize barrier data for new threads.
5420 int b;
5421 kmp_balign_t *balign = new_worker->th.th_bar;
5422 for (b = 0; b < bs_last_barrier; ++b) {
5423 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5424 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5425 KMP_BARRIER_PARENT_FLAG);
5426#if USE_DEBUGGER
5427 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5428#endif
5429 }
5430 }
5431 }
5432
5433#if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) && \
5434 KMP_AFFINITY_SUPPORTED
5435 /* Restore initial primary thread's affinity mask */
5436 new_temp_affinity.restore();
5437#endif
5438#if KMP_NESTED_HOT_TEAMS
5439 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5440#endif // KMP_NESTED_HOT_TEAMS
5441 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5442 // Barrier size already increased earlier in this function
5443 // Activate team threads via th_used_in_team
5444 __kmp_add_threads_to_team(team, new_nthreads: new_nproc);
5445 }
5446 /* make sure everyone is syncronized */
5447 // new threads below
5448 __kmp_initialize_team(team, new_nproc, new_icvs,
5449 loc: root->r.r_uber_thread->th.th_ident);
5450
5451 /* reinitialize the threads */
5452 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5453 for (f = 0; f < team->t.t_nproc; ++f)
5454 __kmp_initialize_info(this_thr: team->t.t_threads[f], team, tid: f,
5455 gtid: __kmp_gtid_from_tid(tid: f, team));
5456
5457 // set th_task_state for new threads in hot team with older thread's state
5458 kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state;
5459 for (f = old_nproc; f < team->t.t_nproc; ++f)
5460 team->t.t_threads[f]->th.th_task_state = old_state;
5461
5462#ifdef KMP_DEBUG
5463 for (f = 0; f < team->t.t_nproc; ++f) {
5464 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5465 team->t.t_threads[f]->th.th_team_nproc ==
5466 team->t.t_nproc);
5467 }
5468#endif
5469
5470 if (do_place_partition) {
5471 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5472#if KMP_AFFINITY_SUPPORTED
5473 __kmp_partition_places(team);
5474#endif
5475 }
5476 } // Check changes in number of threads
5477
5478 if (master->th.th_teams_microtask) {
5479 for (f = 1; f < new_nproc; ++f) {
5480 // propagate teams construct specific info to workers
5481 kmp_info_t *thr = team->t.t_threads[f];
5482 thr->th.th_teams_microtask = master->th.th_teams_microtask;
5483 thr->th.th_teams_level = master->th.th_teams_level;
5484 thr->th.th_teams_size = master->th.th_teams_size;
5485 }
5486 }
5487#if KMP_NESTED_HOT_TEAMS
5488 if (level) {
5489 // Sync barrier state for nested hot teams, not needed for outermost hot
5490 // team.
5491 for (f = 1; f < new_nproc; ++f) {
5492 kmp_info_t *thr = team->t.t_threads[f];
5493 int b;
5494 kmp_balign_t *balign = thr->th.th_bar;
5495 for (b = 0; b < bs_last_barrier; ++b) {
5496 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5497 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5498#if USE_DEBUGGER
5499 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5500#endif
5501 }
5502 }
5503 }
5504#endif // KMP_NESTED_HOT_TEAMS
5505
5506 /* reallocate space for arguments if necessary */
5507 __kmp_alloc_argv_entries(argc, team, TRUE);
5508 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5509 // The hot team re-uses the previous task team,
5510 // if untouched during the previous release->gather phase.
5511
5512 KF_TRACE(10, (" hot_team = %p\n", team));
5513
5514#if KMP_DEBUG
5515 if (__kmp_tasking_mode != tskm_immediate_exec) {
5516 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5517 "task_team[1] = %p after reinit\n",
5518 team->t.t_task_team[0], team->t.t_task_team[1]));
5519 }
5520#endif
5521
5522#if OMPT_SUPPORT
5523 __ompt_team_assign_id(team, ompt_pid: ompt_parallel_data);
5524#endif
5525
5526 KMP_MB();
5527
5528 return team;
5529 }
5530
5531 /* next, let's try to take one from the team pool */
5532 KMP_MB();
5533 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5534 /* TODO: consider resizing undersized teams instead of reaping them, now
5535 that we have a resizing mechanism */
5536 if (team->t.t_max_nproc >= max_nproc) {
5537 /* take this team from the team pool */
5538 __kmp_team_pool = team->t.t_next_pool;
5539
5540 if (max_nproc > 1 &&
5541 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5542 if (!team->t.b) { // Allocate barrier structure
5543 team->t.b = distributedBarrier::allocate(nThreads: __kmp_dflt_team_nth_ub);
5544 }
5545 }
5546
5547 /* setup the team for fresh use */
5548 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5549
5550 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5551 "task_team[1] %p to NULL\n",
5552 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5553 team->t.t_task_team[0] = NULL;
5554 team->t.t_task_team[1] = NULL;
5555
5556 /* reallocate space for arguments if necessary */
5557 __kmp_alloc_argv_entries(argc, team, TRUE);
5558 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5559
5560 KA_TRACE(
5561 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5562 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5563 { // Initialize barrier data.
5564 int b;
5565 for (b = 0; b < bs_last_barrier; ++b) {
5566 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5567#if USE_DEBUGGER
5568 team->t.t_bar[b].b_master_arrived = 0;
5569 team->t.t_bar[b].b_team_arrived = 0;
5570#endif
5571 }
5572 }
5573
5574 team->t.t_proc_bind = new_proc_bind;
5575
5576 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5577 team->t.t_id));
5578
5579#if OMPT_SUPPORT
5580 __ompt_team_assign_id(team, ompt_pid: ompt_parallel_data);
5581#endif
5582
5583 team->t.t_nested_nth = NULL;
5584
5585 KMP_MB();
5586
5587 return team;
5588 }
5589
5590 /* reap team if it is too small, then loop back and check the next one */
5591 // not sure if this is wise, but, will be redone during the hot-teams
5592 // rewrite.
5593 /* TODO: Use technique to find the right size hot-team, don't reap them */
5594 team = __kmp_reap_team(team);
5595 __kmp_team_pool = team;
5596 }
5597
5598 /* nothing available in the pool, no matter, make a new team! */
5599 KMP_MB();
5600 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5601
5602 /* and set it up */
5603 team->t.t_max_nproc = max_nproc;
5604 if (max_nproc > 1 &&
5605 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5606 // Allocate barrier structure
5607 team->t.b = distributedBarrier::allocate(nThreads: __kmp_dflt_team_nth_ub);
5608 }
5609
5610 /* NOTE well, for some reason allocating one big buffer and dividing it up
5611 seems to really hurt performance a lot on the P4, so, let's not use this */
5612 __kmp_allocate_team_arrays(team, max_nth: max_nproc);
5613
5614 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5615 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5616
5617 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5618 "%p to NULL\n",
5619 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5620 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5621 // memory, no need to duplicate
5622 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5623 // memory, no need to duplicate
5624
5625 if (__kmp_storage_map) {
5626 __kmp_print_team_storage_map(header: "team", team, team_id: team->t.t_id, num_thr: new_nproc);
5627 }
5628
5629 /* allocate space for arguments */
5630 __kmp_alloc_argv_entries(argc, team, FALSE);
5631 team->t.t_argc = argc;
5632
5633 KA_TRACE(20,
5634 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5635 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5636 { // Initialize barrier data.
5637 int b;
5638 for (b = 0; b < bs_last_barrier; ++b) {
5639 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5640#if USE_DEBUGGER
5641 team->t.t_bar[b].b_master_arrived = 0;
5642 team->t.t_bar[b].b_team_arrived = 0;
5643#endif
5644 }
5645 }
5646
5647 team->t.t_proc_bind = new_proc_bind;
5648
5649#if OMPT_SUPPORT
5650 __ompt_team_assign_id(team, ompt_pid: ompt_parallel_data);
5651 team->t.ompt_serialized_team_info = NULL;
5652#endif
5653
5654 KMP_MB();
5655
5656 team->t.t_nested_nth = NULL;
5657
5658 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5659 team->t.t_id));
5660
5661 return team;
5662}
5663
5664/* TODO implement hot-teams at all levels */
5665/* TODO implement lazy thread release on demand (disband request) */
5666
5667/* free the team. return it to the team pool. release all the threads
5668 * associated with it */
5669void __kmp_free_team(kmp_root_t *root,
5670 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5671 int f;
5672 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5673 team->t.t_id));
5674
5675 /* verify state */
5676 KMP_DEBUG_ASSERT(root);
5677 KMP_DEBUG_ASSERT(team);
5678 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5679 KMP_DEBUG_ASSERT(team->t.t_threads);
5680
5681 int use_hot_team = team == root->r.r_hot_team;
5682#if KMP_NESTED_HOT_TEAMS
5683 int level;
5684 if (master) {
5685 level = team->t.t_active_level - 1;
5686 if (master->th.th_teams_microtask) { // in teams construct?
5687 if (master->th.th_teams_size.nteams > 1) {
5688 ++level; // level was not increased in teams construct for
5689 // team_of_masters
5690 }
5691 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5692 master->th.th_teams_level == team->t.t_level) {
5693 ++level; // level was not increased in teams construct for
5694 // team_of_workers before the parallel
5695 } // team->t.t_level will be increased inside parallel
5696 }
5697#if KMP_DEBUG
5698 kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5699#endif
5700 if (level < __kmp_hot_teams_max_level) {
5701 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5702 use_hot_team = 1;
5703 }
5704 }
5705#endif // KMP_NESTED_HOT_TEAMS
5706
5707 /* team is done working */
5708 TCW_SYNC_PTR(team->t.t_pkfn,
5709 NULL); // Important for Debugging Support Library.
5710#if KMP_OS_WINDOWS
5711 team->t.t_copyin_counter = 0; // init counter for possible reuse
5712#endif
5713 // Do not reset pointer to parent team to NULL for hot teams.
5714
5715 /* if we are non-hot team, release our threads */
5716 if (!use_hot_team) {
5717 if (__kmp_tasking_mode != tskm_immediate_exec) {
5718 // Wait for threads to reach reapable state
5719 for (f = 1; f < team->t.t_nproc; ++f) {
5720 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5721 kmp_info_t *th = team->t.t_threads[f];
5722 volatile kmp_uint32 *state = &th->th.th_reap_state;
5723 while (*state != KMP_SAFE_TO_REAP) {
5724#if KMP_OS_WINDOWS
5725 // On Windows a thread can be killed at any time, check this
5726 DWORD ecode;
5727 if (!__kmp_is_thread_alive(th, &ecode)) {
5728 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5729 break;
5730 }
5731#endif
5732 // first check if thread is sleeping
5733 if (th->th.th_sleep_loc)
5734 __kmp_null_resume_wrapper(thr: th);
5735 KMP_CPU_PAUSE();
5736 }
5737 }
5738
5739 // Delete task teams
5740 int tt_idx;
5741 for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5742 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5743 if (task_team != NULL) {
5744 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5745 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5746 team->t.t_threads[f]->th.th_task_team = NULL;
5747 }
5748 KA_TRACE(
5749 20,
5750 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5751 __kmp_get_gtid(), task_team, team->t.t_id));
5752#if KMP_NESTED_HOT_TEAMS
5753 __kmp_free_task_team(thread: master, task_team);
5754#endif
5755 team->t.t_task_team[tt_idx] = NULL;
5756 }
5757 }
5758 }
5759
5760 // Before clearing parent pointer, check if nested_nth list should be freed
5761 if (team->t.t_nested_nth && team->t.t_nested_nth != &__kmp_nested_nth &&
5762 team->t.t_nested_nth != team->t.t_parent->t.t_nested_nth) {
5763 KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
5764 KMP_INTERNAL_FREE(team->t.t_nested_nth);
5765 }
5766 team->t.t_nested_nth = NULL;
5767
5768 // Reset pointer to parent team only for non-hot teams.
5769 team->t.t_parent = NULL;
5770 team->t.t_level = 0;
5771 team->t.t_active_level = 0;
5772
5773 /* free the worker threads */
5774 for (f = 1; f < team->t.t_nproc; ++f) {
5775 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5776 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5777 (void)KMP_COMPARE_AND_STORE_ACQ32(
5778 &(team->t.t_threads[f]->th.th_used_in_team), 1, 2);
5779 }
5780 __kmp_free_thread(team->t.t_threads[f]);
5781 }
5782
5783 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5784 if (team->t.b) {
5785 // wake up thread at old location
5786 team->t.b->go_release();
5787 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5788 for (f = 1; f < team->t.t_nproc; ++f) {
5789 if (team->t.b->sleep[f].sleep) {
5790 __kmp_atomic_resume_64(
5791 target_gtid: team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5792 flag: (kmp_atomic_flag_64<> *)NULL);
5793 }
5794 }
5795 }
5796 // Wait for threads to be removed from team
5797 for (int f = 1; f < team->t.t_nproc; ++f) {
5798 while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5799 KMP_CPU_PAUSE();
5800 }
5801 }
5802 }
5803
5804 for (f = 1; f < team->t.t_nproc; ++f) {
5805 team->t.t_threads[f] = NULL;
5806 }
5807
5808 if (team->t.t_max_nproc > 1 &&
5809 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5810 distributedBarrier::deallocate(db: team->t.b);
5811 team->t.b = NULL;
5812 }
5813 /* put the team back in the team pool */
5814 /* TODO limit size of team pool, call reap_team if pool too large */
5815 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5816 __kmp_team_pool = (volatile kmp_team_t *)team;
5817 } else { // Check if team was created for primary threads in teams construct
5818 // See if first worker is a CG root
5819 KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5820 team->t.t_threads[1]->th.th_cg_roots);
5821 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5822 // Clean up the CG root nodes on workers so that this team can be re-used
5823 for (f = 1; f < team->t.t_nproc; ++f) {
5824 kmp_info_t *thr = team->t.t_threads[f];
5825 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5826 thr->th.th_cg_roots->cg_root == thr);
5827 // Pop current CG root off list
5828 kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5829 thr->th.th_cg_roots = tmp->up;
5830 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5831 " up to node %p. cg_nthreads was %d\n",
5832 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5833 int i = tmp->cg_nthreads--;
5834 if (i == 1) {
5835 __kmp_free(tmp); // free CG if we are the last thread in it
5836 }
5837 // Restore current task's thread_limit from CG root
5838 if (thr->th.th_cg_roots)
5839 thr->th.th_current_task->td_icvs.thread_limit =
5840 thr->th.th_cg_roots->cg_thread_limit;
5841 }
5842 }
5843 }
5844
5845 KMP_MB();
5846}
5847
5848/* reap the team. destroy it, reclaim all its resources and free its memory */
5849kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5850 kmp_team_t *next_pool = team->t.t_next_pool;
5851
5852 KMP_DEBUG_ASSERT(team);
5853 KMP_DEBUG_ASSERT(team->t.t_dispatch);
5854 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5855 KMP_DEBUG_ASSERT(team->t.t_threads);
5856 KMP_DEBUG_ASSERT(team->t.t_argv);
5857
5858 /* TODO clean the threads that are a part of this? */
5859
5860 /* free stuff */
5861 __kmp_free_team_arrays(team);
5862 if (team->t.t_argv != &team->t.t_inline_argv[0])
5863 __kmp_free((void *)team->t.t_argv);
5864 __kmp_free(team);
5865
5866 KMP_MB();
5867 return next_pool;
5868}
5869
5870// Free the thread. Don't reap it, just place it on the pool of available
5871// threads.
5872//
5873// Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5874// binding for the affinity mechanism to be useful.
5875//
5876// Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5877// However, we want to avoid a potential performance problem by always
5878// scanning through the list to find the correct point at which to insert
5879// the thread (potential N**2 behavior). To do this we keep track of the
5880// last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5881// With single-level parallelism, threads will always be added to the tail
5882// of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5883// parallelism, all bets are off and we may need to scan through the entire
5884// free list.
5885//
5886// This change also has a potentially large performance benefit, for some
5887// applications. Previously, as threads were freed from the hot team, they
5888// would be placed back on the free list in inverse order. If the hot team
5889// grew back to it's original size, then the freed thread would be placed
5890// back on the hot team in reverse order. This could cause bad cache
5891// locality problems on programs where the size of the hot team regularly
5892// grew and shrunk.
5893//
5894// Now, for single-level parallelism, the OMP tid is always == gtid.
5895void __kmp_free_thread(kmp_info_t *this_th) {
5896 int gtid;
5897 kmp_info_t **scan;
5898
5899 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5900 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5901
5902 KMP_DEBUG_ASSERT(this_th);
5903
5904 // When moving thread to pool, switch thread to wait on own b_go flag, and
5905 // uninitialized (NULL team).
5906 int b;
5907 kmp_balign_t *balign = this_th->th.th_bar;
5908 for (b = 0; b < bs_last_barrier; ++b) {
5909 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5910 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5911 balign[b].bb.team = NULL;
5912 balign[b].bb.leaf_kids = 0;
5913 }
5914 this_th->th.th_task_state = 0;
5915 this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5916
5917 /* put thread back on the free pool */
5918 TCW_PTR(this_th->th.th_team, NULL);
5919 TCW_PTR(this_th->th.th_root, NULL);
5920 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5921
5922 while (this_th->th.th_cg_roots) {
5923 this_th->th.th_cg_roots->cg_nthreads--;
5924 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5925 " %p of thread %p to %d\n",
5926 this_th, this_th->th.th_cg_roots,
5927 this_th->th.th_cg_roots->cg_root,
5928 this_th->th.th_cg_roots->cg_nthreads));
5929 kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5930 if (tmp->cg_root == this_th) { // Thread is a cg_root
5931 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5932 KA_TRACE(
5933 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5934 this_th->th.th_cg_roots = tmp->up;
5935 __kmp_free(tmp);
5936 } else { // Worker thread
5937 if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5938 __kmp_free(tmp);
5939 }
5940 this_th->th.th_cg_roots = NULL;
5941 break;
5942 }
5943 }
5944
5945 /* If the implicit task assigned to this thread can be used by other threads
5946 * -> multiple threads can share the data and try to free the task at
5947 * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5948 * with higher probability when hot team is disabled but can occurs even when
5949 * the hot team is enabled */
5950 __kmp_free_implicit_task(this_thr: this_th);
5951 this_th->th.th_current_task = NULL;
5952
5953 // If the __kmp_thread_pool_insert_pt is already past the new insert
5954 // point, then we need to re-scan the entire list.
5955 gtid = this_th->th.th_info.ds.ds_gtid;
5956 if (__kmp_thread_pool_insert_pt != NULL) {
5957 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5958 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5959 __kmp_thread_pool_insert_pt = NULL;
5960 }
5961 }
5962
5963 // Scan down the list to find the place to insert the thread.
5964 // scan is the address of a link in the list, possibly the address of
5965 // __kmp_thread_pool itself.
5966 //
5967 // In the absence of nested parallelism, the for loop will have 0 iterations.
5968 if (__kmp_thread_pool_insert_pt != NULL) {
5969 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5970 } else {
5971 scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5972 }
5973 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5974 scan = &((*scan)->th.th_next_pool))
5975 ;
5976
5977 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5978 // to its address.
5979 TCW_PTR(this_th->th.th_next_pool, *scan);
5980 __kmp_thread_pool_insert_pt = *scan = this_th;
5981 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5982 (this_th->th.th_info.ds.ds_gtid <
5983 this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5984 TCW_4(this_th->th.th_in_pool, TRUE);
5985 __kmp_suspend_initialize_thread(th: this_th);
5986 __kmp_lock_suspend_mx(th: this_th);
5987 if (this_th->th.th_active == TRUE) {
5988 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5989 this_th->th.th_active_in_pool = TRUE;
5990 }
5991#if KMP_DEBUG
5992 else {
5993 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5994 }
5995#endif
5996 __kmp_unlock_suspend_mx(th: this_th);
5997
5998 TCW_4(__kmp_nth, __kmp_nth - 1);
5999
6000#ifdef KMP_ADJUST_BLOCKTIME
6001 /* Adjust blocktime back to user setting or default if necessary */
6002 /* Middle initialization might never have occurred */
6003 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6004 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6005 if (__kmp_nth <= __kmp_avail_proc) {
6006 __kmp_zero_bt = FALSE;
6007 }
6008 }
6009#endif /* KMP_ADJUST_BLOCKTIME */
6010
6011 KMP_MB();
6012}
6013
6014/* ------------------------------------------------------------------------ */
6015
6016void *__kmp_launch_thread(kmp_info_t *this_thr) {
6017#if OMP_PROFILING_SUPPORT
6018 ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
6019 // TODO: add a configuration option for time granularity
6020 if (ProfileTraceFile)
6021 llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
6022#endif
6023
6024 int gtid = this_thr->th.th_info.ds.ds_gtid;
6025 /* void *stack_data;*/
6026 kmp_team_t **volatile pteam;
6027
6028 KMP_MB();
6029 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
6030
6031 if (__kmp_env_consistency_check) {
6032 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
6033 }
6034
6035#if OMPD_SUPPORT
6036 if (ompd_state & OMPD_ENABLE_BP)
6037 ompd_bp_thread_begin();
6038#endif
6039
6040#if OMPT_SUPPORT
6041 ompt_data_t *thread_data = nullptr;
6042 if (ompt_enabled.enabled) {
6043 thread_data = &(this_thr->th.ompt_thread_info.thread_data);
6044 *thread_data = ompt_data_none;
6045
6046 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6047 this_thr->th.ompt_thread_info.wait_id = 0;
6048 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
6049 this_thr->th.ompt_thread_info.parallel_flags = 0;
6050 if (ompt_enabled.ompt_callback_thread_begin) {
6051 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
6052 ompt_thread_worker, thread_data);
6053 }
6054 this_thr->th.ompt_thread_info.state = ompt_state_idle;
6055 }
6056#endif
6057
6058 /* This is the place where threads wait for work */
6059 while (!TCR_4(__kmp_global.g.g_done)) {
6060 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
6061 KMP_MB();
6062
6063 /* wait for work to do */
6064 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6065
6066 /* No tid yet since not part of a team */
6067 __kmp_fork_barrier(gtid, KMP_GTID_DNE);
6068
6069#if OMPT_SUPPORT
6070 if (ompt_enabled.enabled) {
6071 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6072 }
6073#endif
6074
6075 pteam = &this_thr->th.th_team;
6076
6077 /* have we been allocated? */
6078 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6079 /* we were just woken up, so run our new task */
6080 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6081 int rc;
6082 KA_TRACE(20,
6083 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6084 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6085 (*pteam)->t.t_pkfn));
6086
6087 updateHWFPControl(team: *pteam);
6088
6089#if OMPT_SUPPORT
6090 if (ompt_enabled.enabled) {
6091 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6092 }
6093#endif
6094
6095 rc = (*pteam)->t.t_invoke(gtid);
6096 KMP_ASSERT(rc);
6097
6098 KMP_MB();
6099 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6100 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6101 (*pteam)->t.t_pkfn));
6102 }
6103#if OMPT_SUPPORT
6104 if (ompt_enabled.enabled) {
6105 /* no frame set while outside task */
6106 __ompt_get_task_info_object(depth: 0)->frame.exit_frame = ompt_data_none;
6107
6108 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6109 }
6110#endif
6111 /* join barrier after parallel region */
6112 __kmp_join_barrier(gtid);
6113 }
6114 }
6115
6116#if OMPD_SUPPORT
6117 if (ompd_state & OMPD_ENABLE_BP)
6118 ompd_bp_thread_end();
6119#endif
6120
6121#if OMPT_SUPPORT
6122 if (ompt_enabled.ompt_callback_thread_end) {
6123 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6124 }
6125#endif
6126
6127 this_thr->th.th_task_team = NULL;
6128 /* run the destructors for the threadprivate data for this thread */
6129 __kmp_common_destroy_gtid(gtid);
6130
6131 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6132 KMP_MB();
6133
6134#if OMP_PROFILING_SUPPORT
6135 llvm::timeTraceProfilerFinishThread();
6136#endif
6137 return this_thr;
6138}
6139
6140/* ------------------------------------------------------------------------ */
6141
6142void __kmp_internal_end_dest(void *specific_gtid) {
6143 // Make sure no significant bits are lost
6144 int gtid;
6145 __kmp_type_convert(src: (kmp_intptr_t)specific_gtid - 1, dest: &gtid);
6146
6147 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6148 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6149 * this is because 0 is reserved for the nothing-stored case */
6150
6151 __kmp_internal_end_thread(gtid);
6152}
6153
6154#if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6155
6156__attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6157 __kmp_internal_end_atexit();
6158}
6159
6160#endif
6161
6162/* [Windows] josh: when the atexit handler is called, there may still be more
6163 than one thread alive */
6164void __kmp_internal_end_atexit(void) {
6165 KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6166 /* [Windows]
6167 josh: ideally, we want to completely shutdown the library in this atexit
6168 handler, but stat code that depends on thread specific data for gtid fails
6169 because that data becomes unavailable at some point during the shutdown, so
6170 we call __kmp_internal_end_thread instead. We should eventually remove the
6171 dependency on __kmp_get_specific_gtid in the stat code and use
6172 __kmp_internal_end_library to cleanly shutdown the library.
6173
6174 // TODO: Can some of this comment about GVS be removed?
6175 I suspect that the offending stat code is executed when the calling thread
6176 tries to clean up a dead root thread's data structures, resulting in GVS
6177 code trying to close the GVS structures for that thread, but since the stat
6178 code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6179 the calling thread is cleaning up itself instead of another thread, it get
6180 confused. This happens because allowing a thread to unregister and cleanup
6181 another thread is a recent modification for addressing an issue.
6182 Based on the current design (20050722), a thread may end up
6183 trying to unregister another thread only if thread death does not trigger
6184 the calling of __kmp_internal_end_thread. For Linux* OS, there is the
6185 thread specific data destructor function to detect thread death. For
6186 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6187 is nothing. Thus, the workaround is applicable only for Windows static
6188 stat library. */
6189 __kmp_internal_end_library(gtid: -1);
6190#if KMP_OS_WINDOWS
6191 __kmp_close_console();
6192#endif
6193}
6194
6195static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6196 // It is assumed __kmp_forkjoin_lock is acquired.
6197
6198 int gtid;
6199
6200 KMP_DEBUG_ASSERT(thread != NULL);
6201
6202 gtid = thread->th.th_info.ds.ds_gtid;
6203
6204 if (!is_root) {
6205 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6206 /* Assume the threads are at the fork barrier here */
6207 KA_TRACE(
6208 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6209 gtid));
6210 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6211 while (
6212 !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6213 KMP_CPU_PAUSE();
6214 __kmp_resume_32(target_gtid: gtid, flag: (kmp_flag_32<false, false> *)NULL);
6215 } else {
6216 /* Need release fence here to prevent seg faults for tree forkjoin
6217 barrier (GEH) */
6218 kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6219 thread);
6220 __kmp_release_64(flag: &flag);
6221 }
6222 }
6223
6224 // Terminate OS thread.
6225 __kmp_reap_worker(th: thread);
6226
6227 // The thread was killed asynchronously. If it was actively
6228 // spinning in the thread pool, decrement the global count.
6229 //
6230 // There is a small timing hole here - if the worker thread was just waking
6231 // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6232 // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6233 // the global counter might not get updated.
6234 //
6235 // Currently, this can only happen as the library is unloaded,
6236 // so there are no harmful side effects.
6237 if (thread->th.th_active_in_pool) {
6238 thread->th.th_active_in_pool = FALSE;
6239 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6240 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6241 }
6242 }
6243
6244 __kmp_free_implicit_task(this_thr: thread);
6245
6246// Free the fast memory for tasking
6247#if USE_FAST_MEMORY
6248 __kmp_free_fast_memory(this_thr: thread);
6249#endif /* USE_FAST_MEMORY */
6250
6251 __kmp_suspend_uninitialize_thread(th: thread);
6252
6253 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6254 TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6255
6256 --__kmp_all_nth;
6257 // __kmp_nth was decremented when thread is added to the pool.
6258
6259#ifdef KMP_ADJUST_BLOCKTIME
6260 /* Adjust blocktime back to user setting or default if necessary */
6261 /* Middle initialization might never have occurred */
6262 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6263 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6264 if (__kmp_nth <= __kmp_avail_proc) {
6265 __kmp_zero_bt = FALSE;
6266 }
6267 }
6268#endif /* KMP_ADJUST_BLOCKTIME */
6269
6270 /* free the memory being used */
6271 if (__kmp_env_consistency_check) {
6272 if (thread->th.th_cons) {
6273 __kmp_free_cons_stack(ptr: thread->th.th_cons);
6274 thread->th.th_cons = NULL;
6275 }
6276 }
6277
6278 if (thread->th.th_pri_common != NULL) {
6279 __kmp_free(thread->th.th_pri_common);
6280 thread->th.th_pri_common = NULL;
6281 }
6282
6283#if KMP_USE_BGET
6284 if (thread->th.th_local.bget_data != NULL) {
6285 __kmp_finalize_bget(th: thread);
6286 }
6287#endif
6288
6289#if KMP_AFFINITY_SUPPORTED
6290 if (thread->th.th_affin_mask != NULL) {
6291 KMP_CPU_FREE(thread->th.th_affin_mask);
6292 thread->th.th_affin_mask = NULL;
6293 }
6294#endif /* KMP_AFFINITY_SUPPORTED */
6295
6296#if KMP_USE_HIER_SCHED
6297 if (thread->th.th_hier_bar_data != NULL) {
6298 __kmp_free(thread->th.th_hier_bar_data);
6299 thread->th.th_hier_bar_data = NULL;
6300 }
6301#endif
6302
6303 __kmp_reap_team(team: thread->th.th_serial_team);
6304 thread->th.th_serial_team = NULL;
6305 __kmp_free(thread);
6306
6307 KMP_MB();
6308
6309} // __kmp_reap_thread
6310
6311static void __kmp_itthash_clean(kmp_info_t *th) {
6312#if USE_ITT_NOTIFY
6313 if (__kmp_itt_region_domains.count > 0) {
6314 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6315 kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6316 while (bucket) {
6317 kmp_itthash_entry_t *next = bucket->next_in_bucket;
6318 __kmp_thread_free(th, bucket);
6319 bucket = next;
6320 }
6321 }
6322 }
6323 if (__kmp_itt_barrier_domains.count > 0) {
6324 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6325 kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6326 while (bucket) {
6327 kmp_itthash_entry_t *next = bucket->next_in_bucket;
6328 __kmp_thread_free(th, bucket);
6329 bucket = next;
6330 }
6331 }
6332 }
6333#endif
6334}
6335
6336static void __kmp_internal_end(void) {
6337 int i;
6338
6339 /* First, unregister the library */
6340 __kmp_unregister_library();
6341
6342#if KMP_OS_WINDOWS
6343 /* In Win static library, we can't tell when a root actually dies, so we
6344 reclaim the data structures for any root threads that have died but not
6345 unregistered themselves, in order to shut down cleanly.
6346 In Win dynamic library we also can't tell when a thread dies. */
6347 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6348// dead roots
6349#endif
6350
6351 for (i = 0; i < __kmp_threads_capacity; i++)
6352 if (__kmp_root[i])
6353 if (__kmp_root[i]->r.r_active)
6354 break;
6355 KMP_MB(); /* Flush all pending memory write invalidates. */
6356 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6357
6358 if (i < __kmp_threads_capacity) {
6359#if KMP_USE_MONITOR
6360 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6361 KMP_MB(); /* Flush all pending memory write invalidates. */
6362
6363 // Need to check that monitor was initialized before reaping it. If we are
6364 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6365 // __kmp_monitor will appear to contain valid data, but it is only valid in
6366 // the parent process, not the child.
6367 // New behavior (201008): instead of keying off of the flag
6368 // __kmp_init_parallel, the monitor thread creation is keyed off
6369 // of the new flag __kmp_init_monitor.
6370 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6371 if (TCR_4(__kmp_init_monitor)) {
6372 __kmp_reap_monitor(&__kmp_monitor);
6373 TCW_4(__kmp_init_monitor, 0);
6374 }
6375 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6376 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6377#endif // KMP_USE_MONITOR
6378 } else {
6379/* TODO move this to cleanup code */
6380#ifdef KMP_DEBUG
6381 /* make sure that everything has properly ended */
6382 for (i = 0; i < __kmp_threads_capacity; i++) {
6383 if (__kmp_root[i]) {
6384 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6385 // there can be uber threads alive here
6386 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6387 }
6388 }
6389#endif
6390
6391 KMP_MB();
6392
6393 // Reap the worker threads.
6394 // This is valid for now, but be careful if threads are reaped sooner.
6395 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6396 // Get the next thread from the pool.
6397 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6398 __kmp_thread_pool = thread->th.th_next_pool;
6399 // Reap it.
6400 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6401 thread->th.th_next_pool = NULL;
6402 thread->th.th_in_pool = FALSE;
6403 __kmp_reap_thread(thread, is_root: 0);
6404 }
6405 __kmp_thread_pool_insert_pt = NULL;
6406
6407 // Reap teams.
6408 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6409 // Get the next team from the pool.
6410 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6411 __kmp_team_pool = team->t.t_next_pool;
6412 // Reap it.
6413 team->t.t_next_pool = NULL;
6414 __kmp_reap_team(team);
6415 }
6416
6417 __kmp_reap_task_teams();
6418
6419#if KMP_OS_UNIX
6420 // Threads that are not reaped should not access any resources since they
6421 // are going to be deallocated soon, so the shutdown sequence should wait
6422 // until all threads either exit the final spin-waiting loop or begin
6423 // sleeping after the given blocktime.
6424 for (i = 0; i < __kmp_threads_capacity; i++) {
6425 kmp_info_t *thr = __kmp_threads[i];
6426 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6427 KMP_CPU_PAUSE();
6428 }
6429#endif
6430
6431 for (i = 0; i < __kmp_threads_capacity; ++i) {
6432 // TBD: Add some checking...
6433 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6434 }
6435
6436 /* Make sure all threadprivate destructors get run by joining with all
6437 worker threads before resetting this flag */
6438 TCW_SYNC_4(__kmp_init_common, FALSE);
6439
6440 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6441 KMP_MB();
6442
6443#if KMP_USE_MONITOR
6444 // See note above: One of the possible fixes for CQ138434 / CQ140126
6445 //
6446 // FIXME: push both code fragments down and CSE them?
6447 // push them into __kmp_cleanup() ?
6448 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6449 if (TCR_4(__kmp_init_monitor)) {
6450 __kmp_reap_monitor(&__kmp_monitor);
6451 TCW_4(__kmp_init_monitor, 0);
6452 }
6453 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6454 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6455#endif
6456 } /* else !__kmp_global.t_active */
6457 TCW_4(__kmp_init_gtid, FALSE);
6458 KMP_MB(); /* Flush all pending memory write invalidates. */
6459
6460 __kmp_cleanup();
6461#if OMPT_SUPPORT
6462 ompt_fini();
6463#endif
6464}
6465
6466void __kmp_internal_end_library(int gtid_req) {
6467 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6468 /* this shouldn't be a race condition because __kmp_internal_end() is the
6469 only place to clear __kmp_serial_init */
6470 /* we'll check this later too, after we get the lock */
6471 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6472 // redundant, because the next check will work in any case.
6473 if (__kmp_global.g.g_abort) {
6474 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6475 /* TODO abort? */
6476 return;
6477 }
6478 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6479 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6480 return;
6481 }
6482
6483 // If hidden helper team has been initialized, we need to deinit it
6484 if (TCR_4(__kmp_init_hidden_helper) &&
6485 !TCR_4(__kmp_hidden_helper_team_done)) {
6486 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6487 // First release the main thread to let it continue its work
6488 __kmp_hidden_helper_main_thread_release();
6489 // Wait until the hidden helper team has been destroyed
6490 __kmp_hidden_helper_threads_deinitz_wait();
6491 }
6492
6493 KMP_MB(); /* Flush all pending memory write invalidates. */
6494 /* find out who we are and what we should do */
6495 {
6496 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6497 KA_TRACE(
6498 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6499 if (gtid == KMP_GTID_SHUTDOWN) {
6500 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6501 "already shutdown\n"));
6502 return;
6503 } else if (gtid == KMP_GTID_MONITOR) {
6504 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6505 "registered, or system shutdown\n"));
6506 return;
6507 } else if (gtid == KMP_GTID_DNE) {
6508 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6509 "shutdown\n"));
6510 /* we don't know who we are, but we may still shutdown the library */
6511 } else if (KMP_UBER_GTID(gtid)) {
6512 /* unregister ourselves as an uber thread. gtid is no longer valid */
6513 if (__kmp_root[gtid]->r.r_active) {
6514 __kmp_global.g.g_abort = -1;
6515 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6516 __kmp_unregister_library();
6517 KA_TRACE(10,
6518 ("__kmp_internal_end_library: root still active, abort T#%d\n",
6519 gtid));
6520 return;
6521 } else {
6522 __kmp_itthash_clean(th: __kmp_threads[gtid]);
6523 KA_TRACE(
6524 10,
6525 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6526 __kmp_unregister_root_current_thread(gtid);
6527 }
6528 } else {
6529/* worker threads may call this function through the atexit handler, if they
6530 * call exit() */
6531/* For now, skip the usual subsequent processing and just dump the debug buffer.
6532 TODO: do a thorough shutdown instead */
6533#ifdef DUMP_DEBUG_ON_EXIT
6534 if (__kmp_debug_buf)
6535 __kmp_dump_debug_buffer();
6536#endif
6537 // added unregister library call here when we switch to shm linux
6538 // if we don't, it will leave lots of files in /dev/shm
6539 // cleanup shared memory file before exiting.
6540 __kmp_unregister_library();
6541 return;
6542 }
6543 }
6544 /* synchronize the termination process */
6545 __kmp_acquire_bootstrap_lock(lck: &__kmp_initz_lock);
6546
6547 /* have we already finished */
6548 if (__kmp_global.g.g_abort) {
6549 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6550 /* TODO abort? */
6551 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
6552 return;
6553 }
6554 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6555 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
6556 return;
6557 }
6558
6559 /* We need this lock to enforce mutex between this reading of
6560 __kmp_threads_capacity and the writing by __kmp_register_root.
6561 Alternatively, we can use a counter of roots that is atomically updated by
6562 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6563 __kmp_internal_end_*. */
6564 __kmp_acquire_bootstrap_lock(lck: &__kmp_forkjoin_lock);
6565
6566 /* now we can safely conduct the actual termination */
6567 __kmp_internal_end();
6568
6569 __kmp_release_bootstrap_lock(lck: &__kmp_forkjoin_lock);
6570 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
6571
6572 KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6573
6574#ifdef DUMP_DEBUG_ON_EXIT
6575 if (__kmp_debug_buf)
6576 __kmp_dump_debug_buffer();
6577#endif
6578
6579#if KMP_OS_WINDOWS
6580 __kmp_close_console();
6581#endif
6582
6583 __kmp_fini_allocator();
6584
6585} // __kmp_internal_end_library
6586
6587void __kmp_internal_end_thread(int gtid_req) {
6588 int i;
6589
6590 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6591 /* this shouldn't be a race condition because __kmp_internal_end() is the
6592 * only place to clear __kmp_serial_init */
6593 /* we'll check this later too, after we get the lock */
6594 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6595 // redundant, because the next check will work in any case.
6596 if (__kmp_global.g.g_abort) {
6597 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6598 /* TODO abort? */
6599 return;
6600 }
6601 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6602 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6603 return;
6604 }
6605
6606 // If hidden helper team has been initialized, we need to deinit it
6607 if (TCR_4(__kmp_init_hidden_helper) &&
6608 !TCR_4(__kmp_hidden_helper_team_done)) {
6609 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6610 // First release the main thread to let it continue its work
6611 __kmp_hidden_helper_main_thread_release();
6612 // Wait until the hidden helper team has been destroyed
6613 __kmp_hidden_helper_threads_deinitz_wait();
6614 }
6615
6616 KMP_MB(); /* Flush all pending memory write invalidates. */
6617
6618 /* find out who we are and what we should do */
6619 {
6620 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6621 KA_TRACE(10,
6622 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6623 if (gtid == KMP_GTID_SHUTDOWN) {
6624 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6625 "already shutdown\n"));
6626 return;
6627 } else if (gtid == KMP_GTID_MONITOR) {
6628 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6629 "registered, or system shutdown\n"));
6630 return;
6631 } else if (gtid == KMP_GTID_DNE) {
6632 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6633 "shutdown\n"));
6634 return;
6635 /* we don't know who we are */
6636 } else if (KMP_UBER_GTID(gtid)) {
6637 /* unregister ourselves as an uber thread. gtid is no longer valid */
6638 if (__kmp_root[gtid]->r.r_active) {
6639 __kmp_global.g.g_abort = -1;
6640 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6641 KA_TRACE(10,
6642 ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6643 gtid));
6644 return;
6645 } else {
6646 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6647 gtid));
6648 __kmp_unregister_root_current_thread(gtid);
6649 }
6650 } else {
6651 /* just a worker thread, let's leave */
6652 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6653
6654 if (gtid >= 0) {
6655 __kmp_threads[gtid]->th.th_task_team = NULL;
6656 }
6657
6658 KA_TRACE(10,
6659 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6660 gtid));
6661 return;
6662 }
6663 }
6664#if KMP_DYNAMIC_LIB
6665 if (__kmp_pause_status != kmp_hard_paused)
6666 // AC: lets not shutdown the dynamic library at the exit of uber thread,
6667 // because we will better shutdown later in the library destructor.
6668 {
6669 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6670 return;
6671 }
6672#endif
6673 /* synchronize the termination process */
6674 __kmp_acquire_bootstrap_lock(lck: &__kmp_initz_lock);
6675
6676 /* have we already finished */
6677 if (__kmp_global.g.g_abort) {
6678 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6679 /* TODO abort? */
6680 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
6681 return;
6682 }
6683 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6684 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
6685 return;
6686 }
6687
6688 /* We need this lock to enforce mutex between this reading of
6689 __kmp_threads_capacity and the writing by __kmp_register_root.
6690 Alternatively, we can use a counter of roots that is atomically updated by
6691 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6692 __kmp_internal_end_*. */
6693
6694 /* should we finish the run-time? are all siblings done? */
6695 __kmp_acquire_bootstrap_lock(lck: &__kmp_forkjoin_lock);
6696
6697 for (i = 0; i < __kmp_threads_capacity; ++i) {
6698 if (KMP_UBER_GTID(gtid: i)) {
6699 KA_TRACE(
6700 10,
6701 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6702 __kmp_release_bootstrap_lock(lck: &__kmp_forkjoin_lock);
6703 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
6704 return;
6705 }
6706 }
6707
6708 /* now we can safely conduct the actual termination */
6709
6710 __kmp_internal_end();
6711
6712 __kmp_release_bootstrap_lock(lck: &__kmp_forkjoin_lock);
6713 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
6714
6715 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6716
6717#ifdef DUMP_DEBUG_ON_EXIT
6718 if (__kmp_debug_buf)
6719 __kmp_dump_debug_buffer();
6720#endif
6721} // __kmp_internal_end_thread
6722
6723// -----------------------------------------------------------------------------
6724// Library registration stuff.
6725
6726static long __kmp_registration_flag = 0;
6727// Random value used to indicate library initialization.
6728static char *__kmp_registration_str = NULL;
6729// Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6730
6731static inline char *__kmp_reg_status_name() {
6732/* On RHEL 3u5 if linked statically, getpid() returns different values in
6733 each thread. If registration and unregistration go in different threads
6734 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6735 env var can not be found, because the name will contain different pid. */
6736// macOS* complains about name being too long with additional getuid()
6737#if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6738 return __kmp_str_format(format: "__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6739 (int)getuid());
6740#else
6741 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6742#endif
6743} // __kmp_reg_status_get
6744
6745#if defined(KMP_USE_SHM)
6746bool __kmp_shm_available = false;
6747bool __kmp_tmp_available = false;
6748// If /dev/shm is not accessible, we will create a temporary file under /tmp.
6749char *temp_reg_status_file_name = nullptr;
6750#endif
6751
6752void __kmp_register_library_startup(void) {
6753
6754 char *name = __kmp_reg_status_name(); // Name of the environment variable.
6755 int done = 0;
6756 union {
6757 double dtime;
6758 long ltime;
6759 } time;
6760#if KMP_ARCH_X86 || KMP_ARCH_X86_64
6761 __kmp_initialize_system_tick();
6762#endif
6763 __kmp_read_system_time(delta: &time.dtime);
6764 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6765 __kmp_registration_str =
6766 __kmp_str_format(format: "%p-%lx-%s", &__kmp_registration_flag,
6767 __kmp_registration_flag, KMP_LIBRARY_FILE);
6768
6769 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6770 __kmp_registration_str));
6771
6772 while (!done) {
6773
6774 char *value = NULL; // Actual value of the environment variable.
6775
6776#if defined(KMP_USE_SHM)
6777 char *shm_name = nullptr;
6778 char *data1 = nullptr;
6779 __kmp_shm_available = __kmp_detect_shm();
6780 if (__kmp_shm_available) {
6781 int fd1 = -1;
6782 shm_name = __kmp_str_format(format: "/%s", name);
6783 int shm_preexist = 0;
6784 fd1 = shm_open(name: shm_name, O_CREAT | O_EXCL | O_RDWR, mode: 0600);
6785 if ((fd1 == -1) && (errno == EEXIST)) {
6786 // file didn't open because it already exists.
6787 // try opening existing file
6788 fd1 = shm_open(name: shm_name, O_RDWR, mode: 0600);
6789 if (fd1 == -1) { // file didn't open
6790 KMP_WARNING(FunctionError, "Can't open SHM");
6791 __kmp_shm_available = false;
6792 } else { // able to open existing file
6793 shm_preexist = 1;
6794 }
6795 }
6796 if (__kmp_shm_available && shm_preexist == 0) { // SHM created, set size
6797 if (ftruncate(fd: fd1, SHM_SIZE) == -1) { // error occured setting size;
6798 KMP_WARNING(FunctionError, "Can't set size of SHM");
6799 __kmp_shm_available = false;
6800 }
6801 }
6802 if (__kmp_shm_available) { // SHM exists, now map it
6803 data1 = (char *)mmap(addr: 0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6804 fd: fd1, offset: 0);
6805 if (data1 == MAP_FAILED) { // failed to map shared memory
6806 KMP_WARNING(FunctionError, "Can't map SHM");
6807 __kmp_shm_available = false;
6808 }
6809 }
6810 if (__kmp_shm_available) { // SHM mapped
6811 if (shm_preexist == 0) { // set data to SHM, set value
6812 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6813 }
6814 // Read value from either what we just wrote or existing file.
6815 value = __kmp_str_format(format: "%s", data1); // read value from SHM
6816 munmap(addr: data1, SHM_SIZE);
6817 }
6818 if (fd1 != -1)
6819 close(fd: fd1);
6820 }
6821 if (!__kmp_shm_available)
6822 __kmp_tmp_available = __kmp_detect_tmp();
6823 if (!__kmp_shm_available && __kmp_tmp_available) {
6824 // SHM failed to work due to an error other than that the file already
6825 // exists. Try to create a temp file under /tmp.
6826 // If /tmp isn't accessible, fall back to using environment variable.
6827 // TODO: /tmp might not always be the temporary directory. For now we will
6828 // not consider TMPDIR.
6829 int fd1 = -1;
6830 temp_reg_status_file_name = __kmp_str_format(format: "/tmp/%s", name);
6831 int tmp_preexist = 0;
6832 fd1 = open(file: temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0600);
6833 if ((fd1 == -1) && (errno == EEXIST)) {
6834 // file didn't open because it already exists.
6835 // try opening existing file
6836 fd1 = open(file: temp_reg_status_file_name, O_RDWR, 0600);
6837 if (fd1 == -1) { // file didn't open if (fd1 == -1) {
6838 KMP_WARNING(FunctionError, "Can't open TEMP");
6839 __kmp_tmp_available = false;
6840 } else {
6841 tmp_preexist = 1;
6842 }
6843 }
6844 if (__kmp_tmp_available && tmp_preexist == 0) {
6845 // we created /tmp file now set size
6846 if (ftruncate(fd: fd1, SHM_SIZE) == -1) { // error occured setting size;
6847 KMP_WARNING(FunctionError, "Can't set size of /tmp file");
6848 __kmp_tmp_available = false;
6849 }
6850 }
6851 if (__kmp_tmp_available) {
6852 data1 = (char *)mmap(addr: 0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6853 fd: fd1, offset: 0);
6854 if (data1 == MAP_FAILED) { // failed to map /tmp
6855 KMP_WARNING(FunctionError, "Can't map /tmp");
6856 __kmp_tmp_available = false;
6857 }
6858 }
6859 if (__kmp_tmp_available) {
6860 if (tmp_preexist == 0) { // set data to TMP, set value
6861 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6862 }
6863 // Read value from either what we just wrote or existing file.
6864 value = __kmp_str_format(format: "%s", data1); // read value from SHM
6865 munmap(addr: data1, SHM_SIZE);
6866 }
6867 if (fd1 != -1)
6868 close(fd: fd1);
6869 }
6870 if (!__kmp_shm_available && !__kmp_tmp_available) {
6871 // no /dev/shm and no /tmp -- fall back to environment variable
6872 // Set environment variable, but do not overwrite if it exists.
6873 __kmp_env_set(name, value: __kmp_registration_str, overwrite: 0);
6874 // read value to see if it got set
6875 value = __kmp_env_get(name);
6876 }
6877#else // Windows and unix with static library
6878 // Set environment variable, but do not overwrite if it exists.
6879 __kmp_env_set(name, __kmp_registration_str, 0);
6880 // read value to see if it got set
6881 value = __kmp_env_get(name);
6882#endif
6883
6884 if (value != NULL && strcmp(s1: value, s2: __kmp_registration_str) == 0) {
6885 done = 1; // Ok, environment variable set successfully, exit the loop.
6886 } else {
6887 // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6888 // Check whether it alive or dead.
6889 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6890 char *tail = value;
6891 char *flag_addr_str = NULL;
6892 char *flag_val_str = NULL;
6893 char const *file_name = NULL;
6894 __kmp_str_split(str: tail, delim: '-', head: &flag_addr_str, tail: &tail);
6895 __kmp_str_split(str: tail, delim: '-', head: &flag_val_str, tail: &tail);
6896 file_name = tail;
6897 if (tail != NULL) {
6898 unsigned long *flag_addr = 0;
6899 unsigned long flag_val = 0;
6900 KMP_SSCANF(s: flag_addr_str, format: "%p", RCAST(void **, &flag_addr));
6901 KMP_SSCANF(s: flag_val_str, format: "%lx", &flag_val);
6902 if (flag_addr != 0 && flag_val != 0 && strcmp(s1: file_name, s2: "") != 0) {
6903 // First, check whether environment-encoded address is mapped into
6904 // addr space.
6905 // If so, dereference it to see if it still has the right value.
6906 if (__kmp_is_address_mapped(addr: flag_addr) && *flag_addr == flag_val) {
6907 neighbor = 1;
6908 } else {
6909 // If not, then we know the other copy of the library is no longer
6910 // running.
6911 neighbor = 2;
6912 }
6913 }
6914 }
6915 switch (neighbor) {
6916 case 0: // Cannot parse environment variable -- neighbor status unknown.
6917 // Assume it is the incompatible format of future version of the
6918 // library. Assume the other library is alive.
6919 // WARN( ... ); // TODO: Issue a warning.
6920 file_name = "unknown library";
6921 KMP_FALLTHROUGH();
6922 // Attention! Falling to the next case. That's intentional.
6923 case 1: { // Neighbor is alive.
6924 // Check it is allowed.
6925 char *duplicate_ok = __kmp_env_get(name: "KMP_DUPLICATE_LIB_OK");
6926 if (!__kmp_str_match_true(data: duplicate_ok)) {
6927 // That's not allowed. Issue fatal error.
6928 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6929 KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6930 }
6931 KMP_INTERNAL_FREE(duplicate_ok);
6932 __kmp_duplicate_library_ok = 1;
6933 done = 1; // Exit the loop.
6934 } break;
6935 case 2: { // Neighbor is dead.
6936
6937#if defined(KMP_USE_SHM)
6938 if (__kmp_shm_available) { // close shared memory.
6939 shm_unlink(name: shm_name); // this removes file in /dev/shm
6940 } else if (__kmp_tmp_available) {
6941 unlink(name: temp_reg_status_file_name); // this removes the temp file
6942 } else {
6943 // Clear the variable and try to register library again.
6944 __kmp_env_unset(name);
6945 }
6946#else
6947 // Clear the variable and try to register library again.
6948 __kmp_env_unset(name);
6949#endif
6950 } break;
6951 default: {
6952 KMP_DEBUG_ASSERT(0);
6953 } break;
6954 }
6955 }
6956 KMP_INTERNAL_FREE((void *)value);
6957#if defined(KMP_USE_SHM)
6958 if (shm_name)
6959 KMP_INTERNAL_FREE((void *)shm_name);
6960#endif
6961 } // while
6962 KMP_INTERNAL_FREE((void *)name);
6963
6964} // func __kmp_register_library_startup
6965
6966void __kmp_unregister_library(void) {
6967
6968 char *name = __kmp_reg_status_name();
6969 char *value = NULL;
6970
6971#if defined(KMP_USE_SHM)
6972 char *shm_name = nullptr;
6973 int fd1;
6974 if (__kmp_shm_available) {
6975 shm_name = __kmp_str_format(format: "/%s", name);
6976 fd1 = shm_open(name: shm_name, O_RDONLY, mode: 0600);
6977 if (fd1 != -1) { // File opened successfully
6978 char *data1 = (char *)mmap(addr: 0, SHM_SIZE, PROT_READ, MAP_SHARED, fd: fd1, offset: 0);
6979 if (data1 != MAP_FAILED) {
6980 value = __kmp_str_format(format: "%s", data1); // read value from SHM
6981 munmap(addr: data1, SHM_SIZE);
6982 }
6983 close(fd: fd1);
6984 }
6985 } else if (__kmp_tmp_available) { // try /tmp
6986 fd1 = open(file: temp_reg_status_file_name, O_RDONLY);
6987 if (fd1 != -1) { // File opened successfully
6988 char *data1 = (char *)mmap(addr: 0, SHM_SIZE, PROT_READ, MAP_SHARED, fd: fd1, offset: 0);
6989 if (data1 != MAP_FAILED) {
6990 value = __kmp_str_format(format: "%s", data1); // read value from /tmp
6991 munmap(addr: data1, SHM_SIZE);
6992 }
6993 close(fd: fd1);
6994 }
6995 } else { // fall back to envirable
6996 value = __kmp_env_get(name);
6997 }
6998#else
6999 value = __kmp_env_get(name);
7000#endif
7001
7002 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
7003 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
7004 if (value != NULL && strcmp(s1: value, s2: __kmp_registration_str) == 0) {
7005// Ok, this is our variable. Delete it.
7006#if defined(KMP_USE_SHM)
7007 if (__kmp_shm_available) {
7008 shm_unlink(name: shm_name); // this removes file in /dev/shm
7009 } else if (__kmp_tmp_available) {
7010 unlink(name: temp_reg_status_file_name); // this removes the temp file
7011 } else {
7012 __kmp_env_unset(name);
7013 }
7014#else
7015 __kmp_env_unset(name);
7016#endif
7017 }
7018
7019#if defined(KMP_USE_SHM)
7020 if (shm_name)
7021 KMP_INTERNAL_FREE(shm_name);
7022 if (temp_reg_status_file_name)
7023 KMP_INTERNAL_FREE(temp_reg_status_file_name);
7024#endif
7025
7026 KMP_INTERNAL_FREE(__kmp_registration_str);
7027 KMP_INTERNAL_FREE(value);
7028 KMP_INTERNAL_FREE(name);
7029
7030 __kmp_registration_flag = 0;
7031 __kmp_registration_str = NULL;
7032
7033} // __kmp_unregister_library
7034
7035// End of Library registration stuff.
7036// -----------------------------------------------------------------------------
7037
7038#if KMP_MIC_SUPPORTED
7039
7040static void __kmp_check_mic_type() {
7041 kmp_cpuid_t cpuid_state = {.eax: 0};
7042 kmp_cpuid_t *cs_p = &cpuid_state;
7043 __kmp_x86_cpuid(leaf: 1, subleaf: 0, p: cs_p);
7044 // We don't support mic1 at the moment
7045 if ((cs_p->eax & 0xff0) == 0xB10) {
7046 __kmp_mic_type = mic2;
7047 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
7048 __kmp_mic_type = mic3;
7049 } else {
7050 __kmp_mic_type = non_mic;
7051 }
7052}
7053
7054#endif /* KMP_MIC_SUPPORTED */
7055
7056#if KMP_HAVE_UMWAIT
7057static void __kmp_user_level_mwait_init() {
7058 struct kmp_cpuid buf;
7059 __kmp_x86_cpuid(leaf: 7, subleaf: 0, p: &buf);
7060 __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
7061 __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
7062 __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
7063 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
7064 __kmp_umwait_enabled));
7065}
7066#elif KMP_HAVE_MWAIT
7067#ifndef AT_INTELPHIUSERMWAIT
7068// Spurious, non-existent value that should always fail to return anything.
7069// Will be replaced with the correct value when we know that.
7070#define AT_INTELPHIUSERMWAIT 10000
7071#endif
7072// getauxval() function is available in RHEL7 and SLES12. If a system with an
7073// earlier OS is used to build the RTL, we'll use the following internal
7074// function when the entry is not found.
7075unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
7076unsigned long getauxval(unsigned long) { return 0; }
7077
7078static void __kmp_user_level_mwait_init() {
7079 // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
7080 // use them to find if the user-level mwait is enabled. Otherwise, forcibly
7081 // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
7082 // KMP_USER_LEVEL_MWAIT was set to TRUE.
7083 if (__kmp_mic_type == mic3) {
7084 unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
7085 if ((res & 0x1) || __kmp_user_level_mwait) {
7086 __kmp_mwait_enabled = TRUE;
7087 if (__kmp_user_level_mwait) {
7088 KMP_INFORM(EnvMwaitWarn);
7089 }
7090 } else {
7091 __kmp_mwait_enabled = FALSE;
7092 }
7093 }
7094 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
7095 "__kmp_mwait_enabled = %d\n",
7096 __kmp_mic_type, __kmp_mwait_enabled));
7097}
7098#endif /* KMP_HAVE_UMWAIT */
7099
7100static void __kmp_do_serial_initialize(void) {
7101 int i, gtid;
7102 size_t size;
7103
7104 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
7105
7106 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
7107 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
7108 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
7109 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
7110 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
7111
7112#if OMPT_SUPPORT
7113 ompt_pre_init();
7114#endif
7115#if OMPD_SUPPORT
7116 __kmp_env_dump();
7117 ompd_init();
7118#endif
7119
7120 __kmp_validate_locks();
7121
7122#if ENABLE_LIBOMPTARGET
7123 /* Initialize functions from libomptarget */
7124 __kmp_init_omptarget();
7125#endif
7126
7127 /* Initialize internal memory allocator */
7128 __kmp_init_allocator();
7129
7130 /* Register the library startup via an environment variable or via mapped
7131 shared memory file and check to see whether another copy of the library is
7132 already registered. Since forked child process is often terminated, we
7133 postpone the registration till middle initialization in the child */
7134 if (__kmp_need_register_serial)
7135 __kmp_register_library_startup();
7136
7137 /* TODO reinitialization of library */
7138 if (TCR_4(__kmp_global.g.g_done)) {
7139 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
7140 }
7141
7142 __kmp_global.g.g_abort = 0;
7143 TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
7144
7145/* initialize the locks */
7146#if KMP_USE_ADAPTIVE_LOCKS
7147#if KMP_DEBUG_ADAPTIVE_LOCKS
7148 __kmp_init_speculative_stats();
7149#endif
7150#endif
7151#if KMP_STATS_ENABLED
7152 __kmp_stats_init();
7153#endif
7154 __kmp_init_lock(lck: &__kmp_global_lock);
7155 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock);
7156 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_1i);
7157 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_2i);
7158 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_4i);
7159 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_4r);
7160 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_8i);
7161 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_8r);
7162 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_8c);
7163 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_10r);
7164 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_16r);
7165 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_16c);
7166 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_20c);
7167 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_32c);
7168 __kmp_init_bootstrap_lock(lck: &__kmp_forkjoin_lock);
7169 __kmp_init_bootstrap_lock(lck: &__kmp_exit_lock);
7170#if KMP_USE_MONITOR
7171 __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7172#endif
7173 __kmp_init_bootstrap_lock(lck: &__kmp_tp_cached_lock);
7174
7175 /* conduct initialization and initial setup of configuration */
7176
7177 __kmp_runtime_initialize();
7178
7179#if KMP_MIC_SUPPORTED
7180 __kmp_check_mic_type();
7181#endif
7182
7183// Some global variable initialization moved here from kmp_env_initialize()
7184#ifdef KMP_DEBUG
7185 kmp_diag = 0;
7186#endif
7187 __kmp_abort_delay = 0;
7188
7189 // From __kmp_init_dflt_team_nth()
7190 /* assume the entire machine will be used */
7191 __kmp_dflt_team_nth_ub = __kmp_xproc;
7192 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7193 __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7194 }
7195 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7196 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7197 }
7198 __kmp_max_nth = __kmp_sys_max_nth;
7199 __kmp_cg_max_nth = __kmp_sys_max_nth;
7200 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7201 if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7202 __kmp_teams_max_nth = __kmp_sys_max_nth;
7203 }
7204
7205 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7206 // part
7207 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7208#if KMP_USE_MONITOR
7209 __kmp_monitor_wakeups =
7210 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7211 __kmp_bt_intervals =
7212 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7213#endif
7214 // From "KMP_LIBRARY" part of __kmp_env_initialize()
7215 __kmp_library = library_throughput;
7216 // From KMP_SCHEDULE initialization
7217 __kmp_static = kmp_sch_static_balanced;
7218// AC: do not use analytical here, because it is non-monotonous
7219//__kmp_guided = kmp_sch_guided_iterative_chunked;
7220//__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7221// need to repeat assignment
7222// Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7223// bit control and barrier method control parts
7224#if KMP_FAST_REDUCTION_BARRIER
7225#define kmp_reduction_barrier_gather_bb ((int)1)
7226#define kmp_reduction_barrier_release_bb ((int)1)
7227#define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7228#define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7229#endif // KMP_FAST_REDUCTION_BARRIER
7230 for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7231 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7232 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7233 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7234 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7235#if KMP_FAST_REDUCTION_BARRIER
7236 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7237 // lin_64 ): hyper,1
7238 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7239 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7240 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7241 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7242 }
7243#endif // KMP_FAST_REDUCTION_BARRIER
7244 }
7245#if KMP_FAST_REDUCTION_BARRIER
7246#undef kmp_reduction_barrier_release_pat
7247#undef kmp_reduction_barrier_gather_pat
7248#undef kmp_reduction_barrier_release_bb
7249#undef kmp_reduction_barrier_gather_bb
7250#endif // KMP_FAST_REDUCTION_BARRIER
7251#if KMP_MIC_SUPPORTED
7252 if (__kmp_mic_type == mic2) { // KNC
7253 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7254 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7255 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7256 1; // forkjoin release
7257 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7258 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7259 }
7260#if KMP_FAST_REDUCTION_BARRIER
7261 if (__kmp_mic_type == mic2) { // KNC
7262 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7263 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7264 }
7265#endif // KMP_FAST_REDUCTION_BARRIER
7266#endif // KMP_MIC_SUPPORTED
7267
7268// From KMP_CHECKS initialization
7269#ifdef KMP_DEBUG
7270 __kmp_env_checks = TRUE; /* development versions have the extra checks */
7271#else
7272 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7273#endif
7274
7275 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7276 __kmp_foreign_tp = TRUE;
7277
7278 __kmp_global.g.g_dynamic = FALSE;
7279 __kmp_global.g.g_dynamic_mode = dynamic_default;
7280
7281 __kmp_init_nesting_mode();
7282
7283 __kmp_env_initialize(NULL);
7284
7285#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7286 __kmp_user_level_mwait_init();
7287#endif
7288// Print all messages in message catalog for testing purposes.
7289#ifdef KMP_DEBUG
7290 char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7291 if (__kmp_str_match_true(val)) {
7292 kmp_str_buf_t buffer;
7293 __kmp_str_buf_init(&buffer);
7294 __kmp_i18n_dump_catalog(&buffer);
7295 __kmp_printf("%s", buffer.str);
7296 __kmp_str_buf_free(&buffer);
7297 }
7298 __kmp_env_free(&val);
7299#endif
7300
7301 __kmp_threads_capacity =
7302 __kmp_initial_threads_capacity(req_nproc: __kmp_dflt_team_nth_ub);
7303 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7304 __kmp_tp_capacity = __kmp_default_tp_capacity(
7305 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7306
7307 // If the library is shut down properly, both pools must be NULL. Just in
7308 // case, set them to NULL -- some memory may leak, but subsequent code will
7309 // work even if pools are not freed.
7310 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7311 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7312 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7313 __kmp_thread_pool = NULL;
7314 __kmp_thread_pool_insert_pt = NULL;
7315 __kmp_team_pool = NULL;
7316
7317 /* Allocate all of the variable sized records */
7318 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7319 * expandable */
7320 /* Since allocation is cache-aligned, just add extra padding at the end */
7321 size =
7322 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7323 CACHE_LINE;
7324 __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7325 __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7326 sizeof(kmp_info_t *) * __kmp_threads_capacity);
7327
7328 /* init thread counts */
7329 KMP_DEBUG_ASSERT(__kmp_all_nth ==
7330 0); // Asserts fail if the library is reinitializing and
7331 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7332 __kmp_all_nth = 0;
7333 __kmp_nth = 0;
7334
7335 /* setup the uber master thread and hierarchy */
7336 gtid = __kmp_register_root(TRUE);
7337 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
7338 KMP_ASSERT(KMP_UBER_GTID(gtid));
7339 KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7340
7341 KMP_MB(); /* Flush all pending memory write invalidates. */
7342
7343 __kmp_common_initialize();
7344
7345#if KMP_OS_UNIX
7346 /* invoke the child fork handler */
7347 __kmp_register_atfork();
7348#endif
7349
7350#if !KMP_DYNAMIC_LIB || \
7351 ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
7352 {
7353 /* Invoke the exit handler when the program finishes, only for static
7354 library and macOS* dynamic. For other dynamic libraries, we already
7355 have _fini and DllMain. */
7356 int rc = atexit(__kmp_internal_end_atexit);
7357 if (rc != 0) {
7358 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7359 __kmp_msg_null);
7360 }
7361 }
7362#endif
7363
7364#if KMP_HANDLE_SIGNALS
7365#if KMP_OS_UNIX
7366 /* NOTE: make sure that this is called before the user installs their own
7367 signal handlers so that the user handlers are called first. this way they
7368 can return false, not call our handler, avoid terminating the library, and
7369 continue execution where they left off. */
7370 __kmp_install_signals(FALSE);
7371#endif /* KMP_OS_UNIX */
7372#if KMP_OS_WINDOWS
7373 __kmp_install_signals(TRUE);
7374#endif /* KMP_OS_WINDOWS */
7375#endif
7376
7377 /* we have finished the serial initialization */
7378 __kmp_init_counter++;
7379
7380 __kmp_init_serial = TRUE;
7381
7382 if (__kmp_version) {
7383 __kmp_print_version_1();
7384 }
7385
7386 if (__kmp_settings) {
7387 __kmp_env_print();
7388 }
7389
7390 if (__kmp_display_env || __kmp_display_env_verbose) {
7391 __kmp_env_print_2();
7392 }
7393
7394#if OMPT_SUPPORT
7395 ompt_post_init();
7396#endif
7397
7398 KMP_MB();
7399
7400 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7401}
7402
7403void __kmp_serial_initialize(void) {
7404 if (__kmp_init_serial) {
7405 return;
7406 }
7407 __kmp_acquire_bootstrap_lock(lck: &__kmp_initz_lock);
7408 if (__kmp_init_serial) {
7409 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
7410 return;
7411 }
7412 __kmp_do_serial_initialize();
7413 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
7414}
7415
7416static void __kmp_do_middle_initialize(void) {
7417 int i, j;
7418 int prev_dflt_team_nth;
7419
7420 if (!__kmp_init_serial) {
7421 __kmp_do_serial_initialize();
7422 }
7423
7424 KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7425
7426 if (UNLIKELY(!__kmp_need_register_serial)) {
7427 // We are in a forked child process. The registration was skipped during
7428 // serial initialization in __kmp_atfork_child handler. Do it here.
7429 __kmp_register_library_startup();
7430 }
7431
7432 // Save the previous value for the __kmp_dflt_team_nth so that
7433 // we can avoid some reinitialization if it hasn't changed.
7434 prev_dflt_team_nth = __kmp_dflt_team_nth;
7435
7436#if KMP_AFFINITY_SUPPORTED
7437 // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7438 // number of cores on the machine.
7439 __kmp_affinity_initialize(affinity&: __kmp_affinity);
7440
7441#endif /* KMP_AFFINITY_SUPPORTED */
7442
7443 KMP_ASSERT(__kmp_xproc > 0);
7444 if (__kmp_avail_proc == 0) {
7445 __kmp_avail_proc = __kmp_xproc;
7446 }
7447
7448 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7449 // correct them now
7450 j = 0;
7451 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7452 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7453 __kmp_avail_proc;
7454 j++;
7455 }
7456
7457 if (__kmp_dflt_team_nth == 0) {
7458#ifdef KMP_DFLT_NTH_CORES
7459 // Default #threads = #cores
7460 __kmp_dflt_team_nth = __kmp_ncores;
7461 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7462 "__kmp_ncores (%d)\n",
7463 __kmp_dflt_team_nth));
7464#else
7465 // Default #threads = #available OS procs
7466 __kmp_dflt_team_nth = __kmp_avail_proc;
7467 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7468 "__kmp_avail_proc(%d)\n",
7469 __kmp_dflt_team_nth));
7470#endif /* KMP_DFLT_NTH_CORES */
7471 }
7472
7473 if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7474 __kmp_dflt_team_nth = KMP_MIN_NTH;
7475 }
7476 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7477 __kmp_dflt_team_nth = __kmp_sys_max_nth;
7478 }
7479
7480 if (__kmp_nesting_mode > 0)
7481 __kmp_set_nesting_mode_threads();
7482
7483 // There's no harm in continuing if the following check fails,
7484 // but it indicates an error in the previous logic.
7485 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7486
7487 if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7488 // Run through the __kmp_threads array and set the num threads icv for each
7489 // root thread that is currently registered with the RTL (which has not
7490 // already explicitly set its nthreads-var with a call to
7491 // omp_set_num_threads()).
7492 for (i = 0; i < __kmp_threads_capacity; i++) {
7493 kmp_info_t *thread = __kmp_threads[i];
7494 if (thread == NULL)
7495 continue;
7496 if (thread->th.th_current_task->td_icvs.nproc != 0)
7497 continue;
7498
7499 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7500 }
7501 }
7502 KA_TRACE(
7503 20,
7504 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7505 __kmp_dflt_team_nth));
7506
7507#ifdef KMP_ADJUST_BLOCKTIME
7508 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7509 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7510 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7511 if (__kmp_nth > __kmp_avail_proc) {
7512 __kmp_zero_bt = TRUE;
7513 }
7514 }
7515#endif /* KMP_ADJUST_BLOCKTIME */
7516
7517 /* we have finished middle initialization */
7518 TCW_SYNC_4(__kmp_init_middle, TRUE);
7519
7520 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7521}
7522
7523void __kmp_middle_initialize(void) {
7524 if (__kmp_init_middle) {
7525 return;
7526 }
7527 __kmp_acquire_bootstrap_lock(lck: &__kmp_initz_lock);
7528 if (__kmp_init_middle) {
7529 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
7530 return;
7531 }
7532 __kmp_do_middle_initialize();
7533 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
7534}
7535
7536void __kmp_parallel_initialize(void) {
7537 int gtid = __kmp_entry_gtid(); // this might be a new root
7538
7539 /* synchronize parallel initialization (for sibling) */
7540 if (TCR_4(__kmp_init_parallel))
7541 return;
7542 __kmp_acquire_bootstrap_lock(lck: &__kmp_initz_lock);
7543 if (TCR_4(__kmp_init_parallel)) {
7544 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
7545 return;
7546 }
7547
7548 /* TODO reinitialization after we have already shut down */
7549 if (TCR_4(__kmp_global.g.g_done)) {
7550 KA_TRACE(
7551 10,
7552 ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7553 __kmp_infinite_loop();
7554 }
7555
7556 /* jc: The lock __kmp_initz_lock is already held, so calling
7557 __kmp_serial_initialize would cause a deadlock. So we call
7558 __kmp_do_serial_initialize directly. */
7559 if (!__kmp_init_middle) {
7560 __kmp_do_middle_initialize();
7561 }
7562 __kmp_assign_root_init_mask();
7563 __kmp_resume_if_hard_paused();
7564
7565 /* begin initialization */
7566 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7567 KMP_ASSERT(KMP_UBER_GTID(gtid));
7568
7569#if KMP_ARCH_X86 || KMP_ARCH_X86_64
7570 // Save the FP control regs.
7571 // Worker threads will set theirs to these values at thread startup.
7572 __kmp_store_x87_fpu_control_word(p: &__kmp_init_x87_fpu_control_word);
7573 __kmp_store_mxcsr(p: &__kmp_init_mxcsr);
7574 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7575#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7576
7577#if KMP_OS_UNIX
7578#if KMP_HANDLE_SIGNALS
7579 /* must be after __kmp_serial_initialize */
7580 __kmp_install_signals(TRUE);
7581#endif
7582#endif
7583
7584 __kmp_suspend_initialize();
7585
7586#if defined(USE_LOAD_BALANCE)
7587 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7588 __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7589 }
7590#else
7591 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7592 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7593 }
7594#endif
7595
7596 if (__kmp_version) {
7597 __kmp_print_version_2();
7598 }
7599
7600 /* we have finished parallel initialization */
7601 TCW_SYNC_4(__kmp_init_parallel, TRUE);
7602
7603 KMP_MB();
7604 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7605
7606 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
7607}
7608
7609void __kmp_hidden_helper_initialize() {
7610 if (TCR_4(__kmp_init_hidden_helper))
7611 return;
7612
7613 // __kmp_parallel_initialize is required before we initialize hidden helper
7614 if (!TCR_4(__kmp_init_parallel))
7615 __kmp_parallel_initialize();
7616
7617 // Double check. Note that this double check should not be placed before
7618 // __kmp_parallel_initialize as it will cause dead lock.
7619 __kmp_acquire_bootstrap_lock(lck: &__kmp_initz_lock);
7620 if (TCR_4(__kmp_init_hidden_helper)) {
7621 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
7622 return;
7623 }
7624
7625#if KMP_AFFINITY_SUPPORTED
7626 // Initialize hidden helper affinity settings.
7627 // The above __kmp_parallel_initialize() will initialize
7628 // regular affinity (and topology) if not already done.
7629 if (!__kmp_hh_affinity.flags.initialized)
7630 __kmp_affinity_initialize(affinity&: __kmp_hh_affinity);
7631#endif
7632
7633 // Set the count of hidden helper tasks to be executed to zero
7634 KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7635
7636 // Set the global variable indicating that we're initializing hidden helper
7637 // team/threads
7638 TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7639
7640 // Platform independent initialization
7641 __kmp_do_initialize_hidden_helper_threads();
7642
7643 // Wait here for the finish of initialization of hidden helper teams
7644 __kmp_hidden_helper_threads_initz_wait();
7645
7646 // We have finished hidden helper initialization
7647 TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7648
7649 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
7650}
7651
7652/* ------------------------------------------------------------------------ */
7653
7654void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7655 kmp_team_t *team) {
7656 kmp_disp_t *dispatch;
7657
7658 KMP_MB();
7659
7660 /* none of the threads have encountered any constructs, yet. */
7661 this_thr->th.th_local.this_construct = 0;
7662#if KMP_CACHE_MANAGE
7663 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7664#endif /* KMP_CACHE_MANAGE */
7665 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7666 KMP_DEBUG_ASSERT(dispatch);
7667 KMP_DEBUG_ASSERT(team->t.t_dispatch);
7668 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7669 // this_thr->th.th_info.ds.ds_tid ] );
7670
7671 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7672 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7673 if (__kmp_env_consistency_check)
7674 __kmp_push_parallel(gtid, ident: team->t.t_ident);
7675
7676 KMP_MB(); /* Flush all pending memory write invalidates. */
7677}
7678
7679void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7680 kmp_team_t *team) {
7681 if (__kmp_env_consistency_check)
7682 __kmp_pop_parallel(gtid, ident: team->t.t_ident);
7683
7684 __kmp_finish_implicit_task(this_thr);
7685}
7686
7687int __kmp_invoke_task_func(int gtid) {
7688 int rc;
7689 int tid = __kmp_tid_from_gtid(gtid);
7690 kmp_info_t *this_thr = __kmp_threads[gtid];
7691 kmp_team_t *team = this_thr->th.th_team;
7692
7693 __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7694#if USE_ITT_BUILD
7695 if (__itt_stack_caller_create_ptr) {
7696 // inform ittnotify about entering user's code
7697 if (team->t.t_stack_id != NULL) {
7698 __kmp_itt_stack_callee_enter(id: (__itt_caller)team->t.t_stack_id);
7699 } else {
7700 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7701 __kmp_itt_stack_callee_enter(
7702 id: (__itt_caller)team->t.t_parent->t.t_stack_id);
7703 }
7704 }
7705#endif /* USE_ITT_BUILD */
7706#if INCLUDE_SSC_MARKS
7707 SSC_MARK_INVOKING();
7708#endif
7709
7710#if OMPT_SUPPORT
7711 void *dummy;
7712 void **exit_frame_p;
7713 ompt_data_t *my_task_data;
7714 ompt_data_t *my_parallel_data;
7715 int ompt_team_size;
7716
7717 if (ompt_enabled.enabled) {
7718 exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7719 .ompt_task_info.frame.exit_frame.ptr);
7720 } else {
7721 exit_frame_p = &dummy;
7722 }
7723
7724 my_task_data =
7725 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7726 my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7727 if (ompt_enabled.ompt_callback_implicit_task) {
7728 ompt_team_size = team->t.t_nproc;
7729 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7730 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7731 __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7732 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7733 }
7734#endif
7735
7736#if KMP_STATS_ENABLED
7737 stats_state_e previous_state = KMP_GET_THREAD_STATE();
7738 if (previous_state == stats_state_e::TEAMS_REGION) {
7739 KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7740 } else {
7741 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7742 }
7743 KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7744#endif
7745
7746 rc = __kmp_invoke_microtask(pkfn: (microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7747 npr: tid, argc: (int)team->t.t_argc, argv: (void **)team->t.t_argv
7748#if OMPT_SUPPORT
7749 ,
7750 exit_frame_ptr: exit_frame_p
7751#endif
7752 );
7753#if OMPT_SUPPORT
7754 *exit_frame_p = NULL;
7755 this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_team;
7756#endif
7757
7758#if KMP_STATS_ENABLED
7759 if (previous_state == stats_state_e::TEAMS_REGION) {
7760 KMP_SET_THREAD_STATE(previous_state);
7761 }
7762 KMP_POP_PARTITIONED_TIMER();
7763#endif
7764
7765#if USE_ITT_BUILD
7766 if (__itt_stack_caller_create_ptr) {
7767 // inform ittnotify about leaving user's code
7768 if (team->t.t_stack_id != NULL) {
7769 __kmp_itt_stack_callee_leave(id: (__itt_caller)team->t.t_stack_id);
7770 } else {
7771 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7772 __kmp_itt_stack_callee_leave(
7773 id: (__itt_caller)team->t.t_parent->t.t_stack_id);
7774 }
7775 }
7776#endif /* USE_ITT_BUILD */
7777 __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7778
7779 return rc;
7780}
7781
7782void __kmp_teams_master(int gtid) {
7783 // This routine is called by all primary threads in teams construct
7784 kmp_info_t *thr = __kmp_threads[gtid];
7785 kmp_team_t *team = thr->th.th_team;
7786 ident_t *loc = team->t.t_ident;
7787 thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7788 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7789 KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7790 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7791 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7792
7793 // This thread is a new CG root. Set up the proper variables.
7794 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7795 tmp->cg_root = thr; // Make thr the CG root
7796 // Init to thread limit stored when league primary threads were forked
7797 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7798 tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7799 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7800 " cg_nthreads to 1\n",
7801 thr, tmp));
7802 tmp->up = thr->th.th_cg_roots;
7803 thr->th.th_cg_roots = tmp;
7804
7805// Launch league of teams now, but not let workers execute
7806// (they hang on fork barrier until next parallel)
7807#if INCLUDE_SSC_MARKS
7808 SSC_MARK_FORKING();
7809#endif
7810 __kmp_fork_call(loc, gtid, call_context: fork_context_intel, argc: team->t.t_argc,
7811 microtask: (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7812 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7813#if INCLUDE_SSC_MARKS
7814 SSC_MARK_JOINING();
7815#endif
7816 // If the team size was reduced from the limit, set it to the new size
7817 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7818 thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7819 // AC: last parameter "1" eliminates join barrier which won't work because
7820 // worker threads are in a fork barrier waiting for more parallel regions
7821 __kmp_join_call(loc, gtid
7822#if OMPT_SUPPORT
7823 ,
7824 fork_context: fork_context_intel
7825#endif
7826 ,
7827 exit_teams: 1);
7828}
7829
7830int __kmp_invoke_teams_master(int gtid) {
7831 kmp_info_t *this_thr = __kmp_threads[gtid];
7832 kmp_team_t *team = this_thr->th.th_team;
7833#if KMP_DEBUG
7834 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7835 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7836 (void *)__kmp_teams_master);
7837#endif
7838 __kmp_run_before_invoked_task(gtid, tid: 0, this_thr, team);
7839#if OMPT_SUPPORT
7840 int tid = __kmp_tid_from_gtid(gtid);
7841 ompt_data_t *task_data =
7842 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7843 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7844 if (ompt_enabled.ompt_callback_implicit_task) {
7845 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7846 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7847 ompt_task_initial);
7848 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7849 }
7850#endif
7851 __kmp_teams_master(gtid);
7852#if OMPT_SUPPORT
7853 this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_league;
7854#endif
7855 __kmp_run_after_invoked_task(gtid, tid: 0, this_thr, team);
7856 return 1;
7857}
7858
7859/* this sets the requested number of threads for the next parallel region
7860 encountered by this team. since this should be enclosed in the forkjoin
7861 critical section it should avoid race conditions with asymmetrical nested
7862 parallelism */
7863void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7864 kmp_info_t *thr = __kmp_threads[gtid];
7865
7866 if (num_threads > 0)
7867 thr->th.th_set_nproc = num_threads;
7868}
7869
7870void __kmp_push_num_threads_list(ident_t *id, int gtid, kmp_uint32 list_length,
7871 int *num_threads_list) {
7872 kmp_info_t *thr = __kmp_threads[gtid];
7873
7874 KMP_DEBUG_ASSERT(list_length > 1);
7875
7876 if (num_threads_list[0] > 0)
7877 thr->th.th_set_nproc = num_threads_list[0];
7878 thr->th.th_set_nested_nth =
7879 (int *)KMP_INTERNAL_MALLOC(list_length * sizeof(int));
7880 for (kmp_uint32 i = 0; i < list_length; ++i)
7881 thr->th.th_set_nested_nth[i] = num_threads_list[i];
7882 thr->th.th_set_nested_nth_sz = list_length;
7883}
7884
7885void __kmp_set_strict_num_threads(ident_t *loc, int gtid, int sev,
7886 const char *msg) {
7887 kmp_info_t *thr = __kmp_threads[gtid];
7888 thr->th.th_nt_strict = true;
7889 thr->th.th_nt_loc = loc;
7890 // if sev is unset make fatal
7891 if (sev == severity_warning)
7892 thr->th.th_nt_sev = sev;
7893 else
7894 thr->th.th_nt_sev = severity_fatal;
7895 // if msg is unset, use an appropriate message
7896 if (msg)
7897 thr->th.th_nt_msg = msg;
7898 else
7899 thr->th.th_nt_msg = "Cannot form team with number of threads specified by "
7900 "strict num_threads clause.";
7901}
7902
7903static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7904 int num_threads) {
7905 KMP_DEBUG_ASSERT(thr);
7906 // Remember the number of threads for inner parallel regions
7907 if (!TCR_4(__kmp_init_middle))
7908 __kmp_middle_initialize(); // get internal globals calculated
7909 __kmp_assign_root_init_mask();
7910 KMP_DEBUG_ASSERT(__kmp_avail_proc);
7911 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7912
7913 if (num_threads == 0) {
7914 if (__kmp_teams_thread_limit > 0) {
7915 num_threads = __kmp_teams_thread_limit;
7916 } else {
7917 num_threads = __kmp_avail_proc / num_teams;
7918 }
7919 // adjust num_threads w/o warning as it is not user setting
7920 // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7921 // no thread_limit clause specified - do not change thread-limit-var ICV
7922 if (num_threads > __kmp_dflt_team_nth) {
7923 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7924 }
7925 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7926 num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7927 } // prevent team size to exceed thread-limit-var
7928 if (num_teams * num_threads > __kmp_teams_max_nth) {
7929 num_threads = __kmp_teams_max_nth / num_teams;
7930 }
7931 if (num_threads == 0) {
7932 num_threads = 1;
7933 }
7934 } else {
7935 if (num_threads < 0) {
7936 __kmp_msg(severity: kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7937 __kmp_msg_null);
7938 num_threads = 1;
7939 }
7940 // This thread will be the primary thread of the league primary threads
7941 // Store new thread limit; old limit is saved in th_cg_roots list
7942 thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7943 // num_threads = min(num_threads, nthreads-var)
7944 if (num_threads > __kmp_dflt_team_nth) {
7945 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7946 }
7947 if (num_teams * num_threads > __kmp_teams_max_nth) {
7948 int new_threads = __kmp_teams_max_nth / num_teams;
7949 if (new_threads == 0) {
7950 new_threads = 1;
7951 }
7952 if (new_threads != num_threads) {
7953 if (!__kmp_reserve_warn) { // user asked for too many threads
7954 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7955 __kmp_msg(severity: kmp_ms_warning,
7956 KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7957 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7958 }
7959 }
7960 num_threads = new_threads;
7961 }
7962 }
7963 thr->th.th_teams_size.nth = num_threads;
7964}
7965
7966/* this sets the requested number of teams for the teams region and/or
7967 the number of threads for the next parallel region encountered */
7968void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7969 int num_threads) {
7970 kmp_info_t *thr = __kmp_threads[gtid];
7971 if (num_teams < 0) {
7972 // OpenMP specification requires requested values to be positive,
7973 // but people can send us any value, so we'd better check
7974 __kmp_msg(severity: kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7975 __kmp_msg_null);
7976 num_teams = 1;
7977 }
7978 if (num_teams == 0) {
7979 if (__kmp_nteams > 0) {
7980 num_teams = __kmp_nteams;
7981 } else {
7982 num_teams = 1; // default number of teams is 1.
7983 }
7984 }
7985 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7986 if (!__kmp_reserve_warn) {
7987 __kmp_reserve_warn = 1;
7988 __kmp_msg(severity: kmp_ms_warning,
7989 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7990 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7991 }
7992 num_teams = __kmp_teams_max_nth;
7993 }
7994 // Set number of teams (number of threads in the outer "parallel" of the
7995 // teams)
7996 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7997
7998 __kmp_push_thread_limit(thr, num_teams, num_threads);
7999}
8000
8001/* This sets the requested number of teams for the teams region and/or
8002 the number of threads for the next parallel region encountered */
8003void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
8004 int num_teams_ub, int num_threads) {
8005 kmp_info_t *thr = __kmp_threads[gtid];
8006 KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
8007 KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
8008 KMP_DEBUG_ASSERT(num_threads >= 0);
8009
8010 if (num_teams_lb > num_teams_ub) {
8011 __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
8012 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
8013 }
8014
8015 int num_teams = 1; // defalt number of teams is 1.
8016
8017 if (num_teams_lb == 0 && num_teams_ub > 0)
8018 num_teams_lb = num_teams_ub;
8019
8020 if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
8021 num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
8022 if (num_teams > __kmp_teams_max_nth) {
8023 if (!__kmp_reserve_warn) {
8024 __kmp_reserve_warn = 1;
8025 __kmp_msg(severity: kmp_ms_warning,
8026 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
8027 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
8028 }
8029 num_teams = __kmp_teams_max_nth;
8030 }
8031 } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
8032 num_teams = num_teams_ub;
8033 } else { // num_teams_lb <= num_teams <= num_teams_ub
8034 if (num_threads <= 0) {
8035 if (num_teams_ub > __kmp_teams_max_nth) {
8036 num_teams = num_teams_lb;
8037 } else {
8038 num_teams = num_teams_ub;
8039 }
8040 } else {
8041 num_teams = (num_threads > __kmp_teams_max_nth)
8042 ? num_teams
8043 : __kmp_teams_max_nth / num_threads;
8044 if (num_teams < num_teams_lb) {
8045 num_teams = num_teams_lb;
8046 } else if (num_teams > num_teams_ub) {
8047 num_teams = num_teams_ub;
8048 }
8049 }
8050 }
8051 // Set number of teams (number of threads in the outer "parallel" of the
8052 // teams)
8053 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
8054
8055 __kmp_push_thread_limit(thr, num_teams, num_threads);
8056}
8057
8058// Set the proc_bind var to use in the following parallel region.
8059void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
8060 kmp_info_t *thr = __kmp_threads[gtid];
8061 thr->th.th_set_proc_bind = proc_bind;
8062}
8063
8064/* Launch the worker threads into the microtask. */
8065
8066void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
8067 kmp_info_t *this_thr = __kmp_threads[gtid];
8068
8069#ifdef KMP_DEBUG
8070 int f;
8071#endif /* KMP_DEBUG */
8072
8073 KMP_DEBUG_ASSERT(team);
8074 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8075 KMP_ASSERT(KMP_MASTER_GTID(gtid));
8076 KMP_MB(); /* Flush all pending memory write invalidates. */
8077
8078 team->t.t_construct = 0; /* no single directives seen yet */
8079 team->t.t_ordered.dt.t_value =
8080 0; /* thread 0 enters the ordered section first */
8081
8082 /* Reset the identifiers on the dispatch buffer */
8083 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
8084 if (team->t.t_max_nproc > 1) {
8085 int i;
8086 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
8087 team->t.t_disp_buffer[i].buffer_index = i;
8088 team->t.t_disp_buffer[i].doacross_buf_idx = i;
8089 }
8090 } else {
8091 team->t.t_disp_buffer[0].buffer_index = 0;
8092 team->t.t_disp_buffer[0].doacross_buf_idx = 0;
8093 }
8094
8095 KMP_MB(); /* Flush all pending memory write invalidates. */
8096 KMP_ASSERT(this_thr->th.th_team == team);
8097
8098#ifdef KMP_DEBUG
8099 for (f = 0; f < team->t.t_nproc; f++) {
8100 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
8101 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
8102 }
8103#endif /* KMP_DEBUG */
8104
8105 /* release the worker threads so they may begin working */
8106 __kmp_fork_barrier(gtid, tid: 0);
8107}
8108
8109void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
8110 kmp_info_t *this_thr = __kmp_threads[gtid];
8111
8112 KMP_DEBUG_ASSERT(team);
8113 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8114 KMP_ASSERT(KMP_MASTER_GTID(gtid));
8115 KMP_MB(); /* Flush all pending memory write invalidates. */
8116
8117 /* Join barrier after fork */
8118
8119#ifdef KMP_DEBUG
8120 if (__kmp_threads[gtid] &&
8121 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
8122 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
8123 __kmp_threads[gtid]);
8124 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
8125 "team->t.t_nproc=%d\n",
8126 gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
8127 team->t.t_nproc);
8128 __kmp_print_structure();
8129 }
8130 KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
8131 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
8132#endif /* KMP_DEBUG */
8133
8134 __kmp_join_barrier(gtid); /* wait for everyone */
8135#if OMPT_SUPPORT
8136 ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;
8137 if (ompt_enabled.enabled &&
8138 (ompt_state == ompt_state_wait_barrier_teams ||
8139 ompt_state == ompt_state_wait_barrier_implicit_parallel)) {
8140 int ds_tid = this_thr->th.th_info.ds.ds_tid;
8141 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
8142 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
8143#if OMPT_OPTIONAL
8144 void *codeptr = NULL;
8145 if (KMP_MASTER_TID(ds_tid) &&
8146 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
8147 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
8148 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
8149
8150 ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
8151 if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
8152 sync_kind = ompt_sync_region_barrier_teams;
8153 if (ompt_enabled.ompt_callback_sync_region_wait) {
8154 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
8155 sync_kind, ompt_scope_end, NULL, task_data, codeptr);
8156 }
8157 if (ompt_enabled.ompt_callback_sync_region) {
8158 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
8159 sync_kind, ompt_scope_end, NULL, task_data, codeptr);
8160 }
8161#endif
8162 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
8163 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
8164 ompt_scope_end, NULL, task_data, 0, ds_tid,
8165 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
8166 }
8167 }
8168#endif
8169
8170 KMP_MB(); /* Flush all pending memory write invalidates. */
8171 KMP_ASSERT(this_thr->th.th_team == team);
8172}
8173
8174/* ------------------------------------------------------------------------ */
8175
8176#ifdef USE_LOAD_BALANCE
8177
8178// Return the worker threads actively spinning in the hot team, if we
8179// are at the outermost level of parallelism. Otherwise, return 0.
8180static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
8181 int i;
8182 int retval;
8183 kmp_team_t *hot_team;
8184
8185 if (root->r.r_active) {
8186 return 0;
8187 }
8188 hot_team = root->r.r_hot_team;
8189 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
8190 return hot_team->t.t_nproc - 1; // Don't count primary thread
8191 }
8192
8193 // Skip the primary thread - it is accounted for elsewhere.
8194 retval = 0;
8195 for (i = 1; i < hot_team->t.t_nproc; i++) {
8196 if (hot_team->t.t_threads[i]->th.th_active) {
8197 retval++;
8198 }
8199 }
8200 return retval;
8201}
8202
8203// Perform an automatic adjustment to the number of
8204// threads used by the next parallel region.
8205static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8206 int retval;
8207 int pool_active;
8208 int hot_team_active;
8209 int team_curr_active;
8210 int system_active;
8211
8212 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8213 set_nproc));
8214 KMP_DEBUG_ASSERT(root);
8215 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8216 ->th.th_current_task->td_icvs.dynamic == TRUE);
8217 KMP_DEBUG_ASSERT(set_nproc > 1);
8218
8219 if (set_nproc == 1) {
8220 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8221 return 1;
8222 }
8223
8224 // Threads that are active in the thread pool, active in the hot team for this
8225 // particular root (if we are at the outer par level), and the currently
8226 // executing thread (to become the primary thread) are available to add to the
8227 // new team, but are currently contributing to the system load, and must be
8228 // accounted for.
8229 pool_active = __kmp_thread_pool_active_nth;
8230 hot_team_active = __kmp_active_hot_team_nproc(root);
8231 team_curr_active = pool_active + hot_team_active + 1;
8232
8233 // Check the system load.
8234 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8235 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8236 "hot team active = %d\n",
8237 system_active, pool_active, hot_team_active));
8238
8239 if (system_active < 0) {
8240 // There was an error reading the necessary info from /proc, so use the
8241 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8242 // = dynamic_thread_limit, we shouldn't wind up getting back here.
8243 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8244 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8245
8246 // Make this call behave like the thread limit algorithm.
8247 retval = __kmp_avail_proc - __kmp_nth +
8248 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8249 if (retval > set_nproc) {
8250 retval = set_nproc;
8251 }
8252 if (retval < KMP_MIN_NTH) {
8253 retval = KMP_MIN_NTH;
8254 }
8255
8256 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8257 retval));
8258 return retval;
8259 }
8260
8261 // There is a slight delay in the load balance algorithm in detecting new
8262 // running procs. The real system load at this instant should be at least as
8263 // large as the #active omp thread that are available to add to the team.
8264 if (system_active < team_curr_active) {
8265 system_active = team_curr_active;
8266 }
8267 retval = __kmp_avail_proc - system_active + team_curr_active;
8268 if (retval > set_nproc) {
8269 retval = set_nproc;
8270 }
8271 if (retval < KMP_MIN_NTH) {
8272 retval = KMP_MIN_NTH;
8273 }
8274
8275 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8276 return retval;
8277} // __kmp_load_balance_nproc()
8278
8279#endif /* USE_LOAD_BALANCE */
8280
8281/* ------------------------------------------------------------------------ */
8282
8283/* NOTE: this is called with the __kmp_init_lock held */
8284void __kmp_cleanup(void) {
8285 int f;
8286
8287 KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8288
8289 if (TCR_4(__kmp_init_parallel)) {
8290#if KMP_HANDLE_SIGNALS
8291 __kmp_remove_signals();
8292#endif
8293 TCW_4(__kmp_init_parallel, FALSE);
8294 }
8295
8296 if (TCR_4(__kmp_init_middle)) {
8297#if KMP_AFFINITY_SUPPORTED
8298 __kmp_affinity_uninitialize();
8299#endif /* KMP_AFFINITY_SUPPORTED */
8300 __kmp_cleanup_hierarchy();
8301 TCW_4(__kmp_init_middle, FALSE);
8302 }
8303
8304 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8305
8306 if (__kmp_init_serial) {
8307 __kmp_runtime_destroy();
8308 __kmp_init_serial = FALSE;
8309 }
8310
8311 __kmp_cleanup_threadprivate_caches();
8312
8313 for (f = 0; f < __kmp_threads_capacity; f++) {
8314 if (__kmp_root[f] != NULL) {
8315 __kmp_free(__kmp_root[f]);
8316 __kmp_root[f] = NULL;
8317 }
8318 }
8319 __kmp_free(__kmp_threads);
8320 // __kmp_threads and __kmp_root were allocated at once, as single block, so
8321 // there is no need in freeing __kmp_root.
8322 __kmp_threads = NULL;
8323 __kmp_root = NULL;
8324 __kmp_threads_capacity = 0;
8325
8326 // Free old __kmp_threads arrays if they exist.
8327 kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8328 while (ptr) {
8329 kmp_old_threads_list_t *next = ptr->next;
8330 __kmp_free(ptr->threads);
8331 __kmp_free(ptr);
8332 ptr = next;
8333 }
8334
8335#if KMP_USE_DYNAMIC_LOCK
8336 __kmp_cleanup_indirect_user_locks();
8337#else
8338 __kmp_cleanup_user_locks();
8339#endif
8340#if OMPD_SUPPORT
8341 if (ompd_state) {
8342 __kmp_free(ompd_env_block);
8343 ompd_env_block = NULL;
8344 ompd_env_block_size = 0;
8345 }
8346#endif
8347
8348#if KMP_AFFINITY_SUPPORTED
8349 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8350 __kmp_cpuinfo_file = NULL;
8351#endif /* KMP_AFFINITY_SUPPORTED */
8352
8353#if KMP_USE_ADAPTIVE_LOCKS
8354#if KMP_DEBUG_ADAPTIVE_LOCKS
8355 __kmp_print_speculative_stats();
8356#endif
8357#endif
8358 KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8359 __kmp_nested_nth.nth = NULL;
8360 __kmp_nested_nth.size = 0;
8361 __kmp_nested_nth.used = 0;
8362
8363 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8364 __kmp_nested_proc_bind.bind_types = NULL;
8365 __kmp_nested_proc_bind.size = 0;
8366 __kmp_nested_proc_bind.used = 0;
8367 if (__kmp_affinity_format) {
8368 KMP_INTERNAL_FREE(__kmp_affinity_format);
8369 __kmp_affinity_format = NULL;
8370 }
8371
8372 __kmp_i18n_catclose();
8373
8374#if KMP_USE_HIER_SCHED
8375 __kmp_hier_scheds.deallocate();
8376#endif
8377
8378#if KMP_STATS_ENABLED
8379 __kmp_stats_fini();
8380#endif
8381
8382 KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8383}
8384
8385/* ------------------------------------------------------------------------ */
8386
8387int __kmp_ignore_mppbeg(void) {
8388 char *env;
8389
8390 if ((env = getenv(name: "KMP_IGNORE_MPPBEG")) != NULL) {
8391 if (__kmp_str_match_false(data: env))
8392 return FALSE;
8393 }
8394 // By default __kmpc_begin() is no-op.
8395 return TRUE;
8396}
8397
8398int __kmp_ignore_mppend(void) {
8399 char *env;
8400
8401 if ((env = getenv(name: "KMP_IGNORE_MPPEND")) != NULL) {
8402 if (__kmp_str_match_false(data: env))
8403 return FALSE;
8404 }
8405 // By default __kmpc_end() is no-op.
8406 return TRUE;
8407}
8408
8409void __kmp_internal_begin(void) {
8410 int gtid;
8411 kmp_root_t *root;
8412
8413 /* this is a very important step as it will register new sibling threads
8414 and assign these new uber threads a new gtid */
8415 gtid = __kmp_entry_gtid();
8416 root = __kmp_threads[gtid]->th.th_root;
8417 KMP_ASSERT(KMP_UBER_GTID(gtid));
8418
8419 if (root->r.r_begin)
8420 return;
8421 __kmp_acquire_lock(lck: &root->r.r_begin_lock, gtid);
8422 if (root->r.r_begin) {
8423 __kmp_release_lock(lck: &root->r.r_begin_lock, gtid);
8424 return;
8425 }
8426
8427 root->r.r_begin = TRUE;
8428
8429 __kmp_release_lock(lck: &root->r.r_begin_lock, gtid);
8430}
8431
8432/* ------------------------------------------------------------------------ */
8433
8434void __kmp_user_set_library(enum library_type arg) {
8435 int gtid;
8436 kmp_root_t *root;
8437 kmp_info_t *thread;
8438
8439 /* first, make sure we are initialized so we can get our gtid */
8440
8441 gtid = __kmp_entry_gtid();
8442 thread = __kmp_threads[gtid];
8443
8444 root = thread->th.th_root;
8445
8446 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8447 library_serial));
8448 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8449 thread */
8450 KMP_WARNING(SetLibraryIncorrectCall);
8451 return;
8452 }
8453
8454 switch (arg) {
8455 case library_serial:
8456 thread->th.th_set_nproc = 0;
8457 set__nproc(thread, 1);
8458 break;
8459 case library_turnaround:
8460 thread->th.th_set_nproc = 0;
8461 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8462 : __kmp_dflt_team_nth_ub);
8463 break;
8464 case library_throughput:
8465 thread->th.th_set_nproc = 0;
8466 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8467 : __kmp_dflt_team_nth_ub);
8468 break;
8469 default:
8470 KMP_FATAL(UnknownLibraryType, arg);
8471 }
8472
8473 __kmp_aux_set_library(arg);
8474}
8475
8476void __kmp_aux_set_stacksize(size_t arg) {
8477 if (!__kmp_init_serial)
8478 __kmp_serial_initialize();
8479
8480#if KMP_OS_DARWIN
8481 if (arg & (0x1000 - 1)) {
8482 arg &= ~(0x1000 - 1);
8483 if (arg + 0x1000) /* check for overflow if we round up */
8484 arg += 0x1000;
8485 }
8486#endif
8487 __kmp_acquire_bootstrap_lock(lck: &__kmp_initz_lock);
8488
8489 /* only change the default stacksize before the first parallel region */
8490 if (!TCR_4(__kmp_init_parallel)) {
8491 size_t value = arg; /* argument is in bytes */
8492
8493 if (value < __kmp_sys_min_stksize)
8494 value = __kmp_sys_min_stksize;
8495 else if (value > KMP_MAX_STKSIZE)
8496 value = KMP_MAX_STKSIZE;
8497
8498 __kmp_stksize = value;
8499
8500 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8501 }
8502
8503 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
8504}
8505
8506/* set the behaviour of the runtime library */
8507/* TODO this can cause some odd behaviour with sibling parallelism... */
8508void __kmp_aux_set_library(enum library_type arg) {
8509 __kmp_library = arg;
8510
8511 switch (__kmp_library) {
8512 case library_serial: {
8513 KMP_INFORM(LibraryIsSerial);
8514 } break;
8515 case library_turnaround:
8516 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8517 __kmp_use_yield = 2; // only yield when oversubscribed
8518 break;
8519 case library_throughput:
8520 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8521 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8522 break;
8523 default:
8524 KMP_FATAL(UnknownLibraryType, arg);
8525 }
8526}
8527
8528/* Getting team information common for all team API */
8529// Returns NULL if not in teams construct
8530static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8531 kmp_info_t *thr = __kmp_entry_thread();
8532 teams_serialized = 0;
8533 if (thr->th.th_teams_microtask) {
8534 kmp_team_t *team = thr->th.th_team;
8535 int tlevel = thr->th.th_teams_level; // the level of the teams construct
8536 int ii = team->t.t_level;
8537 teams_serialized = team->t.t_serialized;
8538 int level = tlevel + 1;
8539 KMP_DEBUG_ASSERT(ii >= tlevel);
8540 while (ii > level) {
8541 for (teams_serialized = team->t.t_serialized;
8542 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8543 }
8544 if (team->t.t_serialized && (!teams_serialized)) {
8545 team = team->t.t_parent;
8546 continue;
8547 }
8548 if (ii > level) {
8549 team = team->t.t_parent;
8550 ii--;
8551 }
8552 }
8553 return team;
8554 }
8555 return NULL;
8556}
8557
8558int __kmp_aux_get_team_num() {
8559 int serialized;
8560 kmp_team_t *team = __kmp_aux_get_team_info(teams_serialized&: serialized);
8561 if (team) {
8562 if (serialized > 1) {
8563 return 0; // teams region is serialized ( 1 team of 1 thread ).
8564 } else {
8565 return team->t.t_master_tid;
8566 }
8567 }
8568 return 0;
8569}
8570
8571int __kmp_aux_get_num_teams() {
8572 int serialized;
8573 kmp_team_t *team = __kmp_aux_get_team_info(teams_serialized&: serialized);
8574 if (team) {
8575 if (serialized > 1) {
8576 return 1;
8577 } else {
8578 return team->t.t_parent->t.t_nproc;
8579 }
8580 }
8581 return 1;
8582}
8583
8584/* ------------------------------------------------------------------------ */
8585
8586/*
8587 * Affinity Format Parser
8588 *
8589 * Field is in form of: %[[[0].]size]type
8590 * % and type are required (%% means print a literal '%')
8591 * type is either single char or long name surrounded by {},
8592 * e.g., N or {num_threads}
8593 * 0 => leading zeros
8594 * . => right justified when size is specified
8595 * by default output is left justified
8596 * size is the *minimum* field length
8597 * All other characters are printed as is
8598 *
8599 * Available field types:
8600 * L {thread_level} - omp_get_level()
8601 * n {thread_num} - omp_get_thread_num()
8602 * h {host} - name of host machine
8603 * P {process_id} - process id (integer)
8604 * T {thread_identifier} - native thread identifier (integer)
8605 * N {num_threads} - omp_get_num_threads()
8606 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8607 * a {thread_affinity} - comma separated list of integers or integer ranges
8608 * (values of affinity mask)
8609 *
8610 * Implementation-specific field types can be added
8611 * If a type is unknown, print "undefined"
8612 */
8613
8614// Structure holding the short name, long name, and corresponding data type
8615// for snprintf. A table of these will represent the entire valid keyword
8616// field types.
8617typedef struct kmp_affinity_format_field_t {
8618 char short_name; // from spec e.g., L -> thread level
8619 const char *long_name; // from spec thread_level -> thread level
8620 char field_format; // data type for snprintf (typically 'd' or 's'
8621 // for integer or string)
8622} kmp_affinity_format_field_t;
8623
8624static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8625#if KMP_AFFINITY_SUPPORTED
8626 {.short_name: 'A', .long_name: "thread_affinity", .field_format: 's'},
8627#endif
8628 {.short_name: 't', .long_name: "team_num", .field_format: 'd'},
8629 {.short_name: 'T', .long_name: "num_teams", .field_format: 'd'},
8630 {.short_name: 'L', .long_name: "nesting_level", .field_format: 'd'},
8631 {.short_name: 'n', .long_name: "thread_num", .field_format: 'd'},
8632 {.short_name: 'N', .long_name: "num_threads", .field_format: 'd'},
8633 {.short_name: 'a', .long_name: "ancestor_tnum", .field_format: 'd'},
8634 {.short_name: 'H', .long_name: "host", .field_format: 's'},
8635 {.short_name: 'P', .long_name: "process_id", .field_format: 'd'},
8636 {.short_name: 'i', .long_name: "native_thread_id", .field_format: 'd'}};
8637
8638// Return the number of characters it takes to hold field
8639static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8640 const char **ptr,
8641 kmp_str_buf_t *field_buffer) {
8642 int rc, format_index, field_value;
8643 const char *width_left, *width_right;
8644 bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8645 static const int FORMAT_SIZE = 20;
8646 char format[FORMAT_SIZE] = {0};
8647 char absolute_short_name = 0;
8648
8649 KMP_DEBUG_ASSERT(gtid >= 0);
8650 KMP_DEBUG_ASSERT(th);
8651 KMP_DEBUG_ASSERT(**ptr == '%');
8652 KMP_DEBUG_ASSERT(field_buffer);
8653
8654 __kmp_str_buf_clear(buffer: field_buffer);
8655
8656 // Skip the initial %
8657 (*ptr)++;
8658
8659 // Check for %% first
8660 if (**ptr == '%') {
8661 __kmp_str_buf_cat(buffer: field_buffer, str: "%", len: 1);
8662 (*ptr)++; // skip over the second %
8663 return 1;
8664 }
8665
8666 // Parse field modifiers if they are present
8667 pad_zeros = false;
8668 if (**ptr == '0') {
8669 pad_zeros = true;
8670 (*ptr)++; // skip over 0
8671 }
8672 right_justify = false;
8673 if (**ptr == '.') {
8674 right_justify = true;
8675 (*ptr)++; // skip over .
8676 }
8677 // Parse width of field: [width_left, width_right)
8678 width_left = width_right = NULL;
8679 if (**ptr >= '0' && **ptr <= '9') {
8680 width_left = *ptr;
8681 SKIP_DIGITS(*ptr);
8682 width_right = *ptr;
8683 }
8684
8685 // Create the format for KMP_SNPRINTF based on flags parsed above
8686 format_index = 0;
8687 format[format_index++] = '%';
8688 if (!right_justify)
8689 format[format_index++] = '-';
8690 if (pad_zeros)
8691 format[format_index++] = '0';
8692 if (width_left && width_right) {
8693 int i = 0;
8694 // Only allow 8 digit number widths.
8695 // This also prevents overflowing format variable
8696 while (i < 8 && width_left < width_right) {
8697 format[format_index++] = *width_left;
8698 width_left++;
8699 i++;
8700 }
8701 }
8702
8703 // Parse a name (long or short)
8704 // Canonicalize the name into absolute_short_name
8705 found_valid_name = false;
8706 parse_long_name = (**ptr == '{');
8707 if (parse_long_name)
8708 (*ptr)++; // skip initial left brace
8709 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8710 sizeof(__kmp_affinity_format_table[0]);
8711 ++i) {
8712 char short_name = __kmp_affinity_format_table[i].short_name;
8713 const char *long_name = __kmp_affinity_format_table[i].long_name;
8714 char field_format = __kmp_affinity_format_table[i].field_format;
8715 if (parse_long_name) {
8716 size_t length = KMP_STRLEN(s: long_name);
8717 if (strncmp(s1: *ptr, s2: long_name, n: length) == 0) {
8718 found_valid_name = true;
8719 (*ptr) += length; // skip the long name
8720 }
8721 } else if (**ptr == short_name) {
8722 found_valid_name = true;
8723 (*ptr)++; // skip the short name
8724 }
8725 if (found_valid_name) {
8726 format[format_index++] = field_format;
8727 format[format_index++] = '\0';
8728 absolute_short_name = short_name;
8729 break;
8730 }
8731 }
8732 if (parse_long_name) {
8733 if (**ptr != '}') {
8734 absolute_short_name = 0;
8735 } else {
8736 (*ptr)++; // skip over the right brace
8737 }
8738 }
8739
8740 // Attempt to fill the buffer with the requested
8741 // value using snprintf within __kmp_str_buf_print()
8742 switch (absolute_short_name) {
8743 case 't':
8744 rc = __kmp_str_buf_print(buffer: field_buffer, format, __kmp_aux_get_team_num());
8745 break;
8746 case 'T':
8747 rc = __kmp_str_buf_print(buffer: field_buffer, format, __kmp_aux_get_num_teams());
8748 break;
8749 case 'L':
8750 rc = __kmp_str_buf_print(buffer: field_buffer, format, th->th.th_team->t.t_level);
8751 break;
8752 case 'n':
8753 rc = __kmp_str_buf_print(buffer: field_buffer, format, __kmp_tid_from_gtid(gtid));
8754 break;
8755 case 'H': {
8756 static const int BUFFER_SIZE = 256;
8757 char buf[BUFFER_SIZE];
8758 __kmp_expand_host_name(buffer: buf, size: BUFFER_SIZE);
8759 rc = __kmp_str_buf_print(buffer: field_buffer, format, buf);
8760 } break;
8761 case 'P':
8762 rc = __kmp_str_buf_print(buffer: field_buffer, format, getpid());
8763 break;
8764 case 'i':
8765 rc = __kmp_str_buf_print(buffer: field_buffer, format, __kmp_gettid());
8766 break;
8767 case 'N':
8768 rc = __kmp_str_buf_print(buffer: field_buffer, format, th->th.th_team->t.t_nproc);
8769 break;
8770 case 'a':
8771 field_value =
8772 __kmp_get_ancestor_thread_num(gtid, level: th->th.th_team->t.t_level - 1);
8773 rc = __kmp_str_buf_print(buffer: field_buffer, format, field_value);
8774 break;
8775#if KMP_AFFINITY_SUPPORTED
8776 case 'A': {
8777 kmp_str_buf_t buf;
8778 __kmp_str_buf_init(&buf);
8779 __kmp_affinity_str_buf_mask(buf: &buf, mask: th->th.th_affin_mask);
8780 rc = __kmp_str_buf_print(buffer: field_buffer, format, buf.str);
8781 __kmp_str_buf_free(buffer: &buf);
8782 } break;
8783#endif
8784 default:
8785 // According to spec, If an implementation does not have info for field
8786 // type, then "undefined" is printed
8787 rc = __kmp_str_buf_print(buffer: field_buffer, format: "%s", "undefined");
8788 // Skip the field
8789 if (parse_long_name) {
8790 SKIP_TOKEN(*ptr);
8791 if (**ptr == '}')
8792 (*ptr)++;
8793 } else {
8794 (*ptr)++;
8795 }
8796 }
8797
8798 KMP_ASSERT(format_index <= FORMAT_SIZE);
8799 return rc;
8800}
8801
8802/*
8803 * Return number of characters needed to hold the affinity string
8804 * (not including null byte character)
8805 * The resultant string is printed to buffer, which the caller can then
8806 * handle afterwards
8807 */
8808size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8809 kmp_str_buf_t *buffer) {
8810 const char *parse_ptr;
8811 size_t retval;
8812 const kmp_info_t *th;
8813 kmp_str_buf_t field;
8814
8815 KMP_DEBUG_ASSERT(buffer);
8816 KMP_DEBUG_ASSERT(gtid >= 0);
8817
8818 __kmp_str_buf_init(&field);
8819 __kmp_str_buf_clear(buffer);
8820
8821 th = __kmp_threads[gtid];
8822 retval = 0;
8823
8824 // If format is NULL or zero-length string, then we use
8825 // affinity-format-var ICV
8826 parse_ptr = format;
8827 if (parse_ptr == NULL || *parse_ptr == '\0') {
8828 parse_ptr = __kmp_affinity_format;
8829 }
8830 KMP_DEBUG_ASSERT(parse_ptr);
8831
8832 while (*parse_ptr != '\0') {
8833 // Parse a field
8834 if (*parse_ptr == '%') {
8835 // Put field in the buffer
8836 int rc = __kmp_aux_capture_affinity_field(gtid, th, ptr: &parse_ptr, field_buffer: &field);
8837 __kmp_str_buf_catbuf(dest: buffer, src: &field);
8838 retval += rc;
8839 } else {
8840 // Put literal character in buffer
8841 __kmp_str_buf_cat(buffer, str: parse_ptr, len: 1);
8842 retval++;
8843 parse_ptr++;
8844 }
8845 }
8846 __kmp_str_buf_free(buffer: &field);
8847 return retval;
8848}
8849
8850// Displays the affinity string to stdout
8851void __kmp_aux_display_affinity(int gtid, const char *format) {
8852 kmp_str_buf_t buf;
8853 __kmp_str_buf_init(&buf);
8854 __kmp_aux_capture_affinity(gtid, format, buffer: &buf);
8855 __kmp_fprintf(stream: kmp_out, format: "%s" KMP_END_OF_LINE, buf.str);
8856 __kmp_str_buf_free(buffer: &buf);
8857}
8858
8859/* ------------------------------------------------------------------------ */
8860void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8861 int blocktime = arg; /* argument is in microseconds */
8862#if KMP_USE_MONITOR
8863 int bt_intervals;
8864#endif
8865 kmp_int8 bt_set;
8866
8867 __kmp_save_internal_controls(thread);
8868
8869 /* Normalize and set blocktime for the teams */
8870 if (blocktime < KMP_MIN_BLOCKTIME)
8871 blocktime = KMP_MIN_BLOCKTIME;
8872 else if (blocktime > KMP_MAX_BLOCKTIME)
8873 blocktime = KMP_MAX_BLOCKTIME;
8874
8875 set__blocktime_team(thread->th.th_team, tid, blocktime);
8876 set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8877
8878#if KMP_USE_MONITOR
8879 /* Calculate and set blocktime intervals for the teams */
8880 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8881
8882 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8883 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8884#endif
8885
8886 /* Set whether blocktime has been set to "TRUE" */
8887 bt_set = TRUE;
8888
8889 set__bt_set_team(thread->th.th_team, tid, bt_set);
8890 set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8891#if KMP_USE_MONITOR
8892 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8893 "bt_intervals=%d, monitor_updates=%d\n",
8894 __kmp_gtid_from_tid(tid, thread->th.th_team),
8895 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8896 __kmp_monitor_wakeups));
8897#else
8898 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8899 __kmp_gtid_from_tid(tid, thread->th.th_team),
8900 thread->th.th_team->t.t_id, tid, blocktime));
8901#endif
8902}
8903
8904void __kmp_aux_set_defaults(char const *str, size_t len) {
8905 if (!__kmp_init_serial) {
8906 __kmp_serial_initialize();
8907 }
8908 __kmp_env_initialize(str);
8909
8910 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8911 __kmp_env_print();
8912 }
8913} // __kmp_aux_set_defaults
8914
8915/* ------------------------------------------------------------------------ */
8916/* internal fast reduction routines */
8917
8918PACKED_REDUCTION_METHOD_T
8919__kmp_determine_reduction_method(
8920 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8921 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8922 kmp_critical_name *lck) {
8923
8924 // Default reduction method: critical construct ( lck != NULL, like in current
8925 // PAROPT )
8926 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8927 // can be selected by RTL
8928 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8929 // can be selected by RTL
8930 // Finally, it's up to OpenMP RTL to make a decision on which method to select
8931 // among generated by PAROPT.
8932
8933 PACKED_REDUCTION_METHOD_T retval;
8934
8935 int team_size;
8936
8937 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8938
8939#define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8940 (loc && \
8941 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8942#define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8943
8944 retval = critical_reduce_block;
8945
8946 // another choice of getting a team size (with 1 dynamic deference) is slower
8947 team_size = __kmp_get_team_num_threads(global_tid);
8948 if (team_size == 1) {
8949
8950 retval = empty_reduce_block;
8951
8952 } else {
8953
8954 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8955
8956#if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8957 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || \
8958 KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_WASM
8959
8960#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8961 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HAIKU || \
8962 KMP_OS_HURD || KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8963
8964 int teamsize_cutoff = 4;
8965
8966#if KMP_MIC_SUPPORTED
8967 if (__kmp_mic_type != non_mic) {
8968 teamsize_cutoff = 8;
8969 }
8970#endif
8971 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8972 if (tree_available) {
8973 if (team_size <= teamsize_cutoff) {
8974 if (atomic_available) {
8975 retval = atomic_reduce_block;
8976 }
8977 } else {
8978 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8979 }
8980 } else if (atomic_available) {
8981 retval = atomic_reduce_block;
8982 }
8983#else
8984#error "Unknown or unsupported OS"
8985#endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8986 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HAIKU ||
8987 // KMP_OS_HURD || KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8988
8989#elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS || \
8990 KMP_ARCH_WASM || KMP_ARCH_PPC || KMP_ARCH_AARCH64_32 || KMP_ARCH_SPARC
8991
8992#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8993 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_HAIKU || KMP_OS_HURD || \
8994 KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8995
8996 // basic tuning
8997
8998 if (atomic_available) {
8999 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
9000 retval = atomic_reduce_block;
9001 }
9002 } // otherwise: use critical section
9003
9004#elif KMP_OS_DARWIN
9005
9006 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
9007 if (atomic_available && (num_vars <= 3)) {
9008 retval = atomic_reduce_block;
9009 } else if (tree_available) {
9010 if ((reduce_size > (9 * sizeof(kmp_real64))) &&
9011 (reduce_size < (2000 * sizeof(kmp_real64)))) {
9012 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
9013 }
9014 } // otherwise: use critical section
9015
9016#else
9017#error "Unknown or unsupported OS"
9018#endif
9019
9020#else
9021#error "Unknown or unsupported architecture"
9022#endif
9023 }
9024
9025 // KMP_FORCE_REDUCTION
9026
9027 // If the team is serialized (team_size == 1), ignore the forced reduction
9028 // method and stay with the unsynchronized method (empty_reduce_block)
9029 if (__kmp_force_reduction_method != reduction_method_not_defined &&
9030 team_size != 1) {
9031
9032 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
9033
9034 int atomic_available, tree_available;
9035
9036 switch ((forced_retval = __kmp_force_reduction_method)) {
9037 case critical_reduce_block:
9038 KMP_ASSERT(lck); // lck should be != 0
9039 break;
9040
9041 case atomic_reduce_block:
9042 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
9043 if (!atomic_available) {
9044 KMP_WARNING(RedMethodNotSupported, "atomic");
9045 forced_retval = critical_reduce_block;
9046 }
9047 break;
9048
9049 case tree_reduce_block:
9050 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
9051 if (!tree_available) {
9052 KMP_WARNING(RedMethodNotSupported, "tree");
9053 forced_retval = critical_reduce_block;
9054 } else {
9055#if KMP_FAST_REDUCTION_BARRIER
9056 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
9057#endif
9058 }
9059 break;
9060
9061 default:
9062 KMP_ASSERT(0); // "unsupported method specified"
9063 }
9064
9065 retval = forced_retval;
9066 }
9067
9068 KA_TRACE(10, ("reduction method selected=%08x\n", retval));
9069
9070#undef FAST_REDUCTION_TREE_METHOD_GENERATED
9071#undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
9072
9073 return (retval);
9074}
9075// this function is for testing set/get/determine reduce method
9076kmp_int32 __kmp_get_reduce_method(void) {
9077 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
9078}
9079
9080// Soft pause sets up threads to ignore blocktime and just go to sleep.
9081// Spin-wait code checks __kmp_pause_status and reacts accordingly.
9082void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
9083
9084// Hard pause shuts down the runtime completely. Resume happens naturally when
9085// OpenMP is used subsequently.
9086void __kmp_hard_pause() {
9087 __kmp_pause_status = kmp_hard_paused;
9088 __kmp_internal_end_thread(gtid_req: -1);
9089}
9090
9091// Soft resume sets __kmp_pause_status, and wakes up all threads.
9092void __kmp_resume_if_soft_paused() {
9093 if (__kmp_pause_status == kmp_soft_paused) {
9094 __kmp_pause_status = kmp_not_paused;
9095
9096 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
9097 kmp_info_t *thread = __kmp_threads[gtid];
9098 if (thread) { // Wake it if sleeping
9099 kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
9100 thread);
9101 if (fl.is_sleeping())
9102 fl.resume(th_gtid: gtid);
9103 else if (__kmp_try_suspend_mx(th: thread)) { // got suspend lock
9104 __kmp_unlock_suspend_mx(th: thread); // unlock it; it won't sleep
9105 } else { // thread holds the lock and may sleep soon
9106 do { // until either the thread sleeps, or we can get the lock
9107 if (fl.is_sleeping()) {
9108 fl.resume(th_gtid: gtid);
9109 break;
9110 } else if (__kmp_try_suspend_mx(th: thread)) {
9111 __kmp_unlock_suspend_mx(th: thread);
9112 break;
9113 }
9114 } while (1);
9115 }
9116 }
9117 }
9118 }
9119}
9120
9121// This function is called via __kmpc_pause_resource. Returns 0 if successful.
9122// TODO: add warning messages
9123int __kmp_pause_resource(kmp_pause_status_t level) {
9124 if (level == kmp_not_paused) { // requesting resume
9125 if (__kmp_pause_status == kmp_not_paused) {
9126 // error message about runtime not being paused, so can't resume
9127 return 1;
9128 } else {
9129 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
9130 __kmp_pause_status == kmp_hard_paused);
9131 __kmp_pause_status = kmp_not_paused;
9132 return 0;
9133 }
9134 } else if (level == kmp_soft_paused) { // requesting soft pause
9135 if (__kmp_pause_status != kmp_not_paused) {
9136 // error message about already being paused
9137 return 1;
9138 } else {
9139 __kmp_soft_pause();
9140 return 0;
9141 }
9142 } else if (level == kmp_hard_paused || level == kmp_stop_tool_paused) {
9143 // requesting hard pause or stop_tool pause
9144 if (__kmp_pause_status != kmp_not_paused) {
9145 // error message about already being paused
9146 return 1;
9147 } else {
9148 __kmp_hard_pause();
9149 return 0;
9150 }
9151 } else {
9152 // error message about invalid level
9153 return 1;
9154 }
9155}
9156
9157void __kmp_omp_display_env(int verbose) {
9158 __kmp_acquire_bootstrap_lock(lck: &__kmp_initz_lock);
9159 if (__kmp_init_serial == 0)
9160 __kmp_do_serial_initialize();
9161 __kmp_display_env_impl(display_env: !verbose, display_env_verbose: verbose);
9162 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
9163}
9164
9165// The team size is changing, so distributed barrier must be modified
9166void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
9167 int new_nthreads) {
9168 KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
9169 bp_dist_bar);
9170 kmp_info_t **other_threads = team->t.t_threads;
9171
9172 // We want all the workers to stop waiting on the barrier while we adjust the
9173 // size of the team.
9174 for (int f = 1; f < old_nthreads; ++f) {
9175 KMP_DEBUG_ASSERT(other_threads[f] != NULL);
9176 // Ignore threads that are already inactive or not present in the team
9177 if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
9178 // teams construct causes thread_limit to get passed in, and some of
9179 // those could be inactive; just ignore them
9180 continue;
9181 }
9182 // If thread is transitioning still to in_use state, wait for it
9183 if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
9184 while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
9185 KMP_CPU_PAUSE();
9186 }
9187 // The thread should be in_use now
9188 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
9189 // Transition to unused state
9190 team->t.t_threads[f]->th.th_used_in_team.store(i: 2);
9191 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
9192 }
9193 // Release all the workers
9194 team->t.b->go_release();
9195
9196 KMP_MFENCE();
9197
9198 // Workers should see transition status 2 and move to 0; but may need to be
9199 // woken up first
9200 int count = old_nthreads - 1;
9201 while (count > 0) {
9202 count = old_nthreads - 1;
9203 for (int f = 1; f < old_nthreads; ++f) {
9204 if (other_threads[f]->th.th_used_in_team.load() != 0) {
9205 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
9206 kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
9207 void *, other_threads[f]->th.th_sleep_loc);
9208 __kmp_atomic_resume_64(target_gtid: other_threads[f]->th.th_info.ds.ds_gtid, flag);
9209 }
9210 } else {
9211 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9212 count--;
9213 }
9214 }
9215 }
9216 // Now update the barrier size
9217 team->t.b->update_num_threads(nthr: new_nthreads);
9218 team->t.b->go_reset();
9219}
9220
9221void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9222 // Add the threads back to the team
9223 KMP_DEBUG_ASSERT(team);
9224 // Threads were paused and pointed at th_used_in_team temporarily during a
9225 // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9226 // the thread that it should transition itself back into the team. Then, if
9227 // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9228 // to wake it up.
9229 for (int f = 1; f < new_nthreads; ++f) {
9230 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9231 (void)KMP_COMPARE_AND_STORE_ACQ32(
9232 &(team->t.t_threads[f]->th.th_used_in_team), 0, 3);
9233 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9234 __kmp_resume_32(target_gtid: team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9235 flag: (kmp_flag_32<false, false> *)NULL);
9236 }
9237 }
9238 // The threads should be transitioning to the team; when they are done, they
9239 // should have set th_used_in_team to 1. This loop forces master to wait until
9240 // all threads have moved into the team and are waiting in the barrier.
9241 int count = new_nthreads - 1;
9242 while (count > 0) {
9243 count = new_nthreads - 1;
9244 for (int f = 1; f < new_nthreads; ++f) {
9245 if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9246 count--;
9247 }
9248 }
9249 }
9250}
9251
9252// Globals and functions for hidden helper task
9253kmp_info_t **__kmp_hidden_helper_threads;
9254kmp_info_t *__kmp_hidden_helper_main_thread;
9255std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9256#if KMP_OS_LINUX
9257kmp_int32 __kmp_hidden_helper_threads_num = 8;
9258kmp_int32 __kmp_enable_hidden_helper = TRUE;
9259#else
9260kmp_int32 __kmp_hidden_helper_threads_num = 0;
9261kmp_int32 __kmp_enable_hidden_helper = FALSE;
9262#endif
9263
9264namespace {
9265std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9266
9267void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9268 // This is an explicit synchronization on all hidden helper threads in case
9269 // that when a regular thread pushes a hidden helper task to one hidden
9270 // helper thread, the thread has not been awaken once since they're released
9271 // by the main thread after creating the team.
9272 KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9273 while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9274 __kmp_hidden_helper_threads_num)
9275 ;
9276
9277 // If main thread, then wait for signal
9278 if (__kmpc_master(nullptr, global_tid: *gtid)) {
9279 // First, unset the initial state and release the initial thread
9280 TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9281 __kmp_hidden_helper_initz_release();
9282 __kmp_hidden_helper_main_thread_wait();
9283 // Now wake up all worker threads
9284 for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9285 __kmp_hidden_helper_worker_thread_signal();
9286 }
9287 }
9288}
9289} // namespace
9290
9291void __kmp_hidden_helper_threads_initz_routine() {
9292 // Create a new root for hidden helper team/threads
9293 const int gtid = __kmp_register_root(TRUE);
9294 __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9295 __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9296 __kmp_hidden_helper_main_thread->th.th_set_nproc =
9297 __kmp_hidden_helper_threads_num;
9298
9299 KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9300
9301 __kmpc_fork_call(nullptr, nargs: 0, microtask: __kmp_hidden_helper_wrapper_fn);
9302
9303 // Set the initialization flag to FALSE
9304 TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9305
9306 __kmp_hidden_helper_threads_deinitz_release();
9307}
9308
9309/* Nesting Mode:
9310 Set via KMP_NESTING_MODE, which takes an integer.
9311 Note: we skip duplicate topology levels, and skip levels with only
9312 one entity.
9313 KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9314 KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9315 in the topology, and initializes the number of threads at each of those
9316 levels to the number of entities at each level, respectively, below the
9317 entity at the parent level.
9318 KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9319 but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9320 the user to turn nesting on explicitly. This is an even more experimental
9321 option to this experimental feature, and may change or go away in the
9322 future.
9323*/
9324
9325// Allocate space to store nesting levels
9326void __kmp_init_nesting_mode() {
9327 int levels = KMP_HW_LAST;
9328 __kmp_nesting_mode_nlevels = levels;
9329 __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9330 for (int i = 0; i < levels; ++i)
9331 __kmp_nesting_nth_level[i] = 0;
9332 if (__kmp_nested_nth.size < levels) {
9333 __kmp_nested_nth.nth =
9334 (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9335 __kmp_nested_nth.size = levels;
9336 }
9337}
9338
9339// Set # threads for top levels of nesting; must be called after topology set
9340void __kmp_set_nesting_mode_threads() {
9341 kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9342
9343 if (__kmp_nesting_mode == 1)
9344 __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9345 else if (__kmp_nesting_mode > 1)
9346 __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9347
9348 if (__kmp_topology) { // use topology info
9349 int loc, hw_level;
9350 for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9351 loc < __kmp_nesting_mode_nlevels;
9352 loc++, hw_level++) {
9353 __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(level: hw_level);
9354 if (__kmp_nesting_nth_level[loc] == 1)
9355 loc--;
9356 }
9357 // Make sure all cores are used
9358 if (__kmp_nesting_mode > 1 && loc > 1) {
9359 int core_level = __kmp_topology->get_level(type: KMP_HW_CORE);
9360 int num_cores = __kmp_topology->get_count(level: core_level);
9361 int upper_levels = 1;
9362 for (int level = 0; level < loc - 1; ++level)
9363 upper_levels *= __kmp_nesting_nth_level[level];
9364 if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9365 __kmp_nesting_nth_level[loc - 1] =
9366 num_cores / __kmp_nesting_nth_level[loc - 2];
9367 }
9368 __kmp_nesting_mode_nlevels = loc;
9369 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9370 } else { // no topology info available; provide a reasonable guesstimation
9371 if (__kmp_avail_proc >= 4) {
9372 __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9373 __kmp_nesting_nth_level[1] = 2;
9374 __kmp_nesting_mode_nlevels = 2;
9375 } else {
9376 __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9377 __kmp_nesting_mode_nlevels = 1;
9378 }
9379 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9380 }
9381 for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9382 __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9383 }
9384 set__nproc(thread, __kmp_nesting_nth_level[0]);
9385 if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9386 __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9387 if (get__max_active_levels(thread) > 1) {
9388 // if max levels was set, set nesting mode levels to same
9389 __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9390 }
9391 if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9392 set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9393}
9394
9395// Empty symbols to export (see exports_so.txt) when feature is disabled
9396extern "C" {
9397#if !KMP_STATS_ENABLED
9398void __kmp_reset_stats() {}
9399#endif
9400#if !USE_DEBUGGER
9401int __kmp_omp_debug_struct_info = FALSE;
9402int __kmp_debugging = FALSE;
9403#endif
9404#if !USE_ITT_BUILD || !USE_ITT_NOTIFY
9405void __kmp_itt_fini_ittlib() {}
9406void __kmp_itt_init_ittlib() {}
9407#endif
9408}
9409
9410// end of file
9411

source code of openmp/runtime/src/kmp_runtime.cpp