kmp_tasking.cpp source code [openmp/runtime/src/kmp_tasking.cpp]

1	/*
2	* kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3	*/
4
5	//===----------------------------------------------------------------------===//
6	//
7	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8	// See https://llvm.org/LICENSE.txt for license information.
9	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "kmp.h"
14	#include "kmp_i18n.h"
15	#include "kmp_itt.h"
16	#include "kmp_stats.h"
17	#include "kmp_wait_release.h"
18	#include "kmp_taskdeps.h"
19
20	#if OMPT_SUPPORT
21	#include "ompt-specific.h"
22	#endif
23
24	#if ENABLE_LIBOMPTARGET
25	static void (tgt_target_nowait_query)(void* **);
26
27	void __kmp_init_target_task() {
28	(void* **)(&tgt_target_nowait_query) = KMP_DLSYM("__tgt_target_nowait_query");
29	}
30	#endif
31
32	/ forward declaration /
33	static void __kmp_enable_tasking(kmp_task_team_t *task_team,
34	kmp_info_t *this_thr);
35	static void __kmp_alloc_task_deque(kmp_info_t *thread,
36	kmp_thread_data_t *thread_data);
37	static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
38	kmp_task_team_t *task_team);
39	static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
40	#if OMPX_TASKGRAPH
41	static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id);
42	int __kmp_taskloop_task(int gtid, void *ptask);
43	#endif
44
45	#ifdef BUILD_TIED_TASK_STACK
46
47	// __kmp_trace_task_stack: print the tied tasks from the task stack in order
48	// from top do bottom
49	//
50	// gtid: global thread identifier for thread containing stack
51	// thread_data: thread data for task team thread containing stack
52	// threshold: value above which the trace statement triggers
53	// location: string identifying call site of this function (for trace)
54	static void __kmp_trace_task_stack(kmp_int32 gtid,
55	kmp_thread_data_t *thread_data,
56	int threshold, char *location) {
57	kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
58	kmp_taskdata_t **stack_top = task_stack->ts_top;
59	kmp_int32 entries = task_stack->ts_entries;
60	kmp_taskdata_t *tied_task;
61
62	KA_TRACE(
63	threshold,
64	("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
65	"first_block = %p, stack_top = %p \n",
66	location, gtid, entries, task_stack->ts_first_block, stack_top));
67
68	KMP_DEBUG_ASSERT(stack_top != NULL);
69	KMP_DEBUG_ASSERT(entries > `0`);
70
71	while (entries != `0`) {
72	KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[`0`]);
73	// fix up ts_top if we need to pop from previous block
74	if (entries & TASK_STACK_INDEX_MASK == `0`) {
75	kmp_stack_block_t stack_block = (kmp_stack_block_t )(stack_top);
76
77	stack_block = stack_block->sb_prev;
78	stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
79	}
80
81	// finish bookkeeping
82	stack_top--;
83	entries--;
84
85	tied_task = *stack_top;
86
87	KMP_DEBUG_ASSERT(tied_task != NULL);
88	KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
89
90	KA_TRACE(threshold,
91	("__kmp_trace_task_stack(%s): gtid=%d, entry=%d, "
92	"stack_top=%p, tied_task=%p\n",
93	location, gtid, entries, stack_top, tied_task));
94	}
95	KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[`0`]);
96
97	KA_TRACE(threshold,
98	("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
99	location, gtid));
100	}
101
102	// __kmp_init_task_stack: initialize the task stack for the first time
103	// after a thread_data structure is created.
104	// It should not be necessary to do this again (assuming the stack works).
105	//
106	// gtid: global thread identifier of calling thread
107	// thread_data: thread data for task team thread containing stack
108	static void __kmp_init_task_stack(kmp_int32 gtid,
109	kmp_thread_data_t *thread_data) {
110	kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
111	kmp_stack_block_t *first_block;
112
113	// set up the first block of the stack
114	first_block = &task_stack->ts_first_block;
115	task_stack->ts_top = (kmp_taskdata_t **)first_block;
116	memset((void *)first_block, `'\0'`,
117	TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
118
119	// initialize the stack to be empty
120	task_stack->ts_entries = TASK_STACK_EMPTY;
121	first_block->sb_next = NULL;
122	first_block->sb_prev = NULL;
123	}
124
125	// __kmp_free_task_stack: free the task stack when thread_data is destroyed.
126	//
127	// gtid: global thread identifier for calling thread
128	// thread_data: thread info for thread containing stack
129	static void __kmp_free_task_stack(kmp_int32 gtid,
130	kmp_thread_data_t *thread_data) {
131	kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
132	kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
133
134	KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
135	// free from the second block of the stack
136	while (stack_block != NULL) {
137	kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
138
139	stack_block->sb_next = NULL;
140	stack_block->sb_prev = NULL;
141	if (stack_block != &task_stack->ts_first_block) {
142	__kmp_thread_free(thread,
143	stack_block); // free the block, if not the first
144	}
145	stack_block = next_block;
146	}
147	// initialize the stack to be empty
148	task_stack->ts_entries = `0`;
149	task_stack->ts_top = NULL;
150	}
151
152	// __kmp_push_task_stack: Push the tied task onto the task stack.
153	// Grow the stack if necessary by allocating another block.
154	//
155	// gtid: global thread identifier for calling thread
156	// thread: thread info for thread containing stack
157	// tied_task: the task to push on the stack
158	static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
159	kmp_taskdata_t *tied_task) {
160	// GEH - need to consider what to do if tt_threads_data not allocated yet
161	kmp_thread_data_t *thread_data =
162	&thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
163	kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
164
165	if (tied_task->td_flags.team_serial \|\| tied_task->td_flags.tasking_ser) {
166	return; // Don't push anything on stack if team or team tasks are serialized
167	}
168
169	KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
170	KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
171
172	KA_TRACE(`20`,
173	("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
174	gtid, thread, tied_task));
175	// Store entry
176	*(task_stack->ts_top) = tied_task;
177
178	// Do bookkeeping for next push
179	task_stack->ts_top++;
180	task_stack->ts_entries++;
181
182	if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == `0`) {
183	// Find beginning of this task block
184	kmp_stack_block_t *stack_block =
185	(kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
186
187	// Check if we already have a block
188	if (stack_block->sb_next !=
189	NULL) { // reset ts_top to beginning of next block
190	task_stack->ts_top = &stack_block->sb_next->sb_block[`0`];
191	} else { // Alloc new block and link it up
192	kmp_stack_block_t new_block = (kmp_stack_block_t )__kmp_thread_calloc(
193	thread, sizeof(kmp_stack_block_t));
194
195	task_stack->ts_top = &new_block->sb_block[`0`];
196	stack_block->sb_next = new_block;
197	new_block->sb_prev = stack_block;
198	new_block->sb_next = NULL;
199
200	KA_TRACE(
201	`30`,
202	("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
203	gtid, tied_task, new_block));
204	}
205	}
206	KA_TRACE(`20`, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
207	tied_task));
208	}
209
210	// __kmp_pop_task_stack: Pop the tied task from the task stack. Don't return
211	// the task, just check to make sure it matches the ending task passed in.
212	//
213	// gtid: global thread identifier for the calling thread
214	// thread: thread info structure containing stack
215	// tied_task: the task popped off the stack
216	// ending_task: the task that is ending (should match popped task)
217	static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
218	kmp_taskdata_t *ending_task) {
219	// GEH - need to consider what to do if tt_threads_data not allocated yet
220	kmp_thread_data_t *thread_data =
221	&thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
222	kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
223	kmp_taskdata_t *tied_task;
224
225	if (ending_task->td_flags.team_serial \|\| ending_task->td_flags.tasking_ser) {
226	// Don't pop anything from stack if team or team tasks are serialized
227	return;
228	}
229
230	KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
231	KMP_DEBUG_ASSERT(task_stack->ts_entries > `0`);
232
233	KA_TRACE(`20`, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
234	thread));
235
236	// fix up ts_top if we need to pop from previous block
237	if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == `0`) {
238	kmp_stack_block_t stack_block = (kmp_stack_block_t )(task_stack->ts_top);
239
240	stack_block = stack_block->sb_prev;
241	task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
242	}
243
244	// finish bookkeeping
245	task_stack->ts_top--;
246	task_stack->ts_entries--;
247
248	tied_task = *(task_stack->ts_top);
249
250	KMP_DEBUG_ASSERT(tied_task != NULL);
251	KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
252	KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly
253
254	KA_TRACE(`20`, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
255	tied_task));
256	return;
257	}
258	#endif /* BUILD_TIED_TASK_STACK */
259
260	// returns 1 if new task is allowed to execute, 0 otherwise
261	// checks Task Scheduling constraint (if requested) and
262	// mutexinoutset dependencies if any
263	static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
264	const kmp_taskdata_t *tasknew,
265	const kmp_taskdata_t *taskcurr) {
266	if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) {
267	// Check if the candidate obeys the Task Scheduling Constraints (TSC)
268	// only descendant of all deferred tied tasks can be scheduled, checking
269	// the last one is enough, as it in turn is the descendant of all others
270	kmp_taskdata_t *current = taskcurr->td_last_tied;
271	KMP_DEBUG_ASSERT(current != NULL);
272	// check if the task is not suspended on barrier
273	if (current->td_flags.tasktype == TASK_EXPLICIT \|\|
274	current->td_taskwait_thread > `0`) { // <= 0 on barrier
275	kmp_int32 level = current->td_level;
276	kmp_taskdata_t *parent = tasknew->td_parent;
277	while (parent != current && parent->td_level > level) {
278	// check generation up to the level of the current task
279	parent = parent->td_parent;
280	KMP_DEBUG_ASSERT(parent != NULL);
281	}
282	if (parent != current)
283	return false;
284	}
285	}
286	// Check mutexinoutset dependencies, acquire locks
287	kmp_depnode_t *node = tasknew->td_depnode;
288	#if OMPX_TASKGRAPH
289	if (!tasknew->is_taskgraph && UNLIKELY(node && (node->dn.mtx_num_locks > `0`))) {
290	#else
291	if (UNLIKELY(node && (node->dn.mtx_num_locks > `0`))) {
292	#endif
293	for (int i = `0`; i < node->dn.mtx_num_locks; ++i) {
294	KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
295	if (__kmp_test_lock(lck: node->dn.mtx_locks[i], gtid))
296	continue;
297	// could not get the lock, release previous locks
298	for (int j = i - `1`; j >= `0`; --j)
299	__kmp_release_lock(lck: node->dn.mtx_locks[j], gtid);
300	return false;
301	}
302	// negative num_locks means all locks acquired successfully
303	node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
304	}
305	return true;
306	}
307
308	// __kmp_realloc_task_deque:
309	// Re-allocates a task deque for a particular thread, copies the content from
310	// the old deque and adjusts the necessary data structures relating to the
311	// deque. This operation must be done with the deque_lock being held
312	static void __kmp_realloc_task_deque(kmp_info_t *thread,
313	kmp_thread_data_t *thread_data) {
314	kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
315	KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size);
316	kmp_int32 new_size = `2` * size;
317
318	KE_TRACE(`10`, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
319	"%d] for thread_data %p\n",
320	__kmp_gtid_from_thread(thread), size, new_size, thread_data));
321
322	kmp_taskdata_t **new_deque =
323	(kmp_taskdata_t *)__kmp_allocate(new_size sizeof(kmp_taskdata_t *));
324
325	int i, j;
326	for (i = thread_data->td.td_deque_head, j = `0`; j < size;
327	i = (i + `1`) & TASK_DEQUE_MASK(thread_data->td), j++)
328	new_deque[j] = thread_data->td.td_deque[i];
329
330	__kmp_free(thread_data->td.td_deque);
331
332	thread_data->td.td_deque_head = `0`;
333	thread_data->td.td_deque_tail = size;
334	thread_data->td.td_deque = new_deque;
335	thread_data->td.td_deque_size = new_size;
336	}
337
338	static kmp_task_pri_t *__kmp_alloc_task_pri_list() {
339	kmp_task_pri_t l = (kmp_task_pri_t )__kmp_allocate(sizeof(kmp_task_pri_t));
340	kmp_thread_data_t *thread_data = &l->td;
341	__kmp_init_bootstrap_lock(lck: &thread_data->td.td_deque_lock);
342	thread_data->td.td_deque_last_stolen = -`1`;
343	KE_TRACE(`20`, ("__kmp_alloc_task_pri_list: T#%d allocating deque[%d] "
344	"for thread_data %p\n",
345	__kmp_get_gtid(), INITIAL_TASK_DEQUE_SIZE, thread_data));
346	thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
347	INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
348	thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
349	return l;
350	}
351
352	// The function finds the deque of priority tasks with given priority, or
353	// allocates a new deque and put it into sorted (high -> low) list of deques.
354	// Deques of non-default priority tasks are shared between all threads in team,
355	// as opposed to per-thread deques of tasks with default priority.
356	// The function is called under the lock task_team->tt.tt_task_pri_lock.
357	static kmp_thread_data_t *
358	__kmp_get_priority_deque_data(kmp_task_team_t *task_team, kmp_int32 pri) {
359	kmp_thread_data_t *thread_data;
360	kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
361	if (lst->priority == pri) {
362	// Found queue of tasks with given priority.
363	thread_data = &lst->td;
364	} else if (lst->priority < pri) {
365	// All current priority queues contain tasks with lower priority.
366	// Allocate new one for given priority tasks.
367	kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
368	thread_data = &list->td;
369	list->priority = pri;
370	list->next = lst;
371	task_team->tt.tt_task_pri_list = list;
372	} else { // task_team->tt.tt_task_pri_list->priority > pri
373	kmp_task_pri_t *next_queue = lst->next;
374	while (next_queue && next_queue->priority > pri) {
375	lst = next_queue;
376	next_queue = lst->next;
377	}
378	// lst->priority > pri && (next == NULL \|\| pri >= next->priority)
379	if (next_queue == NULL) {
380	// No queue with pri priority, need to allocate new one.
381	kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
382	thread_data = &list->td;
383	list->priority = pri;
384	list->next = NULL;
385	lst->next = list;
386	} else if (next_queue->priority == pri) {
387	// Found queue of tasks with given priority.
388	thread_data = &next_queue->td;
389	} else { // lst->priority > pri > next->priority
390	// insert newly allocated between existed queues
391	kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
392	thread_data = &list->td;
393	list->priority = pri;
394	list->next = next_queue;
395	lst->next = list;
396	}
397	}
398	return thread_data;
399	}
400
401	// __kmp_push_priority_task: Add a task to the team's priority task deque
402	static kmp_int32 __kmp_push_priority_task(kmp_int32 gtid, kmp_info_t *thread,
403	kmp_taskdata_t *taskdata,
404	kmp_task_team_t *task_team,
405	kmp_int32 pri) {
406	kmp_thread_data_t *thread_data = NULL;
407	KA_TRACE(`20`,
408	("__kmp_push_priority_task: T#%d trying to push task %p, pri %d.\n",
409	gtid, taskdata, pri));
410
411	// Find task queue specific to priority value
412	kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
413	if (UNLIKELY(lst == NULL)) {
414	__kmp_acquire_bootstrap_lock(lck: &task_team->tt.tt_task_pri_lock);
415	if (task_team->tt.tt_task_pri_list == NULL) {
416	// List of queues is still empty, allocate one.
417	kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
418	thread_data = &list->td;
419	list->priority = pri;
420	list->next = NULL;
421	task_team->tt.tt_task_pri_list = list;
422	} else {
423	// Other thread initialized a queue. Check if it fits and get thread_data.
424	thread_data = __kmp_get_priority_deque_data(task_team, pri);
425	}
426	__kmp_release_bootstrap_lock(lck: &task_team->tt.tt_task_pri_lock);
427	} else {
428	if (lst->priority == pri) {
429	// Found queue of tasks with given priority.
430	thread_data = &lst->td;
431	} else {
432	__kmp_acquire_bootstrap_lock(lck: &task_team->tt.tt_task_pri_lock);
433	thread_data = __kmp_get_priority_deque_data(task_team, pri);
434	__kmp_release_bootstrap_lock(lck: &task_team->tt.tt_task_pri_lock);
435	}
436	}
437	KMP_DEBUG_ASSERT(thread_data);
438
439	__kmp_acquire_bootstrap_lock(lck: &thread_data->td.td_deque_lock);
440	// Check if deque is full
441	if (TCR_4(thread_data->td.td_deque_ntasks) >=
442	TASK_DEQUE_SIZE(thread_data->td)) {
443	if (__kmp_enable_task_throttling &&
444	__kmp_task_is_allowed(gtid, is_constrained: __kmp_task_stealing_constraint, tasknew: taskdata,
445	taskcurr: thread->th.th_current_task)) {
446	__kmp_release_bootstrap_lock(lck: &thread_data->td.td_deque_lock);
447	KA_TRACE(`20`, ("__kmp_push_priority_task: T#%d deque is full; returning "
448	"TASK_NOT_PUSHED for task %p\n",
449	gtid, taskdata));
450	return TASK_NOT_PUSHED;
451	} else {
452	// expand deque to push the task which is not allowed to execute
453	__kmp_realloc_task_deque(thread, thread_data);
454	}
455	}
456	KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
457	TASK_DEQUE_SIZE(thread_data->td));
458	// Push taskdata.
459	thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
460	// Wrap index.
461	thread_data->td.td_deque_tail =
462	(thread_data->td.td_deque_tail + `1`) & TASK_DEQUE_MASK(thread_data->td);
463	TCW_4(thread_data->td.td_deque_ntasks,
464	TCR_4(thread_data->td.td_deque_ntasks) + `1`); // Adjust task count
465	KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
466	KMP_FSYNC_RELEASING(taskdata); // releasing child
467	KA_TRACE(`20`, ("__kmp_push_priority_task: T#%d returning "
468	"TASK_SUCCESSFULLY_PUSHED: task=%p ntasks=%d head=%u tail=%u\n",
469	gtid, taskdata, thread_data->td.td_deque_ntasks,
470	thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
471	__kmp_release_bootstrap_lock(lck: &thread_data->td.td_deque_lock);
472	task_team->tt.tt_num_task_pri ++; // atomic inc
473	return TASK_SUCCESSFULLY_PUSHED;
474	}
475
476	// __kmp_push_task: Add a task to the thread's deque
477	static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
478	kmp_info_t *thread = __kmp_threads[gtid];
479	kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
480
481	// If we encounter a hidden helper task, and the current thread is not a
482	// hidden helper thread, we have to give the task to any hidden helper thread
483	// starting from its shadow one.
484	if (UNLIKELY(taskdata->td_flags.hidden_helper &&
485	!KMP_HIDDEN_HELPER_THREAD(gtid))) {
486	kmp_int32 shadow_gtid = KMP_GTID_TO_SHADOW_GTID(gtid);
487	__kmpc_give_task(ptask: task, start: __kmp_tid_from_gtid(gtid: shadow_gtid));
488	// Signal the hidden helper threads.
489	__kmp_hidden_helper_worker_thread_signal();
490	return TASK_SUCCESSFULLY_PUSHED;
491	}
492
493	kmp_task_team_t *task_team = thread->th.th_task_team;
494	kmp_int32 tid = __kmp_tid_from_gtid(gtid);
495	kmp_thread_data_t *thread_data;
496
497	KA_TRACE(`20`,
498	("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
499
500	if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
501	// untied task needs to increment counter so that the task structure is not
502	// freed prematurely
503	kmp_int32 counter = `1` + KMP_ATOMIC_INC(&taskdata->td_untied_count);
504	KMP_DEBUG_USE_VAR(counter);
505	KA_TRACE(
506	`20`,
507	("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
508	gtid, counter, taskdata));
509	}
510
511	// The first check avoids building task_team thread data if serialized
512	if (UNLIKELY(taskdata->td_flags.task_serial)) {
513	KA_TRACE(`20`, ("__kmp_push_task: T#%d team serialized; returning "
514	"TASK_NOT_PUSHED for task %p\n",
515	gtid, taskdata));
516	return TASK_NOT_PUSHED;
517	}
518
519	// Now that serialized tasks have returned, we can assume that we are not in
520	// immediate exec mode
521	KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
522	if (UNLIKELY(!KMP_TASKING_ENABLED(task_team))) {
523	__kmp_enable_tasking(task_team, this_thr: thread);
524	}
525	KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
526	KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
527
528	if (taskdata->td_flags.priority_specified && task->data2.priority > `0` &&
529	__kmp_max_task_priority > `0`) {
530	int pri = KMP_MIN(task->data2.priority, __kmp_max_task_priority);
531	return __kmp_push_priority_task(gtid, thread, taskdata, task_team, pri);
532	}
533
534	// Find tasking deque specific to encountering thread
535	thread_data = &task_team->tt.tt_threads_data[tid];
536
537	// No lock needed since only owner can allocate. If the task is hidden_helper,
538	// we don't need it either because we have initialized the dequeue for hidden
539	// helper thread data.
540	if (UNLIKELY(thread_data->td.td_deque == NULL)) {
541	__kmp_alloc_task_deque(thread, thread_data);
542	}
543
544	int locked = `0`;
545	// Check if deque is full
546	if (TCR_4(thread_data->td.td_deque_ntasks) >=
547	TASK_DEQUE_SIZE(thread_data->td)) {
548	if (__kmp_enable_task_throttling &&
549	__kmp_task_is_allowed(gtid, is_constrained: __kmp_task_stealing_constraint, tasknew: taskdata,
550	taskcurr: thread->th.th_current_task)) {
551	KA_TRACE(`20`, ("__kmp_push_task: T#%d deque is full; returning "
552	"TASK_NOT_PUSHED for task %p\n",
553	gtid, taskdata));
554	return TASK_NOT_PUSHED;
555	} else {
556	__kmp_acquire_bootstrap_lock(lck: &thread_data->td.td_deque_lock);
557	locked = `1`;
558	if (TCR_4(thread_data->td.td_deque_ntasks) >=
559	TASK_DEQUE_SIZE(thread_data->td)) {
560	// expand deque to push the task which is not allowed to execute
561	__kmp_realloc_task_deque(thread, thread_data);
562	}
563	}
564	}
565	// Lock the deque for the task push operation
566	if (!locked) {
567	__kmp_acquire_bootstrap_lock(lck: &thread_data->td.td_deque_lock);
568	// Need to recheck as we can get a proxy task from thread outside of OpenMP
569	if (TCR_4(thread_data->td.td_deque_ntasks) >=
570	TASK_DEQUE_SIZE(thread_data->td)) {
571	if (__kmp_enable_task_throttling &&
572	__kmp_task_is_allowed(gtid, is_constrained: __kmp_task_stealing_constraint, tasknew: taskdata,
573	taskcurr: thread->th.th_current_task)) {
574	__kmp_release_bootstrap_lock(lck: &thread_data->td.td_deque_lock);
575	KA_TRACE(`20`, ("__kmp_push_task: T#%d deque is full on 2nd check; "
576	"returning TASK_NOT_PUSHED for task %p\n",
577	gtid, taskdata));
578	return TASK_NOT_PUSHED;
579	} else {
580	// expand deque to push the task which is not allowed to execute
581	__kmp_realloc_task_deque(thread, thread_data);
582	}
583	}
584	}
585	// Must have room since no thread can add tasks but calling thread
586	KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
587	TASK_DEQUE_SIZE(thread_data->td));
588
589	thread_data->td.td_deque[thread_data->td.td_deque_tail] =
590	taskdata; // Push taskdata
591	// Wrap index.
592	thread_data->td.td_deque_tail =
593	(thread_data->td.td_deque_tail + `1`) & TASK_DEQUE_MASK(thread_data->td);
594	TCW_4(thread_data->td.td_deque_ntasks,
595	TCR_4(thread_data->td.td_deque_ntasks) + `1`); // Adjust task count
596	KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
597	KMP_FSYNC_RELEASING(taskdata); // releasing child
598	KA_TRACE(`20`, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
599	"task=%p ntasks=%d head=%u tail=%u\n",
600	gtid, taskdata, thread_data->td.td_deque_ntasks,
601	thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
602
603	__kmp_release_bootstrap_lock(lck: &thread_data->td.td_deque_lock);
604
605	return TASK_SUCCESSFULLY_PUSHED;
606	}
607
608	// __kmp_pop_current_task_from_thread: set up current task from called thread
609	// when team ends
610	//
611	// this_thr: thread structure to set current_task in.
612	void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
613	KF_TRACE(`10`, ("__kmp_pop_current_task_from_thread(enter): T#%d "
614	"this_thread=%p, curtask=%p, "
615	"curtask_parent=%p\n",
616	`0`, this_thr, this_thr->th.th_current_task,
617	this_thr->th.th_current_task->td_parent));
618
619	this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
620
621	KF_TRACE(`10`, ("__kmp_pop_current_task_from_thread(exit): T#%d "
622	"this_thread=%p, curtask=%p, "
623	"curtask_parent=%p\n",
624	`0`, this_thr, this_thr->th.th_current_task,
625	this_thr->th.th_current_task->td_parent));
626	}
627
628	// __kmp_push_current_task_to_thread: set up current task in called thread for a
629	// new team
630	//
631	// this_thr: thread structure to set up
632	// team: team for implicit task data
633	// tid: thread within team to set up
634	void __kmp_push_current_task_to_thread(kmp_info_t this_thr, kmp_team_t team,
635	int tid) {
636	// current task of the thread is a parent of the new just created implicit
637	// tasks of new team
638	KF_TRACE(`10`, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
639	"curtask=%p "
640	"parent_task=%p\n",
641	tid, this_thr, this_thr->th.th_current_task,
642	team->t.t_implicit_task_taskdata[tid].td_parent));
643
644	KMP_DEBUG_ASSERT(this_thr != NULL);
645
646	if (tid == `0`) {
647	if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[`0`]) {
648	team->t.t_implicit_task_taskdata[`0`].td_parent =
649	this_thr->th.th_current_task;
650	this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[`0`];
651	}
652	} else {
653	team->t.t_implicit_task_taskdata[tid].td_parent =
654	team->t.t_implicit_task_taskdata[`0`].td_parent;
655	this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
656	}
657
658	KF_TRACE(`10`, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
659	"curtask=%p "
660	"parent_task=%p\n",
661	tid, this_thr, this_thr->th.th_current_task,
662	team->t.t_implicit_task_taskdata[tid].td_parent));
663	}
664
665	// __kmp_task_start: bookkeeping for a task starting execution
666	//
667	// GTID: global thread id of calling thread
668	// task: task starting execution
669	// current_task: task suspending
670	static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
671	kmp_taskdata_t *current_task) {
672	kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
673	kmp_info_t *thread = __kmp_threads[gtid];
674
675	KA_TRACE(`10`,
676	("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
677	gtid, taskdata, current_task));
678
679	KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
680
681	// mark currently executing task as suspended
682	// TODO: GEH - make sure root team implicit task is initialized properly.
683	// KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
684	current_task->td_flags.executing = `0`;
685
686	// Add task to stack if tied
687	#ifdef BUILD_TIED_TASK_STACK
688	if (taskdata->td_flags.tiedness == TASK_TIED) {
689	__kmp_push_task_stack(gtid, thread, taskdata);
690	}
691	#endif /* BUILD_TIED_TASK_STACK */
692
693	// mark starting task as executing and as current task
694	thread->th.th_current_task = taskdata;
695
696	KMP_DEBUG_ASSERT(taskdata->td_flags.started == `0` \|\|
697	taskdata->td_flags.tiedness == TASK_UNTIED);
698	KMP_DEBUG_ASSERT(taskdata->td_flags.executing == `0` \|\|
699	taskdata->td_flags.tiedness == TASK_UNTIED);
700	taskdata->td_flags.started = `1`;
701	taskdata->td_flags.executing = `1`;
702	KMP_DEBUG_ASSERT(taskdata->td_flags.complete == `0`);
703	KMP_DEBUG_ASSERT(taskdata->td_flags.freed == `0`);
704
705	// GEH TODO: shouldn't we pass some sort of location identifier here?
706	// APT: yes, we will pass location here.
707	// need to store current thread state (in a thread or taskdata structure)
708	// before setting work_state, otherwise wrong state is set after end of task
709
710	KA_TRACE(`10`, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
711
712	return;
713	}
714
715	#if OMPT_SUPPORT
716	//------------------------------------------------------------------------------
717
718	// __ompt_task_start:
719	// Build and trigger task-begin event
720	static inline void __ompt_task_start(kmp_task_t *task,
721	kmp_taskdata_t *current_task,
722	kmp_int32 gtid) {
723	kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
724	ompt_task_status_t status = ompt_task_switch;
725	if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
726	status = ompt_task_yield;
727	__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = `0`;
728	}
729	/ let OMPT know that we're about to run this task /
730	if (ompt_enabled.ompt_callback_task_schedule) {
731	ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
732	&(current_task->ompt_task_info.task_data), status,
733	&(taskdata->ompt_task_info.task_data));
734	}
735	taskdata->ompt_task_info.scheduling_parent = current_task;
736	}
737
738	// __ompt_task_finish:
739	// Build and trigger final task-schedule event
740	static inline void __ompt_task_finish(kmp_task_t *task,
741	kmp_taskdata_t *resumed_task,
742	ompt_task_status_t status) {
743	if (ompt_enabled.ompt_callback_task_schedule) {
744	kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
745	if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
746	taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
747	status = ompt_task_cancel;
748	}
749
750	/ let OMPT know that we're returning to the callee task /
751	ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
752	&(taskdata->ompt_task_info.task_data), status,
753	(resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL));
754	}
755	}
756	#endif
757
758	template <bool ompt>
759	static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
760	kmp_task_t *task,
761	void *frame_address,
762	void *return_address) {
763	kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
764	kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
765
766	KA_TRACE(`10`, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
767	"current_task=%p\n",
768	gtid, loc_ref, taskdata, current_task));
769
770	if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
771	// untied task needs to increment counter so that the task structure is not
772	// freed prematurely
773	kmp_int32 counter = `1` + KMP_ATOMIC_INC(&taskdata->td_untied_count);
774	KMP_DEBUG_USE_VAR(counter);
775	KA_TRACE(`20`, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
776	"incremented for task %p\n",
777	gtid, counter, taskdata));
778	}
779
780	taskdata->td_flags.task_serial =
781	`1`; // Execute this task immediately, not deferred.
782	__kmp_task_start(gtid, task, current_task);
783
784	#if OMPT_SUPPORT
785	if (ompt) {
786	if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) {
787	current_task->ompt_task_info.frame.enter_frame.ptr =
788	taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;
789	current_task->ompt_task_info.frame.enter_frame_flags =
790	taskdata->ompt_task_info.frame.exit_frame_flags =
791	OMPT_FRAME_FLAGS_APP;
792	}
793	if (ompt_enabled.ompt_callback_task_create) {
794	ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
795	ompt_callbacks.ompt_callback(ompt_callback_task_create)(
796	&(parent_info->task_data), &(parent_info->frame),
797	&(taskdata->ompt_task_info.task_data),
798	TASK_TYPE_DETAILS_FORMAT(taskdata), `0`, return_address);
799	}
800	__ompt_task_start(task, current_task, gtid);
801	}
802	#endif // OMPT_SUPPORT
803
804	KA_TRACE(`10`, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
805	loc_ref, taskdata));
806	}
807
808	#if OMPT_SUPPORT
809	OMPT_NOINLINE
810	static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
811	kmp_task_t *task,
812	void *frame_address,
813	void *return_address) {
814	__kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
815	return_address);
816	}
817	#endif // OMPT_SUPPORT
818
819	// __kmpc_omp_task_begin_if0: report that a given serialized task has started
820	// execution
821	//
822	// loc_ref: source location information; points to beginning of task block.
823	// gtid: global thread number.
824	// task: task thunk for the started task.
825	#ifdef __s390x__
826	// This is required for OMPT_GET_FRAME_ADDRESS(1) to compile on s390x.
827	// In order for it to work correctly, the caller also needs to be compiled with
828	// backchain. If a caller is compiled without backchain,
829	// OMPT_GET_FRAME_ADDRESS(1) will produce an incorrect value, but will not
830	// crash.
831	__attribute__((target("backchain")))
832	#endif
833	void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
834	kmp_task_t *task) {
835	#if OMPT_SUPPORT
836	if (UNLIKELY(ompt_enabled.enabled)) {
837	OMPT_STORE_RETURN_ADDRESS(gtid);
838	__kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
839	OMPT_GET_FRAME_ADDRESS(`1`),
840	OMPT_LOAD_RETURN_ADDRESS(gtid));
841	return;
842	}
843	#endif
844	__kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
845	}
846
847	#ifdef TASK_UNUSED
848	// __kmpc_omp_task_begin: report that a given task has started execution
849	// NEVER GENERATED BY COMPILER, DEPRECATED!!!
850	void __kmpc_omp_task_begin(ident_t loc_ref, kmp_int32 gtid, kmp_task_t task) {
851	kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
852
853	KA_TRACE(
854	`10`,
855	("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
856	gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
857
858	__kmp_task_start(gtid, task, current_task);
859
860	KA_TRACE(`10`, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
861	loc_ref, KMP_TASK_TO_TASKDATA(task)));
862	return;
863	}
864	#endif // TASK_UNUSED
865
866	// __kmp_free_task: free the current task space and the space for shareds
867	//
868	// gtid: Global thread ID of calling thread
869	// taskdata: task to free
870	// thread: thread data structure of caller
871	static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
872	kmp_info_t *thread) {
873	KA_TRACE(`30`, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
874	taskdata));
875
876	// Check to make sure all flags and counters have the correct values
877	KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
878	KMP_DEBUG_ASSERT(taskdata->td_flags.executing == `0`);
879	KMP_DEBUG_ASSERT(taskdata->td_flags.complete == `1`);
880	KMP_DEBUG_ASSERT(taskdata->td_flags.freed == `0`);
881	KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == `0` \|\|
882	taskdata->td_flags.task_serial == `1`);
883	KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == `0`);
884	kmp_task_t *task = KMP_TASKDATA_TO_TASK(taskdata);
885	// Clear data to not be re-used later by mistake.
886	task->data1.destructors = NULL;
887	task->data2.priority = `0`;
888
889	taskdata->td_flags.freed = `1`;
890	#if OMPX_TASKGRAPH
891	// do not free tasks in taskgraph
892	if (!taskdata->is_taskgraph) {
893	#endif
894	// deallocate the taskdata and shared variable blocks associated with this task
895	#if USE_FAST_MEMORY
896	__kmp_fast_free(thread, taskdata);
897	#else /* ! USE_FAST_MEMORY */
898	__kmp_thread_free(thread, taskdata);
899	#endif
900	#if OMPX_TASKGRAPH
901	} else {
902	taskdata->td_flags.complete = `0`;
903	taskdata->td_flags.started = `0`;
904	taskdata->td_flags.freed = `0`;
905	taskdata->td_flags.executing = `0`;
906	taskdata->td_flags.task_serial =
907	(taskdata->td_parent->td_flags.final \|\|
908	taskdata->td_flags.team_serial \|\| taskdata->td_flags.tasking_ser);
909
910	// taskdata->td_allow_completion_event.pending_events_count = 1;
911	KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, `0`);
912	KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, `0`);
913	// start at one because counts current task and children
914	KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, `1`);
915	}
916	#endif
917
918	KA_TRACE(`20`, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
919	}
920
921	// __kmp_free_task_and_ancestors: free the current task and ancestors without
922	// children
923	//
924	// gtid: Global thread ID of calling thread
925	// taskdata: task to free
926	// thread: thread data structure of caller
927	static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
928	kmp_taskdata_t *taskdata,
929	kmp_info_t *thread) {
930	// Proxy tasks must always be allowed to free their parents
931	// because they can be run in background even in serial mode.
932	kmp_int32 team_serial =
933	(taskdata->td_flags.team_serial \|\| taskdata->td_flags.tasking_ser) &&
934	!taskdata->td_flags.proxy;
935	KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
936
937	kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - `1`;
938	KMP_DEBUG_ASSERT(children >= `0`);
939
940	// Now, go up the ancestor tree to see if any ancestors can now be freed.
941	while (children == `0`) {
942	kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
943
944	KA_TRACE(`20`, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
945	"and freeing itself\n",
946	gtid, taskdata));
947
948	// --- Deallocate my ancestor task ---
949	__kmp_free_task(gtid, taskdata, thread);
950
951	taskdata = parent_taskdata;
952
953	if (team_serial)
954	return;
955	// Stop checking ancestors at implicit task instead of walking up ancestor
956	// tree to avoid premature deallocation of ancestors.
957	if (taskdata->td_flags.tasktype == TASK_IMPLICIT) {
958	if (taskdata->td_dephash) { // do we need to cleanup dephash?
959	int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks);
960	kmp_tasking_flags_t flags_old = taskdata->td_flags;
961	if (children == `0` && flags_old.complete == `1`) {
962	kmp_tasking_flags_t flags_new = flags_old;
963	flags_new.complete = `0`;
964	if (KMP_COMPARE_AND_STORE_ACQ32(
965	RCAST(kmp_int32 *, &taskdata->td_flags),
966	RCAST(kmp_int32 , &flags_old),
967	RCAST(kmp_int32 , &flags_new))) {
968	KA_TRACE(`100`, ("__kmp_free_task_and_ancestors: T#%d cleans "
969	"dephash of implicit task %p\n",
970	gtid, taskdata));
971	// cleanup dephash of finished implicit task
972	__kmp_dephash_free_entries(thread, h: taskdata->td_dephash);
973	}
974	}
975	}
976	return;
977	}
978	// Predecrement simulated by "- 1" calculation
979	children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - `1`;
980	KMP_DEBUG_ASSERT(children >= `0`);
981	}
982
983	KA_TRACE(
984	`20`, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
985	"not freeing it yet\n",
986	gtid, taskdata, children));
987	}
988
989	// Only need to keep track of child task counts if any of the following:
990	// 1. team parallel and tasking not serialized;
991	// 2. it is a proxy or detachable or hidden helper task
992	// 3. the children counter of its parent task is greater than 0.
993	// The reason for the 3rd one is for serialized team that found detached task,
994	// hidden helper task, T. In this case, the execution of T is still deferred,
995	// and it is also possible that a regular task depends on T. In this case, if we
996	// don't track the children, task synchronization will be broken.
997	static bool __kmp_track_children_task(kmp_taskdata_t *taskdata) {
998	kmp_tasking_flags_t flags = taskdata->td_flags;
999	bool ret = !(flags.team_serial \|\| flags.tasking_ser);
1000	ret = ret \|\| flags.proxy == TASK_PROXY \|\|
1001	flags.detachable == TASK_DETACHABLE \|\| flags.hidden_helper;
1002	ret = ret \|\|
1003	KMP_ATOMIC_LD_ACQ(&taskdata->td_parent->td_incomplete_child_tasks) > `0`;
1004	#if OMPX_TASKGRAPH
1005	if (taskdata->td_taskgroup && taskdata->is_taskgraph)
1006	ret = ret \|\| KMP_ATOMIC_LD_ACQ(&taskdata->td_taskgroup->count) > `0`;
1007	#endif
1008	return ret;
1009	}
1010
1011	// __kmp_task_finish: bookkeeping to do when a task finishes execution
1012	//
1013	// gtid: global thread ID for calling thread
1014	// task: task to be finished
1015	// resumed_task: task to be resumed. (may be NULL if task is serialized)
1016	//
1017	// template<ompt>: effectively ompt_enabled.enabled!=0
1018	// the version with ompt=false is inlined, allowing to optimize away all ompt
1019	// code in this case
1020	template <bool ompt>
1021	static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
1022	kmp_taskdata_t *resumed_task) {
1023	kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1024	kmp_info_t *thread = __kmp_threads[gtid];
1025	kmp_task_team_t *task_team =
1026	thread->th.th_task_team; // might be NULL for serial teams...
1027	#if OMPX_TASKGRAPH
1028	// to avoid seg fault when we need to access taskdata->td_flags after free when using vanilla taskloop
1029	bool is_taskgraph;
1030	#endif
1031	#if KMP_DEBUG
1032	kmp_int32 children = `0`;
1033	#endif
1034	KA_TRACE(`10`, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
1035	"task %p\n",
1036	gtid, taskdata, resumed_task));
1037
1038	KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
1039
1040	#if OMPX_TASKGRAPH
1041	is_taskgraph = taskdata->is_taskgraph;
1042	#endif
1043
1044	// Pop task from stack if tied
1045	#ifdef BUILD_TIED_TASK_STACK
1046	if (taskdata->td_flags.tiedness == TASK_TIED) {
1047	__kmp_pop_task_stack(gtid, thread, taskdata);
1048	}
1049	#endif /* BUILD_TIED_TASK_STACK */
1050
1051	if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
1052	// untied task needs to check the counter so that the task structure is not
1053	// freed prematurely
1054	kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - `1`;
1055	KA_TRACE(
1056	`20`,
1057	("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
1058	gtid, counter, taskdata));
1059	if (counter > `0`) {
1060	// untied task is not done, to be continued possibly by other thread, do
1061	// not free it now
1062	if (resumed_task == NULL) {
1063	KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
1064	resumed_task = taskdata->td_parent; // In a serialized task, the resumed
1065	// task is the parent
1066	}
1067	thread->th.th_current_task = resumed_task; // restore current_task
1068	resumed_task->td_flags.executing = `1`; // resume previous task
1069	KA_TRACE(`10`, ("__kmp_task_finish(exit): T#%d partially done task %p, "
1070	"resuming task %p\n",
1071	gtid, taskdata, resumed_task));
1072	return;
1073	}
1074	}
1075
1076	// bookkeeping for resuming task:
1077	// GEH - note tasking_ser => task_serial
1078	KMP_DEBUG_ASSERT(
1079	(taskdata->td_flags.tasking_ser \|\| taskdata->td_flags.task_serial) ==
1080	taskdata->td_flags.task_serial);
1081	if (taskdata->td_flags.task_serial) {
1082	if (resumed_task == NULL) {
1083	resumed_task = taskdata->td_parent; // In a serialized task, the resumed
1084	// task is the parent
1085	}
1086	} else {
1087	KMP_DEBUG_ASSERT(resumed_task !=
1088	NULL); // verify that resumed task is passed as argument
1089	}
1090
1091	/ If the tasks' destructor thunk flag has been set, we need to invoke the*
1092	destructor thunk that has been generated by the compiler. The code is
1093	placed here, since at this point other tasks might have been released
1094	hence overlapping the destructor invocations with some other work in the
1095	released tasks. The OpenMP spec is not specific on when the destructors
1096	are invoked, so we should be free to choose. /*
1097	if (UNLIKELY(taskdata->td_flags.destructors_thunk)) {
1098	kmp_routine_entry_t destr_thunk = task->data1.destructors;
1099	KMP_ASSERT(destr_thunk);
1100	destr_thunk(gtid, task);
1101	}
1102
1103	KMP_DEBUG_ASSERT(taskdata->td_flags.complete == `0`);
1104	KMP_DEBUG_ASSERT(taskdata->td_flags.started == `1`);
1105	KMP_DEBUG_ASSERT(taskdata->td_flags.freed == `0`);
1106
1107	bool completed = true;
1108	if (UNLIKELY(taskdata->td_flags.detachable == TASK_DETACHABLE)) {
1109	if (taskdata->td_allow_completion_event.type ==
1110	KMP_EVENT_ALLOW_COMPLETION) {
1111	// event hasn't been fulfilled yet. Try to detach task.
1112	__kmp_acquire_tas_lock(lck: &taskdata->td_allow_completion_event.lock, gtid);
1113	if (taskdata->td_allow_completion_event.type ==
1114	KMP_EVENT_ALLOW_COMPLETION) {
1115	// task finished execution
1116	KMP_DEBUG_ASSERT(taskdata->td_flags.executing == `1`);
1117	taskdata->td_flags.executing = `0`; // suspend the finishing task
1118
1119	#if OMPT_SUPPORT
1120	// For a detached task, which is not completed, we switch back
1121	// the omp_fulfill_event signals completion
1122	// locking is necessary to avoid a race with ompt_task_late_fulfill
1123	if (ompt)
1124	__ompt_task_finish(task, resumed_task, status: ompt_task_detach);
1125	#endif
1126
1127	// no access to taskdata after this point!
1128	// __kmp_fulfill_event might free taskdata at any time from now
1129
1130	taskdata->td_flags.proxy = TASK_PROXY; // proxify!
1131	completed = false;
1132	}
1133	__kmp_release_tas_lock(lck: &taskdata->td_allow_completion_event.lock, gtid);
1134	}
1135	}
1136
1137	// Tasks with valid target async handles must be re-enqueued.
1138	if (taskdata->td_target_data.async_handle != NULL) {
1139	// Note: no need to translate gtid to its shadow. If the current thread is a
1140	// hidden helper one, then the gtid is already correct. Otherwise, hidden
1141	// helper threads are disabled, and gtid refers to a OpenMP thread.
1142	#if OMPT_SUPPORT
1143	if (ompt) {
1144	__ompt_task_finish(task, resumed_task, status: ompt_task_switch);
1145	}
1146	#endif
1147	__kmpc_give_task(ptask: task, start: __kmp_tid_from_gtid(gtid));
1148	if (KMP_HIDDEN_HELPER_THREAD(gtid))
1149	__kmp_hidden_helper_worker_thread_signal();
1150	completed = false;
1151	}
1152
1153	if (completed) {
1154	taskdata->td_flags.complete = `1`; // mark the task as completed
1155	#if OMPX_TASKGRAPH
1156	taskdata->td_flags.onced = `1`; // mark the task as ran once already
1157	#endif
1158
1159	#if OMPT_SUPPORT
1160	// This is not a detached task, we are done here
1161	if (ompt)
1162	__ompt_task_finish(task, resumed_task, status: ompt_task_complete);
1163	#endif
1164	// TODO: What would be the balance between the conditions in the function
1165	// and an atomic operation?
1166	if (__kmp_track_children_task(taskdata)) {
1167	__kmp_release_deps(gtid, task: taskdata);
1168	// Predecrement simulated by "- 1" calculation
1169	#if KMP_DEBUG
1170	children = -`1` +
1171	#endif
1172	KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
1173	KMP_DEBUG_ASSERT(children >= `0`);
1174	#if OMPX_TASKGRAPH
1175	if (taskdata->td_taskgroup && !taskdata->is_taskgraph)
1176	#else
1177	if (taskdata->td_taskgroup)
1178	#endif
1179	KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
1180	} else if (task_team && (task_team->tt.tt_found_proxy_tasks \|\|
1181	task_team->tt.tt_hidden_helper_task_encountered)) {
1182	// if we found proxy or hidden helper tasks there could exist a dependency
1183	// chain with the proxy task as origin
1184	__kmp_release_deps(gtid, task: taskdata);
1185	}
1186	// td_flags.executing must be marked as 0 after __kmp_release_deps has been
1187	// called. Othertwise, if a task is executed immediately from the
1188	// release_deps code, the flag will be reset to 1 again by this same
1189	// function
1190	KMP_DEBUG_ASSERT(taskdata->td_flags.executing == `1`);
1191	taskdata->td_flags.executing = `0`; // suspend the finishing task
1192
1193	// Decrement the counter of hidden helper tasks to be executed.
1194	if (taskdata->td_flags.hidden_helper) {
1195	// Hidden helper tasks can only be executed by hidden helper threads.
1196	KMP_ASSERT(KMP_HIDDEN_HELPER_THREAD(gtid));
1197	KMP_ATOMIC_DEC(&__kmp_unexecuted_hidden_helper_tasks);
1198	}
1199	}
1200
1201	KA_TRACE(
1202	`20`, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
1203	gtid, taskdata, children));
1204
1205	// Free this task and then ancestor tasks if they have no children.
1206	// Restore th_current_task first as suggested by John:
1207	// johnmc: if an asynchronous inquiry peers into the runtime system
1208	// it doesn't see the freed task as the current task.
1209	thread->th.th_current_task = resumed_task;
1210	if (completed)
1211	__kmp_free_task_and_ancestors(gtid, taskdata, thread);
1212
1213	// TODO: GEH - make sure root team implicit task is initialized properly.
1214	// KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
1215	resumed_task->td_flags.executing = `1`; // resume previous task
1216
1217	#if OMPX_TASKGRAPH
1218	if (is_taskgraph && __kmp_track_children_task(taskdata) &&
1219	taskdata->td_taskgroup) {
1220	// TDG: we only release taskgroup barrier here because
1221	// free_task_and_ancestors will call
1222	// __kmp_free_task, which resets all task parameters such as
1223	// taskdata->started, etc. If we release the barrier earlier, these
1224	// parameters could be read before being reset. This is not an issue for
1225	// non-TDG implementation because we never reuse a task(data) structure
1226	KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
1227	}
1228	#endif
1229
1230	KA_TRACE(
1231	`10`, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
1232	gtid, taskdata, resumed_task));
1233
1234	return;
1235	}
1236
1237	template <bool ompt>
1238	static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
1239	kmp_int32 gtid,
1240	kmp_task_t *task) {
1241	KA_TRACE(`10`, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
1242	gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1243	KMP_DEBUG_ASSERT(gtid >= `0`);
1244	// this routine will provide task to resume
1245	__kmp_task_finish<ompt>(gtid, task, NULL);
1246
1247	KA_TRACE(`10`, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
1248	gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1249
1250	#if OMPT_SUPPORT
1251	if (ompt) {
1252	ompt_frame_t *ompt_frame;
1253	__ompt_get_task_info_internal(ancestor_level: `0`, NULL, NULL, task_frame: &ompt_frame, NULL, NULL);
1254	ompt_frame->enter_frame = ompt_data_none;
1255	ompt_frame->enter_frame_flags = OMPT_FRAME_FLAGS_RUNTIME;
1256	}
1257	#endif
1258
1259	return;
1260	}
1261
1262	#if OMPT_SUPPORT
1263	OMPT_NOINLINE
1264	void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
1265	kmp_task_t *task) {
1266	__kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
1267	}
1268	#endif // OMPT_SUPPORT
1269
1270	// __kmpc_omp_task_complete_if0: report that a task has completed execution
1271	//
1272	// loc_ref: source location information; points to end of task block.
1273	// gtid: global thread number.
1274	// task: task thunk for the completed task.
1275	void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
1276	kmp_task_t *task) {
1277	#if OMPT_SUPPORT
1278	if (UNLIKELY(ompt_enabled.enabled)) {
1279	__kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
1280	return;
1281	}
1282	#endif
1283	__kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
1284	}
1285
1286	#ifdef TASK_UNUSED
1287	// __kmpc_omp_task_complete: report that a task has completed execution
1288	// NEVER GENERATED BY COMPILER, DEPRECATED!!!
1289	void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
1290	kmp_task_t *task) {
1291	KA_TRACE(`10`, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
1292	loc_ref, KMP_TASK_TO_TASKDATA(task)));
1293
1294	__kmp_task_finish<false>(gtid, task,
1295	NULL); // Not sure how to find task to resume
1296
1297	KA_TRACE(`10`, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
1298	loc_ref, KMP_TASK_TO_TASKDATA(task)));
1299	return;
1300	}
1301	#endif // TASK_UNUSED
1302
1303	// __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
1304	// task for a given thread
1305	//
1306	// loc_ref: reference to source location of parallel region
1307	// this_thr: thread data structure corresponding to implicit task
1308	// team: team for this_thr
1309	// tid: thread id of given thread within team
1310	// set_curr_task: TRUE if need to push current task to thread
1311	// NOTE: Routine does not set up the implicit task ICVS. This is assumed to
1312	// have already been done elsewhere.
1313	// TODO: Get better loc_ref. Value passed in may be NULL
1314	void __kmp_init_implicit_task(ident_t loc_ref, kmp_info_t this_thr,
1315	kmp_team_t team, int* tid, int set_curr_task) {
1316	kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
1317
1318	KF_TRACE(
1319	`10`,
1320	("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
1321	tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
1322
1323	task->td_task_id = KMP_GEN_TASK_ID();
1324	task->td_team = team;
1325	// task->td_parent = NULL; // fix for CQ230101 (broken parent task info
1326	// in debugger)
1327	task->td_ident = loc_ref;
1328	task->td_taskwait_ident = NULL;
1329	task->td_taskwait_counter = `0`;
1330	task->td_taskwait_thread = `0`;
1331
1332	task->td_flags.tiedness = TASK_TIED;
1333	task->td_flags.tasktype = TASK_IMPLICIT;
1334	task->td_flags.proxy = TASK_FULL;
1335
1336	// All implicit tasks are executed immediately, not deferred
1337	task->td_flags.task_serial = `1`;
1338	task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1339	task->td_flags.team_serial = (team->t.t_serialized) ? `1` : `0`;
1340
1341	task->td_flags.started = `1`;
1342	task->td_flags.executing = `1`;
1343	task->td_flags.complete = `0`;
1344	task->td_flags.freed = `0`;
1345	#if OMPX_TASKGRAPH
1346	task->td_flags.onced = `0`;
1347	#endif
1348
1349	task->td_depnode = NULL;
1350	task->td_last_tied = task;
1351	task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1352
1353	if (set_curr_task) { // only do this init first time thread is created
1354	KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, `0`);
1355	// Not used: don't need to deallocate implicit task
1356	KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, `0`);
1357	task->td_taskgroup = NULL; // An implicit task does not have taskgroup
1358	task->td_dephash = NULL;
1359	__kmp_push_current_task_to_thread(this_thr, team, tid);
1360	} else {
1361	KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == `0`);
1362	KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == `0`);
1363	}
1364
1365	#if OMPT_SUPPORT
1366	if (UNLIKELY(ompt_enabled.enabled))
1367	__ompt_task_init(task, tid);
1368	#endif
1369
1370	KF_TRACE(`10`, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
1371	team, task));
1372	}
1373
1374	// __kmp_finish_implicit_task: Release resources associated to implicit tasks
1375	// at the end of parallel regions. Some resources are kept for reuse in the next
1376	// parallel region.
1377	//
1378	// thread: thread data structure corresponding to implicit task
1379	void __kmp_finish_implicit_task(kmp_info_t *thread) {
1380	kmp_taskdata_t *task = thread->th.th_current_task;
1381	if (task->td_dephash) {
1382	int children;
1383	task->td_flags.complete = `1`;
1384	#if OMPX_TASKGRAPH
1385	task->td_flags.onced = `1`;
1386	#endif
1387	children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
1388	kmp_tasking_flags_t flags_old = task->td_flags;
1389	if (children == `0` && flags_old.complete == `1`) {
1390	kmp_tasking_flags_t flags_new = flags_old;
1391	flags_new.complete = `0`;
1392	if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags),
1393	RCAST(kmp_int32 , &flags_old),
1394	RCAST(kmp_int32 , &flags_new))) {
1395	KA_TRACE(`100`, ("__kmp_finish_implicit_task: T#%d cleans "
1396	"dephash of implicit task %p\n",
1397	thread->th.th_info.ds.ds_gtid, task));
1398	__kmp_dephash_free_entries(thread, h: task->td_dephash);
1399	}
1400	}
1401	}
1402	}
1403
1404	// __kmp_free_implicit_task: Release resources associated to implicit tasks
1405	// when these are destroyed regions
1406	//
1407	// thread: thread data structure corresponding to implicit task
1408	void __kmp_free_implicit_task(kmp_info_t *thread) {
1409	kmp_taskdata_t *task = thread->th.th_current_task;
1410	if (task && task->td_dephash) {
1411	__kmp_dephash_free(thread, h: task->td_dephash);
1412	task->td_dephash = NULL;
1413	}
1414	}
1415
1416	// Round up a size to a power of two specified by val: Used to insert padding
1417	// between structures co-allocated using a single malloc() call
1418	static size_t __kmp_round_up_to_val(size_t size, size_t val) {
1419	if (size & (val - `1`)) {
1420	size &= ~(val - `1`);
1421	if (size <= KMP_SIZE_T_MAX - val) {
1422	size += val; // Round up if there is no overflow.
1423	}
1424	}
1425	return size;
1426	} // __kmp_round_up_to_va
1427
1428	// __kmp_task_alloc: Allocate the taskdata and task data structures for a task
1429	//
1430	// loc_ref: source location information
1431	// gtid: global thread number.
1432	// flags: include tiedness & task type (explicit vs. implicit) of the ''new''
1433	// task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
1434	// sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including
1435	// private vars accessed in task.
1436	// sizeof_shareds: Size in bytes of array of pointers to shared vars accessed
1437	// in task.
1438	// task_entry: Pointer to task code entry point generated by compiler.
1439	// returns: a pointer to the allocated kmp_task_t structure (task).
1440	kmp_task_t __kmp_task_alloc(ident_t loc_ref, kmp_int32 gtid,
1441	kmp_tasking_flags_t *flags,
1442	size_t sizeof_kmp_task_t, size_t sizeof_shareds,
1443	kmp_routine_entry_t task_entry) {
1444	kmp_task_t *task;
1445	kmp_taskdata_t *taskdata;
1446	kmp_info_t *thread = __kmp_threads[gtid];
1447	kmp_team_t *team = thread->th.th_team;
1448	kmp_taskdata_t *parent_task = thread->th.th_current_task;
1449	size_t shareds_offset;
1450
1451	if (UNLIKELY(!TCR_4(__kmp_init_middle)))
1452	__kmp_middle_initialize();
1453
1454	if (flags->hidden_helper) {
1455	if (__kmp_enable_hidden_helper) {
1456	if (!TCR_4(__kmp_init_hidden_helper))
1457	__kmp_hidden_helper_initialize();
1458	} else {
1459	// If the hidden helper task is not enabled, reset the flag to FALSE.
1460	flags->hidden_helper = FALSE;
1461	}
1462	}
1463
1464	KA_TRACE(`10`, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
1465	"sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1466	gtid, loc_ref, ((kmp_int32 )flags), sizeof_kmp_task_t,
1467	sizeof_shareds, task_entry));
1468
1469	KMP_DEBUG_ASSERT(parent_task);
1470	if (parent_task->td_flags.final) {
1471	if (flags->merged_if0) {
1472	}
1473	flags->final = `1`;
1474	}
1475
1476	if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
1477	// Untied task encountered causes the TSC algorithm to check entire deque of
1478	// the victim thread. If no untied task encountered, then checking the head
1479	// of the deque should be enough.
1480	KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, `1`);
1481	}
1482
1483	// Detachable tasks are not proxy tasks yet but could be in the future. Doing
1484	// the tasking setup
1485	// when that happens is too late.
1486	if (UNLIKELY(flags->proxy == TASK_PROXY \|\|
1487	flags->detachable == TASK_DETACHABLE \|\| flags->hidden_helper)) {
1488	if (flags->proxy == TASK_PROXY) {
1489	flags->tiedness = TASK_UNTIED;
1490	flags->merged_if0 = `1`;
1491	}
1492	/ are we running in a sequential parallel or tskm_immediate_exec... we need*
1493	tasking support enabled /*
1494	if ((thread->th.th_task_team) == NULL) {
1495	/ This should only happen if the team is serialized*
1496	setup a task team and propagate it to the thread /*
1497	KMP_DEBUG_ASSERT(team->t.t_serialized);
1498	KA_TRACE(`30`,
1499	("T#%d creating task team in __kmp_task_alloc for proxy task\n",
1500	gtid));
1501	__kmp_task_team_setup(this_thr: thread, team);
1502	thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
1503	}
1504	kmp_task_team_t *task_team = thread->th.th_task_team;
1505
1506	/ tasking must be enabled now as the task might not be pushed /
1507	if (!KMP_TASKING_ENABLED(task_team)) {
1508	KA_TRACE(
1509	`30`,
1510	("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
1511	__kmp_enable_tasking(task_team, this_thr: thread);
1512	kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1513	kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
1514	// No lock needed since only owner can allocate
1515	if (thread_data->td.td_deque == NULL) {
1516	__kmp_alloc_task_deque(thread, thread_data);
1517	}
1518	}
1519
1520	if ((flags->proxy == TASK_PROXY \|\| flags->detachable == TASK_DETACHABLE) &&
1521	task_team->tt.tt_found_proxy_tasks == FALSE)
1522	TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
1523	if (flags->hidden_helper &&
1524	task_team->tt.tt_hidden_helper_task_encountered == FALSE)
1525	TCW_4(task_team->tt.tt_hidden_helper_task_encountered, TRUE);
1526	}
1527
1528	// Calculate shared structure offset including padding after kmp_task_t struct
1529	// to align pointers in shared struct
1530	shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
1531	shareds_offset = __kmp_round_up_to_val(size: shareds_offset, val: sizeof(void *));
1532
1533	// Allocate a kmp_taskdata_t block and a kmp_task_t block.
1534	KA_TRACE(`30`, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1535	shareds_offset));
1536	KA_TRACE(`30`, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1537	sizeof_shareds));
1538
1539	// Avoid double allocation here by combining shareds with taskdata
1540	#if USE_FAST_MEMORY
1541	taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
1542	sizeof_shareds);
1543	#else /* ! USE_FAST_MEMORY */
1544	taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
1545	sizeof_shareds);
1546	#endif /* USE_FAST_MEMORY */
1547
1548	task = KMP_TASKDATA_TO_TASK(taskdata);
1549
1550	// Make sure task & taskdata are aligned appropriately
1551	#if KMP_ARCH_X86 \|\| KMP_ARCH_PPC64 \|\| KMP_ARCH_S390X \|\| !KMP_HAVE_QUAD
1552	KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - `1`)) == `0`);
1553	KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - `1`)) == `0`);
1554	#else
1555	KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - `1`)) == `0`);
1556	KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - `1`)) == `0`);
1557	#endif
1558	if (sizeof_shareds > `0`) {
1559	// Avoid double allocation here by combining shareds with taskdata
1560	task->shareds = &((char *)taskdata)[shareds_offset];
1561	// Make sure shareds struct is aligned to pointer size
1562	KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - `1`)) ==
1563	`0`);
1564	} else {
1565	task->shareds = NULL;
1566	}
1567	task->routine = task_entry;
1568	task->part_id = `0`; // AC: Always start with 0 part id
1569
1570	taskdata->td_task_id = KMP_GEN_TASK_ID();
1571	taskdata->td_team = thread->th.th_team;
1572	taskdata->td_alloc_thread = thread;
1573	taskdata->td_parent = parent_task;
1574	taskdata->td_level = parent_task->td_level + `1`; // increment nesting level
1575	KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, `0`);
1576	taskdata->td_ident = loc_ref;
1577	taskdata->td_taskwait_ident = NULL;
1578	taskdata->td_taskwait_counter = `0`;
1579	taskdata->td_taskwait_thread = `0`;
1580	KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1581	// avoid copying icvs for proxy tasks
1582	if (flags->proxy == TASK_FULL)
1583	copy_icvs(dst: &taskdata->td_icvs, src: &taskdata->td_parent->td_icvs);
1584
1585	taskdata->td_flags = *flags;
1586	taskdata->td_task_team = thread->th.th_task_team;
1587	taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1588	taskdata->td_flags.tasktype = TASK_EXPLICIT;
1589	// If it is hidden helper task, we need to set the team and task team
1590	// correspondingly.
1591	if (flags->hidden_helper) {
1592	kmp_info_t *shadow_thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)];
1593	taskdata->td_team = shadow_thread->th.th_team;
1594	taskdata->td_task_team = shadow_thread->th.th_task_team;
1595	}
1596
1597	// GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1598	taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1599
1600	// GEH - TODO: fix this to copy parent task's value of team_serial flag
1601	taskdata->td_flags.team_serial = (team->t.t_serialized) ? `1` : `0`;
1602
1603	// GEH - Note we serialize the task if the team is serialized to make sure
1604	// implicit parallel region tasks are not left until program termination to
1605	// execute. Also, it helps locality to execute immediately.
1606
1607	taskdata->td_flags.task_serial =
1608	(parent_task->td_flags.final \|\| taskdata->td_flags.team_serial \|\|
1609	taskdata->td_flags.tasking_ser \|\| flags->merged_if0);
1610
1611	taskdata->td_flags.started = `0`;
1612	taskdata->td_flags.executing = `0`;
1613	taskdata->td_flags.complete = `0`;
1614	taskdata->td_flags.freed = `0`;
1615	#if OMPX_TASKGRAPH
1616	taskdata->td_flags.onced = `0`;
1617	taskdata->is_taskgraph = `0`;
1618	taskdata->tdg = nullptr;
1619	#endif
1620	KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, `0`);
1621	// start at one because counts current task and children
1622	KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, `1`);
1623	taskdata->td_taskgroup =
1624	parent_task->td_taskgroup; // task inherits taskgroup from the parent task
1625	taskdata->td_dephash = NULL;
1626	taskdata->td_depnode = NULL;
1627	taskdata->td_target_data.async_handle = NULL;
1628	if (flags->tiedness == TASK_UNTIED)
1629	taskdata->td_last_tied = NULL; // will be set when the task is scheduled
1630	else
1631	taskdata->td_last_tied = taskdata;
1632	taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1633	#if OMPT_SUPPORT
1634	if (UNLIKELY(ompt_enabled.enabled))
1635	__ompt_task_init(task: taskdata, tid: gtid);
1636	#endif
1637	// TODO: What would be the balance between the conditions in the function and
1638	// an atomic operation?
1639	if (__kmp_track_children_task(taskdata)) {
1640	KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
1641	if (parent_task->td_taskgroup)
1642	KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
1643	// Only need to keep track of allocated child tasks for explicit tasks since
1644	// implicit not deallocated
1645	if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1646	KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
1647	}
1648	if (flags->hidden_helper) {
1649	taskdata->td_flags.task_serial = FALSE;
1650	// Increment the number of hidden helper tasks to be executed
1651	KMP_ATOMIC_INC(&__kmp_unexecuted_hidden_helper_tasks);
1652	}
1653	}
1654
1655	#if OMPX_TASKGRAPH
1656	kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
1657	if (tdg && __kmp_tdg_is_recording(tdg->tdg_status) &&
1658	(task_entry != (kmp_routine_entry_t)__kmp_taskloop_task)) {
1659	taskdata->is_taskgraph = `1`;
1660	taskdata->tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
1661	taskdata->td_task_id = KMP_GEN_TASK_ID();
1662	taskdata->td_tdg_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
1663	}
1664	#endif
1665	KA_TRACE(`20`, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1666	gtid, taskdata, taskdata->td_parent));
1667
1668	return task;
1669	}
1670
1671	kmp_task_t __kmpc_omp_task_alloc(ident_t loc_ref, kmp_int32 gtid,
1672	kmp_int32 flags, size_t sizeof_kmp_task_t,
1673	size_t sizeof_shareds,
1674	kmp_routine_entry_t task_entry) {
1675	kmp_task_t *retval;
1676	kmp_tasking_flags_t input_flags = (kmp_tasking_flags_t )&flags;
1677	__kmp_assert_valid_gtid(gtid);
1678	input_flags->native = FALSE;
1679	// __kmp_task_alloc() sets up all other runtime flags
1680	KA_TRACE(`10`, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
1681	"sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1682	gtid, loc_ref, input_flags->tiedness ? "tied " : "untied",
1683	input_flags->proxy ? "proxy" : "",
1684	input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t,
1685	sizeof_shareds, task_entry));
1686
1687	retval = __kmp_task_alloc(loc_ref, gtid, flags: input_flags, sizeof_kmp_task_t,
1688	sizeof_shareds, task_entry);
1689
1690	KA_TRACE(`20`, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1691
1692	return retval;
1693	}
1694
1695	kmp_task_t __kmpc_omp_target_task_alloc(ident_t loc_ref, kmp_int32 gtid,
1696	kmp_int32 flags,
1697	size_t sizeof_kmp_task_t,
1698	size_t sizeof_shareds,
1699	kmp_routine_entry_t task_entry,
1700	kmp_int64 device_id) {
1701	auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags);
1702	// target task is untied defined in the specification
1703	input_flags.tiedness = TASK_UNTIED;
1704	input_flags.target = `1`;
1705
1706	if (__kmp_enable_hidden_helper)
1707	input_flags.hidden_helper = TRUE;
1708
1709	return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
1710	sizeof_shareds, task_entry);
1711	}
1712
1713	/!*
1714	@ingroup TASKING
1715	@param loc_ref location of the original task directive
1716	@param gtid Global Thread ID of encountering thread
1717	@param new_task task thunk allocated by __kmpc_omp_task_alloc() for the ''new
1718	task''
1719	@param naffins Number of affinity items
1720	@param affin_list List of affinity items
1721	@return Returns non-zero if registering affinity information was not successful.
1722	Returns 0 if registration was successful
1723	This entry registers the affinity information attached to a task with the task
1724	thunk structure kmp_taskdata_t.
1725	*/
1726	kmp_int32
1727	__kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid,
1728	kmp_task_t *new_task, kmp_int32 naffins,
1729	kmp_task_affinity_info_t *affin_list) {
1730	return `0`;
1731	}
1732
1733	// __kmp_invoke_task: invoke the specified task
1734	//
1735	// gtid: global thread ID of caller
1736	// task: the task to invoke
1737	// current_task: the task to resume after task invocation
1738	#ifdef __s390x__
1739	__attribute__((target("backchain")))
1740	#endif
1741	static void
1742	__kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
1743	kmp_taskdata_t *current_task) {
1744	kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1745	kmp_info_t *thread;
1746	int discard = `0` / false /;
1747	KA_TRACE(
1748	`30`, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1749	gtid, taskdata, current_task));
1750	KMP_DEBUG_ASSERT(task);
1751	if (UNLIKELY(taskdata->td_flags.proxy == TASK_PROXY &&
1752	taskdata->td_flags.complete == `1`)) {
1753	// This is a proxy task that was already completed but it needs to run
1754	// its bottom-half finish
1755	KA_TRACE(
1756	`30`,
1757	("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1758	gtid, taskdata));
1759
1760	__kmp_bottom_half_finish_proxy(gtid, ptask: task);
1761
1762	KA_TRACE(`30`, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1763	"proxy task %p, resuming task %p\n",
1764	gtid, taskdata, current_task));
1765
1766	return;
1767	}
1768
1769	#if OMPT_SUPPORT
1770	// For untied tasks, the first task executed only calls __kmpc_omp_task and
1771	// does not execute code.
1772	ompt_thread_info_t oldInfo;
1773	if (UNLIKELY(ompt_enabled.enabled)) {
1774	// Store the threads states and restore them after the task
1775	thread = __kmp_threads[gtid];
1776	oldInfo = thread->th.ompt_thread_info;
1777	thread->th.ompt_thread_info.wait_id = `0`;
1778	thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
1779	? ompt_state_work_serial
1780	: ompt_state_work_parallel;
1781	taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(`0`);
1782	}
1783	#endif
1784
1785	// Proxy tasks are not handled by the runtime
1786	if (taskdata->td_flags.proxy != TASK_PROXY) {
1787	__kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
1788	}
1789
1790	// TODO: cancel tasks if the parallel region has also been cancelled
1791	// TODO: check if this sequence can be hoisted above __kmp_task_start
1792	// if cancellation has been enabled for this run ...
1793	if (UNLIKELY(__kmp_omp_cancellation)) {
1794	thread = __kmp_threads[gtid];
1795	kmp_team_t *this_team = thread->th.th_team;
1796	kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1797	if ((taskgroup && taskgroup->cancel_request) \|\|
1798	(this_team->t.t_cancel_request == cancel_parallel)) {
1799	#if OMPT_SUPPORT && OMPT_OPTIONAL
1800	ompt_data_t *task_data;
1801	if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
1802	__ompt_get_task_info_internal(ancestor_level: `0`, NULL, task_data: &task_data, NULL, NULL, NULL);
1803	ompt_callbacks.ompt_callback(ompt_callback_cancel)(
1804	task_data,
1805	((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
1806	: ompt_cancel_parallel) \|
1807	ompt_cancel_discarded_task,
1808	NULL);
1809	}
1810	#endif
1811	KMP_COUNT_BLOCK(TASK_cancelled);
1812	// this task belongs to a task group and we need to cancel it
1813	discard = `1` / true /;
1814	}
1815	}
1816
1817	// Invoke the task routine and pass in relevant data.
1818	// Thunks generated by gcc take a different argument list.
1819	if (!discard) {
1820	if (taskdata->td_flags.tiedness == TASK_UNTIED) {
1821	taskdata->td_last_tied = current_task->td_last_tied;
1822	KMP_DEBUG_ASSERT(taskdata->td_last_tied);
1823	}
1824	#if KMP_STATS_ENABLED
1825	KMP_COUNT_BLOCK(TASK_executed);
1826	switch (KMP_GET_THREAD_STATE()) {
1827	case FORK_JOIN_BARRIER:
1828	KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1829	break;
1830	case PLAIN_BARRIER:
1831	KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1832	break;
1833	case TASKYIELD:
1834	KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1835	break;
1836	case TASKWAIT:
1837	KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1838	break;
1839	case TASKGROUP:
1840	KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1841	break;
1842	default:
1843	KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1844	break;
1845	}
1846	#endif // KMP_STATS_ENABLED
1847
1848	// OMPT task begin
1849	#if OMPT_SUPPORT
1850	if (UNLIKELY(ompt_enabled.enabled))
1851	__ompt_task_start(task, current_task, gtid);
1852	#endif
1853	#if OMPT_SUPPORT && OMPT_OPTIONAL
1854	if (UNLIKELY(ompt_enabled.ompt_callback_dispatch &&
1855	taskdata->ompt_task_info.dispatch_chunk.iterations > `0`)) {
1856	ompt_data_t instance = ompt_data_none;
1857	instance.ptr = &(taskdata->ompt_task_info.dispatch_chunk);
1858	ompt_team_info_t *team_info = __ompt_get_teaminfo(depth: `0`, NULL);
1859	ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
1860	&(team_info->parallel_data), &(taskdata->ompt_task_info.task_data),
1861	ompt_dispatch_taskloop_chunk, instance);
1862	taskdata->ompt_task_info.dispatch_chunk = {.start: `0`, .iterations: `0`};
1863	}
1864	#endif // OMPT_SUPPORT && OMPT_OPTIONAL
1865
1866	#if OMPD_SUPPORT
1867	if (ompd_state & OMPD_ENABLE_BP)
1868	ompd_bp_task_begin();
1869	#endif
1870
1871	#if USE_ITT_BUILD && USE_ITT_NOTIFY
1872	kmp_uint64 cur_time;
1873	kmp_int32 kmp_itt_count_task =
1874	__kmp_forkjoin_frames_mode == `3` && !taskdata->td_flags.task_serial &&
1875	current_task->td_flags.tasktype == TASK_IMPLICIT;
1876	if (kmp_itt_count_task) {
1877	thread = __kmp_threads[gtid];
1878	// Time outer level explicit task on barrier for adjusting imbalance time
1879	if (thread->th.th_bar_arrive_time)
1880	cur_time = __itt_get_timestamp();
1881	else
1882	kmp_itt_count_task = `0`; // thread is not on a barrier - skip timing
1883	}
1884	KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task)
1885	#endif
1886
1887	#if ENABLE_LIBOMPTARGET
1888	if (taskdata->td_target_data.async_handle != NULL) {
1889	// If we have a valid target async handle, that means that we have already
1890	// executed the task routine once. We must query for the handle completion
1891	// instead of re-executing the routine.
1892	KMP_ASSERT(tgt_target_nowait_query);
1893	tgt_target_nowait_query(&taskdata->td_target_data.async_handle);
1894	} else
1895	#endif
1896	if (task->routine != NULL) {
1897	#ifdef KMP_GOMP_COMPAT
1898	if (taskdata->td_flags.native) {
1899	((void ()(void* ))((task->routine)))(task->shareds);
1900	} else
1901	#endif /* KMP_GOMP_COMPAT */
1902	{
1903	(*(task->routine))(gtid, task);
1904	}
1905	}
1906	KMP_POP_PARTITIONED_TIMER();
1907
1908	#if USE_ITT_BUILD && USE_ITT_NOTIFY
1909	if (kmp_itt_count_task) {
1910	// Barrier imbalance - adjust arrive time with the task duration
1911	thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1912	}
1913	KMP_FSYNC_CANCEL(taskdata); // destroy self (just executed)
1914	KMP_FSYNC_RELEASING(taskdata->td_parent); // releasing parent
1915	#endif
1916	}
1917
1918	#if OMPD_SUPPORT
1919	if (ompd_state & OMPD_ENABLE_BP)
1920	ompd_bp_task_end();
1921	#endif
1922
1923	// Proxy tasks are not handled by the runtime
1924	if (taskdata->td_flags.proxy != TASK_PROXY) {
1925	#if OMPT_SUPPORT
1926	if (UNLIKELY(ompt_enabled.enabled)) {
1927	thread->th.ompt_thread_info = oldInfo;
1928	if (taskdata->td_flags.tiedness == TASK_TIED) {
1929	taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1930	}
1931	__kmp_task_finish<true>(gtid, task, resumed_task: current_task);
1932	} else
1933	#endif
1934	__kmp_task_finish<false>(gtid, task, resumed_task: current_task);
1935	}
1936	#if OMPT_SUPPORT
1937	else if (UNLIKELY(ompt_enabled.enabled && taskdata->td_flags.target)) {
1938	__ompt_task_finish(task, resumed_task: current_task, status: ompt_task_switch);
1939	}
1940	#endif
1941
1942	KA_TRACE(
1943	`30`,
1944	("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1945	gtid, taskdata, current_task));
1946	return;
1947	}
1948
1949	// __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1950	//
1951	// loc_ref: location of original task pragma (ignored)
1952	// gtid: Global Thread ID of encountering thread
1953	// new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1954	// Returns:
1955	// TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1956	// be resumed later.
1957	// TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1958	// resumed later.
1959	kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
1960	kmp_task_t *new_task) {
1961	kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1962
1963	KA_TRACE(`10`, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1964	loc_ref, new_taskdata));
1965
1966	#if OMPT_SUPPORT
1967	kmp_taskdata_t *parent;
1968	if (UNLIKELY(ompt_enabled.enabled)) {
1969	parent = new_taskdata->td_parent;
1970	if (ompt_enabled.ompt_callback_task_create) {
1971	ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1972	&(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
1973	&(new_taskdata->ompt_task_info.task_data),
1974	TASK_TYPE_DETAILS_FORMAT(new_taskdata), `0`,
1975	OMPT_GET_RETURN_ADDRESS(`0`));
1976	}
1977	}
1978	#endif
1979
1980	/ Should we execute the new task or queue it? For now, let's just always try*
1981	to queue it. If the queue fills up, then we'll execute it. /*
1982
1983	if (__kmp_push_task(gtid, task: new_task) == TASK_NOT_PUSHED) // if cannot defer
1984	{ // Execute this task immediately
1985	kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1986	new_taskdata->td_flags.task_serial = `1`;
1987	__kmp_invoke_task(gtid, task: new_task, current_task);
1988	}
1989
1990	KA_TRACE(
1991	`10`,
1992	("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1993	"loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1994	gtid, loc_ref, new_taskdata));
1995
1996	#if OMPT_SUPPORT
1997	if (UNLIKELY(ompt_enabled.enabled)) {
1998	parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1999	parent->ompt_task_info.frame.enter_frame_flags = OMPT_FRAME_FLAGS_RUNTIME;
2000	}
2001	#endif
2002	return TASK_CURRENT_NOT_QUEUED;
2003	}
2004
2005	// __kmp_omp_task: Schedule a non-thread-switchable task for execution
2006	//
2007	// gtid: Global Thread ID of encountering thread
2008	// new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
2009	// serialize_immediate: if TRUE then if the task is executed immediately its
2010	// execution will be serialized
2011	// Returns:
2012	// TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
2013	// be resumed later.
2014	// TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
2015	// resumed later.
2016	kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
2017	bool serialize_immediate) {
2018	kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
2019
2020	#if OMPX_TASKGRAPH
2021	if (new_taskdata->is_taskgraph &&
2022	__kmp_tdg_is_recording(new_taskdata->tdg->tdg_status)) {
2023	kmp_tdg_info_t *tdg = new_taskdata->tdg;
2024	// extend the record_map if needed
2025	if (new_taskdata->td_tdg_task_id >= new_taskdata->tdg->map_size) {
2026	__kmp_acquire_bootstrap_lock(&tdg->graph_lock);
2027	// map_size could have been updated by another thread if recursive
2028	// taskloop
2029	if (new_taskdata->td_tdg_task_id >= tdg->map_size) {
2030	kmp_uint old_size = tdg->map_size;
2031	kmp_uint new_size = old_size * `2`;
2032	kmp_node_info_t *old_record = tdg->record_map;
2033	kmp_node_info_t new_record = (kmp_node_info_t )__kmp_allocate(
2034	new_size * sizeof(kmp_node_info_t));
2035
2036	KMP_MEMCPY(new_record, old_record, old_size * sizeof(kmp_node_info_t));
2037	tdg->record_map = new_record;
2038
2039	__kmp_free(old_record);
2040
2041	for (kmp_int i = old_size; i < new_size; i++) {
2042	kmp_int32 successorsList = (kmp_int32 )__kmp_allocate(
2043	__kmp_successors_size * sizeof(kmp_int32));
2044	new_record[i].task = nullptr;
2045	new_record[i].successors = successorsList;
2046	new_record[i].nsuccessors = `0`;
2047	new_record[i].npredecessors = `0`;
2048	new_record[i].successors_size = __kmp_successors_size;
2049	KMP_ATOMIC_ST_REL(&new_record[i].npredecessors_counter, `0`);
2050	}
2051	// update the size at the end, so that we avoid other
2052	// threads use old_record while map_size is already updated
2053	tdg->map_size = new_size;
2054	}
2055	__kmp_release_bootstrap_lock(&tdg->graph_lock);
2056	}
2057	// record a task
2058	if (tdg->record_map[new_taskdata->td_tdg_task_id].task == nullptr) {
2059	tdg->record_map[new_taskdata->td_tdg_task_id].task = new_task;
2060	tdg->record_map[new_taskdata->td_tdg_task_id].parent_task =
2061	new_taskdata->td_parent;
2062	KMP_ATOMIC_INC(&tdg->num_tasks);
2063	}
2064	}
2065	#endif
2066
2067	/ Should we execute the new task or queue it? For now, let's just always try*
2068	to queue it. If the queue fills up, then we'll execute it. /*
2069	if (new_taskdata->td_flags.proxy == TASK_PROXY \|\|
2070	__kmp_push_task(gtid, task: new_task) == TASK_NOT_PUSHED) // if cannot defer
2071	{ // Execute this task immediately
2072	kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
2073	if (serialize_immediate)
2074	new_taskdata->td_flags.task_serial = `1`;
2075	__kmp_invoke_task(gtid, task: new_task, current_task);
2076	} else if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
2077	__kmp_wpolicy_passive) {
2078	kmp_info_t *this_thr = __kmp_threads[gtid];
2079	kmp_team_t *team = this_thr->th.th_team;
2080	kmp_int32 nthreads = this_thr->th.th_team_nproc;
2081	for (int i = `0`; i < nthreads; ++i) {
2082	kmp_info_t *thread = team->t.t_threads[i];
2083	if (thread == this_thr)
2084	continue;
2085	if (thread->th.th_sleep_loc != NULL) {
2086	__kmp_null_resume_wrapper(thr: thread);
2087	break; // awake one thread at a time
2088	}
2089	}
2090	}
2091	return TASK_CURRENT_NOT_QUEUED;
2092	}
2093
2094	// __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
2095	// non-thread-switchable task from the parent thread only!
2096	//
2097	// loc_ref: location of original task pragma (ignored)
2098	// gtid: Global Thread ID of encountering thread
2099	// new_task: non-thread-switchable task thunk allocated by
2100	// __kmp_omp_task_alloc()
2101	// Returns:
2102	// TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
2103	// be resumed later.
2104	// TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
2105	// resumed later.
2106	kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
2107	kmp_task_t *new_task) {
2108	kmp_int32 res;
2109	KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
2110
2111	#if KMP_DEBUG \|\| OMPT_SUPPORT
2112	kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
2113	#endif
2114	KA_TRACE(`10`, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
2115	new_taskdata));
2116	__kmp_assert_valid_gtid(gtid);
2117
2118	#if OMPT_SUPPORT
2119	kmp_taskdata_t *parent = NULL;
2120	if (UNLIKELY(ompt_enabled.enabled)) {
2121	if (!new_taskdata->td_flags.started) {
2122	OMPT_STORE_RETURN_ADDRESS(gtid);
2123	parent = new_taskdata->td_parent;
2124	if (!parent->ompt_task_info.frame.enter_frame.ptr) {
2125	parent->ompt_task_info.frame.enter_frame.ptr =
2126	OMPT_GET_FRAME_ADDRESS(`0`);
2127	}
2128	if (ompt_enabled.ompt_callback_task_create) {
2129	ompt_callbacks.ompt_callback(ompt_callback_task_create)(
2130	&(parent->ompt_task_info.task_data),
2131	&(parent->ompt_task_info.frame),
2132	&(new_taskdata->ompt_task_info.task_data),
2133	TASK_TYPE_DETAILS_FORMAT(new_taskdata), `0`,
2134	OMPT_LOAD_RETURN_ADDRESS(gtid));
2135	}
2136	} else {
2137	// We are scheduling the continuation of an UNTIED task.
2138	// Scheduling back to the parent task.
2139	__ompt_task_finish(task: new_task,
2140	resumed_task: new_taskdata->ompt_task_info.scheduling_parent,
2141	status: ompt_task_switch);
2142	new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
2143	}
2144	}
2145	#endif
2146
2147	res = __kmp_omp_task(gtid, new_task, serialize_immediate: true);
2148
2149	KA_TRACE(`10`, ("__kmpc_omp_task(exit): T#%d returning "
2150	"TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
2151	gtid, loc_ref, new_taskdata));
2152	#if OMPT_SUPPORT
2153	if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
2154	parent->ompt_task_info.frame.enter_frame = ompt_data_none;
2155	}
2156	#endif
2157	return res;
2158	}
2159
2160	// __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
2161	// a taskloop task with the correct OMPT return address
2162	//
2163	// loc_ref: location of original task pragma (ignored)
2164	// gtid: Global Thread ID of encountering thread
2165	// new_task: non-thread-switchable task thunk allocated by
2166	// __kmp_omp_task_alloc()
2167	// codeptr_ra: return address for OMPT callback
2168	// Returns:
2169	// TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
2170	// be resumed later.
2171	// TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
2172	// resumed later.
2173	kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
2174	kmp_task_t new_task, void* *codeptr_ra) {
2175	kmp_int32 res;
2176	KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
2177
2178	#if KMP_DEBUG \|\| OMPT_SUPPORT
2179	kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
2180	#endif
2181	KA_TRACE(`10`, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
2182	new_taskdata));
2183
2184	#if OMPT_SUPPORT
2185	kmp_taskdata_t *parent = NULL;
2186	if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
2187	parent = new_taskdata->td_parent;
2188	if (!parent->ompt_task_info.frame.enter_frame.ptr)
2189	parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(`0`);
2190	if (ompt_enabled.ompt_callback_task_create) {
2191	ompt_callbacks.ompt_callback(ompt_callback_task_create)(
2192	&(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
2193	&(new_taskdata->ompt_task_info.task_data),
2194	TASK_TYPE_DETAILS_FORMAT(new_taskdata), `0`, codeptr_ra);
2195	}
2196	}
2197	#endif
2198
2199	res = __kmp_omp_task(gtid, new_task, serialize_immediate: true);
2200
2201	KA_TRACE(`10`, ("__kmpc_omp_task(exit): T#%d returning "
2202	"TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
2203	gtid, loc_ref, new_taskdata));
2204	#if OMPT_SUPPORT
2205	if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
2206	parent->ompt_task_info.frame.enter_frame = ompt_data_none;
2207	}
2208	#endif
2209	return res;
2210	}
2211
2212	template <bool ompt>
2213	static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
2214	void *frame_address,
2215	void *return_address) {
2216	kmp_taskdata_t taskdata = nullptr*;
2217	kmp_info_t *thread;
2218	int thread_finished = FALSE;
2219	KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
2220
2221	KA_TRACE(`10`, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
2222	KMP_DEBUG_ASSERT(gtid >= `0`);
2223
2224	if (__kmp_tasking_mode != tskm_immediate_exec) {
2225	thread = __kmp_threads[gtid];
2226	taskdata = thread->th.th_current_task;
2227
2228	#if OMPT_SUPPORT && OMPT_OPTIONAL
2229	ompt_data_t *my_task_data;
2230	ompt_data_t *my_parallel_data;
2231
2232	if (ompt) {
2233	my_task_data = &(taskdata->ompt_task_info.task_data);
2234	my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
2235
2236	taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address;
2237
2238	if (ompt_enabled.ompt_callback_sync_region) {
2239	ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2240	ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2241	my_task_data, return_address);
2242	}
2243
2244	if (ompt_enabled.ompt_callback_sync_region_wait) {
2245	ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2246	ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2247	my_task_data, return_address);
2248	}
2249	}
2250	#endif // OMPT_SUPPORT && OMPT_OPTIONAL
2251
2252	// Debugger: The taskwait is active. Store location and thread encountered the
2253	// taskwait.
2254	#if USE_ITT_BUILD
2255	// Note: These values are used by ITT events as well.
2256	#endif /* USE_ITT_BUILD */
2257	taskdata->td_taskwait_counter += `1`;
2258	taskdata->td_taskwait_ident = loc_ref;
2259	taskdata->td_taskwait_thread = gtid + `1`;
2260
2261	#if USE_ITT_BUILD
2262	void *itt_sync_obj = NULL;
2263	#if USE_ITT_NOTIFY
2264	KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2265	#endif /* USE_ITT_NOTIFY */
2266	#endif /* USE_ITT_BUILD */
2267
2268	bool must_wait =
2269	!taskdata->td_flags.team_serial && !taskdata->td_flags.final;
2270
2271	must_wait = must_wait \|\| (thread->th.th_task_team != NULL &&
2272	thread->th.th_task_team->tt.tt_found_proxy_tasks);
2273	// If hidden helper thread is encountered, we must enable wait here.
2274	must_wait =
2275	must_wait \|\|
2276	(__kmp_enable_hidden_helper && thread->th.th_task_team != NULL &&
2277	thread->th.th_task_team->tt.tt_hidden_helper_task_encountered);
2278
2279	if (must_wait) {
2280	kmp_flag_32<false, false> flag(
2281	RCAST(std::atomic<kmp_uint32> *,
2282	&(taskdata->td_incomplete_child_tasks)),
2283	`0U`);
2284	while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != `0`) {
2285	flag.execute_tasks(this_thr: thread, gtid, FALSE,
2286	thread_finished: &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2287	is_constrained: __kmp_task_stealing_constraint);
2288	}
2289	}
2290	#if USE_ITT_BUILD
2291	KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2292	KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with children
2293	#endif /* USE_ITT_BUILD */
2294
2295	// Debugger: The taskwait is completed. Location remains, but thread is
2296	// negated.
2297	taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2298
2299	#if OMPT_SUPPORT && OMPT_OPTIONAL
2300	if (ompt) {
2301	if (ompt_enabled.ompt_callback_sync_region_wait) {
2302	ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2303	ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2304	my_task_data, return_address);
2305	}
2306	if (ompt_enabled.ompt_callback_sync_region) {
2307	ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2308	ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2309	my_task_data, return_address);
2310	}
2311	taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
2312	}
2313	#endif // OMPT_SUPPORT && OMPT_OPTIONAL
2314	}
2315
2316	KA_TRACE(`10`, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
2317	"returning TASK_CURRENT_NOT_QUEUED\n",
2318	gtid, taskdata));
2319
2320	return TASK_CURRENT_NOT_QUEUED;
2321	}
2322
2323	#if OMPT_SUPPORT && OMPT_OPTIONAL
2324	OMPT_NOINLINE
2325	static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
2326	void *frame_address,
2327	void *return_address) {
2328	return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
2329	return_address);
2330	}
2331	#endif // OMPT_SUPPORT && OMPT_OPTIONAL
2332
2333	// __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
2334	// complete
2335	kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
2336	#if OMPT_SUPPORT && OMPT_OPTIONAL
2337	if (UNLIKELY(ompt_enabled.enabled)) {
2338	OMPT_STORE_RETURN_ADDRESS(gtid);
2339	return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(`0`),
2340	OMPT_LOAD_RETURN_ADDRESS(gtid));
2341	}
2342	#endif
2343	return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
2344	}
2345
2346	// __kmpc_omp_taskyield: switch to a different task
2347	kmp_int32 __kmpc_omp_taskyield(ident_t loc_ref, kmp_int32 gtid, int* end_part) {
2348	kmp_taskdata_t *taskdata = NULL;
2349	kmp_info_t *thread;
2350	int thread_finished = FALSE;
2351
2352	KMP_COUNT_BLOCK(OMP_TASKYIELD);
2353	KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
2354
2355	KA_TRACE(`10`, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
2356	gtid, loc_ref, end_part));
2357	__kmp_assert_valid_gtid(gtid);
2358
2359	if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
2360	thread = __kmp_threads[gtid];
2361	taskdata = thread->th.th_current_task;
2362	// Should we model this as a task wait or not?
2363	// Debugger: The taskwait is active. Store location and thread encountered the
2364	// taskwait.
2365	#if USE_ITT_BUILD
2366	// Note: These values are used by ITT events as well.
2367	#endif /* USE_ITT_BUILD */
2368	taskdata->td_taskwait_counter += `1`;
2369	taskdata->td_taskwait_ident = loc_ref;
2370	taskdata->td_taskwait_thread = gtid + `1`;
2371
2372	#if USE_ITT_BUILD
2373	void *itt_sync_obj = NULL;
2374	#if USE_ITT_NOTIFY
2375	KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2376	#endif /* USE_ITT_NOTIFY */
2377	#endif /* USE_ITT_BUILD */
2378	if (!taskdata->td_flags.team_serial) {
2379	kmp_task_team_t *task_team = thread->th.th_task_team;
2380	if (task_team != NULL) {
2381	if (KMP_TASKING_ENABLED(task_team)) {
2382	#if OMPT_SUPPORT
2383	if (UNLIKELY(ompt_enabled.enabled))
2384	thread->th.ompt_thread_info.ompt_task_yielded = `1`;
2385	#endif
2386	__kmp_execute_tasks_32(
2387	thread, gtid, flag: (kmp_flag_32<> *)NULL, FALSE,
2388	thread_finished: &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2389	is_constrained: __kmp_task_stealing_constraint);
2390	#if OMPT_SUPPORT
2391	if (UNLIKELY(ompt_enabled.enabled))
2392	thread->th.ompt_thread_info.ompt_task_yielded = `0`;
2393	#endif
2394	}
2395	}
2396	}
2397	#if USE_ITT_BUILD
2398	KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2399	#endif /* USE_ITT_BUILD */
2400
2401	// Debugger: The taskwait is completed. Location remains, but thread is
2402	// negated.
2403	taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2404	}
2405
2406	KA_TRACE(`10`, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
2407	"returning TASK_CURRENT_NOT_QUEUED\n",
2408	gtid, taskdata));
2409
2410	return TASK_CURRENT_NOT_QUEUED;
2411	}
2412
2413	// Task Reduction implementation
2414	//
2415	// Note: initial implementation didn't take into account the possibility
2416	// to specify omp_orig for initializer of the UDR (user defined reduction).
2417	// Corrected implementation takes into account the omp_orig object.
2418	// Compiler is free to use old implementation if omp_orig is not specified.
2419
2420	/!*
2421	@ingroup BASIC_TYPES
2422	@{
2423	*/
2424
2425	/!*
2426	Flags for special info per task reduction item.
2427	*/
2428	typedef struct kmp_taskred_flags {
2429	/! 1 - use lazy alloc/init (e.g. big objects, num tasks < num threads) /
2430	unsigned lazy_priv : `1`;
2431	unsigned reserved31 : `31`;
2432	} kmp_taskred_flags_t;
2433
2434	/!*
2435	Internal struct for reduction data item related info set up by compiler.
2436	*/
2437	typedef struct kmp_task_red_input {
2438	void reduce_shar; /*< shared between tasks item to reduce into /*
2439	size_t reduce_size; /< size of data item in bytes /*
2440	// three compiler-generated routines (init, fini are optional):
2441	void reduce_init; /*< data initialization routine (single parameter) /*
2442	void reduce_fini; /*< data finalization routine /*
2443	void reduce_comb; /*< data combiner routine /*
2444	kmp_taskred_flags_t flags; /< flags for additional info from compiler /*
2445	} kmp_task_red_input_t;
2446
2447	/!*
2448	Internal struct for reduction data item related info saved by the library.
2449	*/
2450	typedef struct kmp_taskred_data {
2451	void reduce_shar; /*< shared between tasks item to reduce into /*
2452	size_t reduce_size; /< size of data item /*
2453	kmp_taskred_flags_t flags; /< flags for additional info from compiler /*
2454	void reduce_priv; /*< array of thread specific items /*
2455	void reduce_pend; /*< end of private data for faster comparison op /*
2456	// three compiler-generated routines (init, fini are optional):
2457	void reduce_comb; /*< data combiner routine /*
2458	void reduce_init; /*< data initialization routine (two parameters) /*
2459	void reduce_fini; /*< data finalization routine /*
2460	void reduce_orig; /*< original item (can be used in UDR initializer) /*
2461	} kmp_taskred_data_t;
2462
2463	/!*
2464	Internal struct for reduction data item related info set up by compiler.
2465
2466	New interface: added reduce_orig field to provide omp_orig for UDR initializer.
2467	*/
2468	typedef struct kmp_taskred_input {
2469	void reduce_shar; /*< shared between tasks item to reduce into /*
2470	void reduce_orig; /*< original reduction item used for initialization /*
2471	size_t reduce_size; /< size of data item /*
2472	// three compiler-generated routines (init, fini are optional):
2473	void reduce_init; /*< data initialization routine (two parameters) /*
2474	void reduce_fini; /*< data finalization routine /*
2475	void reduce_comb; /*< data combiner routine /*
2476	kmp_taskred_flags_t flags; /< flags for additional info from compiler /*
2477	} kmp_taskred_input_t;
2478	/!*
2479	@}
2480	*/
2481
2482	template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
2483	template <>
2484	void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2485	kmp_task_red_input_t &src) {
2486	item.reduce_orig = NULL;
2487	}
2488	template <>
2489	void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2490	kmp_taskred_input_t &src) {
2491	if (src.reduce_orig != NULL) {
2492	item.reduce_orig = src.reduce_orig;
2493	} else {
2494	item.reduce_orig = src.reduce_shar;
2495	} // non-NULL reduce_orig means new interface used
2496	}
2497
2498	template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, size_t j);
2499	template <>
2500	void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2501	size_t offset) {
2502	((void ()(void* ))item.reduce_init)((char* *)(item.reduce_priv) + offset);
2503	}
2504	template <>
2505	void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2506	size_t offset) {
2507	((void ()(void* , void* *))item.reduce_init)(
2508	(char *)(item.reduce_priv) + offset, item.reduce_orig);
2509	}
2510
2511	template <typename T>
2512	void __kmp_task_reduction_init(int* gtid, int num, T *data) {
2513	__kmp_assert_valid_gtid(gtid);
2514	kmp_info_t *thread = __kmp_threads[gtid];
2515	kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
2516	kmp_uint32 nth = thread->th.th_team_nproc;
2517	kmp_taskred_data_t *arr;
2518
2519	// check input data just in case
2520	KMP_ASSERT(tg != NULL);
2521	KMP_ASSERT(data != NULL);
2522	KMP_ASSERT(num > `0`);
2523	if (nth == `1` && !__kmp_enable_hidden_helper) {
2524	KA_TRACE(`10`, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
2525	gtid, tg));
2526	return (void *)tg;
2527	}
2528	KA_TRACE(`10`, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
2529	gtid, tg, num));
2530	arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2531	thread, num * sizeof(kmp_taskred_data_t));
2532	for (int i = `0`; i < num; ++i) {
2533	size_t size = data[i].reduce_size - `1`;
2534	// round the size up to cache line per thread-specific item
2535	size += CACHE_LINE - size % CACHE_LINE;
2536	KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory
2537	arr[i].reduce_shar = data[i].reduce_shar;
2538	arr[i].reduce_size = size;
2539	arr[i].flags = data[i].flags;
2540	arr[i].reduce_comb = data[i].reduce_comb;
2541	arr[i].reduce_init = data[i].reduce_init;
2542	arr[i].reduce_fini = data[i].reduce_fini;
2543	__kmp_assign_orig<T>(arr[i], data[i]);
2544	if (!arr[i].flags.lazy_priv) {
2545	// allocate cache-line aligned block and fill it with zeros
2546	arr[i].reduce_priv = __kmp_allocate(nth * size);
2547	arr[i].reduce_pend = (char )(arr[i].reduce_priv) + nth size;
2548	if (arr[i].reduce_init != NULL) {
2549	// initialize all thread-specific items
2550	for (size_t j = `0`; j < nth; ++j) {
2551	__kmp_call_init<T>(arr[i], j * size);
2552	}
2553	}
2554	} else {
2555	// only allocate space for pointers now,
2556	// objects will be lazily allocated/initialized if/when requested
2557	// note that __kmp_allocate zeroes the allocated memory
2558	arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
2559	}
2560	}
2561	tg->reduce_data = (void *)arr;
2562	tg->reduce_num_data = num;
2563	return (void *)tg;
2564	}
2565
2566	/!*
2567	@ingroup TASKING
2568	@param gtid Global thread ID
2569	@param num Number of data items to reduce
2570	@param data Array of data for reduction
2571	@return The taskgroup identifier
2572
2573	Initialize task reduction for the taskgroup.
2574
2575	Note: this entry supposes the optional compiler-generated initializer routine
2576	has single parameter - pointer to object to be initialized. That means
2577	the reduction either does not use omp_orig object, or the omp_orig is accessible
2578	without help of the runtime library.
2579	*/
2580	void __kmpc_task_reduction_init(int* gtid, int num, void *data) {
2581	#if OMPX_TASKGRAPH
2582	kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
2583	if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
2584	kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
2585	this_tdg->rec_taskred_data =
2586	__kmp_allocate(sizeof(kmp_task_red_input_t) * num);
2587	this_tdg->rec_num_taskred = num;
2588	KMP_MEMCPY(this_tdg->rec_taskred_data, data,
2589	sizeof(kmp_task_red_input_t) * num);
2590	}
2591	#endif
2592	return __kmp_task_reduction_init(gtid, num, data: (kmp_task_red_input_t *)data);
2593	}
2594
2595	/!*
2596	@ingroup TASKING
2597	@param gtid Global thread ID
2598	@param num Number of data items to reduce
2599	@param data Array of data for reduction
2600	@return The taskgroup identifier
2601
2602	Initialize task reduction for the taskgroup.
2603
2604	Note: this entry supposes the optional compiler-generated initializer routine
2605	has two parameters, pointer to object to be initialized and pointer to omp_orig
2606	*/
2607	void __kmpc_taskred_init(int* gtid, int num, void *data) {
2608	#if OMPX_TASKGRAPH
2609	kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
2610	if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
2611	kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
2612	this_tdg->rec_taskred_data =
2613	__kmp_allocate(sizeof(kmp_task_red_input_t) * num);
2614	this_tdg->rec_num_taskred = num;
2615	KMP_MEMCPY(this_tdg->rec_taskred_data, data,
2616	sizeof(kmp_task_red_input_t) * num);
2617	}
2618	#endif
2619	return __kmp_task_reduction_init(gtid, num, data: (kmp_taskred_input_t *)data);
2620	}
2621
2622	// Copy task reduction data (except for shared pointers).
2623	template <typename T>
2624	void __kmp_task_reduction_init_copy(kmp_info_t thr, int* num, T *data,
2625	kmp_taskgroup_t tg, void* *reduce_data) {
2626	kmp_taskred_data_t *arr;
2627	KA_TRACE(`20`, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
2628	" from data %p\n",
2629	thr, tg, reduce_data));
2630	arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2631	thr, num * sizeof(kmp_taskred_data_t));
2632	// threads will share private copies, thunk routines, sizes, flags, etc.:
2633	KMP_MEMCPY(dest: arr, src: reduce_data, n: num * sizeof(kmp_taskred_data_t));
2634	for (int i = `0`; i < num; ++i) {
2635	arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers
2636	}
2637	tg->reduce_data = (void *)arr;
2638	tg->reduce_num_data = num;
2639	}
2640
2641	/!*
2642	@ingroup TASKING
2643	@param gtid Global thread ID
2644	@param tskgrp The taskgroup ID (optional)
2645	@param data Shared location of the item
2646	@return The pointer to per-thread data
2647
2648	Get thread-specific location of data item
2649	*/
2650	void __kmpc_task_reduction_get_th_data(int* gtid, void tskgrp, void* *data) {
2651	__kmp_assert_valid_gtid(gtid);
2652	kmp_info_t *thread = __kmp_threads[gtid];
2653	kmp_int32 nth = thread->th.th_team_nproc;
2654	if (nth == `1`)
2655	return data; // nothing to do
2656
2657	kmp_taskgroup_t tg = (kmp_taskgroup_t )tskgrp;
2658	if (tg == NULL)
2659	tg = thread->th.th_current_task->td_taskgroup;
2660	KMP_ASSERT(tg != NULL);
2661	kmp_taskred_data_t *arr;
2662	kmp_int32 num;
2663	kmp_int32 tid = thread->th.th_info.ds.ds_tid;
2664
2665	#if OMPX_TASKGRAPH
2666	if ((thread->th.th_current_task->is_taskgraph) &&
2667	(!__kmp_tdg_is_recording(
2668	__kmp_global_tdgs[__kmp_curr_tdg_idx]->tdg_status))) {
2669	tg = thread->th.th_current_task->td_taskgroup;
2670	KMP_ASSERT(tg != NULL);
2671	KMP_ASSERT(tg->reduce_data != NULL);
2672	arr = (kmp_taskred_data_t *)(tg->reduce_data);
2673	num = tg->reduce_num_data;
2674	}
2675	#endif
2676
2677	KMP_ASSERT(data != NULL);
2678	while (tg != NULL) {
2679	arr = (kmp_taskred_data_t *)(tg->reduce_data);
2680	num = tg->reduce_num_data;
2681	for (int i = `0`; i < num; ++i) {
2682	if (!arr[i].flags.lazy_priv) {
2683	if (data == arr[i].reduce_shar \|\|
2684	(data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
2685	return (char )(arr[i].reduce_priv) + tid arr[i].reduce_size;
2686	} else {
2687	// check shared location first
2688	void *p_priv = (void* **)(arr[i].reduce_priv);
2689	if (data == arr[i].reduce_shar)
2690	goto found;
2691	// check if we get some thread specific location as parameter
2692	for (int j = `0`; j < nth; ++j)
2693	if (data == p_priv[j])
2694	goto found;
2695	continue; // not found, continue search
2696	found:
2697	if (p_priv[tid] == NULL) {
2698	// allocate thread specific object lazily
2699	p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
2700	if (arr[i].reduce_init != NULL) {
2701	if (arr[i].reduce_orig != NULL) { // new interface
2702	((void ()(void* , void* *))arr[i].reduce_init)(
2703	p_priv[tid], arr[i].reduce_orig);
2704	} else { // old interface (single parameter)
2705	((void ()(void* *))arr[i].reduce_init)(p_priv[tid]);
2706	}
2707	}
2708	}
2709	return p_priv[tid];
2710	}
2711	}
2712	KMP_ASSERT(tg->parent);
2713	tg = tg->parent;
2714	}
2715	KMP_ASSERT2(`0`, "Unknown task reduction item");
2716	return NULL; // ERROR, this line never executed
2717	}
2718
2719	// Finalize task reduction.
2720	// Called from __kmpc_end_taskgroup()
2721	static void __kmp_task_reduction_fini(kmp_info_t th, kmp_taskgroup_t tg) {
2722	kmp_int32 nth = th->th.th_team_nproc;
2723	KMP_DEBUG_ASSERT(
2724	nth > `1` \|\|
2725	__kmp_enable_hidden_helper); // should not be called if nth == 1 unless we
2726	// are using hidden helper threads
2727	kmp_taskred_data_t arr = (kmp_taskred_data_t )tg->reduce_data;
2728	kmp_int32 num = tg->reduce_num_data;
2729	for (int i = `0`; i < num; ++i) {
2730	void *sh_data = arr[i].reduce_shar;
2731	void (f_fini)(void* ) = (void* ()(void* *))(arr[i].reduce_fini);
2732	void (f_comb)(void* , void* *) =
2733	(void ()(void* , void* *))(arr[i].reduce_comb);
2734	if (!arr[i].flags.lazy_priv) {
2735	void *pr_data = arr[i].reduce_priv;
2736	size_t size = arr[i].reduce_size;
2737	for (int j = `0`; j < nth; ++j) {
2738	void priv_data = (char* )pr_data + j size;
2739	f_comb(sh_data, priv_data); // combine results
2740	if (f_fini)
2741	f_fini(priv_data); // finalize if needed
2742	}
2743	} else {
2744	void *pr_data = (void* **)(arr[i].reduce_priv);
2745	for (int j = `0`; j < nth; ++j) {
2746	if (pr_data[j] != NULL) {
2747	f_comb(sh_data, pr_data[j]); // combine results
2748	if (f_fini)
2749	f_fini(pr_data[j]); // finalize if needed
2750	__kmp_free(pr_data[j]);
2751	}
2752	}
2753	}
2754	__kmp_free(arr[i].reduce_priv);
2755	}
2756	__kmp_thread_free(th, arr);
2757	tg->reduce_data = NULL;
2758	tg->reduce_num_data = `0`;
2759	}
2760
2761	// Cleanup task reduction data for parallel or worksharing,
2762	// do not touch task private data other threads still working with.
2763	// Called from __kmpc_end_taskgroup()
2764	static void __kmp_task_reduction_clean(kmp_info_t th, kmp_taskgroup_t tg) {
2765	__kmp_thread_free(th, tg->reduce_data);
2766	tg->reduce_data = NULL;
2767	tg->reduce_num_data = `0`;
2768	}
2769
2770	template <typename T>
2771	void __kmp_task_reduction_modifier_init(ident_t loc, int gtid, int is_ws,
2772	int num, T *data) {
2773	__kmp_assert_valid_gtid(gtid);
2774	kmp_info_t *thr = __kmp_threads[gtid];
2775	kmp_int32 nth = thr->th.th_team_nproc;
2776	__kmpc_taskgroup(loc, gtid); // form new taskgroup first
2777	if (nth == `1`) {
2778	KA_TRACE(`10`,
2779	("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
2780	gtid, thr->th.th_current_task->td_taskgroup));
2781	return (void *)thr->th.th_current_task->td_taskgroup;
2782	}
2783	kmp_team_t *team = thr->th.th_team;
2784	void *reduce_data;
2785	kmp_taskgroup_t *tg;
2786	reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
2787	if (reduce_data == NULL &&
2788	__kmp_atomic_compare_store(p: &team->t.t_tg_reduce_data[is_ws], expected: reduce_data,
2789	desired: (void *)`1`)) {
2790	// single thread enters this block to initialize common reduction data
2791	KMP_DEBUG_ASSERT(reduce_data == NULL);
2792	// first initialize own data, then make a copy other threads can use
2793	tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);
2794	reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t));
2795	KMP_MEMCPY(dest: reduce_data, src: tg->reduce_data, n: num * sizeof(kmp_taskred_data_t));
2796	// fini counters should be 0 at this point
2797	KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[`0`]) == `0`);
2798	KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[`1`]) == `0`);
2799	KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
2800	} else {
2801	while (
2802	(reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
2803	(void )`1`) { // wait for task reduction initialization*
2804	KMP_CPU_PAUSE();
2805	}
2806	KMP_DEBUG_ASSERT(reduce_data > (void )`1`); // should be valid pointer here*
2807	tg = thr->th.th_current_task->td_taskgroup;
2808	__kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
2809	}
2810	return tg;
2811	}
2812
2813	/!*
2814	@ingroup TASKING
2815	@param loc Source location info
2816	@param gtid Global thread ID
2817	@param is_ws Is 1 if the reduction is for worksharing, 0 otherwise
2818	@param num Number of data items to reduce
2819	@param data Array of data for reduction
2820	@return The taskgroup identifier
2821
2822	Initialize task reduction for a parallel or worksharing.
2823
2824	Note: this entry supposes the optional compiler-generated initializer routine
2825	has single parameter - pointer to object to be initialized. That means
2826	the reduction either does not use omp_orig object, or the omp_orig is accessible
2827	without help of the runtime library.
2828	*/
2829	void __kmpc_task_reduction_modifier_init(ident_t loc, int gtid, int is_ws,
2830	int num, void *data) {
2831	return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2832	data: (kmp_task_red_input_t *)data);
2833	}
2834
2835	/!*
2836	@ingroup TASKING
2837	@param loc Source location info
2838	@param gtid Global thread ID
2839	@param is_ws Is 1 if the reduction is for worksharing, 0 otherwise
2840	@param num Number of data items to reduce
2841	@param data Array of data for reduction
2842	@return The taskgroup identifier
2843
2844	Initialize task reduction for a parallel or worksharing.
2845
2846	Note: this entry supposes the optional compiler-generated initializer routine
2847	has two parameters, pointer to object to be initialized and pointer to omp_orig
2848	*/
2849	void __kmpc_taskred_modifier_init(ident_t loc, int gtid, int is_ws, int num,
2850	void *data) {
2851	return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2852	data: (kmp_taskred_input_t *)data);
2853	}
2854
2855	/!*
2856	@ingroup TASKING
2857	@param loc Source location info
2858	@param gtid Global thread ID
2859	@param is_ws Is 1 if the reduction is for worksharing, 0 otherwise
2860
2861	Finalize task reduction for a parallel or worksharing.
2862	*/
2863	void __kmpc_task_reduction_modifier_fini(ident_t loc, int* gtid, int is_ws) {
2864	__kmpc_end_taskgroup(loc, gtid);
2865	}
2866
2867	// __kmpc_taskgroup: Start a new taskgroup
2868	void __kmpc_taskgroup(ident_t loc, int* gtid) {
2869	__kmp_assert_valid_gtid(gtid);
2870	kmp_info_t *thread = __kmp_threads[gtid];
2871	kmp_taskdata_t *taskdata = thread->th.th_current_task;
2872	kmp_taskgroup_t *tg_new =
2873	(kmp_taskgroup_t )__kmp_thread_malloc(thread, sizeof*(kmp_taskgroup_t));
2874	KA_TRACE(`10`, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
2875	KMP_ATOMIC_ST_RLX(&tg_new->count, `0`);
2876	KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
2877	tg_new->parent = taskdata->td_taskgroup;
2878	tg_new->reduce_data = NULL;
2879	tg_new->reduce_num_data = `0`;
2880	tg_new->gomp_data = NULL;
2881	taskdata->td_taskgroup = tg_new;
2882
2883	#if OMPT_SUPPORT && OMPT_OPTIONAL
2884	if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2885	void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2886	if (!codeptr)
2887	codeptr = OMPT_GET_RETURN_ADDRESS(`0`);
2888	kmp_team_t *team = thread->th.th_team;
2889	ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
2890	// FIXME: I think this is wrong for lwt!
2891	ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
2892
2893	ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2894	ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2895	&(my_task_data), codeptr);
2896	}
2897	#endif
2898	}
2899
2900	// __kmpc_end_taskgroup: Wait until all tasks generated by the current task
2901	// and its descendants are complete
2902	void __kmpc_end_taskgroup(ident_t loc, int* gtid) {
2903	__kmp_assert_valid_gtid(gtid);
2904	kmp_info_t *thread = __kmp_threads[gtid];
2905	kmp_taskdata_t *taskdata = thread->th.th_current_task;
2906	kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
2907	int thread_finished = FALSE;
2908
2909	#if OMPT_SUPPORT && OMPT_OPTIONAL
2910	kmp_team_t *team;
2911	ompt_data_t my_task_data;
2912	ompt_data_t my_parallel_data;
2913	void codeptr = nullptr*;
2914	if (UNLIKELY(ompt_enabled.enabled)) {
2915	team = thread->th.th_team;
2916	my_task_data = taskdata->ompt_task_info.task_data;
2917	// FIXME: I think this is wrong for lwt!
2918	my_parallel_data = team->t.ompt_team_info.parallel_data;
2919	codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2920	if (!codeptr)
2921	codeptr = OMPT_GET_RETURN_ADDRESS(`0`);
2922	}
2923	#endif
2924
2925	KA_TRACE(`10`, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
2926	KMP_DEBUG_ASSERT(taskgroup != NULL);
2927	KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
2928
2929	if (__kmp_tasking_mode != tskm_immediate_exec) {
2930	// mark task as waiting not on a barrier
2931	taskdata->td_taskwait_counter += `1`;
2932	taskdata->td_taskwait_ident = loc;
2933	taskdata->td_taskwait_thread = gtid + `1`;
2934	#if USE_ITT_BUILD
2935	// For ITT the taskgroup wait is similar to taskwait until we need to
2936	// distinguish them
2937	void *itt_sync_obj = NULL;
2938	#if USE_ITT_NOTIFY
2939	KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2940	#endif /* USE_ITT_NOTIFY */
2941	#endif /* USE_ITT_BUILD */
2942
2943	#if OMPT_SUPPORT && OMPT_OPTIONAL
2944	if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2945	ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2946	ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2947	&(my_task_data), codeptr);
2948	}
2949	#endif
2950
2951	if (!taskdata->td_flags.team_serial \|\|
2952	(thread->th.th_task_team != NULL &&
2953	(thread->th.th_task_team->tt.tt_found_proxy_tasks \|\|
2954	thread->th.th_task_team->tt.tt_hidden_helper_task_encountered))) {
2955	kmp_flag_32<false, false> flag(
2956	RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), `0U`);
2957	while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != `0`) {
2958	flag.execute_tasks(this_thr: thread, gtid, FALSE,
2959	thread_finished: &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2960	is_constrained: __kmp_task_stealing_constraint);
2961	}
2962	}
2963	taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting
2964
2965	#if OMPT_SUPPORT && OMPT_OPTIONAL
2966	if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2967	ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2968	ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2969	&(my_task_data), codeptr);
2970	}
2971	#endif
2972
2973	#if USE_ITT_BUILD
2974	KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2975	KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with descendants
2976	#endif /* USE_ITT_BUILD */
2977	}
2978	KMP_DEBUG_ASSERT(taskgroup->count == `0`);
2979
2980	if (taskgroup->reduce_data != NULL &&
2981	!taskgroup->gomp_data) { // need to reduce?
2982	int cnt;
2983	void *reduce_data;
2984	kmp_team_t *t = thread->th.th_team;
2985	kmp_taskred_data_t arr = (kmp_taskred_data_t )taskgroup->reduce_data;
2986	// check if <priv> data of the first reduction variable shared for the team
2987	void *priv0 = arr[`0`].reduce_priv;
2988	if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[`0`])) != NULL &&
2989	((kmp_taskred_data_t *)reduce_data)[`0`].reduce_priv == priv0) {
2990	// finishing task reduction on parallel
2991	cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[`0`]);
2992	if (cnt == thread->th.th_team_nproc - `1`) {
2993	// we are the last thread passing __kmpc_reduction_modifier_fini()
2994	// finalize task reduction:
2995	__kmp_task_reduction_fini(th: thread, tg: taskgroup);
2996	// cleanup fields in the team structure:
2997	// TODO: is relaxed store enough here (whole barrier should follow)?
2998	__kmp_thread_free(thread, reduce_data);
2999	KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[`0`], NULL);
3000	KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[`0`], `0`);
3001	} else {
3002	// we are not the last thread passing __kmpc_reduction_modifier_fini(),
3003	// so do not finalize reduction, just clean own copy of the data
3004	__kmp_task_reduction_clean(th: thread, tg: taskgroup);
3005	}
3006	} else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[`1`])) !=
3007	NULL &&
3008	((kmp_taskred_data_t *)reduce_data)[`0`].reduce_priv == priv0) {
3009	// finishing task reduction on worksharing
3010	cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[`1`]);
3011	if (cnt == thread->th.th_team_nproc - `1`) {
3012	// we are the last thread passing __kmpc_reduction_modifier_fini()
3013	__kmp_task_reduction_fini(th: thread, tg: taskgroup);
3014	// cleanup fields in team structure:
3015	// TODO: is relaxed store enough here (whole barrier should follow)?
3016	__kmp_thread_free(thread, reduce_data);
3017	KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[`1`], NULL);
3018	KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[`1`], `0`);
3019	} else {
3020	// we are not the last thread passing __kmpc_reduction_modifier_fini(),
3021	// so do not finalize reduction, just clean own copy of the data
3022	__kmp_task_reduction_clean(th: thread, tg: taskgroup);
3023	}
3024	} else {
3025	// finishing task reduction on taskgroup
3026	__kmp_task_reduction_fini(th: thread, tg: taskgroup);
3027	}
3028	}
3029	// Restore parent taskgroup for the current task
3030	taskdata->td_taskgroup = taskgroup->parent;
3031	__kmp_thread_free(thread, taskgroup);
3032
3033	KA_TRACE(`10`, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
3034	gtid, taskdata));
3035
3036	#if OMPT_SUPPORT && OMPT_OPTIONAL
3037	if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
3038	ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
3039	ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
3040	&(my_task_data), codeptr);
3041	}
3042	#endif
3043	}
3044
3045	static kmp_task_t *__kmp_get_priority_task(kmp_int32 gtid,
3046	kmp_task_team_t *task_team,
3047	kmp_int32 is_constrained) {
3048	kmp_task_t *task = NULL;
3049	kmp_taskdata_t *taskdata;
3050	kmp_taskdata_t *current;
3051	kmp_thread_data_t *thread_data;
3052	int ntasks = task_team->tt.tt_num_task_pri;
3053	if (ntasks == `0`) {
3054	KA_TRACE(
3055	`20`, ("__kmp_get_priority_task(exit #1): T#%d No tasks to get\n", gtid));
3056	return NULL;
3057	}
3058	do {
3059	// decrement num_tasks to "reserve" one task to get for execution
3060	if (__kmp_atomic_compare_store(p: &task_team->tt.tt_num_task_pri, expected: ntasks,
3061	desired: ntasks - `1`))
3062	break;
3063	ntasks = task_team->tt.tt_num_task_pri;
3064	} while (ntasks > `0`);
3065	if (ntasks == `0`) {
3066	KA_TRACE(`20`, ("__kmp_get_priority_task(exit #2): T#%d No tasks to get\n",
3067	__kmp_get_gtid()));
3068	return NULL;
3069	}
3070	// We got a "ticket" to get a "reserved" priority task
3071	int deque_ntasks;
3072	kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
3073	do {
3074	KMP_ASSERT(list != NULL);
3075	thread_data = &list->td;
3076	__kmp_acquire_bootstrap_lock(lck: &thread_data->td.td_deque_lock);
3077	deque_ntasks = thread_data->td.td_deque_ntasks;
3078	if (deque_ntasks == `0`) {
3079	__kmp_release_bootstrap_lock(lck: &thread_data->td.td_deque_lock);
3080	KA_TRACE(`20`, ("__kmp_get_priority_task: T#%d No tasks to get from %p\n",
3081	__kmp_get_gtid(), thread_data));
3082	list = list->next;
3083	}
3084	} while (deque_ntasks == `0`);
3085	KMP_DEBUG_ASSERT(deque_ntasks);
3086	int target = thread_data->td.td_deque_head;
3087	current = __kmp_threads[gtid]->th.th_current_task;
3088	taskdata = thread_data->td.td_deque[target];
3089	if (__kmp_task_is_allowed(gtid, is_constrained, tasknew: taskdata, taskcurr: current)) {
3090	// Bump head pointer and Wrap.
3091	thread_data->td.td_deque_head =
3092	(target + `1`) & TASK_DEQUE_MASK(thread_data->td);
3093	} else {
3094	if (!task_team->tt.tt_untied_task_encountered) {
3095	// The TSC does not allow to steal victim task
3096	__kmp_release_bootstrap_lock(lck: &thread_data->td.td_deque_lock);
3097	KA_TRACE(`20`, ("__kmp_get_priority_task(exit #3): T#%d could not get task "
3098	"from %p: task_team=%p ntasks=%d head=%u tail=%u\n",
3099	gtid, thread_data, task_team, deque_ntasks, target,
3100	thread_data->td.td_deque_tail));
3101	task_team->tt.tt_num_task_pri ++; // atomic inc, restore value
3102	return NULL;
3103	}
3104	int i;
3105	// walk through the deque trying to steal any task
3106	taskdata = NULL;
3107	for (i = `1`; i < deque_ntasks; ++i) {
3108	target = (target + `1`) & TASK_DEQUE_MASK(thread_data->td);
3109	taskdata = thread_data->td.td_deque[target];
3110	if (__kmp_task_is_allowed(gtid, is_constrained, tasknew: taskdata, taskcurr: current)) {
3111	break; // found task to execute
3112	} else {
3113	taskdata = NULL;
3114	}
3115	}
3116	if (taskdata == NULL) {
3117	// No appropriate candidate found to execute
3118	__kmp_release_bootstrap_lock(lck: &thread_data->td.td_deque_lock);
3119	KA_TRACE(
3120	`10`, ("__kmp_get_priority_task(exit #4): T#%d could not get task from "
3121	"%p: task_team=%p ntasks=%d head=%u tail=%u\n",
3122	gtid, thread_data, task_team, deque_ntasks,
3123	thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3124	task_team->tt.tt_num_task_pri ++; // atomic inc, restore value
3125	return NULL;
3126	}
3127	int prev = target;
3128	for (i = i + `1`; i < deque_ntasks; ++i) {
3129	// shift remaining tasks in the deque left by 1
3130	target = (target + `1`) & TASK_DEQUE_MASK(thread_data->td);
3131	thread_data->td.td_deque[prev] = thread_data->td.td_deque[target];
3132	prev = target;
3133	}
3134	KMP_DEBUG_ASSERT(
3135	thread_data->td.td_deque_tail ==
3136	(kmp_uint32)((target + `1`) & TASK_DEQUE_MASK(thread_data->td)));
3137	thread_data->td.td_deque_tail = target; // tail -= 1 (wrapped))
3138	}
3139	thread_data->td.td_deque_ntasks = deque_ntasks - `1`;
3140	__kmp_release_bootstrap_lock(lck: &thread_data->td.td_deque_lock);
3141	task = KMP_TASKDATA_TO_TASK(taskdata);
3142	return task;
3143	}
3144
3145	// __kmp_remove_my_task: remove a task from my own deque
3146	static kmp_task_t __kmp_remove_my_task(kmp_info_t thread, kmp_int32 gtid,
3147	kmp_task_team_t *task_team,
3148	kmp_int32 is_constrained) {
3149	kmp_task_t *task;
3150	kmp_taskdata_t *taskdata;
3151	kmp_thread_data_t *thread_data;
3152	kmp_uint32 tail;
3153
3154	KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3155	KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
3156	NULL); // Caller should check this condition
3157
3158	thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
3159
3160	KA_TRACE(`10`, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
3161	gtid, thread_data->td.td_deque_ntasks,
3162	thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3163
3164	if (TCR_4(thread_data->td.td_deque_ntasks) == `0`) {
3165	KA_TRACE(`10`,
3166	("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
3167	"ntasks=%d head=%u tail=%u\n",
3168	gtid, thread_data->td.td_deque_ntasks,
3169	thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3170	return NULL;
3171	}
3172
3173	__kmp_acquire_bootstrap_lock(lck: &thread_data->td.td_deque_lock);
3174
3175	if (TCR_4(thread_data->td.td_deque_ntasks) == `0`) {
3176	__kmp_release_bootstrap_lock(lck: &thread_data->td.td_deque_lock);
3177	KA_TRACE(`10`,
3178	("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
3179	"ntasks=%d head=%u tail=%u\n",
3180	gtid, thread_data->td.td_deque_ntasks,
3181	thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3182	return NULL;
3183	}
3184
3185	tail = (thread_data->td.td_deque_tail - `1`) &
3186	TASK_DEQUE_MASK(thread_data->td); // Wrap index.
3187	taskdata = thread_data->td.td_deque[tail];
3188
3189	if (!__kmp_task_is_allowed(gtid, is_constrained, tasknew: taskdata,
3190	taskcurr: thread->th.th_current_task)) {
3191	// The TSC does not allow to steal victim task
3192	__kmp_release_bootstrap_lock(lck: &thread_data->td.td_deque_lock);
3193	KA_TRACE(`10`,
3194	("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: "
3195	"ntasks=%d head=%u tail=%u\n",
3196	gtid, thread_data->td.td_deque_ntasks,
3197	thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3198	return NULL;
3199	}
3200
3201	thread_data->td.td_deque_tail = tail;
3202	TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - `1`);
3203
3204	__kmp_release_bootstrap_lock(lck: &thread_data->td.td_deque_lock);
3205
3206	KA_TRACE(`10`, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: "
3207	"ntasks=%d head=%u tail=%u\n",
3208	gtid, taskdata, thread_data->td.td_deque_ntasks,
3209	thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3210
3211	task = KMP_TASKDATA_TO_TASK(taskdata);
3212	return task;
3213	}
3214
3215	// __kmp_steal_task: remove a task from another thread's deque
3216	// Assume that calling thread has already checked existence of
3217	// task_team thread_data before calling this routine.
3218	static kmp_task_t *__kmp_steal_task(kmp_int32 victim_tid, kmp_int32 gtid,
3219	kmp_task_team_t *task_team,
3220	std::atomic<kmp_int32> *unfinished_threads,
3221	int *thread_finished,
3222	kmp_int32 is_constrained) {
3223	kmp_task_t *task;
3224	kmp_taskdata_t *taskdata;
3225	kmp_taskdata_t *current;
3226	kmp_thread_data_t victim_td, threads_data;
3227	kmp_int32 target;
3228	kmp_info_t *victim_thr;
3229
3230	KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3231
3232	threads_data = task_team->tt.tt_threads_data;
3233	KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
3234	KMP_DEBUG_ASSERT(victim_tid >= `0`);
3235	KMP_DEBUG_ASSERT(victim_tid < task_team->tt.tt_max_threads);
3236
3237	victim_td = &threads_data[victim_tid];
3238	victim_thr = victim_td->td.td_thr;
3239	(void)victim_thr; // Use in TRACE messages which aren't always enabled.
3240
3241	KA_TRACE(`10`, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
3242	"task_team=%p ntasks=%d head=%u tail=%u\n",
3243	gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3244	victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3245	victim_td->td.td_deque_tail));
3246
3247	if (TCR_4(victim_td->td.td_deque_ntasks) == `0`) {
3248	KA_TRACE(`10`, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
3249	"task_team=%p ntasks=%d head=%u tail=%u\n",
3250	gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3251	victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3252	victim_td->td.td_deque_tail));
3253	return NULL;
3254	}
3255
3256	__kmp_acquire_bootstrap_lock(lck: &victim_td->td.td_deque_lock);
3257
3258	int ntasks = TCR_4(victim_td->td.td_deque_ntasks);
3259	// Check again after we acquire the lock
3260	if (ntasks == `0`) {
3261	__kmp_release_bootstrap_lock(lck: &victim_td->td.td_deque_lock);
3262	KA_TRACE(`10`, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
3263	"task_team=%p ntasks=%d head=%u tail=%u\n",
3264	gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3265	victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3266	return NULL;
3267	}
3268
3269	KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
3270	current = __kmp_threads[gtid]->th.th_current_task;
3271	taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
3272	if (__kmp_task_is_allowed(gtid, is_constrained, tasknew: taskdata, taskcurr: current)) {
3273	// Bump head pointer and Wrap.
3274	victim_td->td.td_deque_head =
3275	(victim_td->td.td_deque_head + `1`) & TASK_DEQUE_MASK(victim_td->td);
3276	} else {
3277	if (!task_team->tt.tt_untied_task_encountered) {
3278	// The TSC does not allow to steal victim task
3279	__kmp_release_bootstrap_lock(lck: &victim_td->td.td_deque_lock);
3280	KA_TRACE(`10`, ("__kmp_steal_task(exit #3): T#%d could not steal from "
3281	"T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3282	gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3283	victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3284	return NULL;
3285	}
3286	int i;
3287	// walk through victim's deque trying to steal any task
3288	target = victim_td->td.td_deque_head;
3289	taskdata = NULL;
3290	for (i = `1`; i < ntasks; ++i) {
3291	target = (target + `1`) & TASK_DEQUE_MASK(victim_td->td);
3292	taskdata = victim_td->td.td_deque[target];
3293	if (__kmp_task_is_allowed(gtid, is_constrained, tasknew: taskdata, taskcurr: current)) {
3294	break; // found victim task
3295	} else {
3296	taskdata = NULL;
3297	}
3298	}
3299	if (taskdata == NULL) {
3300	// No appropriate candidate to steal found
3301	__kmp_release_bootstrap_lock(lck: &victim_td->td.td_deque_lock);
3302	KA_TRACE(`10`, ("__kmp_steal_task(exit #4): T#%d could not steal from "
3303	"T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3304	gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3305	victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3306	return NULL;
3307	}
3308	int prev = target;
3309	for (i = i + `1`; i < ntasks; ++i) {
3310	// shift remaining tasks in the deque left by 1
3311	target = (target + `1`) & TASK_DEQUE_MASK(victim_td->td);
3312	victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
3313	prev = target;
3314	}
3315	KMP_DEBUG_ASSERT(
3316	victim_td->td.td_deque_tail ==
3317	(kmp_uint32)((target + `1`) & TASK_DEQUE_MASK(victim_td->td)));
3318	victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped))
3319	}
3320	if (*thread_finished) {
3321	// We need to un-mark this victim as a finished victim. This must be done
3322	// before releasing the lock, or else other threads (starting with the
3323	// primary thread victim) might be prematurely released from the barrier!!!
3324	#if KMP_DEBUG
3325	kmp_int32 count =
3326	#endif
3327	KMP_ATOMIC_INC(unfinished_threads);
3328	KA_TRACE(
3329	`20`,
3330	("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
3331	gtid, count + `1`, task_team));
3332	*thread_finished = FALSE;
3333	}
3334	TCW_4(victim_td->td.td_deque_ntasks, ntasks - `1`);
3335
3336	__kmp_release_bootstrap_lock(lck: &victim_td->td.td_deque_lock);
3337
3338	KMP_COUNT_BLOCK(TASK_stolen);
3339	KA_TRACE(`10`,
3340	("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
3341	"task_team=%p ntasks=%d head=%u tail=%u\n",
3342	gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,
3343	ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3344
3345	task = KMP_TASKDATA_TO_TASK(taskdata);
3346	return task;
3347	}
3348
3349	// __kmp_execute_tasks_template: Choose and execute tasks until either the
3350	// condition is statisfied (return true) or there are none left (return false).
3351	//
3352	// final_spin is TRUE if this is the spin at the release barrier.
3353	// thread_finished indicates whether the thread is finished executing all
3354	// the tasks it has on its deque, and is at the release barrier.
3355	// spinner is the location on which to spin.
3356	// spinner == NULL means only execute a single task and return.
3357	// checker is the value to check to terminate the spin.
3358	template <class C>
3359	static inline int __kmp_execute_tasks_template(
3360	kmp_info_t thread, kmp_int32 gtid, C flag, int final_spin,
3361	int thread_finished USE_ITT_BUILD_ARG(void* *itt_sync_obj),
3362	kmp_int32 is_constrained) {
3363	kmp_task_team_t *task_team = thread->th.th_task_team;
3364	kmp_thread_data_t *threads_data;
3365	kmp_task_t *task;
3366	kmp_info_t *other_thread;
3367	kmp_taskdata_t *current_task = thread->th.th_current_task;
3368	std::atomic<kmp_int32> *unfinished_threads;
3369	kmp_int32 nthreads, victim_tid = -`2`, use_own_tasks = `1`, new_victim = `0`,
3370	tid = thread->th.th_info.ds.ds_tid;
3371
3372	KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3373	KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
3374
3375	if (task_team == NULL \|\| current_task == NULL)
3376	return FALSE;
3377
3378	KA_TRACE(`15`, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
3379	"*thread_finished=%d\n",
3380	gtid, final_spin, *thread_finished));
3381
3382	thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
3383	threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3384
3385	KMP_DEBUG_ASSERT(threads_data != NULL);
3386
3387	nthreads = task_team->tt.tt_nproc;
3388	unfinished_threads = &(task_team->tt.tt_unfinished_threads);
3389	KMP_DEBUG_ASSERT(*unfinished_threads >= `0`);
3390
3391	while (`1`) { // Outer loop keeps trying to find tasks in case of single thread
3392	// getting tasks from target constructs
3393	while (`1`) { // Inner loop to find a task and execute it
3394	task = NULL;
3395	if (task_team->tt.tt_num_task_pri) { // get priority task first
3396	task = __kmp_get_priority_task(gtid, task_team, is_constrained);
3397	}
3398	if (task == NULL && use_own_tasks) { // check own queue next
3399	task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
3400	}
3401	if ((task == NULL) && (nthreads > `1`)) { // Steal a task finally
3402	int asleep = `1`;
3403	use_own_tasks = `0`;
3404	// Try to steal from the last place I stole from successfully.
3405	if (victim_tid == -`2`) { // haven't stolen anything yet
3406	victim_tid = threads_data[tid].td.td_deque_last_stolen;
3407	if (victim_tid !=
3408	-`1`) // if we have a last stolen from victim, get the thread
3409	other_thread = threads_data[victim_tid].td.td_thr;
3410	}
3411	if (victim_tid != -`1`) { // found last victim
3412	asleep = `0`;
3413	} else if (!new_victim) { // no recent steals and we haven't already
3414	// used a new victim; select a random thread
3415	do { // Find a different thread to steal work from.
3416	// Pick a random thread. Initial plan was to cycle through all the
3417	// threads, and only return if we tried to steal from every thread,
3418	// and failed. Arch says that's not such a great idea.
3419	victim_tid = __kmp_get_random(thread) % (nthreads - `1`);
3420	if (victim_tid >= tid) {
3421	++victim_tid; // Adjusts random distribution to exclude self
3422	}
3423	// Found a potential victim
3424	other_thread = threads_data[victim_tid].td.td_thr;
3425	// There is a slight chance that __kmp_enable_tasking() did not wake
3426	// up all threads waiting at the barrier. If victim is sleeping,
3427	// then wake it up. Since we were going to pay the cache miss
3428	// penalty for referencing another thread's kmp_info_t struct
3429	// anyway,
3430	// the check shouldn't cost too much performance at this point. In
3431	// extra barrier mode, tasks do not sleep at the separate tasking
3432	// barrier, so this isn't a problem.
3433	asleep = `0`;
3434	if ((__kmp_tasking_mode == tskm_task_teams) &&
3435	(__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
3436	(TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
3437	NULL)) {
3438	asleep = `1`;
3439	__kmp_null_resume_wrapper(thr: other_thread);
3440	// A sleeping thread should not have any tasks on it's queue.
3441	// There is a slight possibility that it resumes, steals a task
3442	// from another thread, which spawns more tasks, all in the time
3443	// that it takes this thread to check => don't write an assertion
3444	// that the victim's queue is empty. Try stealing from a
3445	// different thread.
3446	}
3447	} while (asleep);
3448	}
3449
3450	if (!asleep) {
3451	// We have a victim to try to steal from
3452	task =
3453	__kmp_steal_task(victim_tid, gtid, task_team, unfinished_threads,
3454	thread_finished, is_constrained);
3455	}
3456	if (task != NULL) { // set last stolen to victim
3457	if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
3458	threads_data[tid].td.td_deque_last_stolen = victim_tid;
3459	// The pre-refactored code did not try more than 1 successful new
3460	// vicitm, unless the last one generated more local tasks;
3461	// new_victim keeps track of this
3462	new_victim = `1`;
3463	}
3464	} else { // No tasks found; unset last_stolen
3465	KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -`1`);
3466	victim_tid = -`2`; // no successful victim found
3467	}
3468	}
3469
3470	if (task == NULL)
3471	break; // break out of tasking loop
3472
3473	// Found a task; execute it
3474	#if USE_ITT_BUILD && USE_ITT_NOTIFY
3475	if (__itt_sync_create_ptr \|\| KMP_ITT_DEBUG) {
3476	if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
3477	// get the object reliably
3478	itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier);
3479	}
3480	__kmp_itt_task_starting(object: itt_sync_obj);
3481	}
3482	#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
3483	__kmp_invoke_task(gtid, task, current_task);
3484	#if USE_ITT_BUILD
3485	if (itt_sync_obj != NULL)
3486	__kmp_itt_task_finished(object: itt_sync_obj);
3487	#endif /* USE_ITT_BUILD */
3488	// If this thread is only partway through the barrier and the condition is
3489	// met, then return now, so that the barrier gather/release pattern can
3490	// proceed. If this thread is in the last spin loop in the barrier,
3491	// waiting to be released, we know that the termination condition will not
3492	// be satisfied, so don't waste any cycles checking it.
3493	if (flag == NULL \|\| (!final_spin && flag->done_check())) {
3494	KA_TRACE(
3495	`15`,
3496	("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3497	gtid));
3498	return TRUE;
3499	}
3500	if (thread->th.th_task_team == NULL) {
3501	break;
3502	}
3503	KMP_YIELD(__kmp_library == library_throughput); // Yield before next task
3504	// If execution of a stolen task results in more tasks being placed on our
3505	// run queue, reset use_own_tasks
3506	if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != `0`) {
3507	KA_TRACE(`20`, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
3508	"other tasks, restart\n",
3509	gtid));
3510	use_own_tasks = `1`;
3511	new_victim = `0`;
3512	}
3513	}
3514
3515	// The task source has been exhausted. If in final spin loop of barrier,
3516	// check if termination condition is satisfied. The work queue may be empty
3517	// but there might be proxy tasks still executing.
3518	if (final_spin &&
3519	KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == `0`) {
3520	// First, decrement the #unfinished threads, if that has not already been
3521	// done. This decrement might be to the spin location, and result in the
3522	// termination condition being satisfied.
3523	if (!*thread_finished) {
3524	#if KMP_DEBUG
3525	kmp_int32 count = -`1` +
3526	#endif
3527	KMP_ATOMIC_DEC(unfinished_threads);
3528	KA_TRACE(`20`, ("__kmp_execute_tasks_template: T#%d dec "
3529	"unfinished_threads to %d task_team=%p\n",
3530	gtid, count, task_team));
3531	*thread_finished = TRUE;
3532	}
3533
3534	// It is now unsafe to reference thread->th.th_team !!!
3535	// Decrementing task_team->tt.tt_unfinished_threads can allow the primary
3536	// thread to pass through the barrier, where it might reset each thread's
3537	// th.th_team field for the next parallel region. If we can steal more
3538	// work, we know that this has not happened yet.
3539	if (flag != NULL && flag->done_check()) {
3540	KA_TRACE(
3541	`15`,
3542	("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3543	gtid));
3544	return TRUE;
3545	}
3546	}
3547
3548	// If this thread's task team is NULL, primary thread has recognized that
3549	// there are no more tasks; bail out
3550	if (thread->th.th_task_team == NULL) {
3551	KA_TRACE(`15`,
3552	("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
3553	return FALSE;
3554	}
3555
3556	// Check the flag again to see if it has already done in case to be trapped
3557	// into infinite loop when a if0 task depends on a hidden helper task
3558	// outside any parallel region. Detached tasks are not impacted in this case
3559	// because the only thread executing this function has to execute the proxy
3560	// task so it is in another code path that has the same check.
3561	if (flag == NULL \|\| (!final_spin && flag->done_check())) {
3562	KA_TRACE(`15`,
3563	("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3564	gtid));
3565	return TRUE;
3566	}
3567
3568	// We could be getting tasks from target constructs; if this is the only
3569	// thread, keep trying to execute tasks from own queue
3570	if (nthreads == `1` &&
3571	KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks))
3572	use_own_tasks = `1`;
3573	else {
3574	KA_TRACE(`15`,
3575	("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
3576	return FALSE;
3577	}
3578	}
3579	}
3580
3581	template <bool C, bool S>
3582	int __kmp_execute_tasks_32(
3583	kmp_info_t thread, kmp_int32 gtid, kmp_flag_32<C, S> flag, int final_spin,
3584	int thread_finished USE_ITT_BUILD_ARG(void* *itt_sync_obj),
3585	kmp_int32 is_constrained) {
3586	return __kmp_execute_tasks_template(
3587	thread, gtid, flag, final_spin,
3588	thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3589	}
3590
3591	template <bool C, bool S>
3592	int __kmp_execute_tasks_64(
3593	kmp_info_t thread, kmp_int32 gtid, kmp_flag_64<C, S> flag, int final_spin,
3594	int thread_finished USE_ITT_BUILD_ARG(void* *itt_sync_obj),
3595	kmp_int32 is_constrained) {
3596	return __kmp_execute_tasks_template(
3597	thread, gtid, flag, final_spin,
3598	thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3599	}
3600
3601	template <bool C, bool S>
3602	int __kmp_atomic_execute_tasks_64(
3603	kmp_info_t thread, kmp_int32 gtid, kmp_atomic_flag_64<C, S> flag,
3604	int final_spin, int thread_finished USE_ITT_BUILD_ARG(void* *itt_sync_obj),
3605	kmp_int32 is_constrained) {
3606	return __kmp_execute_tasks_template(
3607	thread, gtid, flag, final_spin,
3608	thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3609	}
3610
3611	int __kmp_execute_tasks_oncore(
3612	kmp_info_t thread, kmp_int32 gtid, kmp_flag_oncore flag, int final_spin,
3613	int thread_finished USE_ITT_BUILD_ARG(void* *itt_sync_obj),
3614	kmp_int32 is_constrained) {
3615	return __kmp_execute_tasks_template(
3616	thread, gtid, flag, final_spin,
3617	thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3618	}
3619
3620	template int
3621	__kmp_execute_tasks_32<false, false>(kmp_info_t *, kmp_int32,
3622	kmp_flag_32<false, false> , int*,
3623	int USE_ITT_BUILD_ARG(void* *), kmp_int32);
3624
3625	template int __kmp_execute_tasks_64<false, true>(kmp_info_t *, kmp_int32,
3626	kmp_flag_64<false, true> *,
3627	int,
3628	int USE_ITT_BUILD_ARG(void* *),
3629	kmp_int32);
3630
3631	template int __kmp_execute_tasks_64<true, false>(kmp_info_t *, kmp_int32,
3632	kmp_flag_64<true, false> *,
3633	int,
3634	int USE_ITT_BUILD_ARG(void* *),
3635	kmp_int32);
3636
3637	template int __kmp_atomic_execute_tasks_64<false, true>(
3638	kmp_info_t , kmp_int32, kmp_atomic_flag_64<false, true> , int,
3639	int USE_ITT_BUILD_ARG(void* *), kmp_int32);
3640
3641	template int __kmp_atomic_execute_tasks_64<true, false>(
3642	kmp_info_t , kmp_int32, kmp_atomic_flag_64<true, false> , int,
3643	int USE_ITT_BUILD_ARG(void* *), kmp_int32);
3644
3645	// __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
3646	// next barrier so they can assist in executing enqueued tasks.
3647	// First thread in allocates the task team atomically.
3648	static void __kmp_enable_tasking(kmp_task_team_t *task_team,
3649	kmp_info_t *this_thr) {
3650	kmp_thread_data_t *threads_data;
3651	int nthreads, i, is_init_thread;
3652
3653	KA_TRACE(`10`, ("__kmp_enable_tasking(enter): T#%d\n",
3654	__kmp_gtid_from_thread(this_thr)));
3655
3656	KMP_DEBUG_ASSERT(task_team != NULL);
3657	KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
3658
3659	nthreads = task_team->tt.tt_nproc;
3660	KMP_DEBUG_ASSERT(nthreads > `0`);
3661	KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
3662
3663	// Allocate or increase the size of threads_data if necessary
3664	is_init_thread = __kmp_realloc_task_threads_data(thread: this_thr, task_team);
3665
3666	if (!is_init_thread) {
3667	// Some other thread already set up the array.
3668	KA_TRACE(
3669	`20`,
3670	("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
3671	__kmp_gtid_from_thread(this_thr)));
3672	return;
3673	}
3674	threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3675	KMP_DEBUG_ASSERT(threads_data != NULL);
3676
3677	if (__kmp_tasking_mode == tskm_task_teams &&
3678	(__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
3679	// Release any threads sleeping at the barrier, so that they can steal
3680	// tasks and execute them. In extra barrier mode, tasks do not sleep
3681	// at the separate tasking barrier, so this isn't a problem.
3682	for (i = `0`; i < nthreads; i++) {
3683	void *sleep_loc;
3684	kmp_info_t *thread = threads_data[i].td.td_thr;
3685
3686	if (i == this_thr->th.th_info.ds.ds_tid) {
3687	continue;
3688	}
3689	// Since we haven't locked the thread's suspend mutex lock at this
3690	// point, there is a small window where a thread might be putting
3691	// itself to sleep, but hasn't set the th_sleep_loc field yet.
3692	// To work around this, __kmp_execute_tasks_template() periodically checks
3693	// see if other threads are sleeping (using the same random mechanism that
3694	// is used for task stealing) and awakens them if they are.
3695	if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3696	NULL) {
3697	KF_TRACE(`50`, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
3698	__kmp_gtid_from_thread(this_thr),
3699	__kmp_gtid_from_thread(thread)));
3700	__kmp_null_resume_wrapper(thr: thread);
3701	} else {
3702	KF_TRACE(`50`, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
3703	__kmp_gtid_from_thread(this_thr),
3704	__kmp_gtid_from_thread(thread)));
3705	}
3706	}
3707	}
3708
3709	KA_TRACE(`10`, ("__kmp_enable_tasking(exit): T#%d\n",
3710	__kmp_gtid_from_thread(this_thr)));
3711	}
3712
3713	/ // TODO: Check the comment consistency*
3714	* Utility routines for "task teams". A task team (kmp_task_t) is kind of
3715	* like a shadow of the kmp_team_t data struct, with a different lifetime.
3716	* After a child * thread checks into a barrier and calls __kmp_release() from
3717	* the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
3718	* longer assume that the kmp_team_t structure is intact (at any moment, the
3719	* primary thread may exit the barrier code and free the team data structure,
3720	* and return the threads to the thread pool).
3721	*
3722	* This does not work with the tasking code, as the thread is still
3723	* expected to participate in the execution of any tasks that may have been
3724	* spawned my a member of the team, and the thread still needs access to all
3725	* to each thread in the team, so that it can steal work from it.
3726	*
3727	* Enter the existence of the kmp_task_team_t struct. It employs a reference
3728	* counting mechanism, and is allocated by the primary thread before calling
3729	* __kmp_<barrier_kind>_release, and then is release by the last thread to
3730	* exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes
3731	* of the kmp_task_team_t structs for consecutive barriers can overlap
3732	* (and will, unless the primary thread is the last thread to exit the barrier
3733	* release phase, which is not typical). The existence of such a struct is
3734	* useful outside the context of tasking.
3735	*
3736	* We currently use the existence of the threads array as an indicator that
3737	* tasks were spawned since the last barrier. If the structure is to be
3738	* useful outside the context of tasking, then this will have to change, but
3739	* not setting the field minimizes the performance impact of tasking on
3740	* barriers, when no explicit tasks were spawned (pushed, actually).
3741	*/
3742
3743	static kmp_task_team_t *__kmp_free_task_teams =
3744	NULL; // Free list for task_team data structures
3745	// Lock for task team data structures
3746	kmp_bootstrap_lock_t __kmp_task_team_lock =
3747	KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
3748
3749	// __kmp_alloc_task_deque:
3750	// Allocates a task deque for a particular thread, and initialize the necessary
3751	// data structures relating to the deque. This only happens once per thread
3752	// per task team since task teams are recycled. No lock is needed during
3753	// allocation since each thread allocates its own deque.
3754	static void __kmp_alloc_task_deque(kmp_info_t *thread,
3755	kmp_thread_data_t *thread_data) {
3756	__kmp_init_bootstrap_lock(lck: &thread_data->td.td_deque_lock);
3757	KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
3758
3759	// Initialize last stolen task field to "none"
3760	thread_data->td.td_deque_last_stolen = -`1`;
3761
3762	KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == `0`);
3763	KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == `0`);
3764	KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == `0`);
3765
3766	KE_TRACE(
3767	`10`,
3768	("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
3769	__kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
3770	// Allocate space for task deque, and zero the deque
3771	// Cannot use __kmp_thread_calloc() because threads not around for
3772	// kmp_reap_task_team( ).
3773	thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
3774	INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
3775	thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
3776	}
3777
3778	// __kmp_free_task_deque:
3779	// Deallocates a task deque for a particular thread. Happens at library
3780	// deallocation so don't need to reset all thread data fields.
3781	static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
3782	if (thread_data->td.td_deque != NULL) {
3783	__kmp_acquire_bootstrap_lock(lck: &thread_data->td.td_deque_lock);
3784	TCW_4(thread_data->td.td_deque_ntasks, `0`);
3785	__kmp_free(thread_data->td.td_deque);
3786	thread_data->td.td_deque = NULL;
3787	__kmp_release_bootstrap_lock(lck: &thread_data->td.td_deque_lock);
3788	}
3789
3790	#ifdef BUILD_TIED_TASK_STACK
3791	// GEH: Figure out what to do here for td_susp_tied_tasks
3792	if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) {
3793	__kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data);
3794	}
3795	#endif // BUILD_TIED_TASK_STACK
3796	}
3797
3798	// __kmp_realloc_task_threads_data:
3799	// Allocates a threads_data array for a task team, either by allocating an
3800	// initial array or enlarging an existing array. Only the first thread to get
3801	// the lock allocs or enlarges the array and re-initializes the array elements.
3802	// That thread returns "TRUE", the rest return "FALSE".
3803	// Assumes that the new array size is given by task_team -> tt.tt_nproc.
3804	// The current size is given by task_team -> tt.tt_max_threads.
3805	static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
3806	kmp_task_team_t *task_team) {
3807	kmp_thread_data_t **threads_data_p;
3808	kmp_int32 nthreads, maxthreads;
3809	int is_init_thread = FALSE;
3810
3811	if (TCR_4(task_team->tt.tt_found_tasks)) {
3812	// Already reallocated and initialized.
3813	return FALSE;
3814	}
3815
3816	threads_data_p = &task_team->tt.tt_threads_data;
3817	nthreads = task_team->tt.tt_nproc;
3818	maxthreads = task_team->tt.tt_max_threads;
3819
3820	// All threads must lock when they encounter the first task of the implicit
3821	// task region to make sure threads_data fields are (re)initialized before
3822	// used.
3823	__kmp_acquire_bootstrap_lock(lck: &task_team->tt.tt_threads_lock);
3824
3825	if (!TCR_4(task_team->tt.tt_found_tasks)) {
3826	// first thread to enable tasking
3827	kmp_team_t *team = thread->th.th_team;
3828	int i;
3829
3830	is_init_thread = TRUE;
3831	if (maxthreads < nthreads) {
3832
3833	if (*threads_data_p != NULL) {
3834	kmp_thread_data_t old_data = threads_data_p;
3835	kmp_thread_data_t *new_data = NULL;
3836
3837	KE_TRACE(
3838	`10`,
3839	("__kmp_realloc_task_threads_data: T#%d reallocating "
3840	"threads data for task_team %p, new_size = %d, old_size = %d\n",
3841	__kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
3842	// Reallocate threads_data to have more elements than current array
3843	// Cannot use __kmp_thread_realloc() because threads not around for
3844	// kmp_reap_task_team( ). Note all new array entries are initialized
3845	// to zero by __kmp_allocate().
3846	new_data = (kmp_thread_data_t *)__kmp_allocate(
3847	nthreads * sizeof(kmp_thread_data_t));
3848	// copy old data to new data
3849	KMP_MEMCPY_S((void )new_data, nthreads sizeof(kmp_thread_data_t),
3850	(void )old_data, maxthreads sizeof(kmp_thread_data_t));
3851
3852	#ifdef BUILD_TIED_TASK_STACK
3853	// GEH: Figure out if this is the right thing to do
3854	for (i = maxthreads; i < nthreads; i++) {
3855	kmp_thread_data_t thread_data = &(threads_data_p)[i];
3856	__kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3857	}
3858	#endif // BUILD_TIED_TASK_STACK
3859	// Install the new data and free the old data
3860	(*threads_data_p) = new_data;
3861	__kmp_free(old_data);
3862	} else {
3863	KE_TRACE(`10`, ("__kmp_realloc_task_threads_data: T#%d allocating "
3864	"threads data for task_team %p, size = %d\n",
3865	__kmp_gtid_from_thread(thread), task_team, nthreads));
3866	// Make the initial allocate for threads_data array, and zero entries
3867	// Cannot use __kmp_thread_calloc() because threads not around for
3868	// kmp_reap_task_team( ).
3869	threads_data_p = (kmp_thread_data_t )__kmp_allocate(
3870	nthreads * sizeof(kmp_thread_data_t));
3871	#ifdef BUILD_TIED_TASK_STACK
3872	// GEH: Figure out if this is the right thing to do
3873	for (i = `0`; i < nthreads; i++) {
3874	kmp_thread_data_t thread_data = &(threads_data_p)[i];
3875	__kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3876	}
3877	#endif // BUILD_TIED_TASK_STACK
3878	}
3879	task_team->tt.tt_max_threads = nthreads;
3880	} else {
3881	// If array has (more than) enough elements, go ahead and use it
3882	KMP_DEBUG_ASSERT(*threads_data_p != NULL);
3883	}
3884
3885	// initialize threads_data pointers back to thread_info structures
3886	for (i = `0`; i < nthreads; i++) {
3887	kmp_thread_data_t thread_data = &(threads_data_p)[i];
3888	thread_data->td.td_thr = team->t.t_threads[i];
3889
3890	if (thread_data->td.td_deque_last_stolen >= nthreads) {
3891	// The last stolen field survives across teams / barrier, and the number
3892	// of threads may have changed. It's possible (likely?) that a new
3893	// parallel region will exhibit the same behavior as previous region.
3894	thread_data->td.td_deque_last_stolen = -`1`;
3895	}
3896	}
3897
3898	KMP_MB();
3899	TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
3900	}
3901
3902	__kmp_release_bootstrap_lock(lck: &task_team->tt.tt_threads_lock);
3903	return is_init_thread;
3904	}
3905
3906	// __kmp_free_task_threads_data:
3907	// Deallocates a threads_data array for a task team, including any attached
3908	// tasking deques. Only occurs at library shutdown.
3909	static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
3910	__kmp_acquire_bootstrap_lock(lck: &task_team->tt.tt_threads_lock);
3911	if (task_team->tt.tt_threads_data != NULL) {
3912	int i;
3913	for (i = `0`; i < task_team->tt.tt_max_threads; i++) {
3914	__kmp_free_task_deque(thread_data: &task_team->tt.tt_threads_data[i]);
3915	}
3916	__kmp_free(task_team->tt.tt_threads_data);
3917	task_team->tt.tt_threads_data = NULL;
3918	}
3919	__kmp_release_bootstrap_lock(lck: &task_team->tt.tt_threads_lock);
3920	}
3921
3922	// __kmp_free_task_pri_list:
3923	// Deallocates tasking deques used for priority tasks.
3924	// Only occurs at library shutdown.
3925	static void __kmp_free_task_pri_list(kmp_task_team_t *task_team) {
3926	__kmp_acquire_bootstrap_lock(lck: &task_team->tt.tt_task_pri_lock);
3927	if (task_team->tt.tt_task_pri_list != NULL) {
3928	kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
3929	while (list != NULL) {
3930	kmp_task_pri_t *next = list->next;
3931	__kmp_free_task_deque(thread_data: &list->td);
3932	__kmp_free(list);
3933	list = next;
3934	}
3935	task_team->tt.tt_task_pri_list = NULL;
3936	}
3937	__kmp_release_bootstrap_lock(lck: &task_team->tt.tt_task_pri_lock);
3938	}
3939
3940	static inline void __kmp_task_team_init(kmp_task_team_t *task_team,
3941	kmp_team_t *team) {
3942	int team_nth = team->t.t_nproc;
3943	// Only need to init if task team is isn't active or team size changed
3944	if (!task_team->tt.tt_active \|\| team_nth != task_team->tt.tt_nproc) {
3945	TCW_4(task_team->tt.tt_found_tasks, FALSE);
3946	TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3947	TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
3948	TCW_4(task_team->tt.tt_nproc, team_nth);
3949	KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, team_nth);
3950	TCW_4(task_team->tt.tt_active, TRUE);
3951	}
3952	}
3953
3954	// __kmp_allocate_task_team:
3955	// Allocates a task team associated with a specific team, taking it from
3956	// the global task team free list if possible. Also initializes data
3957	// structures.
3958	static kmp_task_team_t __kmp_allocate_task_team(kmp_info_t thread,
3959	kmp_team_t *team) {
3960	kmp_task_team_t *task_team = NULL;
3961
3962	KA_TRACE(`20`, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
3963	(thread ? __kmp_gtid_from_thread(thread) : -`1`), team));
3964
3965	if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3966	// Take a task team from the task team pool
3967	__kmp_acquire_bootstrap_lock(lck: &__kmp_task_team_lock);
3968	if (__kmp_free_task_teams != NULL) {
3969	task_team = __kmp_free_task_teams;
3970	TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
3971	task_team->tt.tt_next = NULL;
3972	}
3973	__kmp_release_bootstrap_lock(lck: &__kmp_task_team_lock);
3974	}
3975
3976	if (task_team == NULL) {
3977	KE_TRACE(`10`, ("__kmp_allocate_task_team: T#%d allocating "
3978	"task team for team %p\n",
3979	__kmp_gtid_from_thread(thread), team));
3980	// Allocate a new task team if one is not available. Cannot use
3981	// __kmp_thread_malloc because threads not around for kmp_reap_task_team.
3982	task_team = (kmp_task_team_t )__kmp_allocate(sizeof*(kmp_task_team_t));
3983	__kmp_init_bootstrap_lock(lck: &task_team->tt.tt_threads_lock);
3984	__kmp_init_bootstrap_lock(lck: &task_team->tt.tt_task_pri_lock);
3985	#if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
3986	// suppress race conditions detection on synchronization flags in debug mode
3987	// this helps to analyze library internals eliminating false positives
3988	__itt_suppress_mark_range(
3989	__itt_suppress_range, __itt_suppress_threading_errors,
3990	&task_team->tt.tt_found_tasks, sizeof(task_team->tt.tt_found_tasks));
3991	__itt_suppress_mark_range(__itt_suppress_range,
3992	__itt_suppress_threading_errors,
3993	CCAST(kmp_uint32 *, &task_team->tt.tt_active),
3994	sizeof(task_team->tt.tt_active));
3995	#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
3996	// Note: __kmp_allocate zeroes returned memory, othewise we would need:
3997	// task_team->tt.tt_threads_data = NULL;
3998	// task_team->tt.tt_max_threads = 0;
3999	// task_team->tt.tt_next = NULL;
4000	}
4001
4002	__kmp_task_team_init(task_team, team);
4003
4004	KA_TRACE(`20`, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
4005	"unfinished_threads init'd to %d\n",
4006	(thread ? __kmp_gtid_from_thread(thread) : -`1`), task_team,
4007	KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));
4008	return task_team;
4009	}
4010
4011	// __kmp_free_task_team:
4012	// Frees the task team associated with a specific thread, and adds it
4013	// to the global task team free list.
4014	void __kmp_free_task_team(kmp_info_t thread, kmp_task_team_t task_team) {
4015	KA_TRACE(`20`, ("__kmp_free_task_team: T#%d task_team = %p\n",
4016	thread ? __kmp_gtid_from_thread(thread) : -`1`, task_team));
4017
4018	// Put task team back on free list
4019	__kmp_acquire_bootstrap_lock(lck: &__kmp_task_team_lock);
4020
4021	KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
4022	task_team->tt.tt_next = __kmp_free_task_teams;
4023	TCW_PTR(__kmp_free_task_teams, task_team);
4024
4025	__kmp_release_bootstrap_lock(lck: &__kmp_task_team_lock);
4026	}
4027
4028	// __kmp_reap_task_teams:
4029	// Free all the task teams on the task team free list.
4030	// Should only be done during library shutdown.
4031	// Cannot do anything that needs a thread structure or gtid since they are
4032	// already gone.
4033	void __kmp_reap_task_teams(void) {
4034	kmp_task_team_t *task_team;
4035
4036	if (TCR_PTR(__kmp_free_task_teams) != NULL) {
4037	// Free all task_teams on the free list
4038	__kmp_acquire_bootstrap_lock(lck: &__kmp_task_team_lock);
4039	while ((task_team = __kmp_free_task_teams) != NULL) {
4040	__kmp_free_task_teams = task_team->tt.tt_next;
4041	task_team->tt.tt_next = NULL;
4042
4043	// Free threads_data if necessary
4044	if (task_team->tt.tt_threads_data != NULL) {
4045	__kmp_free_task_threads_data(task_team);
4046	}
4047	if (task_team->tt.tt_task_pri_list != NULL) {
4048	__kmp_free_task_pri_list(task_team);
4049	}
4050	__kmp_free(task_team);
4051	}
4052	__kmp_release_bootstrap_lock(lck: &__kmp_task_team_lock);
4053	}
4054	}
4055
4056	// View the array of two task team pointers as a pair of pointers:
4057	// 1) a single task_team pointer
4058	// 2) next pointer for stack
4059	// Serial teams can create a stack of task teams for nested serial teams.
4060	void __kmp_push_task_team_node(kmp_info_t thread, kmp_team_t team) {
4061	KMP_DEBUG_ASSERT(team->t.t_nproc == `1`);
4062	kmp_task_team_list_t *current =
4063	(kmp_task_team_list_t *)(&team->t.t_task_team[`0`]);
4064	kmp_task_team_list_t *node =
4065	(kmp_task_team_list_t )__kmp_allocate(sizeof*(kmp_task_team_list_t));
4066	node->task_team = current->task_team;
4067	node->next = current->next;
4068	thread->th.th_task_team = current->task_team = NULL;
4069	current->next = node;
4070	}
4071
4072	// Serial team pops a task team off the stack
4073	void __kmp_pop_task_team_node(kmp_info_t thread, kmp_team_t team) {
4074	KMP_DEBUG_ASSERT(team->t.t_nproc == `1`);
4075	kmp_task_team_list_t *current =
4076	(kmp_task_team_list_t *)(&team->t.t_task_team[`0`]);
4077	if (current->task_team) {
4078	__kmp_free_task_team(thread, task_team: current->task_team);
4079	}
4080	kmp_task_team_list_t *next = current->next;
4081	if (next) {
4082	current->task_team = next->task_team;
4083	current->next = next->next;
4084	KMP_DEBUG_ASSERT(next != current);
4085	__kmp_free(next);
4086	thread->th.th_task_team = current->task_team;
4087	}
4088	}
4089
4090	// __kmp_wait_to_unref_task_teams:
4091	// Some threads could still be in the fork barrier release code, possibly
4092	// trying to steal tasks. Wait for each thread to unreference its task team.
4093	void __kmp_wait_to_unref_task_teams(void) {
4094	kmp_info_t *thread;
4095	kmp_uint32 spins;
4096	kmp_uint64 time;
4097	int done;
4098
4099	KMP_INIT_YIELD(spins);
4100	KMP_INIT_BACKOFF(time);
4101
4102	for (;;) {
4103	done = TRUE;
4104
4105	// TODO: GEH - this may be is wrong because some sync would be necessary
4106	// in case threads are added to the pool during the traversal. Need to
4107	// verify that lock for thread pool is held when calling this routine.
4108	for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
4109	thread = thread->th.th_next_pool) {
4110	#if KMP_OS_WINDOWS
4111	DWORD exit_val;
4112	#endif
4113	if (TCR_PTR(thread->th.th_task_team) == NULL) {
4114	KA_TRACE(`10`, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
4115	__kmp_gtid_from_thread(thread)));
4116	continue;
4117	}
4118	#if KMP_OS_WINDOWS
4119	// TODO: GEH - add this check for Linux OS / OS X* as well?*
4120	if (!__kmp_is_thread_alive(thread, &exit_val)) {
4121	thread->th.th_task_team = NULL;
4122	continue;
4123	}
4124	#endif
4125
4126	done = FALSE; // Because th_task_team pointer is not NULL for this thread
4127
4128	KA_TRACE(`10`, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
4129	"unreference task_team\n",
4130	__kmp_gtid_from_thread(thread)));
4131
4132	if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
4133	void *sleep_loc;
4134	// If the thread is sleeping, awaken it.
4135	if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
4136	NULL) {
4137	KA_TRACE(
4138	`10`,
4139	("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
4140	__kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
4141	__kmp_null_resume_wrapper(thr: thread);
4142	}
4143	}
4144	}
4145	if (done) {
4146	break;
4147	}
4148
4149	// If oversubscribed or have waited a bit, yield.
4150	KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
4151	}
4152	}
4153
4154	// __kmp_task_team_setup: Create a task_team for the current team, but use
4155	// an already created, unused one if it already exists.
4156	void __kmp_task_team_setup(kmp_info_t this_thr, kmp_team_t team) {
4157	KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
4158
4159	// For the serial and root teams, setup the first task team pointer to point
4160	// to task team. The other pointer is a stack of task teams from previous
4161	// serial levels.
4162	if (team == this_thr->th.th_serial_team \|\|
4163	team == this_thr->th.th_root->r.r_root_team) {
4164	KMP_DEBUG_ASSERT(team->t.t_nproc == `1`);
4165	if (team->t.t_task_team[`0`] == NULL) {
4166	team->t.t_task_team[`0`] = __kmp_allocate_task_team(thread: this_thr, team);
4167	KA_TRACE(
4168	`20`, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
4169	" for serial/root team %p\n",
4170	__kmp_gtid_from_thread(this_thr), team->t.t_task_team[`0`], team));
4171
4172	} else
4173	__kmp_task_team_init(task_team: team->t.t_task_team[`0`], team);
4174	return;
4175	}
4176
4177	// If this task_team hasn't been created yet, allocate it. It will be used in
4178	// the region after the next.
4179	// If it exists, it is the current task team and shouldn't be touched yet as
4180	// it may still be in use.
4181	if (team->t.t_task_team[this_thr->th.th_task_state] == NULL) {
4182	team->t.t_task_team[this_thr->th.th_task_state] =
4183	__kmp_allocate_task_team(thread: this_thr, team);
4184	KA_TRACE(`20`, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
4185	" for team %d at parity=%d\n",
4186	__kmp_gtid_from_thread(this_thr),
4187	team->t.t_task_team[this_thr->th.th_task_state], team->t.t_id,
4188	this_thr->th.th_task_state));
4189	}
4190
4191	// After threads exit the release, they will call sync, and then point to this
4192	// other task_team; make sure it is allocated and properly initialized. As
4193	// threads spin in the barrier release phase, they will continue to use the
4194	// previous task_team struct(above), until they receive the signal to stop
4195	// checking for tasks (they can't safely reference the kmp_team_t struct,
4196	// which could be reallocated by the primary thread).
4197	int other_team = `1` - this_thr->th.th_task_state;
4198	KMP_DEBUG_ASSERT(other_team >= `0` && other_team < `2`);
4199	if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
4200	team->t.t_task_team[other_team] = __kmp_allocate_task_team(thread: this_thr, team);
4201	KA_TRACE(`20`, ("__kmp_task_team_setup: Primary T#%d created second new "
4202	"task_team %p for team %d at parity=%d\n",
4203	__kmp_gtid_from_thread(this_thr),
4204	team->t.t_task_team[other_team], team->t.t_id, other_team));
4205	} else { // Leave the old task team struct in place for the upcoming region;
4206	// adjust as needed
4207	kmp_task_team_t *task_team = team->t.t_task_team[other_team];
4208	__kmp_task_team_init(task_team, team);
4209	// if team size has changed, the first thread to enable tasking will
4210	// realloc threads_data if necessary
4211	KA_TRACE(`20`, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
4212	"%p for team %d at parity=%d\n",
4213	__kmp_gtid_from_thread(this_thr),
4214	team->t.t_task_team[other_team], team->t.t_id, other_team));
4215	}
4216
4217	// For regular thread, task enabling should be called when the task is going
4218	// to be pushed to a dequeue. However, for the hidden helper thread, we need
4219	// it ahead of time so that some operations can be performed without race
4220	// condition.
4221	if (this_thr == __kmp_hidden_helper_main_thread) {
4222	for (int i = `0`; i < `2`; ++i) {
4223	kmp_task_team_t *task_team = team->t.t_task_team[i];
4224	if (KMP_TASKING_ENABLED(task_team)) {
4225	continue;
4226	}
4227	__kmp_enable_tasking(task_team, this_thr);
4228	for (int j = `0`; j < task_team->tt.tt_nproc; ++j) {
4229	kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[j];
4230	if (thread_data->td.td_deque == NULL) {
4231	__kmp_alloc_task_deque(thread: __kmp_hidden_helper_threads[j], thread_data);
4232	}
4233	}
4234	}
4235	}
4236	}
4237
4238	// __kmp_task_team_sync: Propagation of task team data from team to threads
4239	// which happens just after the release phase of a team barrier. This may be
4240	// called by any thread. This is not called for serial or root teams.
4241	void __kmp_task_team_sync(kmp_info_t this_thr, kmp_team_t team) {
4242	KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
4243	KMP_DEBUG_ASSERT(team != this_thr->th.th_serial_team);
4244	KMP_DEBUG_ASSERT(team != this_thr->th.th_root->r.r_root_team);
4245
4246	// Toggle the th_task_state field, to switch which task_team this thread
4247	// refers to
4248	this_thr->th.th_task_state = (kmp_uint8)(`1` - this_thr->th.th_task_state);
4249
4250	// It is now safe to propagate the task team pointer from the team struct to
4251	// the current thread.
4252	TCW_PTR(this_thr->th.th_task_team,
4253	team->t.t_task_team[this_thr->th.th_task_state]);
4254	KA_TRACE(`20`,
4255	("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
4256	"%p from Team #%d (parity=%d)\n",
4257	__kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
4258	team->t.t_id, this_thr->th.th_task_state));
4259	}
4260
4261	// __kmp_task_team_wait: Primary thread waits for outstanding tasks after the
4262	// barrier gather phase. Only called by the primary thread.
4263	//
4264	// wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
4265	// by passing in 0 optionally as the last argument. When wait is zero, primary
4266	// thread does not wait for unfinished_threads to reach 0.
4267	void __kmp_task_team_wait(
4268	kmp_info_t *this_thr,
4269	kmp_team_t team USE_ITT_BUILD_ARG(void* itt_sync_obj), int* wait) {
4270	kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
4271
4272	KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
4273	KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
4274
4275	if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
4276	if (wait) {
4277	KA_TRACE(`20`, ("__kmp_task_team_wait: Primary T#%d waiting for all tasks "
4278	"(for unfinished_threads to reach 0) on task_team = %p\n",
4279	__kmp_gtid_from_thread(this_thr), task_team));
4280	// Worker threads may have dropped through to release phase, but could
4281	// still be executing tasks. Wait here for tasks to complete. To avoid
4282	// memory contention, only primary thread checks termination condition.
4283	kmp_flag_32<false, false> flag(
4284	RCAST(std::atomic<kmp_uint32> *,
4285	&task_team->tt.tt_unfinished_threads),
4286	`0U`);
4287	flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
4288	}
4289	// Deactivate the old task team, so that the worker threads will stop
4290	// referencing it while spinning.
4291	KA_TRACE(
4292	`20`,
4293	("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: "
4294	"setting active to false, setting local and team's pointer to NULL\n",
4295	__kmp_gtid_from_thread(this_thr), task_team));
4296	TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
4297	TCW_SYNC_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
4298	KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, `0`);
4299	TCW_SYNC_4(task_team->tt.tt_active, FALSE);
4300	KMP_MB();
4301
4302	TCW_PTR(this_thr->th.th_task_team, NULL);
4303	}
4304	}
4305
4306	// __kmp_tasking_barrier:
4307	// This routine is called only when __kmp_tasking_mode == tskm_extra_barrier.
4308	// Internal function to execute all tasks prior to a regular barrier or a join
4309	// barrier. It is a full barrier itself, which unfortunately turns regular
4310	// barriers into double barriers and join barriers into 1 1/2 barriers.
4311	void __kmp_tasking_barrier(kmp_team_t team, kmp_info_t thread, int gtid) {
4312	std::atomic<kmp_uint32> *spin = RCAST(
4313	std::atomic<kmp_uint32> *,
4314	&team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
4315	int flag = FALSE;
4316	KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
4317
4318	#if USE_ITT_BUILD
4319	KMP_FSYNC_SPIN_INIT(spin, NULL);
4320	#endif /* USE_ITT_BUILD */
4321	kmp_flag_32<false, false> spin_flag(spin, `0U`);
4322	while (!spin_flag.execute_tasks(this_thr: thread, gtid, TRUE,
4323	thread_finished: &flag USE_ITT_BUILD_ARG(NULL), is_constrained: `0`)) {
4324	#if USE_ITT_BUILD
4325	// TODO: What about itt_sync_obj??
4326	KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));
4327	#endif /* USE_ITT_BUILD */
4328
4329	if (TCR_4(__kmp_global.g.g_done)) {
4330	if (__kmp_global.g.g_abort)
4331	__kmp_abort_thread();
4332	break;
4333	}
4334	KMP_YIELD(TRUE);
4335	}
4336	#if USE_ITT_BUILD
4337	KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
4338	#endif /* USE_ITT_BUILD */
4339	}
4340
4341	// __kmp_give_task puts a task into a given thread queue if:
4342	// - the queue for that thread was created
4343	// - there's space in that queue
4344	// Because of this, __kmp_push_task needs to check if there's space after
4345	// getting the lock
4346	static bool __kmp_give_task(kmp_info_t thread, kmp_int32 tid, kmp_task_t task,
4347	kmp_int32 pass) {
4348	kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4349	kmp_task_team_t *task_team = taskdata->td_task_team;
4350
4351	KA_TRACE(`20`, ("__kmp_give_task: trying to give task %p to thread %d.\n",
4352	taskdata, tid));
4353
4354	// If task_team is NULL something went really bad...
4355	KMP_DEBUG_ASSERT(task_team != NULL);
4356
4357	bool result = false;
4358	kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
4359
4360	if (thread_data->td.td_deque == NULL) {
4361	// There's no queue in this thread, go find another one
4362	// We're guaranteed that at least one thread has a queue
4363	KA_TRACE(`30`,
4364	("__kmp_give_task: thread %d has no queue while giving task %p.\n",
4365	tid, taskdata));
4366	return result;
4367	}
4368
4369	if (TCR_4(thread_data->td.td_deque_ntasks) >=
4370	TASK_DEQUE_SIZE(thread_data->td)) {
4371	KA_TRACE(
4372	`30`,
4373	("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
4374	taskdata, tid));
4375
4376	// if this deque is bigger than the pass ratio give a chance to another
4377	// thread
4378	if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4379	return result;
4380
4381	__kmp_acquire_bootstrap_lock(lck: &thread_data->td.td_deque_lock);
4382	if (TCR_4(thread_data->td.td_deque_ntasks) >=
4383	TASK_DEQUE_SIZE(thread_data->td)) {
4384	// expand deque to push the task which is not allowed to execute
4385	__kmp_realloc_task_deque(thread, thread_data);
4386	}
4387
4388	} else {
4389
4390	__kmp_acquire_bootstrap_lock(lck: &thread_data->td.td_deque_lock);
4391
4392	if (TCR_4(thread_data->td.td_deque_ntasks) >=
4393	TASK_DEQUE_SIZE(thread_data->td)) {
4394	KA_TRACE(`30`, ("__kmp_give_task: queue is full while giving task %p to "
4395	"thread %d.\n",
4396	taskdata, tid));
4397
4398	// if this deque is bigger than the pass ratio give a chance to another
4399	// thread
4400	if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4401	goto release_and_exit;
4402
4403	__kmp_realloc_task_deque(thread, thread_data);
4404	}
4405	}
4406
4407	// lock is held here, and there is space in the deque
4408
4409	thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
4410	// Wrap index.
4411	thread_data->td.td_deque_tail =
4412	(thread_data->td.td_deque_tail + `1`) & TASK_DEQUE_MASK(thread_data->td);
4413	TCW_4(thread_data->td.td_deque_ntasks,
4414	TCR_4(thread_data->td.td_deque_ntasks) + `1`);
4415
4416	result = true;
4417	KA_TRACE(`30`, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
4418	taskdata, tid));
4419
4420	release_and_exit:
4421	__kmp_release_bootstrap_lock(lck: &thread_data->td.td_deque_lock);
4422
4423	return result;
4424	}
4425
4426	#define PROXY_TASK_FLAG 0x40000000
4427	/ The finish of the proxy tasks is divided in two pieces:*
4428	- the top half is the one that can be done from a thread outside the team
4429	- the bottom half must be run from a thread within the team
4430
4431	In order to run the bottom half the task gets queued back into one of the
4432	threads of the team. Once the td_incomplete_child_task counter of the parent
4433	is decremented the threads can leave the barriers. So, the bottom half needs
4434	to be queued before the counter is decremented. The top half is therefore
4435	divided in two parts:
4436	- things that can be run before queuing the bottom half
4437	- things that must be run after queuing the bottom half
4438
4439	This creates a second race as the bottom half can free the task before the
4440	second top half is executed. To avoid this we use the
4441	td_incomplete_child_task of the proxy task to synchronize the top and bottom
4442	half. /*
4443	static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
4444	KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
4445	KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4446	KMP_DEBUG_ASSERT(taskdata->td_flags.complete == `0`);
4447	KMP_DEBUG_ASSERT(taskdata->td_flags.freed == `0`);
4448
4449	taskdata->td_flags.complete = `1`; // mark the task as completed
4450	#if OMPX_TASKGRAPH
4451	taskdata->td_flags.onced = `1`;
4452	#endif
4453
4454	if (taskdata->td_taskgroup)
4455	KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
4456
4457	// Create an imaginary children for this task so the bottom half cannot
4458	// release the task before we have completed the second top half
4459	KMP_ATOMIC_OR(&taskdata->td_incomplete_child_tasks, PROXY_TASK_FLAG);
4460	}
4461
4462	static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
4463	#if KMP_DEBUG
4464	kmp_int32 children = `0`;
4465	// Predecrement simulated by "- 1" calculation
4466	children = -`1` +
4467	#endif
4468	KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
4469	KMP_DEBUG_ASSERT(children >= `0`);
4470
4471	// Remove the imaginary children
4472	KMP_ATOMIC_AND(&taskdata->td_incomplete_child_tasks, ~PROXY_TASK_FLAG);
4473	}
4474
4475	static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
4476	kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4477	kmp_info_t *thread = __kmp_threads[gtid];
4478
4479	KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4480	KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
4481	`1`); // top half must run before bottom half
4482
4483	// We need to wait to make sure the top half is finished
4484	// Spinning here should be ok as this should happen quickly
4485	while ((KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) &
4486	PROXY_TASK_FLAG) > `0`)
4487	;
4488
4489	__kmp_release_deps(gtid, task: taskdata);
4490	__kmp_free_task_and_ancestors(gtid, taskdata, thread);
4491	}
4492
4493	/!*
4494	@ingroup TASKING
4495	@param gtid Global Thread ID of encountering thread
4496	@param ptask Task which execution is completed
4497
4498	Execute the completion of a proxy task from a thread of that is part of the
4499	team. Run first and bottom halves directly.
4500	*/
4501	void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
4502	KMP_DEBUG_ASSERT(ptask != NULL);
4503	kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4504	KA_TRACE(
4505	`10`, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
4506	gtid, taskdata));
4507	__kmp_assert_valid_gtid(gtid);
4508	KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4509
4510	__kmp_first_top_half_finish_proxy(taskdata);
4511	__kmp_second_top_half_finish_proxy(taskdata);
4512	__kmp_bottom_half_finish_proxy(gtid, ptask);
4513
4514	KA_TRACE(`10`,
4515	("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
4516	gtid, taskdata));
4517	}
4518
4519	void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start = `0`) {
4520	KMP_DEBUG_ASSERT(ptask != NULL);
4521	kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4522
4523	// Enqueue task to complete bottom half completion from a thread within the
4524	// corresponding team
4525	kmp_team_t *team = taskdata->td_team;
4526	kmp_int32 nthreads = team->t.t_nproc;
4527	kmp_info_t *thread;
4528
4529	// This should be similar to start_k = __kmp_get_random( thread ) % nthreads
4530	// but we cannot use __kmp_get_random here
4531	kmp_int32 start_k = start % nthreads;
4532	kmp_int32 pass = `1`;
4533	kmp_int32 k = start_k;
4534
4535	do {
4536	// For now we're just linearly trying to find a thread
4537	thread = team->t.t_threads[k];
4538	k = (k + `1`) % nthreads;
4539
4540	// we did a full pass through all the threads
4541	if (k == start_k)
4542	pass = pass << `1`;
4543
4544	} while (!__kmp_give_task(thread, tid: k, task: ptask, pass));
4545
4546	if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && __kmp_wpolicy_passive) {
4547	// awake at least one thread to execute given task
4548	for (int i = `0`; i < nthreads; ++i) {
4549	thread = team->t.t_threads[i];
4550	if (thread->th.th_sleep_loc != NULL) {
4551	__kmp_null_resume_wrapper(thr: thread);
4552	break;
4553	}
4554	}
4555	}
4556	}
4557
4558	/!*
4559	@ingroup TASKING
4560	@param ptask Task which execution is completed
4561
4562	Execute the completion of a proxy task from a thread that could not belong to
4563	the team.
4564	*/
4565	void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
4566	KMP_DEBUG_ASSERT(ptask != NULL);
4567	kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4568
4569	KA_TRACE(
4570	`10`,
4571	("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
4572	taskdata));
4573
4574	KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4575
4576	__kmp_first_top_half_finish_proxy(taskdata);
4577
4578	__kmpc_give_task(ptask);
4579
4580	__kmp_second_top_half_finish_proxy(taskdata);
4581
4582	KA_TRACE(
4583	`10`,
4584	("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
4585	taskdata));
4586	}
4587
4588	kmp_event_t __kmpc_task_allow_completion_event(ident_t loc_ref, int gtid,
4589	kmp_task_t *task) {
4590	kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
4591	if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) {
4592	td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION;
4593	td->td_allow_completion_event.ed.task = task;
4594	__kmp_init_tas_lock(lck: &td->td_allow_completion_event.lock);
4595	}
4596	return &td->td_allow_completion_event;
4597	}
4598
4599	void __kmp_fulfill_event(kmp_event_t *event) {
4600	if (event->type == KMP_EVENT_ALLOW_COMPLETION) {
4601	kmp_task_t *ptask = event->ed.task;
4602	kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4603	bool detached = false;
4604	int gtid = __kmp_get_gtid();
4605
4606	// The associated task might have completed or could be completing at this
4607	// point.
4608	// We need to take the lock to avoid races
4609	__kmp_acquire_tas_lock(lck: &event->lock, gtid);
4610	if (taskdata->td_flags.proxy == TASK_PROXY) {
4611	detached = true;
4612	} else {
4613	#if OMPT_SUPPORT
4614	// The OMPT event must occur under mutual exclusion,
4615	// otherwise the tool might access ptask after free
4616	if (UNLIKELY(ompt_enabled.enabled))
4617	__ompt_task_finish(task: ptask, NULL, status: ompt_task_early_fulfill);
4618	#endif
4619	}
4620	event->type = KMP_EVENT_UNINITIALIZED;
4621	__kmp_release_tas_lock(lck: &event->lock, gtid);
4622
4623	if (detached) {
4624	#if OMPT_SUPPORT
4625	// We free ptask afterwards and know the task is finished,
4626	// so locking is not necessary
4627	if (UNLIKELY(ompt_enabled.enabled))
4628	__ompt_task_finish(task: ptask, NULL, status: ompt_task_late_fulfill);
4629	#endif
4630	// If the task detached complete the proxy task
4631	if (gtid >= `0`) {
4632	kmp_team_t *team = taskdata->td_team;
4633	kmp_info_t *thread = __kmp_get_thread();
4634	if (thread->th.th_team == team) {
4635	__kmpc_proxy_task_completed(gtid, ptask);
4636	return;
4637	}
4638	}
4639
4640	// fallback
4641	__kmpc_proxy_task_completed_ooo(ptask);
4642	}
4643	}
4644	}
4645
4646	// __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
4647	// for taskloop
4648	//
4649	// thread: allocating thread
4650	// task_src: pointer to source task to be duplicated
4651	// taskloop_recur: used only when dealing with taskgraph,
4652	// indicating whether we need to update task->td_task_id
4653	// returns: a pointer to the allocated kmp_task_t structure (task).
4654	kmp_task_t __kmp_task_dup_alloc(kmp_info_t thread, kmp_task_t *task_src
4655	#if OMPX_TASKGRAPH
4656	, int taskloop_recur
4657	#endif
4658	) {
4659	kmp_task_t *task;
4660	kmp_taskdata_t *taskdata;
4661	kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
4662	kmp_taskdata_t parent_task = taskdata_src->td_parent; // same parent task*
4663	size_t shareds_offset;
4664	size_t task_size;
4665
4666	KA_TRACE(`10`, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
4667	task_src));
4668	KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
4669	TASK_FULL); // it should not be proxy task
4670	KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
4671	task_size = taskdata_src->td_size_alloc;
4672
4673	// Allocate a kmp_taskdata_t block and a kmp_task_t block.
4674	KA_TRACE(`30`, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
4675	task_size));
4676	#if USE_FAST_MEMORY
4677	taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
4678	#else
4679	taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
4680	#endif /* USE_FAST_MEMORY */
4681	KMP_MEMCPY(dest: taskdata, src: taskdata_src, n: task_size);
4682
4683	task = KMP_TASKDATA_TO_TASK(taskdata);
4684
4685	// Initialize new task (only specific fields not affected by memcpy)
4686	#if OMPX_TASKGRAPH
4687	if (taskdata->is_taskgraph && !taskloop_recur &&
4688	__kmp_tdg_is_recording(taskdata_src->tdg->tdg_status))
4689	taskdata->td_tdg_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
4690	#endif
4691	taskdata->td_task_id = KMP_GEN_TASK_ID();
4692	if (task->shareds != NULL) { // need setup shareds pointer
4693	shareds_offset = (char )task_src->shareds - (char* *)taskdata_src;
4694	task->shareds = &((char *)taskdata)[shareds_offset];
4695	KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - `1`)) ==
4696	`0`);
4697	}
4698	taskdata->td_alloc_thread = thread;
4699	taskdata->td_parent = parent_task;
4700	// task inherits the taskgroup from the parent task
4701	taskdata->td_taskgroup = parent_task->td_taskgroup;
4702	// tied task needs to initialize the td_last_tied at creation,
4703	// untied one does this when it is scheduled for execution
4704	if (taskdata->td_flags.tiedness == TASK_TIED)
4705	taskdata->td_last_tied = taskdata;
4706
4707	// Only need to keep track of child task counts if team parallel and tasking
4708	// not serialized
4709	if (!(taskdata->td_flags.team_serial \|\| taskdata->td_flags.tasking_ser)) {
4710	KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
4711	if (parent_task->td_taskgroup)
4712	KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
4713	// Only need to keep track of allocated child tasks for explicit tasks since
4714	// implicit not deallocated
4715	if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
4716	KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
4717	}
4718
4719	KA_TRACE(`20`,
4720	("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
4721	thread, taskdata, taskdata->td_parent));
4722	#if OMPT_SUPPORT
4723	if (UNLIKELY(ompt_enabled.enabled))
4724	__ompt_task_init(task: taskdata, tid: thread->th.th_info.ds.ds_gtid);
4725	#endif
4726	return task;
4727	}
4728
4729	// Routine optionally generated by the compiler for setting the lastprivate flag
4730	// and calling needed constructors for private/firstprivate objects
4731	// (used to form taskloop tasks from pattern task)
4732	// Parameters: dest task, src task, lastprivate flag.
4733	typedef void (p_task_dup_t)(kmp_task_t , kmp_task_t *, kmp_int32);
4734
4735	KMP_BUILD_ASSERT(sizeof(long) == `4` \|\| sizeof(long) == `8`);
4736
4737	// class to encapsulate manipulating loop bounds in a taskloop task.
4738	// this abstracts away the Intel vs GOMP taskloop interface for setting/getting
4739	// the loop bound variables.
4740	class kmp_taskloop_bounds_t {
4741	kmp_task_t *task;
4742	const kmp_taskdata_t *taskdata;
4743	size_t lower_offset;
4744	size_t upper_offset;
4745
4746	public:
4747	kmp_taskloop_bounds_t(kmp_task_t _task, kmp_uint64 lb, kmp_uint64 *ub)
4748	: task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),
4749	lower_offset((char )lb - (char* *)task),
4750	upper_offset((char )ub - (char* *)task) {
4751	KMP_DEBUG_ASSERT((char )lb > (char* *)_task);
4752	KMP_DEBUG_ASSERT((char )ub > (char* *)_task);
4753	}
4754	kmp_taskloop_bounds_t(kmp_task_t _task, const* kmp_taskloop_bounds_t &bounds)
4755	: task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),
4756	lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}
4757	size_t get_lower_offset() const { return lower_offset; }
4758	size_t get_upper_offset() const { return upper_offset; }
4759	kmp_uint64 get_lb() const {
4760	kmp_int64 retval;
4761	#if defined(KMP_GOMP_COMPAT)
4762	// Intel task just returns the lower bound normally
4763	if (!taskdata->td_flags.native) {
4764	retval = (kmp_int64 )((char *)task + lower_offset);
4765	} else {
4766	// GOMP task has to take into account the sizeof(long)
4767	if (taskdata->td_size_loop_bounds == `4`) {
4768	kmp_int32 lb = RCAST(kmp_int32 , task->shareds);
4769	retval = (kmp_int64)*lb;
4770	} else {
4771	kmp_int64 lb = RCAST(kmp_int64 , task->shareds);
4772	retval = (kmp_int64)*lb;
4773	}
4774	}
4775	#else
4776	(void)taskdata;
4777	retval = (kmp_int64 )((char *)task + lower_offset);
4778	#endif // defined(KMP_GOMP_COMPAT)
4779	return retval;
4780	}
4781	kmp_uint64 get_ub() const {
4782	kmp_int64 retval;
4783	#if defined(KMP_GOMP_COMPAT)
4784	// Intel task just returns the upper bound normally
4785	if (!taskdata->td_flags.native) {
4786	retval = (kmp_int64 )((char *)task + upper_offset);
4787	} else {
4788	// GOMP task has to take into account the sizeof(long)
4789	if (taskdata->td_size_loop_bounds == `4`) {
4790	kmp_int32 ub = RCAST(kmp_int32 , task->shareds) + `1`;
4791	retval = (kmp_int64)*ub;
4792	} else {
4793	kmp_int64 ub = RCAST(kmp_int64 , task->shareds) + `1`;
4794	retval = (kmp_int64)*ub;
4795	}
4796	}
4797	#else
4798	retval = (kmp_int64 )((char *)task + upper_offset);
4799	#endif // defined(KMP_GOMP_COMPAT)
4800	return retval;
4801	}
4802	void set_lb(kmp_uint64 lb) {
4803	#if defined(KMP_GOMP_COMPAT)
4804	// Intel task just sets the lower bound normally
4805	if (!taskdata->td_flags.native) {
4806	(kmp_uint64 )((char *)task + lower_offset) = lb;
4807	} else {
4808	// GOMP task has to take into account the sizeof(long)
4809	if (taskdata->td_size_loop_bounds == `4`) {
4810	kmp_uint32 lower = RCAST(kmp_uint32 , task->shareds);
4811	*lower = (kmp_uint32)lb;
4812	} else {
4813	kmp_uint64 lower = RCAST(kmp_uint64 , task->shareds);
4814	*lower = (kmp_uint64)lb;
4815	}
4816	}
4817	#else
4818	(kmp_uint64 )((char *)task + lower_offset) = lb;
4819	#endif // defined(KMP_GOMP_COMPAT)
4820	}
4821	void set_ub(kmp_uint64 ub) {
4822	#if defined(KMP_GOMP_COMPAT)
4823	// Intel task just sets the upper bound normally
4824	if (!taskdata->td_flags.native) {
4825	(kmp_uint64 )((char *)task + upper_offset) = ub;
4826	} else {
4827	// GOMP task has to take into account the sizeof(long)
4828	if (taskdata->td_size_loop_bounds == `4`) {
4829	kmp_uint32 upper = RCAST(kmp_uint32 , task->shareds) + `1`;
4830	*upper = (kmp_uint32)ub;
4831	} else {
4832	kmp_uint64 upper = RCAST(kmp_uint64 , task->shareds) + `1`;
4833	*upper = (kmp_uint64)ub;
4834	}
4835	}
4836	#else
4837	(kmp_uint64 )((char *)task + upper_offset) = ub;
4838	#endif // defined(KMP_GOMP_COMPAT)
4839	}
4840	};
4841
4842	// __kmp_taskloop_linear: Start tasks of the taskloop linearly
4843	//
4844	// loc Source location information
4845	// gtid Global thread ID
4846	// task Pattern task, exposes the loop iteration range
4847	// lb Pointer to loop lower bound in task structure
4848	// ub Pointer to loop upper bound in task structure
4849	// st Loop stride
4850	// ub_glob Global upper bound (used for lastprivate check)
4851	// num_tasks Number of tasks to execute
4852	// grainsize Number of loop iterations per task
4853	// extras Number of chunks with grainsize+1 iterations
4854	// last_chunk Reduction of grainsize for last task
4855	// tc Iterations count
4856	// task_dup Tasks duplication routine
4857	// codeptr_ra Return address for OMPT events
4858	void __kmp_taskloop_linear(ident_t loc, int* gtid, kmp_task_t *task,
4859	kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
4860	kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4861	kmp_uint64 grainsize, kmp_uint64 extras,
4862	kmp_int64 last_chunk, kmp_uint64 tc,
4863	#if OMPT_SUPPORT
4864	void *codeptr_ra,
4865	#endif
4866	void *task_dup) {
4867	KMP_COUNT_BLOCK(OMP_TASKLOOP);
4868	KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
4869	p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4870	// compiler provides global bounds here
4871	kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4872	kmp_uint64 lower = task_bounds.get_lb();
4873	kmp_uint64 upper = task_bounds.get_ub();
4874	kmp_uint64 i;
4875	kmp_info_t *thread = __kmp_threads[gtid];
4876	kmp_taskdata_t *current_task = thread->th.th_current_task;
4877	kmp_task_t *next_task;
4878	kmp_int32 lastpriv = `0`;
4879
4880	KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4881	(last_chunk < `0` ? last_chunk : extras));
4882	KMP_DEBUG_ASSERT(num_tasks > extras);
4883	KMP_DEBUG_ASSERT(num_tasks > `0`);
4884	KA_TRACE(`20`, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
4885	"extras %lld, last_chunk %lld, i=%lld,%lld(%d)%lld, dup %p\n",
4886	gtid, num_tasks, grainsize, extras, last_chunk, lower, upper,
4887	ub_glob, st, task_dup));
4888
4889	// Launch num_tasks tasks, assign grainsize iterations each task
4890	for (i = `0`; i < num_tasks; ++i) {
4891	kmp_uint64 chunk_minus_1;
4892	if (extras == `0`) {
4893	chunk_minus_1 = grainsize - `1`;
4894	} else {
4895	chunk_minus_1 = grainsize;
4896	--extras; // first extras iterations get bigger chunk (grainsize+1)
4897	}
4898	upper = lower + st * chunk_minus_1;
4899	if (upper > *ub) {
4900	upper = *ub;
4901	}
4902	if (i == num_tasks - `1`) {
4903	// schedule the last task, set lastprivate flag if needed
4904	if (st == `1`) { // most common case
4905	KMP_DEBUG_ASSERT(upper == *ub);
4906	if (upper == ub_glob)
4907	lastpriv = `1`;
4908	} else if (st > `0`) { // positive loop stride
4909	KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
4910	if ((kmp_uint64)st > ub_glob - upper)
4911	lastpriv = `1`;
4912	} else { // negative loop stride
4913	KMP_DEBUG_ASSERT(upper + st < *ub);
4914	if (upper - ub_glob < (kmp_uint64)(-st))
4915	lastpriv = `1`;
4916	}
4917	}
4918
4919	#if OMPX_TASKGRAPH
4920	next_task = __kmp_task_dup_alloc(thread, task, / taskloop_recur / `0`);
4921	#else
4922	next_task = __kmp_task_dup_alloc(thread, task_src: task); // allocate new task
4923	#endif
4924
4925	kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
4926	kmp_taskloop_bounds_t next_task_bounds =
4927	kmp_taskloop_bounds_t (next_task, task_bounds);
4928
4929	// adjust task-specific bounds
4930	next_task_bounds.set_lb(lower);
4931	if (next_taskdata->td_flags.native) {
4932	next_task_bounds.set_ub(upper + (st > `0` ? `1` : -`1`));
4933	} else {
4934	next_task_bounds.set_ub(upper);
4935	}
4936	if (ptask_dup != NULL) // set lastprivate flag, construct firstprivates,
4937	// etc.
4938	ptask_dup(next_task, task, lastpriv);
4939	KA_TRACE(`40`,
4940	("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
4941	"upper %lld stride %lld, (offsets %p %p)\n",
4942	gtid, i, next_task, lower, upper, st,
4943	next_task_bounds.get_lower_offset(),
4944	next_task_bounds.get_upper_offset()));
4945	#if OMPT_SUPPORT
4946	__kmp_omp_taskloop_task(NULL, gtid, new_task: next_task,
4947	codeptr_ra); // schedule new task
4948	#if OMPT_OPTIONAL
4949	if (ompt_enabled.ompt_callback_dispatch) {
4950	OMPT_GET_DISPATCH_CHUNK(next_taskdata->ompt_task_info.dispatch_chunk,
4951	lower, upper, st);
4952	}
4953	#endif // OMPT_OPTIONAL
4954	#else
4955	__kmp_omp_task(gtid, next_task, true); // schedule new task
4956	#endif
4957	lower = upper + st; // adjust lower bound for the next iteration
4958	}
4959	// free the pattern task and exit
4960	__kmp_task_start(gtid, task, current_task); // make internal bookkeeping
4961	// do not execute the pattern task, just do internal bookkeeping
4962	__kmp_task_finish<false>(gtid, task, resumed_task: current_task);
4963	}
4964
4965	// Structure to keep taskloop parameters for auxiliary task
4966	// kept in the shareds of the task structure.
4967	typedef struct __taskloop_params {
4968	kmp_task_t *task;
4969	kmp_uint64 *lb;
4970	kmp_uint64 *ub;
4971	void *task_dup;
4972	kmp_int64 st;
4973	kmp_uint64 ub_glob;
4974	kmp_uint64 num_tasks;
4975	kmp_uint64 grainsize;
4976	kmp_uint64 extras;
4977	kmp_int64 last_chunk;
4978	kmp_uint64 tc;
4979	kmp_uint64 num_t_min;
4980	#if OMPT_SUPPORT
4981	void *codeptr_ra;
4982	#endif
4983	} __taskloop_params_t;
4984
4985	void __kmp_taskloop_recur(ident_t , int, kmp_task_t , kmp_uint64 *,
4986	kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
4987	kmp_uint64, kmp_uint64, kmp_int64, kmp_uint64,
4988	kmp_uint64,
4989	#if OMPT_SUPPORT
4990	void *,
4991	#endif
4992	void *);
4993
4994	// Execute part of the taskloop submitted as a task.
4995	int __kmp_taskloop_task(int gtid, void *ptask) {
4996	__taskloop_params_t *p =
4997	(__taskloop_params_t )((kmp_task_t )ptask)->shareds;
4998	kmp_task_t *task = p->task;
4999	kmp_uint64 *lb = p->lb;
5000	kmp_uint64 *ub = p->ub;
5001	void *task_dup = p->task_dup;
5002	// p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
5003	kmp_int64 st = p->st;
5004	kmp_uint64 ub_glob = p->ub_glob;
5005	kmp_uint64 num_tasks = p->num_tasks;
5006	kmp_uint64 grainsize = p->grainsize;
5007	kmp_uint64 extras = p->extras;
5008	kmp_int64 last_chunk = p->last_chunk;
5009	kmp_uint64 tc = p->tc;
5010	kmp_uint64 num_t_min = p->num_t_min;
5011	#if OMPT_SUPPORT
5012	void *codeptr_ra = p->codeptr_ra;
5013	#endif
5014	#if KMP_DEBUG
5015	kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
5016	KMP_DEBUG_ASSERT(task != NULL);
5017	KA_TRACE(`20`,
5018	("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
5019	" %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
5020	gtid, taskdata, num_tasks, grainsize, extras, last_chunk, lb, ub,
5021	st, task_dup));
5022	#endif
5023	KMP_DEBUG_ASSERT(num_tasks * `2` + `1` > num_t_min);
5024	if (num_tasks > num_t_min)
5025	__kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
5026	grainsize, extras, last_chunk, tc, num_t_min,
5027	#if OMPT_SUPPORT
5028	codeptr_ra,
5029	#endif
5030	task_dup);
5031	else
5032	__kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
5033	grainsize, extras, last_chunk, tc,
5034	#if OMPT_SUPPORT
5035	codeptr_ra,
5036	#endif
5037	task_dup);
5038
5039	KA_TRACE(`40`, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
5040	return `0`;
5041	}
5042
5043	// Schedule part of the taskloop as a task,
5044	// execute the rest of the taskloop.
5045	//
5046	// loc Source location information
5047	// gtid Global thread ID
5048	// task Pattern task, exposes the loop iteration range
5049	// lb Pointer to loop lower bound in task structure
5050	// ub Pointer to loop upper bound in task structure
5051	// st Loop stride
5052	// ub_glob Global upper bound (used for lastprivate check)
5053	// num_tasks Number of tasks to execute
5054	// grainsize Number of loop iterations per task
5055	// extras Number of chunks with grainsize+1 iterations
5056	// last_chunk Reduction of grainsize for last task
5057	// tc Iterations count
5058	// num_t_min Threshold to launch tasks recursively
5059	// task_dup Tasks duplication routine
5060	// codeptr_ra Return address for OMPT events
5061	void __kmp_taskloop_recur(ident_t loc, int* gtid, kmp_task_t *task,
5062	kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
5063	kmp_uint64 ub_glob, kmp_uint64 num_tasks,
5064	kmp_uint64 grainsize, kmp_uint64 extras,
5065	kmp_int64 last_chunk, kmp_uint64 tc,
5066	kmp_uint64 num_t_min,
5067	#if OMPT_SUPPORT
5068	void *codeptr_ra,
5069	#endif
5070	void *task_dup) {
5071	kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
5072	KMP_DEBUG_ASSERT(task != NULL);
5073	KMP_DEBUG_ASSERT(num_tasks > num_t_min);
5074	KA_TRACE(`20`,
5075	("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
5076	" %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
5077	gtid, taskdata, num_tasks, grainsize, extras, last_chunk, lb, ub,
5078	st, task_dup));
5079	p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
5080	kmp_uint64 lower = *lb;
5081	kmp_info_t *thread = __kmp_threads[gtid];
5082	// kmp_taskdata_t current_task = thread->th.th_current_task;*
5083	kmp_task_t *next_task;
5084	size_t lower_offset =
5085	(char )lb - (char* )task; // remember offset of lb in the task structure*
5086	size_t upper_offset =
5087	(char )ub - (char* )task; // remember offset of ub in the task structure*
5088
5089	KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
5090	(last_chunk < `0` ? last_chunk : extras));
5091	KMP_DEBUG_ASSERT(num_tasks > extras);
5092	KMP_DEBUG_ASSERT(num_tasks > `0`);
5093
5094	// split the loop in two halves
5095	kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
5096	kmp_int64 last_chunk0 = `0`, last_chunk1 = `0`;
5097	kmp_uint64 gr_size0 = grainsize;
5098	kmp_uint64 n_tsk0 = num_tasks >> `1`; // num_tasks/2 to execute
5099	kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
5100	if (last_chunk < `0`) {
5101	ext0 = ext1 = `0`;
5102	last_chunk1 = last_chunk;
5103	tc0 = grainsize * n_tsk0;
5104	tc1 = tc - tc0;
5105	} else if (n_tsk0 <= extras) {
5106	gr_size0++; // integrate extras into grainsize
5107	ext0 = `0`; // no extra iters in 1st half
5108	ext1 = extras - n_tsk0; // remaining extras
5109	tc0 = gr_size0 * n_tsk0;
5110	tc1 = tc - tc0;
5111	} else { // n_tsk0 > extras
5112	ext1 = `0`; // no extra iters in 2nd half
5113	ext0 = extras;
5114	tc1 = grainsize * n_tsk1;
5115	tc0 = tc - tc1;
5116	}
5117	ub0 = lower + st * (tc0 - `1`);
5118	lb1 = ub0 + st;
5119
5120	// create pattern task for 2nd half of the loop
5121	#if OMPX_TASKGRAPH
5122	next_task = __kmp_task_dup_alloc(thread, task,
5123	/ taskloop_recur / `1`);
5124	#else
5125	next_task = __kmp_task_dup_alloc(thread, task_src: task); // duplicate the task
5126	#endif
5127	// adjust lower bound (upper bound is not changed) for the 2nd half
5128	(kmp_uint64 )((char *)next_task + lower_offset) = lb1;
5129	if (ptask_dup != NULL) // construct firstprivates, etc.
5130	ptask_dup(next_task, task, `0`);
5131	ub = ub0; // adjust upper bound for the 1st half*
5132
5133	// create auxiliary task for 2nd half of the loop
5134	// make sure new task has same parent task as the pattern task
5135	kmp_taskdata_t *current_task = thread->th.th_current_task;
5136	thread->th.th_current_task = taskdata->td_parent;
5137	kmp_task_t *new_task =
5138	__kmpc_omp_task_alloc(loc_ref: loc, gtid, flags: `1`, sizeof_kmp_task_t: `3` * sizeof(void *),
5139	sizeof_shareds: sizeof(__taskloop_params_t), task_entry: &__kmp_taskloop_task);
5140	// restore current task
5141	thread->th.th_current_task = current_task;
5142	__taskloop_params_t p = (__taskloop_params_t )new_task->shareds;
5143	p->task = next_task;
5144	p->lb = (kmp_uint64 )((char* *)next_task + lower_offset);
5145	p->ub = (kmp_uint64 )((char* *)next_task + upper_offset);
5146	p->task_dup = task_dup;
5147	p->st = st;
5148	p->ub_glob = ub_glob;
5149	p->num_tasks = n_tsk1;
5150	p->grainsize = grainsize;
5151	p->extras = ext1;
5152	p->last_chunk = last_chunk1;
5153	p->tc = tc1;
5154	p->num_t_min = num_t_min;
5155	#if OMPT_SUPPORT
5156	p->codeptr_ra = codeptr_ra;
5157	#endif
5158
5159	#if OMPX_TASKGRAPH
5160	kmp_taskdata_t *new_task_data = KMP_TASK_TO_TASKDATA(new_task);
5161	new_task_data->tdg = taskdata->tdg;
5162	new_task_data->is_taskgraph = `0`;
5163	#endif
5164
5165	#if OMPT_SUPPORT
5166	// schedule new task with correct return address for OMPT events
5167	__kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
5168	#else
5169	__kmp_omp_task(gtid, new_task, true); // schedule new task
5170	#endif
5171
5172	// execute the 1st half of current subrange
5173	if (n_tsk0 > num_t_min)
5174	__kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks: n_tsk0, grainsize: gr_size0,
5175	extras: ext0, last_chunk: last_chunk0, tc: tc0, num_t_min,
5176	#if OMPT_SUPPORT
5177	codeptr_ra,
5178	#endif
5179	task_dup);
5180	else
5181	__kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks: n_tsk0,
5182	grainsize: gr_size0, extras: ext0, last_chunk: last_chunk0, tc: tc0,
5183	#if OMPT_SUPPORT
5184	codeptr_ra,
5185	#endif
5186	task_dup);
5187
5188	KA_TRACE(`40`, ("__kmp_taskloop_recur(exit): T#%d\n", gtid));
5189	}
5190
5191	static void __kmp_taskloop(ident_t loc, int* gtid, kmp_task_t task, int* if_val,
5192	kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
5193	int nogroup, int sched, kmp_uint64 grainsize,
5194	int modifier, void *task_dup) {
5195	kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
5196	KMP_DEBUG_ASSERT(task != NULL);
5197	if (nogroup == `0`) {
5198	#if OMPT_SUPPORT && OMPT_OPTIONAL
5199	OMPT_STORE_RETURN_ADDRESS(gtid);
5200	#endif
5201	__kmpc_taskgroup(loc, gtid);
5202	}
5203
5204	#if OMPX_TASKGRAPH
5205	KMP_ATOMIC_DEC(&__kmp_tdg_task_id);
5206	#endif
5207	// =========================================================================
5208	// calculate loop parameters
5209	kmp_taskloop_bounds_t task_bounds(task, lb, ub);
5210	kmp_uint64 tc;
5211	// compiler provides global bounds here
5212	kmp_uint64 lower = task_bounds.get_lb();
5213	kmp_uint64 upper = task_bounds.get_ub();
5214	kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
5215	kmp_uint64 num_tasks = `0`, extras = `0`;
5216	kmp_int64 last_chunk =
5217	`0`; // reduce grainsize of last task by last_chunk in strict mode
5218	kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
5219	kmp_info_t *thread = __kmp_threads[gtid];
5220	kmp_taskdata_t *current_task = thread->th.th_current_task;
5221
5222	KA_TRACE(`20`, ("__kmp_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
5223	"grain %llu(%d, %d), dup %p\n",
5224	gtid, taskdata, lower, upper, st, grainsize, sched, modifier,
5225	task_dup));
5226
5227	// compute trip count
5228	if (st == `1`) { // most common case
5229	tc = upper - lower + `1`;
5230	} else if (st < `0`) {
5231	tc = (lower - upper) / (-st) + `1`;
5232	} else { // st > 0
5233	tc = (upper - lower) / st + `1`;
5234	}
5235	if (tc == `0`) {
5236	KA_TRACE(`20`, ("__kmp_taskloop(exit): T#%d zero-trip loop\n", gtid));
5237	// free the pattern task and exit
5238	__kmp_task_start(gtid, task, current_task);
5239	// do not execute anything for zero-trip loop
5240	__kmp_task_finish<false>(gtid, task, resumed_task: current_task);
5241	return;
5242	}
5243
5244	#if OMPT_SUPPORT && OMPT_OPTIONAL
5245	ompt_team_info_t *team_info = __ompt_get_teaminfo(depth: `0`, NULL);
5246	ompt_task_info_t *task_info = __ompt_get_task_info_object(depth: `0`);
5247	if (ompt_enabled.ompt_callback_work) {
5248	ompt_callbacks.ompt_callback(ompt_callback_work)(
5249	ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
5250	&(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(`0`));
5251	}
5252	#endif
5253
5254	if (num_tasks_min == `0`)
5255	// TODO: can we choose better default heuristic?
5256	num_tasks_min =
5257	KMP_MIN(thread->th.th_team_nproc * `10`, INITIAL_TASK_DEQUE_SIZE);
5258
5259	// compute num_tasks/grainsize based on the input provided
5260	switch (sched) {
5261	case `0`: // no schedule clause specified, we can choose the default
5262	// let's try to schedule (team_size10) tasks*
5263	grainsize = thread->th.th_team_nproc * static_cast<kmp_uint64>(`10`);
5264	KMP_FALLTHROUGH();
5265	case `2`: // num_tasks provided
5266	if (grainsize > tc) {
5267	num_tasks = tc; // too big num_tasks requested, adjust values
5268	grainsize = `1`;
5269	extras = `0`;
5270	} else {
5271	num_tasks = grainsize;
5272	grainsize = tc / num_tasks;
5273	extras = tc % num_tasks;
5274	}
5275	break;
5276	case `1`: // grainsize provided
5277	if (grainsize > tc) {
5278	num_tasks = `1`;
5279	grainsize = tc; // too big grainsize requested, adjust values
5280	extras = `0`;
5281	} else {
5282	if (modifier) {
5283	num_tasks = (tc + grainsize - `1`) / grainsize;
5284	last_chunk = tc - (num_tasks * grainsize);
5285	extras = `0`;
5286	} else {
5287	num_tasks = tc / grainsize;
5288	// adjust grainsize for balanced distribution of iterations
5289	grainsize = tc / num_tasks;
5290	extras = tc % num_tasks;
5291	}
5292	}
5293	break;
5294	default:
5295	KMP_ASSERT2(`0`, "unknown scheduling of taskloop");
5296	}
5297
5298	KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
5299	(last_chunk < `0` ? last_chunk : extras));
5300	KMP_DEBUG_ASSERT(num_tasks > extras);
5301	KMP_DEBUG_ASSERT(num_tasks > `0`);
5302	// =========================================================================
5303
5304	// check if clause value first
5305	// Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
5306	if (if_val == `0`) { // if(0) specified, mark task as serial
5307	taskdata->td_flags.task_serial = `1`;
5308	taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
5309	// always start serial tasks linearly
5310	__kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5311	grainsize, extras, last_chunk, tc,
5312	#if OMPT_SUPPORT
5313	OMPT_GET_RETURN_ADDRESS(`0`),
5314	#endif
5315	task_dup);
5316	// !taskdata->td_flags.native => currently force linear spawning of tasks
5317	// for GOMP_taskloop
5318	} else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
5319	KA_TRACE(`20`, ("__kmp_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
5320	"(%lld), grain %llu, extras %llu, last_chunk %lld\n",
5321	gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5322	last_chunk));
5323	__kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5324	grainsize, extras, last_chunk, tc, num_t_min: num_tasks_min,
5325	#if OMPT_SUPPORT
5326	OMPT_GET_RETURN_ADDRESS(`0`),
5327	#endif
5328	task_dup);
5329	} else {
5330	KA_TRACE(`20`, ("__kmp_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
5331	"(%lld), grain %llu, extras %llu, last_chunk %lld\n",
5332	gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5333	last_chunk));
5334	__kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5335	grainsize, extras, last_chunk, tc,
5336	#if OMPT_SUPPORT
5337	OMPT_GET_RETURN_ADDRESS(`0`),
5338	#endif
5339	task_dup);
5340	}
5341
5342	#if OMPT_SUPPORT && OMPT_OPTIONAL
5343	if (ompt_enabled.ompt_callback_work) {
5344	ompt_callbacks.ompt_callback(ompt_callback_work)(
5345	ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
5346	&(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(`0`));
5347	}
5348	#endif
5349
5350	if (nogroup == `0`) {
5351	#if OMPT_SUPPORT && OMPT_OPTIONAL
5352	OMPT_STORE_RETURN_ADDRESS(gtid);
5353	#endif
5354	__kmpc_end_taskgroup(loc, gtid);
5355	}
5356	KA_TRACE(`20`, ("__kmp_taskloop(exit): T#%d\n", gtid));
5357	}
5358
5359	/!*
5360	@ingroup TASKING
5361	@param loc Source location information
5362	@param gtid Global thread ID
5363	@param task Task structure
5364	@param if_val Value of the if clause
5365	@param lb Pointer to loop lower bound in task structure
5366	@param ub Pointer to loop upper bound in task structure
5367	@param st Loop stride
5368	@param nogroup Flag, 1 if nogroup clause specified, 0 otherwise
5369	@param sched Schedule specified 0/1/2 for none/grainsize/num_tasks
5370	@param grainsize Schedule value if specified
5371	@param task_dup Tasks duplication routine
5372
5373	Execute the taskloop construct.
5374	*/
5375	void __kmpc_taskloop(ident_t loc, int* gtid, kmp_task_t task, int* if_val,
5376	kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, int nogroup,
5377	int sched, kmp_uint64 grainsize, void *task_dup) {
5378	__kmp_assert_valid_gtid(gtid);
5379	KA_TRACE(`20`, ("__kmpc_taskloop(enter): T#%d\n", gtid));
5380	__kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5381	modifier: `0`, task_dup);
5382	KA_TRACE(`20`, ("__kmpc_taskloop(exit): T#%d\n", gtid));
5383	}
5384
5385	/!*
5386	@ingroup TASKING
5387	@param loc Source location information
5388	@param gtid Global thread ID
5389	@param task Task structure
5390	@param if_val Value of the if clause
5391	@param lb Pointer to loop lower bound in task structure
5392	@param ub Pointer to loop upper bound in task structure
5393	@param st Loop stride
5394	@param nogroup Flag, 1 if nogroup clause specified, 0 otherwise
5395	@param sched Schedule specified 0/1/2 for none/grainsize/num_tasks
5396	@param grainsize Schedule value if specified
5397	@param modifier Modifier 'strict' for sched, 1 if present, 0 otherwise
5398	@param task_dup Tasks duplication routine
5399
5400	Execute the taskloop construct.
5401	*/
5402	void __kmpc_taskloop_5(ident_t loc, int* gtid, kmp_task_t task, int* if_val,
5403	kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
5404	int nogroup, int sched, kmp_uint64 grainsize,
5405	int modifier, void *task_dup) {
5406	__kmp_assert_valid_gtid(gtid);
5407	KA_TRACE(`20`, ("__kmpc_taskloop_5(enter): T#%d\n", gtid));
5408	__kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5409	modifier, task_dup);
5410	KA_TRACE(`20`, ("__kmpc_taskloop_5(exit): T#%d\n", gtid));
5411	}
5412
5413	/!*
5414	@ingroup TASKING
5415	@param gtid Global Thread ID of current thread
5416	@return Returns a pointer to the thread's current task async handle. If no task
5417	is present or gtid is invalid, returns NULL.
5418
5419	Acqurires a pointer to the target async handle from the current task.
5420	*/
5421	void **__kmpc_omp_get_target_async_handle_ptr(kmp_int32 gtid) {
5422	if (gtid == KMP_GTID_DNE)
5423	return NULL;
5424
5425	kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
5426	kmp_taskdata_t *taskdata = thread->th.th_current_task;
5427
5428	if (!taskdata)
5429	return NULL;
5430
5431	return &taskdata->td_target_data.async_handle;
5432	}
5433
5434	/!*
5435	@ingroup TASKING
5436	@param gtid Global Thread ID of current thread
5437	@return Returns TRUE if the current task being executed of the given thread has
5438	a task team allocated to it. Otherwise, returns FALSE.
5439
5440	Checks if the current thread has a task team.
5441	*/
5442	bool __kmpc_omp_has_task_team(kmp_int32 gtid) {
5443	if (gtid == KMP_GTID_DNE)
5444	return FALSE;
5445
5446	kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
5447	kmp_taskdata_t *taskdata = thread->th.th_current_task;
5448
5449	if (!taskdata)
5450	return FALSE;
5451
5452	return taskdata->td_task_team != NULL;
5453	}
5454
5455	#if OMPX_TASKGRAPH
5456	// __kmp_find_tdg: identify a TDG through its ID
5457	// tdg_id: ID of the TDG
5458	// returns: If a TDG corresponding to this ID is found and not
5459	// its initial state, return the pointer to it, otherwise nullptr
5460	static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id) {
5461	kmp_tdg_info_t res = nullptr*;
5462	if (__kmp_max_tdgs == `0`)
5463	return res;
5464
5465	if (__kmp_global_tdgs == NULL)
5466	__kmp_global_tdgs = (kmp_tdg_info_t **)__kmp_allocate(
5467	sizeof(kmp_tdg_info_t ) __kmp_max_tdgs);
5468
5469	if ((__kmp_global_tdgs[tdg_id]) &&
5470	(__kmp_global_tdgs[tdg_id]->tdg_status != KMP_TDG_NONE))
5471	res = __kmp_global_tdgs[tdg_id];
5472	return res;
5473	}
5474
5475	// __kmp_print_tdg_dot: prints the TDG to a dot file
5476	// tdg: ID of the TDG
5477	// gtid: Global Thread ID
5478	void __kmp_print_tdg_dot(kmp_tdg_info_t *tdg, kmp_int32 gtid) {
5479	kmp_int32 tdg_id = tdg->tdg_id;
5480	KA_TRACE(`10`, ("__kmp_print_tdg_dot(enter): T#%d tdg_id=%d \n", gtid, tdg_id));
5481
5482	char file_name[`20`];
5483	sprintf(file_name, "tdg_%d.dot", tdg_id);
5484	kmp_safe_raii_file_t tdg_file(file_name, "w");
5485
5486	kmp_int32 num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5487	fprintf(tdg_file,
5488	"digraph TDG {\n"
5489	" compound=true\n"
5490	" subgraph cluster {\n"
5491	" label=TDG_%d\n",
5492	tdg_id);
5493	for (kmp_int32 i = `0`; i < num_tasks; i++) {
5494	fprintf(tdg_file, " %d[style=bold]\n", i);
5495	}
5496	fprintf(tdg_file, " }\n");
5497	for (kmp_int32 i = `0`; i < num_tasks; i++) {
5498	kmp_int32 nsuccessors = tdg->record_map[i].nsuccessors;
5499	kmp_int32 *successors = tdg->record_map[i].successors;
5500	if (nsuccessors > `0`) {
5501	for (kmp_int32 j = `0`; j < nsuccessors; j++)
5502	fprintf(tdg_file, " %d -> %d \n", i, successors[j]);
5503	}
5504	}
5505	fprintf(tdg_file, "}");
5506	KA_TRACE(`10`, ("__kmp_print_tdg_dot(exit): T#%d tdg_id=%d \n", gtid, tdg_id));
5507	}
5508
5509	// __kmp_exec_tdg: launch the execution of a previous
5510	// recorded TDG
5511	// gtid: Global Thread ID
5512	// tdg: ID of the TDG
5513	void __kmp_exec_tdg(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
5514	KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_READY);
5515	KA_TRACE(`10`, ("__kmp_exec_tdg(enter): T#%d tdg_id=%d num_roots=%d\n", gtid,
5516	tdg->tdg_id, tdg->num_roots));
5517	kmp_node_info_t *this_record_map = tdg->record_map;
5518	kmp_int32 *this_root_tasks = tdg->root_tasks;
5519	kmp_int32 this_num_roots = tdg->num_roots;
5520	kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5521
5522	kmp_info_t *thread = __kmp_threads[gtid];
5523	kmp_taskdata_t *parent_task = thread->th.th_current_task;
5524
5525	if (tdg->rec_taskred_data) {
5526	__kmpc_taskred_init(gtid, tdg->rec_num_taskred, tdg->rec_taskred_data);
5527	}
5528
5529	for (kmp_int32 j = `0`; j < this_num_tasks; j++) {
5530	kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(this_record_map[j].task);
5531
5532	td->td_parent = parent_task;
5533	this_record_map[j].parent_task = parent_task;
5534
5535	kmp_taskgroup_t *parent_taskgroup =
5536	this_record_map[j].parent_task->td_taskgroup;
5537
5538	KMP_ATOMIC_ST_RLX(&this_record_map[j].npredecessors_counter,
5539	this_record_map[j].npredecessors);
5540	KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_incomplete_child_tasks);
5541
5542	if (parent_taskgroup) {
5543	KMP_ATOMIC_INC(&parent_taskgroup->count);
5544	// The taskgroup is different so we must update it
5545	td->td_taskgroup = parent_taskgroup;
5546	} else if (td->td_taskgroup != nullptr) {
5547	// If the parent doesnt have a taskgroup, remove it from the task
5548	td->td_taskgroup = nullptr;
5549	}
5550	if (this_record_map[j].parent_task->td_flags.tasktype == TASK_EXPLICIT)
5551	KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_allocated_child_tasks);
5552	}
5553
5554	for (kmp_int32 j = `0`; j < this_num_roots; ++j) {
5555	__kmp_omp_task(gtid, this_record_map[this_root_tasks[j]].task, true);
5556	}
5557	KA_TRACE(`10`, ("__kmp_exec_tdg(exit): T#%d tdg_id=%d num_roots=%d\n", gtid,
5558	tdg->tdg_id, tdg->num_roots));
5559	}
5560
5561	// __kmp_start_record: set up a TDG structure and turn the
5562	// recording flag to true
5563	// gtid: Global Thread ID of the encountering thread
5564	// input_flags: Flags associated with the TDG
5565	// tdg_id: ID of the TDG to record
5566	static inline void __kmp_start_record(kmp_int32 gtid,
5567	kmp_taskgraph_flags_t *flags,
5568	kmp_int32 tdg_id) {
5569	kmp_tdg_info_t *tdg =
5570	(kmp_tdg_info_t )__kmp_allocate(sizeof*(kmp_tdg_info_t));
5571	__kmp_global_tdgs[__kmp_curr_tdg_idx] = tdg;
5572	// Initializing the TDG structure
5573	tdg->tdg_id = tdg_id;
5574	tdg->map_size = INIT_MAPSIZE;
5575	tdg->num_roots = -`1`;
5576	tdg->root_tasks = nullptr;
5577	tdg->tdg_status = KMP_TDG_RECORDING;
5578	tdg->rec_num_taskred = `0`;
5579	tdg->rec_taskred_data = nullptr;
5580	KMP_ATOMIC_ST_RLX(&tdg->num_tasks, `0`);
5581
5582	// Initializing the list of nodes in this TDG
5583	kmp_node_info_t *this_record_map =
5584	(kmp_node_info_t )__kmp_allocate(INIT_MAPSIZE sizeof(kmp_node_info_t));
5585	for (kmp_int32 i = `0`; i < INIT_MAPSIZE; i++) {
5586	kmp_int32 *successorsList =
5587	(kmp_int32 )__kmp_allocate(__kmp_successors_size sizeof(kmp_int32));
5588	this_record_map[i].task = nullptr;
5589	this_record_map[i].successors = successorsList;
5590	this_record_map[i].nsuccessors = `0`;
5591	this_record_map[i].npredecessors = `0`;
5592	this_record_map[i].successors_size = __kmp_successors_size;
5593	KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter, `0`);
5594	}
5595
5596	__kmp_global_tdgs[__kmp_curr_tdg_idx]->record_map = this_record_map;
5597	}
5598
5599	// __kmpc_start_record_task: Wrapper around __kmp_start_record to mark
5600	// the beginning of the record process of a task region
5601	// loc_ref: Location of TDG, not used yet
5602	// gtid: Global Thread ID of the encountering thread
5603	// input_flags: Flags associated with the TDG
5604	// tdg_id: ID of the TDG to record, for now, incremental integer
5605	// returns: 1 if we record, otherwise, 0
5606	kmp_int32 __kmpc_start_record_task(ident_t *loc_ref, kmp_int32 gtid,
5607	kmp_int32 input_flags, kmp_int32 tdg_id) {
5608
5609	kmp_int32 res;
5610	kmp_taskgraph_flags_t flags = (kmp_taskgraph_flags_t )&input_flags;
5611	KA_TRACE(`10`,
5612	("__kmpc_start_record_task(enter): T#%d loc=%p flags=%d tdg_id=%d\n",
5613	gtid, loc_ref, input_flags, tdg_id));
5614
5615	if (__kmp_max_tdgs == `0`) {
5616	KA_TRACE(
5617	`10`,
5618	("__kmpc_start_record_task(abandon): T#%d loc=%p flags=%d tdg_id = %d, "
5619	"__kmp_max_tdgs = 0\n",
5620	gtid, loc_ref, input_flags, tdg_id));
5621	return `1`;
5622	}
5623
5624	__kmpc_taskgroup(loc_ref, gtid);
5625	if (kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id)) {
5626	// TODO: use re_record flag
5627	__kmp_exec_tdg(gtid, tdg);
5628	res = `0`;
5629	} else {
5630	__kmp_curr_tdg_idx = tdg_id;
5631	KMP_DEBUG_ASSERT(__kmp_curr_tdg_idx < __kmp_max_tdgs);
5632	__kmp_start_record(gtid, flags, tdg_id);
5633	__kmp_num_tdg++;
5634	res = `1`;
5635	}
5636	KA_TRACE(`10`, ("__kmpc_start_record_task(exit): T#%d TDG %d starts to %s\n",
5637	gtid, tdg_id, res ? "record" : "execute"));
5638	return res;
5639	}
5640
5641	// __kmp_end_record: set up a TDG after recording it
5642	// gtid: Global thread ID
5643	// tdg: Pointer to the TDG
5644	void __kmp_end_record(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
5645	// Store roots
5646	kmp_node_info_t *this_record_map = tdg->record_map;
5647	kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5648	kmp_int32 *this_root_tasks =
5649	(kmp_int32 )__kmp_allocate(this_num_tasks sizeof(kmp_int32));
5650	kmp_int32 this_map_size = tdg->map_size;
5651	kmp_int32 this_num_roots = `0`;
5652	kmp_info_t *thread = __kmp_threads[gtid];
5653
5654	for (kmp_int32 i = `0`; i < this_num_tasks; i++) {
5655	if (this_record_map[i].npredecessors == `0`) {
5656	this_root_tasks[this_num_roots++] = i;
5657	}
5658	}
5659
5660	// Update with roots info and mapsize
5661	tdg->map_size = this_map_size;
5662	tdg->num_roots = this_num_roots;
5663	tdg->root_tasks = this_root_tasks;
5664	KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_RECORDING);
5665	tdg->tdg_status = KMP_TDG_READY;
5666
5667	if (thread->th.th_current_task->td_dephash) {
5668	__kmp_dephash_free(thread, thread->th.th_current_task->td_dephash);
5669	thread->th.th_current_task->td_dephash = NULL;
5670	}
5671
5672	// Reset predecessor counter
5673	for (kmp_int32 i = `0`; i < this_num_tasks; i++) {
5674	KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter,
5675	this_record_map[i].npredecessors);
5676	}
5677	KMP_ATOMIC_ST_RLX(&__kmp_tdg_task_id, `0`);
5678
5679	if (__kmp_tdg_dot)
5680	__kmp_print_tdg_dot(tdg, gtid);
5681	}
5682
5683	// __kmpc_end_record_task: wrapper around __kmp_end_record to mark
5684	// the end of recording phase
5685	//
5686	// loc_ref: Source location information
5687	// gtid: Global thread ID
5688	// input_flags: Flags attached to the graph
5689	// tdg_id: ID of the TDG just finished recording
5690	void __kmpc_end_record_task(ident_t *loc_ref, kmp_int32 gtid,
5691	kmp_int32 input_flags, kmp_int32 tdg_id) {
5692	kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id);
5693
5694	KA_TRACE(`10`, ("__kmpc_end_record_task(enter): T#%d loc=%p finishes recording"
5695	" tdg=%d with flags=%d\n",
5696	gtid, loc_ref, tdg_id, input_flags));
5697	if (__kmp_max_tdgs) {
5698	// TODO: use input_flags->nowait
5699	__kmpc_end_taskgroup(loc_ref, gtid);
5700	if (__kmp_tdg_is_recording(tdg->tdg_status))
5701	__kmp_end_record(gtid, tdg);
5702	}
5703	KA_TRACE(`10`, ("__kmpc_end_record_task(exit): T#%d loc=%p finished recording"
5704	" tdg=%d, its status is now READY\n",
5705	gtid, loc_ref, tdg_id));
5706	}
5707	#endif
5708

source code of openmp/runtime/src/kmp_tasking.cpp