kmp_dispatch.cpp source code [openmp/runtime/src/kmp_dispatch.cpp]

1	/*
2	* kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3	*/
4
5	//===----------------------------------------------------------------------===//
6	//
7	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8	// See https://llvm.org/LICENSE.txt for license information.
9	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10	//
11	//===----------------------------------------------------------------------===//
12
13	/ Dynamic scheduling initialization and dispatch.*
14	*
15	* NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16	* it may change values between parallel regions. __kmp_max_nth
17	* is the largest value __kmp_nth may take, 1 is the smallest.
18	*/
19
20	#include "kmp.h"
21	#include "kmp_error.h"
22	#include "kmp_i18n.h"
23	#include "kmp_itt.h"
24	#include "kmp_stats.h"
25	#include "kmp_str.h"
26	#if KMP_USE_X87CONTROL
27	#include <float.h>
28	#endif
29	#include "kmp_lock.h"
30	#include "kmp_dispatch.h"
31	#if KMP_USE_HIER_SCHED
32	#include "kmp_dispatch_hier.h"
33	#endif
34
35	#if OMPT_SUPPORT
36	#include "ompt-specific.h"
37	#endif
38
39	/ ------------------------------------------------------------------------ /
40	/ ------------------------------------------------------------------------ /
41
42	void __kmp_dispatch_deo_error(int gtid_ref, int* cid_ref, ident_t loc_ref) {
43	kmp_info_t *th;
44
45	KMP_DEBUG_ASSERT(gtid_ref);
46
47	if (__kmp_env_consistency_check) {
48	th = __kmp_threads[*gtid_ref];
49	if (th->th.th_root->r.r_active &&
50	(th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51	#if KMP_USE_DYNAMIC_LOCK
52	__kmp_push_sync(gtid: *gtid_ref, ct: ct_ordered_in_pdo, ident: loc_ref, NULL, `0`);
53	#else
54	__kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55	#endif
56	}
57	}
58	}
59
60	void __kmp_dispatch_dxo_error(int gtid_ref, int* cid_ref, ident_t loc_ref) {
61	kmp_info_t *th;
62
63	if (__kmp_env_consistency_check) {
64	th = __kmp_threads[*gtid_ref];
65	if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66	__kmp_pop_sync(gtid: *gtid_ref, ct: ct_ordered_in_pdo, ident: loc_ref);
67	}
68	}
69	}
70
71	// Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
72	static inline int __kmp_get_monotonicity(ident_t loc, enum* sched_type schedule,
73	bool use_hier = false) {
74	// Pick up the nonmonotonic/monotonic bits from the scheduling type
75	// Nonmonotonic as default for dynamic schedule when no modifier is specified
76	int monotonicity = SCHEDULE_NONMONOTONIC;
77
78	// Let default be monotonic for executables
79	// compiled with OpenMP 4.5 or less compilers*
80	if (loc != NULL && loc->get_openmp_version() < `50`)
81	monotonicity = SCHEDULE_MONOTONIC;
82
83	if (use_hier \|\| __kmp_force_monotonic)
84	monotonicity = SCHEDULE_MONOTONIC;
85	else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
86	monotonicity = SCHEDULE_NONMONOTONIC;
87	else if (SCHEDULE_HAS_MONOTONIC(schedule))
88	monotonicity = SCHEDULE_MONOTONIC;
89
90	return monotonicity;
91	}
92
93	#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
94	// Return floating point number rounded to two decimal points
95	static inline float __kmp_round_2decimal_val(float num) {
96	return (float)(static_cast<int>(num * `100` + `0.5`)) / `100`;
97	}
98	static inline int __kmp_get_round_val(float num) {
99	return static_cast<int>(num < `0` ? num - `0.5` : num + `0.5`);
100	}
101	#endif
102
103	template <typename T>
104	inline void
105	__kmp_initialize_self_buffer(kmp_team_t *team, T id,
106	dispatch_private_info_template<T> *pr,
107	typename traits_t<T>::unsigned_t nchunks, T nproc,
108	typename traits_t<T>::unsigned_t &init,
109	T &small_chunk, T &extras, T &p_extra) {
110
111	#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
112	if (pr->flags.use_hybrid) {
113	kmp_info_t th = __kmp_threads[__kmp_gtid_from_tid(tid: (int*)id, team)];
114	kmp_hw_core_type_t type =
115	(kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
116	T pchunks = pr->u.p.pchunks;
117	T echunks = nchunks - pchunks;
118	T num_procs_with_pcore = pr->u.p.num_procs_with_pcore;
119	T num_procs_with_ecore = nproc - num_procs_with_pcore;
120	T first_thread_with_ecore = pr->u.p.first_thread_with_ecore;
121	T big_chunk =
122	pchunks / num_procs_with_pcore; // chunks per thread with p-core
123	small_chunk =
124	echunks / num_procs_with_ecore; // chunks per thread with e-core
125
126	extras =
127	(pchunks % num_procs_with_pcore) + (echunks % num_procs_with_ecore);
128
129	p_extra = (big_chunk - small_chunk);
130
131	if (type == KMP_HW_CORE_TYPE_CORE) {
132	if (id < first_thread_with_ecore) {
133	init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
134	} else {
135	init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
136	(id < extras ? id : extras);
137	}
138	} else {
139	if (id == first_thread_with_ecore) {
140	init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
141	} else {
142	init = id * small_chunk + first_thread_with_ecore * p_extra +
143	(id < extras ? id : extras);
144	}
145	}
146	p_extra = (type == KMP_HW_CORE_TYPE_CORE) ? p_extra : `0`;
147	return;
148	}
149	#endif
150
151	small_chunk = nchunks / nproc; // chunks per thread
152	extras = nchunks % nproc;
153	p_extra = `0`;
154	init = id * small_chunk + (id < extras ? id : extras);
155	}
156
157	#if KMP_STATIC_STEAL_ENABLED
158	enum { // values for steal_flag (possible states of private per-loop buffer)
159	UNUSED = `0`,
160	CLAIMED = `1`, // owner thread started initialization
161	READY = `2`, // available for stealing
162	THIEF = `3` // finished by owner, or claimed by thief
163	// possible state changes:
164	// 0 -> 1 owner only, sync
165	// 0 -> 3 thief only, sync
166	// 1 -> 2 owner only, async
167	// 2 -> 3 owner only, async
168	// 3 -> 2 owner only, async
169	// 3 -> 0 last thread finishing the loop, async
170	};
171	#endif
172
173	// Initialize a dispatch_private_info_template<T> buffer for a particular
174	// type of schedule,chunk. The loop description is found in lb (lower bound),
175	// ub (upper bound), and st (stride). nproc is the number of threads relevant
176	// to the scheduling (often the number of threads in a team, but not always if
177	// hierarchical scheduling is used). tid is the id of the thread calling
178	// the function within the group of nproc threads. It will have a value
179	// between 0 and nproc - 1. This is often just the thread id within a team, but
180	// is not necessarily the case when using hierarchical scheduling.
181	// loc is the source file location of the corresponding loop
182	// gtid is the global thread id
183	template <typename T>
184	void __kmp_dispatch_init_algorithm(ident_t loc, int* gtid,
185	dispatch_private_info_template<T> *pr,
186	enum sched_type schedule, T lb, T ub,
187	typename traits_t<T>::signed_t st,
188	#if USE_ITT_BUILD
189	kmp_uint64 *cur_chunk,
190	#endif
191	typename traits_t<T>::signed_t chunk,
192	T nproc, T tid) {
193	typedef typename traits_t<T>::unsigned_t UT;
194	typedef typename traits_t<T>::floating_t DBL;
195
196	int active;
197	T tc;
198	kmp_info_t *th;
199	kmp_team_t *team;
200	int monotonicity;
201	bool use_hier;
202
203	#ifdef KMP_DEBUG
204	typedef typename traits_t<T>::signed_t ST;
205	{
206	char *buff;
207	// create format specifiers before the debug output
208	buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
209	"pr:%%p lb:%%%s ub:%%%s st:%%%s "
210	"schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
211	traits_t<T>::spec, traits_t<T>::spec,
212	traits_t<ST>::spec, traits_t<ST>::spec,
213	traits_t<T>::spec, traits_t<T>::spec);
214	KD_TRACE(`10`, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
215	__kmp_str_free(str: &buff);
216	}
217	#endif
218	/ setup data /
219	th = __kmp_threads[gtid];
220	team = th->th.th_team;
221	active = !team->t.t_serialized;
222
223	#if USE_ITT_BUILD
224	int itt_need_metadata_reporting =
225	__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == `3` &&
226	KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
227	team->t.t_active_level == `1`;
228	#endif
229
230	#if KMP_USE_HIER_SCHED
231	use_hier = pr->flags.use_hier;
232	#else
233	use_hier = false;
234	#endif
235
236	/ Pick up the nonmonotonic/monotonic bits from the scheduling type /
237	monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
238	schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
239
240	/ Pick up the nomerge/ordered bits from the scheduling type /
241	if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
242	pr->flags.nomerge = TRUE;
243	schedule =
244	(enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
245	} else {
246	pr->flags.nomerge = FALSE;
247	}
248	pr->type_size = traits_t<T>::type_size; // remember the size of variables
249	if (kmp_ord_lower & schedule) {
250	pr->flags.ordered = TRUE;
251	schedule =
252	(enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
253	} else {
254	pr->flags.ordered = FALSE;
255	}
256	// Ordered overrides nonmonotonic
257	if (pr->flags.ordered) {
258	monotonicity = SCHEDULE_MONOTONIC;
259	}
260
261	if (schedule == kmp_sch_static) {
262	schedule = __kmp_static;
263	} else {
264	if (schedule == kmp_sch_runtime) {
265	// Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
266	// not specified)
267	schedule = team->t.t_sched.r_sched_type;
268	monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
269	schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
270	if (pr->flags.ordered) // correct monotonicity for ordered loop if needed
271	monotonicity = SCHEDULE_MONOTONIC;
272	// Detail the schedule if needed (global controls are differentiated
273	// appropriately)
274	if (schedule == kmp_sch_guided_chunked) {
275	schedule = __kmp_guided;
276	} else if (schedule == kmp_sch_static) {
277	schedule = __kmp_static;
278	}
279	// Use the chunk size specified by OMP_SCHEDULE (or default if not
280	// specified)
281	chunk = team->t.t_sched.chunk;
282	#if USE_ITT_BUILD
283	if (cur_chunk)
284	*cur_chunk = chunk;
285	#endif
286	#ifdef KMP_DEBUG
287	{
288	char *buff;
289	// create format specifiers before the debug output
290	buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
291	"schedule:%%d chunk:%%%s\n",
292	traits_t<ST>::spec);
293	KD_TRACE(`10`, (buff, gtid, schedule, chunk));
294	__kmp_str_free(str: &buff);
295	}
296	#endif
297	} else {
298	if (schedule == kmp_sch_guided_chunked) {
299	schedule = __kmp_guided;
300	}
301	if (chunk <= `0`) {
302	chunk = KMP_DEFAULT_CHUNK;
303	}
304	}
305
306	if (schedule == kmp_sch_auto) {
307	// mapping and differentiation: in the __kmp_do_serial_initialize()
308	schedule = __kmp_auto;
309	#ifdef KMP_DEBUG
310	{
311	char *buff;
312	// create format specifiers before the debug output
313	buff = __kmp_str_format(
314	"__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
315	"schedule:%%d chunk:%%%s\n",
316	traits_t<ST>::spec);
317	KD_TRACE(`10`, (buff, gtid, schedule, chunk));
318	__kmp_str_free(str: &buff);
319	}
320	#endif
321	}
322	#if KMP_STATIC_STEAL_ENABLED
323	// map nonmonotonic:dynamic to static steal
324	if (schedule == kmp_sch_dynamic_chunked) {
325	if (monotonicity == SCHEDULE_NONMONOTONIC)
326	schedule = kmp_sch_static_steal;
327	}
328	#endif
329	/ guided analytical not safe for too many threads /
330	if (schedule == kmp_sch_guided_analytical_chunked && nproc > `1` << `20`) {
331	schedule = kmp_sch_guided_iterative_chunked;
332	KMP_WARNING(DispatchManyThreads);
333	}
334	if (schedule == kmp_sch_runtime_simd) {
335	// compiler provides simd_width in the chunk parameter
336	schedule = team->t.t_sched.r_sched_type;
337	monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
338	schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
339	// Detail the schedule if needed (global controls are differentiated
340	// appropriately)
341	if (schedule == kmp_sch_static \|\| schedule == kmp_sch_auto \|\|
342	schedule == __kmp_static) {
343	schedule = kmp_sch_static_balanced_chunked;
344	} else {
345	if (schedule == kmp_sch_guided_chunked \|\| schedule == __kmp_guided) {
346	schedule = kmp_sch_guided_simd;
347	}
348	chunk = team->t.t_sched.chunk * chunk;
349	}
350	#if USE_ITT_BUILD
351	if (cur_chunk)
352	*cur_chunk = chunk;
353	#endif
354	#ifdef KMP_DEBUG
355	{
356	char *buff;
357	// create format specifiers before the debug output
358	buff = __kmp_str_format(
359	"__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
360	" chunk:%%%s\n",
361	traits_t<ST>::spec);
362	KD_TRACE(`10`, (buff, gtid, schedule, chunk));
363	__kmp_str_free(str: &buff);
364	}
365	#endif
366	}
367	pr->u.p.parm1 = chunk;
368	}
369	KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
370	"unknown scheduling type");
371
372	pr->u.p.count = `0`;
373
374	if (__kmp_env_consistency_check) {
375	if (st == `0`) {
376	__kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
377	(pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
378	}
379	}
380	// compute trip count
381	if (st == `1`) { // most common case
382	if (ub >= lb) {
383	tc = ub - lb + `1`;
384	} else { // ub < lb
385	tc = `0`; // zero-trip
386	}
387	} else if (st < `0`) {
388	if (lb >= ub) {
389	// AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
390	// where the division needs to be unsigned regardless of the result type
391	tc = (UT)(lb - ub) / (-st) + `1`;
392	} else { // lb < ub
393	tc = `0`; // zero-trip
394	}
395	} else { // st > 0
396	if (ub >= lb) {
397	// AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
398	// where the division needs to be unsigned regardless of the result type
399	tc = (UT)(ub - lb) / st + `1`;
400	} else { // ub < lb
401	tc = `0`; // zero-trip
402	}
403	}
404
405	#if KMP_STATS_ENABLED
406	if (KMP_MASTER_GTID(gtid)) {
407	KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
408	}
409	#endif
410
411	pr->u.p.lb = lb;
412	pr->u.p.ub = ub;
413	pr->u.p.st = st;
414	pr->u.p.tc = tc;
415
416	#if KMP_OS_WINDOWS
417	pr->u.p.last_upper = ub + st;
418	#endif /* KMP_OS_WINDOWS */
419
420	/ NOTE: only the active parallel region(s) has active ordered sections /
421
422	if (active) {
423	if (pr->flags.ordered) {
424	pr->ordered_bumped = `0`;
425	pr->u.p.ordered_lower = `1`;
426	pr->u.p.ordered_upper = `0`;
427	}
428	}
429
430	switch (schedule) {
431	#if KMP_STATIC_STEAL_ENABLED
432	case kmp_sch_static_steal: {
433	T ntc, init = `0`;
434
435	KD_TRACE(`100`,
436	("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
437	gtid));
438
439	ntc = (tc % chunk ? `1` : `0`) + tc / chunk;
440	if (nproc > `1` && ntc >= nproc) {
441	KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
442	T id = tid;
443	T small_chunk, extras, p_extra = `0`;
444	kmp_uint32 old = UNUSED;
445	int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);
446	if (traits_t<T>::type_size > `4`) {
447	// AC: TODO: check if 16-byte CAS available and use it to
448	// improve performance (probably wait for explicit request
449	// before spending time on this).
450	// For now use dynamically allocated per-private-buffer lock,
451	// free memory in __kmp_dispatch_next when status==0.
452	pr->u.p.steal_lock = (kmp_lock_t )__kmp_allocate(sizeof*(kmp_lock_t));
453	__kmp_init_lock(pr->u.p.steal_lock);
454	}
455
456	#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
457	// Iterations are divided in a 60/40 skewed distribution among CORE and
458	// ATOM processors for hybrid systems
459	bool use_hybrid = false;
460	kmp_hw_core_type_t core_type = KMP_HW_CORE_TYPE_UNKNOWN;
461	T first_thread_with_ecore = `0`;
462	T num_procs_with_pcore = `0`;
463	T num_procs_with_ecore = `0`;
464	T p_ntc = `0`, e_ntc = `0`;
465	if (__kmp_is_hybrid_cpu() && __kmp_affinity.type != affinity_none &&
466	__kmp_affinity.type != affinity_explicit) {
467	use_hybrid = true;
468	core_type = (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
469	if (core_type != KMP_HW_CORE_TYPE_UNKNOWN &&
470	__kmp_first_osid_with_ecore > -`1`) {
471	for (int i = `0`; i < team->t.t_nproc; ++i) {
472	kmp_hw_core_type_t type = (kmp_hw_core_type_t)team->t.t_threads[i]
473	->th.th_topology_attrs.core_type;
474	int id = team->t.t_threads[i]->th.th_topology_ids.os_id;
475	if (id == __kmp_first_osid_with_ecore) {
476	first_thread_with_ecore =
477	team->t.t_threads[i]->th.th_info.ds.ds_tid;
478	}
479	if (type == KMP_HW_CORE_TYPE_CORE) {
480	num_procs_with_pcore++;
481	} else if (type == KMP_HW_CORE_TYPE_ATOM) {
482	num_procs_with_ecore++;
483	} else {
484	use_hybrid = false;
485	break;
486	}
487	}
488	}
489	if (num_procs_with_pcore > `0` && num_procs_with_ecore > `0`) {
490	float multiplier = `60.0` / `40.0`;
491	float p_ratio = (float)num_procs_with_pcore / nproc;
492	float e_ratio = (float)num_procs_with_ecore / nproc;
493	float e_multiplier =
494	(float)`1` /
495	(((multiplier * num_procs_with_pcore) / nproc) + e_ratio);
496	float p_multiplier = multiplier * e_multiplier;
497	p_ntc = __kmp_get_round_val(ntc * p_ratio * p_multiplier);
498	if ((int)p_ntc > (int)(ntc * p_ratio * p_multiplier))
499	e_ntc =
500	(int)(__kmp_round_2decimal_val(ntc * e_ratio * e_multiplier));
501	else
502	e_ntc = __kmp_get_round_val(ntc * e_ratio * e_multiplier);
503	KMP_DEBUG_ASSERT(ntc == p_ntc + e_ntc);
504
505	// Use regular static steal if not enough chunks for skewed
506	// distribution
507	use_hybrid = (use_hybrid && (p_ntc >= num_procs_with_pcore &&
508	e_ntc >= num_procs_with_ecore)
509	? true
510	: false);
511	} else {
512	use_hybrid = false;
513	}
514	}
515	pr->flags.use_hybrid = use_hybrid;
516	pr->u.p.pchunks = p_ntc;
517	pr->u.p.num_procs_with_pcore = num_procs_with_pcore;
518	pr->u.p.first_thread_with_ecore = first_thread_with_ecore;
519
520	if (use_hybrid) {
521	KMP_DEBUG_ASSERT(nproc == num_procs_with_pcore + num_procs_with_ecore);
522	T big_chunk = p_ntc / num_procs_with_pcore;
523	small_chunk = e_ntc / num_procs_with_ecore;
524
525	extras =
526	(p_ntc % num_procs_with_pcore) + (e_ntc % num_procs_with_ecore);
527
528	p_extra = (big_chunk - small_chunk);
529
530	if (core_type == KMP_HW_CORE_TYPE_CORE) {
531	if (id < first_thread_with_ecore) {
532	init =
533	id * small_chunk + id * p_extra + (id < extras ? id : extras);
534	} else {
535	init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
536	(id < extras ? id : extras);
537	}
538	} else {
539	if (id == first_thread_with_ecore) {
540	init =
541	id * small_chunk + id * p_extra + (id < extras ? id : extras);
542	} else {
543	init = id * small_chunk + first_thread_with_ecore * p_extra +
544	(id < extras ? id : extras);
545	}
546	}
547	p_extra = (core_type == KMP_HW_CORE_TYPE_CORE) ? p_extra : `0`;
548	} else
549	#endif
550	{
551	small_chunk = ntc / nproc;
552	extras = ntc % nproc;
553	init = id * small_chunk + (id < extras ? id : extras);
554	p_extra = `0`;
555	}
556	pr->u.p.count = init;
557	if (claimed) { // are we succeeded in claiming own buffer?
558	pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? `1` : `0`);
559	// Other threads will inspect steal_flag when searching for a victim.
560	// READY means other threads may steal from this thread from now on.
561	KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
562	} else {
563	// other thread has stolen whole our range
564	KMP_DEBUG_ASSERT(pr->steal_flag == THIEF);
565	pr->u.p.ub = init; // mark there is no iterations to work on
566	}
567	pr->u.p.parm2 = ntc; // save number of chunks
568	// parm3 is the number of times to attempt stealing which is
569	// nproc (just a heuristics, could be optimized later on).
570	pr->u.p.parm3 = nproc;
571	pr->u.p.parm4 = (id + `1`) % nproc; // remember neighbour tid
572	break;
573	} else {
574	/ too few chunks: switching to kmp_sch_dynamic_chunked /
575	schedule = kmp_sch_dynamic_chunked;
576	KD_TRACE(`100`, ("__kmp_dispatch_init_algorithm: T#%d switching to "
577	"kmp_sch_dynamic_chunked\n",
578	gtid));
579	goto dynamic_init;
580	break;
581	} // if
582	} // case
583	#endif
584	case kmp_sch_static_balanced: {
585	T init, limit;
586
587	KD_TRACE(
588	`100`,
589	("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
590	gtid));
591
592	if (nproc > `1`) {
593	T id = tid;
594
595	if (tc < nproc) {
596	if (id < tc) {
597	init = id;
598	limit = id;
599	pr->u.p.parm1 = (id == tc - `1`); / parm1 stores plastiter /*
600	} else {
601	pr->u.p.count = `1`; / means no more chunks to execute /
602	pr->u.p.parm1 = FALSE;
603	break;
604	}
605	} else {
606	T small_chunk = tc / nproc;
607	T extras = tc % nproc;
608	init = id * small_chunk + (id < extras ? id : extras);
609	limit = init + small_chunk - (id < extras ? `0` : `1`);
610	pr->u.p.parm1 = (id == nproc - `1`);
611	}
612	} else {
613	if (tc > `0`) {
614	init = `0`;
615	limit = tc - `1`;
616	pr->u.p.parm1 = TRUE;
617	} else {
618	// zero trip count
619	pr->u.p.count = `1`; / means no more chunks to execute /
620	pr->u.p.parm1 = FALSE;
621	break;
622	}
623	}
624	#if USE_ITT_BUILD
625	// Calculate chunk for metadata report
626	if (itt_need_metadata_reporting)
627	if (cur_chunk)
628	*cur_chunk = limit - init + `1`;
629	#endif
630	if (st == `1`) {
631	pr->u.p.lb = lb + init;
632	pr->u.p.ub = lb + limit;
633	} else {
634	// calculated upper bound, "ub" is user-defined upper bound
635	T ub_tmp = lb + limit * st;
636	pr->u.p.lb = lb + init * st;
637	// adjust upper bound to "ub" if needed, so that MS lastprivate will match
638	// it exactly
639	if (st > `0`) {
640	pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
641	} else {
642	pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
643	}
644	}
645	if (pr->flags.ordered) {
646	pr->u.p.ordered_lower = init;
647	pr->u.p.ordered_upper = limit;
648	}
649	break;
650	} // case
651	case kmp_sch_static_balanced_chunked: {
652	// similar to balanced, but chunk adjusted to multiple of simd width
653	T nth = nproc;
654	KD_TRACE(`100`, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
655	" -> falling-through to static_greedy\n",
656	gtid));
657	schedule = kmp_sch_static_greedy;
658	if (nth > `1`)
659	pr->u.p.parm1 = ((tc + nth - `1`) / nth + chunk - `1`) & ~(chunk - `1`);
660	else
661	pr->u.p.parm1 = tc;
662	break;
663	} // case
664	case kmp_sch_guided_simd:
665	case kmp_sch_guided_iterative_chunked: {
666	KD_TRACE(
667	`100`,
668	("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
669	" case\n",
670	gtid));
671
672	if (nproc > `1`) {
673	if ((`2L` * chunk + `1`) * nproc >= tc) {
674	/ chunk size too large, switch to dynamic /
675	schedule = kmp_sch_dynamic_chunked;
676	goto dynamic_init;
677	} else {
678	// when remaining iters become less than parm2 - switch to dynamic
679	pr->u.p.parm2 = guided_int_param * nproc * (chunk + `1`);
680	(double* *)&pr->u.p.parm3 =
681	guided_flt_param / (double)nproc; // may occupy parm3 and parm4
682	}
683	} else {
684	KD_TRACE(`100`, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
685	"kmp_sch_static_greedy\n",
686	gtid));
687	schedule = kmp_sch_static_greedy;
688	/ team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy /
689	KD_TRACE(
690	`100`,
691	("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
692	gtid));
693	pr->u.p.parm1 = tc;
694	} // if
695	} // case
696	break;
697	case kmp_sch_guided_analytical_chunked: {
698	KD_TRACE(`100`, ("__kmp_dispatch_init_algorithm: T#%d "
699	"kmp_sch_guided_analytical_chunked case\n",
700	gtid));
701
702	if (nproc > `1`) {
703	if ((`2L` * chunk + `1`) * nproc >= tc) {
704	/ chunk size too large, switch to dynamic /
705	schedule = kmp_sch_dynamic_chunked;
706	goto dynamic_init;
707	} else {
708	/ commonly used term: (2 nproc - 1)/(2 nproc) /
709	DBL x;
710
711	#if KMP_USE_X87CONTROL
712	/ Linux* OS already has 64-bit computation by default for long double,*
713	and on Windows OS on Intel(R) 64, /Qlong_double doesn't work. On*
714	Windows OS on IA-32 architecture, we need to set precision to 64-bit*
715	instead of the default 53-bit. Even though long double doesn't work
716	on Windows OS on Intel(R) 64, the resulting lack of precision is not*
717	expected to impact the correctness of the algorithm, but this has not
718	been mathematically proven. /*
719	// save original FPCW and set precision to 64-bit, as
720	// Windows OS on IA-32 architecture defaults to 53-bit*
721	unsigned int oldFpcw = _control87(`0`, `0`);
722	_control87(_PC_64, _MCW_PC); // 0,0x30000
723	#endif
724	/ value used for comparison in solver for cross-over point /
725	KMP_ASSERT(tc > `0`);
726	long double target = ((long double)chunk * `2` + `1`) * nproc / tc;
727
728	/ crossover point--chunk indexes equal to or greater than*
729	this point switch to dynamic-style scheduling /*
730	UT cross;
731
732	/ commonly used term: (2 nproc - 1)/(2 nproc) /
733	x = `1.0` - `0.5` / (double)nproc;
734
735	#ifdef KMP_DEBUG
736	{ // test natural alignment
737	struct _test_a {
738	char a;
739	union {
740	char b;
741	DBL d;
742	};
743	} t;
744	ptrdiff_t natural_alignment =
745	(ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)`1`;
746	//__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
747	// long)natural_alignment );
748	KMP_DEBUG_ASSERT(
749	(((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == `0`);
750	}
751	#endif // KMP_DEBUG
752
753	/ save the term in thread private dispatch structure /
754	(DBL )&pr->u.p.parm3 = x;
755
756	/ solve for the crossover point to the nearest integer i for which C_i*
757	<= chunk /*
758	{
759	UT left, right, mid;
760	long double p;
761
762	/ estimate initial upper and lower bound /
763
764	/ doesn't matter what value right is as long as it is positive, but*
765	it affects performance of the solver /*
766	right = `229`;
767	p = __kmp_pow<UT>(x, right);
768	if (p > target) {
769	do {
770	p *= p;
771	right <<= `1`;
772	} while (p > target && right < (`1` << `27`));
773	/ lower bound is previous (failed) estimate of upper bound /
774	left = right >> `1`;
775	} else {
776	left = `0`;
777	}
778
779	/ bisection root-finding method /
780	while (left + `1` < right) {
781	mid = (left + right) / `2`;
782	if (__kmp_pow<UT>(x, mid) > target) {
783	left = mid;
784	} else {
785	right = mid;
786	}
787	} // while
788	cross = right;
789	}
790	/ assert sanity of computed crossover point /
791	KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - `1`) > target &&
792	__kmp_pow<UT>(x, cross) <= target);
793
794	/ save the crossover point in thread private dispatch structure /
795	pr->u.p.parm2 = cross;
796
797	// C75803
798	#if ((KMP_OS_LINUX \|\| KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
799	#define GUIDED_ANALYTICAL_WORKAROUND ((DBL )&pr->u.p.parm3)
800	#else
801	#define GUIDED_ANALYTICAL_WORKAROUND (x)
802	#endif
803	/ dynamic-style scheduling offset /
804	pr->u.p.count = tc -
805	__kmp_dispatch_guided_remaining(
806	tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
807	cross * chunk;
808	#if KMP_USE_X87CONTROL
809	// restore FPCW
810	_control87(oldFpcw, _MCW_PC);
811	#endif
812	} // if
813	} else {
814	KD_TRACE(`100`, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
815	"kmp_sch_static_greedy\n",
816	gtid));
817	schedule = kmp_sch_static_greedy;
818	/ team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy /
819	pr->u.p.parm1 = tc;
820	} // if
821	} // case
822	break;
823	case kmp_sch_static_greedy:
824	KD_TRACE(
825	`100`,
826	("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
827	gtid));
828	pr->u.p.parm1 = (nproc > `1`) ? (tc + nproc - `1`) / nproc : tc;
829	break;
830	case kmp_sch_static_chunked:
831	case kmp_sch_dynamic_chunked:
832	dynamic_init:
833	if (tc == `0`)
834	break;
835	if (pr->u.p.parm1 <= `0`)
836	pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
837	else if (pr->u.p.parm1 > tc)
838	pr->u.p.parm1 = tc;
839	// Store the total number of chunks to prevent integer overflow during
840	// bounds calculations in the get next chunk routine.
841	pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? `1` : `0`);
842	KD_TRACE(`100`, ("__kmp_dispatch_init_algorithm: T#%d "
843	"kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
844	gtid));
845	break;
846	case kmp_sch_trapezoidal: {
847	/ TSS: trapezoid self-scheduling, minimum chunk_size = parm1 /
848
849	T parm1, parm2, parm3, parm4;
850	KD_TRACE(`100`,
851	("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
852	gtid));
853
854	parm1 = chunk;
855
856	/ F : size of the first cycle /
857	parm2 = (tc / (`2` * nproc));
858
859	if (parm2 < `1`) {
860	parm2 = `1`;
861	}
862
863	/ L : size of the last cycle. Make sure the last cycle is not larger*
864	than the first cycle. /*
865	if (parm1 < `1`) {
866	parm1 = `1`;
867	} else if (parm1 > parm2) {
868	parm1 = parm2;
869	}
870
871	/ N : number of cycles /
872	parm3 = (parm2 + parm1);
873	parm3 = (`2` * tc + parm3 - `1`) / parm3;
874
875	if (parm3 < `2`) {
876	parm3 = `2`;
877	}
878
879	/ sigma : decreasing incr of the trapezoid /
880	parm4 = (parm3 - `1`);
881	parm4 = (parm2 - parm1) / parm4;
882
883	// pointless check, because parm4 >= 0 always
884	// if ( parm4 < 0 ) {
885	// parm4 = 0;
886	//}
887
888	pr->u.p.parm1 = parm1;
889	pr->u.p.parm2 = parm2;
890	pr->u.p.parm3 = parm3;
891	pr->u.p.parm4 = parm4;
892	} // case
893	break;
894
895	default: {
896	__kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
897	KMP_HNT(GetNewerLibrary), // Hint
898	__kmp_msg_null // Variadic argument list terminator
899	);
900	} break;
901	} // switch
902	pr->schedule = schedule;
903	}
904
905	#if KMP_USE_HIER_SCHED
906	template <typename T>
907	inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
908	typename traits_t<T>::signed_t st);
909	template <>
910	inline void
911	__kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
912	kmp_int32 ub, kmp_int32 st) {
913	__kmp_dispatch_init_hierarchy<kmp_int32>(
914	loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
915	__kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
916	}
917	template <>
918	inline void
919	__kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
920	kmp_uint32 ub, kmp_int32 st) {
921	__kmp_dispatch_init_hierarchy<kmp_uint32>(
922	loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
923	__kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
924	}
925	template <>
926	inline void
927	__kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
928	kmp_int64 ub, kmp_int64 st) {
929	__kmp_dispatch_init_hierarchy<kmp_int64>(
930	loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
931	__kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
932	}
933	template <>
934	inline void
935	__kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
936	kmp_uint64 ub, kmp_int64 st) {
937	__kmp_dispatch_init_hierarchy<kmp_uint64>(
938	loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
939	__kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
940	}
941
942	// free all the hierarchy scheduling memory associated with the team
943	void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
944	int num_disp_buff = team->t.t_max_nproc > `1` ? __kmp_dispatch_num_buffers : `2`;
945	for (int i = `0`; i < num_disp_buff; ++i) {
946	// type does not matter here so use kmp_int32
947	auto sh =
948	reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
949	&team->t.t_disp_buffer[i]);
950	if (sh->hier) {
951	sh->hier->deallocate();
952	__kmp_free(sh->hier);
953	}
954	}
955	}
956	#endif
957
958	// UT - unsigned flavor of T, ST - signed flavor of T,
959	// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
960	template <typename T>
961	static void
962	__kmp_dispatch_init(ident_t loc, int* gtid, enum sched_type schedule, T lb,
963	T ub, typename traits_t<T>::signed_t st,
964	typename traits_t<T>::signed_t chunk, int push_ws) {
965	typedef typename traits_t<T>::unsigned_t UT;
966
967	int active;
968	kmp_info_t *th;
969	kmp_team_t *team;
970	kmp_uint32 my_buffer_index;
971	dispatch_private_info_template<T> *pr;
972	dispatch_shared_info_template<T> volatile *sh;
973
974	KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
975	sizeof(dispatch_private_info));
976	KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
977	sizeof(dispatch_shared_info));
978	__kmp_assert_valid_gtid(gtid);
979
980	if (!TCR_4(__kmp_init_parallel))
981	__kmp_parallel_initialize();
982
983	__kmp_resume_if_soft_paused();
984
985	#if INCLUDE_SSC_MARKS
986	SSC_MARK_DISPATCH_INIT();
987	#endif
988	#ifdef KMP_DEBUG
989	typedef typename traits_t<T>::signed_t ST;
990	{
991	char *buff;
992	// create format specifiers before the debug output
993	buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
994	"chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
995	traits_t<ST>::spec, traits_t<T>::spec,
996	traits_t<T>::spec, traits_t<ST>::spec);
997	KD_TRACE(`10`, (buff, gtid, schedule, chunk, lb, ub, st));
998	__kmp_str_free(str: &buff);
999	}
1000	#endif
1001	/ setup data /
1002	th = __kmp_threads[gtid];
1003	team = th->th.th_team;
1004	active = !team->t.t_serialized;
1005	th->th.th_ident = loc;
1006
1007	// Any half-decent optimizer will remove this test when the blocks are empty
1008	// since the macros expand to nothing
1009	// when statistics are disabled.
1010	if (schedule == __kmp_static) {
1011	KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
1012	} else {
1013	KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
1014	}
1015
1016	#if KMP_USE_HIER_SCHED
1017	// Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
1018	// Hierarchical scheduling does not work with ordered, so if ordered is
1019	// detected, then revert back to threaded scheduling.
1020	bool ordered;
1021	enum sched_type my_sched = schedule;
1022	my_buffer_index = th->th.th_dispatch->th_disp_index;
1023	pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1024	&th->th.th_dispatch
1025	->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
1026	my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
1027	if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
1028	my_sched =
1029	(enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
1030	ordered = (kmp_ord_lower & my_sched);
1031	if (pr->flags.use_hier) {
1032	if (ordered) {
1033	KD_TRACE(`100`, ("__kmp_dispatch_init: T#%d ordered loop detected. "
1034	"Disabling hierarchical scheduling.\n",
1035	gtid));
1036	pr->flags.use_hier = FALSE;
1037	}
1038	}
1039	if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > `0`) {
1040	// Don't use hierarchical for ordered parallel loops and don't
1041	// use the runtime hierarchy if one was specified in the program
1042	if (!ordered && !pr->flags.use_hier)
1043	__kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
1044	}
1045	#endif // KMP_USE_HIER_SCHED
1046
1047	#if USE_ITT_BUILD
1048	kmp_uint64 cur_chunk = chunk;
1049	int itt_need_metadata_reporting =
1050	__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == `3` &&
1051	KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
1052	team->t.t_active_level == `1`;
1053	#endif
1054	if (!active) {
1055	pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1056	th->th.th_dispatch->th_disp_buffer); / top of the stack /
1057	} else {
1058	KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1059	&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1060
1061	my_buffer_index = th->th.th_dispatch->th_disp_index++;
1062
1063	/ What happens when number of threads changes, need to resize buffer? /
1064	pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1065	&th->th.th_dispatch
1066	->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
1067	sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
1068	&team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
1069	KD_TRACE(`10`, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
1070	my_buffer_index));
1071	if (sh->buffer_index != my_buffer_index) { // too many loops in progress?
1072	KD_TRACE(`100`, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d"
1073	" sh->buffer_index:%d\n",
1074	gtid, my_buffer_index, sh->buffer_index));
1075	__kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
1076	__kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
1077	// Note: KMP_WAIT() cannot be used there: buffer index and
1078	// my_buffer_index are always* 32-bit integers.*
1079	KD_TRACE(`100`, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
1080	"sh->buffer_index:%d\n",
1081	gtid, my_buffer_index, sh->buffer_index));
1082	}
1083	}
1084
1085	__kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
1086	#if USE_ITT_BUILD
1087	&cur_chunk,
1088	#endif
1089	chunk, (T)th->th.th_team_nproc,
1090	(T)th->th.th_info.ds.ds_tid);
1091	if (active) {
1092	if (pr->flags.ordered == `0`) {
1093	th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
1094	th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
1095	} else {
1096	th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
1097	th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
1098	}
1099	th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
1100	th->th.th_dispatch->th_dispatch_sh_current =
1101	CCAST(dispatch_shared_info_t , (volatile* dispatch_shared_info_t *)sh);
1102	#if USE_ITT_BUILD
1103	if (pr->flags.ordered) {
1104	__kmp_itt_ordered_init(gtid);
1105	}
1106	// Report loop metadata
1107	if (itt_need_metadata_reporting) {
1108	// Only report metadata by primary thread of active team at level 1
1109	kmp_uint64 schedtype = `0`;
1110	switch (schedule) {
1111	case kmp_sch_static_chunked:
1112	case kmp_sch_static_balanced: // Chunk is calculated in the switch above
1113	break;
1114	case kmp_sch_static_greedy:
1115	cur_chunk = pr->u.p.parm1;
1116	break;
1117	case kmp_sch_dynamic_chunked:
1118	schedtype = `1`;
1119	break;
1120	case kmp_sch_guided_iterative_chunked:
1121	case kmp_sch_guided_analytical_chunked:
1122	case kmp_sch_guided_simd:
1123	schedtype = `2`;
1124	break;
1125	default:
1126	// Should we put this case under "static"?
1127	// case kmp_sch_static_steal:
1128	schedtype = `3`;
1129	break;
1130	}
1131	__kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
1132	}
1133	#if KMP_USE_HIER_SCHED
1134	if (pr->flags.use_hier) {
1135	pr->u.p.count = `0`;
1136	pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = `0`;
1137	}
1138	#endif // KMP_USER_HIER_SCHED
1139	#endif /* USE_ITT_BUILD */
1140	}
1141
1142	#ifdef KMP_DEBUG
1143	{
1144	char *buff;
1145	// create format specifiers before the debug output
1146	buff = __kmp_str_format(
1147	"__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
1148	"lb:%%%s ub:%%%s"
1149	" st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
1150	" parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1151	traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
1152	traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1153	traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
1154	traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
1155	KD_TRACE(`10`, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
1156	pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1157	pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1158	pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
1159	__kmp_str_free(str: &buff);
1160	}
1161	#endif
1162	#if OMPT_SUPPORT && OMPT_OPTIONAL
1163	if (ompt_enabled.ompt_callback_work) {
1164	ompt_team_info_t *team_info = __ompt_get_teaminfo(depth: `0`, NULL);
1165	ompt_task_info_t *task_info = __ompt_get_task_info_object(depth: `0`);
1166	ompt_callbacks.ompt_callback(ompt_callback_work)(
1167	ompt_get_work_schedule(pr->schedule), ompt_scope_begin,
1168	&(team_info->parallel_data), &(task_info->task_data), pr->u.p.tc,
1169	OMPT_LOAD_RETURN_ADDRESS(gtid));
1170	}
1171	#endif
1172	KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
1173	}
1174
1175	/ For ordered loops, either __kmp_dispatch_finish() should be called after*
1176	* every iteration, or __kmp_dispatch_finish_chunk() should be called after
1177	* every chunk of iterations. If the ordered section(s) were not executed
1178	* for this iteration (or every iteration in this chunk), we need to set the
1179	* ordered iteration counters so that the next thread can proceed. */
1180	template <typename UT>
1181	static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1182	typedef typename traits_t<UT>::signed_t ST;
1183	__kmp_assert_valid_gtid(gtid);
1184	kmp_info_t *th = __kmp_threads[gtid];
1185
1186	KD_TRACE(`100`, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1187	if (!th->th.th_team->t.t_serialized) {
1188
1189	dispatch_private_info_template<UT> *pr =
1190	reinterpret_cast<dispatch_private_info_template<UT> *>(
1191	th->th.th_dispatch->th_dispatch_pr_current);
1192	dispatch_shared_info_template<UT> volatile *sh =
1193	reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1194	th->th.th_dispatch->th_dispatch_sh_current);
1195	KMP_DEBUG_ASSERT(pr);
1196	KMP_DEBUG_ASSERT(sh);
1197	KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1198	&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1199
1200	if (pr->ordered_bumped) {
1201	KD_TRACE(
1202	`1000`,
1203	("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1204	gtid));
1205	pr->ordered_bumped = `0`;
1206	} else {
1207	UT lower = pr->u.p.ordered_lower;
1208
1209	#ifdef KMP_DEBUG
1210	{
1211	char *buff;
1212	// create format specifiers before the debug output
1213	buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1214	"ordered_iteration:%%%s lower:%%%s\n",
1215	traits_t<UT>::spec, traits_t<UT>::spec);
1216	KD_TRACE(`1000`, (buff, gtid, sh->u.s.ordered_iteration, lower));
1217	__kmp_str_free(str: &buff);
1218	}
1219	#endif
1220
1221	__kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1222	__kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1223	KMP_MB(); / is this necessary? /
1224	#ifdef KMP_DEBUG
1225	{
1226	char *buff;
1227	// create format specifiers before the debug output
1228	buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1229	"ordered_iteration:%%%s lower:%%%s\n",
1230	traits_t<UT>::spec, traits_t<UT>::spec);
1231	KD_TRACE(`1000`, (buff, gtid, sh->u.s.ordered_iteration, lower));
1232	__kmp_str_free(str: &buff);
1233	}
1234	#endif
1235
1236	test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1237	} // if
1238	} // if
1239	KD_TRACE(`100`, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1240	}
1241
1242	#ifdef KMP_GOMP_COMPAT
1243
1244	template <typename UT>
1245	static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1246	typedef typename traits_t<UT>::signed_t ST;
1247	__kmp_assert_valid_gtid(gtid);
1248	kmp_info_t *th = __kmp_threads[gtid];
1249
1250	KD_TRACE(`100`, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1251	if (!th->th.th_team->t.t_serialized) {
1252	dispatch_private_info_template<UT> *pr =
1253	reinterpret_cast<dispatch_private_info_template<UT> *>(
1254	th->th.th_dispatch->th_dispatch_pr_current);
1255	dispatch_shared_info_template<UT> volatile *sh =
1256	reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1257	th->th.th_dispatch->th_dispatch_sh_current);
1258	KMP_DEBUG_ASSERT(pr);
1259	KMP_DEBUG_ASSERT(sh);
1260	KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1261	&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1262
1263	UT lower = pr->u.p.ordered_lower;
1264	UT upper = pr->u.p.ordered_upper;
1265	UT inc = upper - lower + `1`;
1266
1267	if (pr->ordered_bumped == inc) {
1268	KD_TRACE(
1269	`1000`,
1270	("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1271	gtid));
1272	pr->ordered_bumped = `0`;
1273	} else {
1274	inc -= pr->ordered_bumped;
1275
1276	#ifdef KMP_DEBUG
1277	{
1278	char *buff;
1279	// create format specifiers before the debug output
1280	buff = __kmp_str_format(
1281	"__kmp_dispatch_finish_chunk: T#%%d before wait: "
1282	"ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1283	traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1284	KD_TRACE(`1000`, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1285	__kmp_str_free(str: &buff);
1286	}
1287	#endif
1288
1289	__kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1290	__kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1291
1292	KMP_MB(); / is this necessary? /
1293	KD_TRACE(`1000`, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1294	"ordered_bumped to zero\n",
1295	gtid));
1296	pr->ordered_bumped = `0`;
1297	//!!!!! TODO check if the inc should be unsigned, or signed???
1298	#ifdef KMP_DEBUG
1299	{
1300	char *buff;
1301	// create format specifiers before the debug output
1302	buff = __kmp_str_format(
1303	"__kmp_dispatch_finish_chunk: T#%%d after wait: "
1304	"ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1305	traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1306	traits_t<UT>::spec);
1307	KD_TRACE(`1000`,
1308	(buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1309	__kmp_str_free(str: &buff);
1310	}
1311	#endif
1312
1313	test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1314	}
1315	// }
1316	}
1317	KD_TRACE(`100`, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1318	}
1319
1320	#endif /* KMP_GOMP_COMPAT */
1321
1322	template <typename T>
1323	int __kmp_dispatch_next_algorithm(int gtid,
1324	dispatch_private_info_template<T> *pr,
1325	dispatch_shared_info_template<T> volatile *sh,
1326	kmp_int32 p_last, T p_lb, T *p_ub,
1327	typename traits_t<T>::signed_t *p_st, T nproc,
1328	T tid) {
1329	typedef typename traits_t<T>::unsigned_t UT;
1330	typedef typename traits_t<T>::signed_t ST;
1331	typedef typename traits_t<T>::floating_t DBL;
1332	int status = `0`;
1333	bool last = false;
1334	T start;
1335	ST incr;
1336	UT limit, trip, init;
1337	kmp_info_t *th = __kmp_threads[gtid];
1338	kmp_team_t *team = th->th.th_team;
1339
1340	KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1341	&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1342	KMP_DEBUG_ASSERT(pr);
1343	KMP_DEBUG_ASSERT(sh);
1344	KMP_DEBUG_ASSERT(tid >= `0` && tid < nproc);
1345	#ifdef KMP_DEBUG
1346	{
1347	char *buff;
1348	// create format specifiers before the debug output
1349	buff =
1350	__kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1351	"sh:%%p nproc:%%%s tid:%%%s\n",
1352	traits_t<T>::spec, traits_t<T>::spec);
1353	KD_TRACE(`10`, (buff, gtid, pr, sh, nproc, tid));
1354	__kmp_str_free(str: &buff);
1355	}
1356	#endif
1357
1358	// zero trip count
1359	if (pr->u.p.tc == `0`) {
1360	KD_TRACE(`10`,
1361	("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1362	"zero status:%d\n",
1363	gtid, status));
1364	return `0`;
1365	}
1366
1367	switch (pr->schedule) {
1368	#if KMP_STATIC_STEAL_ENABLED
1369	case kmp_sch_static_steal: {
1370	T chunk = pr->u.p.parm1;
1371	UT nchunks = pr->u.p.parm2;
1372	KD_TRACE(`100`,
1373	("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1374	gtid));
1375
1376	trip = pr->u.p.tc - `1`;
1377
1378	if (traits_t<T>::type_size > `4`) {
1379	// use lock for 8-byte induction variable.
1380	// TODO (optional): check presence and use 16-byte CAS
1381	kmp_lock_t *lck = pr->u.p.steal_lock;
1382	KMP_DEBUG_ASSERT(lck != NULL);
1383	if (pr->u.p.count < (UT)pr->u.p.ub) {
1384	KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1385	__kmp_acquire_lock(lck, gtid);
1386	// try to get own chunk of iterations
1387	init = (pr->u.p.count)++;
1388	status = (init < (UT)pr->u.p.ub);
1389	__kmp_release_lock(lck, gtid);
1390	} else {
1391	status = `0`; // no own chunks
1392	}
1393	if (!status) { // try to steal
1394	kmp_lock_t lckv; // victim buffer's lock*
1395	T while_limit = pr->u.p.parm3;
1396	T while_index = `0`;
1397	int idx = (th->th.th_dispatch->th_disp_index - `1`) %
1398	__kmp_dispatch_num_buffers; // current loop index
1399	// note: victim thread can potentially execute another loop
1400	KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1401	while ((!status) && (while_limit != ++while_index)) {
1402	dispatch_private_info_template<T> *v;
1403	T remaining;
1404	T victimId = pr->u.p.parm4;
1405	T oldVictimId = victimId ? victimId - `1` : nproc - `1`;
1406	v = reinterpret_cast<dispatch_private_info_template<T> *>(
1407	&team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1408	KMP_DEBUG_ASSERT(v);
1409	while ((v == pr \|\| KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1410	oldVictimId != victimId) {
1411	victimId = (victimId + `1`) % nproc;
1412	v = reinterpret_cast<dispatch_private_info_template<T> *>(
1413	&team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1414	KMP_DEBUG_ASSERT(v);
1415	}
1416	if (v == pr \|\| KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1417	continue; // try once more (nproc attempts in total)
1418	}
1419	if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1420	kmp_uint32 old = UNUSED;
1421	// try to steal whole range from inactive victim
1422	status = v->steal_flag.compare_exchange_strong(old, THIEF);
1423	if (status) {
1424	// initialize self buffer with victim's whole range of chunks
1425	T id = victimId;
1426	T small_chunk = `0`, extras = `0`, p_extra = `0`;
1427	__kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
1428	init, small_chunk, extras,
1429	p_extra);
1430	__kmp_acquire_lock(lck, gtid);
1431	pr->u.p.count = init + `1`; // exclude one we execute immediately
1432	pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? `1` : `0`);
1433	__kmp_release_lock(lck, gtid);
1434	pr->u.p.parm4 = (id + `1`) % nproc; // remember neighbour tid
1435	// no need to reinitialize other thread invariants: lb, st, etc.
1436	#ifdef KMP_DEBUG
1437	{
1438	char *buff;
1439	// create format specifiers before the debug output
1440	buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1441	"stolen chunks from T#%%d, "
1442	"count:%%%s ub:%%%s\n",
1443	traits_t<UT>::spec, traits_t<T>::spec);
1444	KD_TRACE(`10`, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1445	__kmp_str_free(str: &buff);
1446	}
1447	#endif
1448	// activate non-empty buffer and let others steal from us
1449	if (pr->u.p.count < (UT)pr->u.p.ub)
1450	KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1451	break;
1452	}
1453	}
1454	if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY \|\|
1455	v->u.p.count >= (UT)v->u.p.ub) {
1456	pr->u.p.parm4 = (victimId + `1`) % nproc; // shift start victim tid
1457	continue; // no chunks to steal, try next victim
1458	}
1459	lckv = v->u.p.steal_lock;
1460	KMP_ASSERT(lckv != NULL);
1461	__kmp_acquire_lock(lck: lckv, gtid);
1462	limit = v->u.p.ub; // keep initial ub
1463	if (v->u.p.count >= limit) {
1464	__kmp_release_lock(lck: lckv, gtid);
1465	pr->u.p.parm4 = (victimId + `1`) % nproc; // shift start victim tid
1466	continue; // no chunks to steal, try next victim
1467	}
1468
1469	// stealing succeded, reduce victim's ub by 1/4 of undone chunks
1470	// TODO: is this heuristics good enough??
1471	remaining = limit - v->u.p.count;
1472	if (remaining > `7`) {
1473	// steal 1/4 of remaining
1474	KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> `2`);
1475	init = (v->u.p.ub -= (remaining >> `2`));
1476	} else {
1477	// steal 1 chunk of 1..7 remaining
1478	KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, `1`);
1479	init = (v->u.p.ub -= `1`);
1480	}
1481	__kmp_release_lock(lck: lckv, gtid);
1482	#ifdef KMP_DEBUG
1483	{
1484	char *buff;
1485	// create format specifiers before the debug output
1486	buff = __kmp_str_format(
1487	"__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1488	"count:%%%s ub:%%%s\n",
1489	traits_t<UT>::spec, traits_t<UT>::spec);
1490	KD_TRACE(`10`, (buff, gtid, victimId, init, limit));
1491	__kmp_str_free(str: &buff);
1492	}
1493	#endif
1494	KMP_DEBUG_ASSERT(init + `1` <= limit);
1495	pr->u.p.parm4 = victimId; // remember victim to steal from
1496	status = `1`;
1497	// now update own count and ub with stolen range excluding init chunk
1498	__kmp_acquire_lock(lck, gtid);
1499	pr->u.p.count = init + `1`;
1500	pr->u.p.ub = limit;
1501	__kmp_release_lock(lck, gtid);
1502	// activate non-empty buffer and let others steal from us
1503	if (init + `1` < limit)
1504	KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1505	} // while (search for victim)
1506	} // if (try to find victim and steal)
1507	} else {
1508	// 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1509	// as all operations on pair (count, ub) must be done atomically
1510	typedef union {
1511	struct {
1512	UT count;
1513	T ub;
1514	} p;
1515	kmp_int64 b;
1516	} union_i4;
1517	union_i4 vold, vnew;
1518	if (pr->u.p.count < (UT)pr->u.p.ub) {
1519	KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1520	vold.b = (volatile* kmp_int64 *)(&pr->u.p.count);
1521	vnew.b = vold.b;
1522	vnew.p.count++; // get chunk from head of self range
1523	while (!KMP_COMPARE_AND_STORE_REL64(
1524	(volatile kmp_int64 *)&pr->u.p.count,
1525	VOLATILE_CAST(kmp_int64 ) & vold.b,
1526	VOLATILE_CAST(kmp_int64 ) & vnew.b)) {
1527	KMP_CPU_PAUSE();
1528	vold.b = (volatile* kmp_int64 *)(&pr->u.p.count);
1529	vnew.b = vold.b;
1530	vnew.p.count++;
1531	}
1532	init = vold.p.count;
1533	status = (init < (UT)vold.p.ub);
1534	} else {
1535	status = `0`; // no own chunks
1536	}
1537	if (!status) { // try to steal
1538	T while_limit = pr->u.p.parm3;
1539	T while_index = `0`;
1540	int idx = (th->th.th_dispatch->th_disp_index - `1`) %
1541	__kmp_dispatch_num_buffers; // current loop index
1542	// note: victim thread can potentially execute another loop
1543	KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1544	while ((!status) && (while_limit != ++while_index)) {
1545	dispatch_private_info_template<T> *v;
1546	T remaining;
1547	T victimId = pr->u.p.parm4;
1548	T oldVictimId = victimId ? victimId - `1` : nproc - `1`;
1549	v = reinterpret_cast<dispatch_private_info_template<T> *>(
1550	&team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1551	KMP_DEBUG_ASSERT(v);
1552	while ((v == pr \|\| KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1553	oldVictimId != victimId) {
1554	victimId = (victimId + `1`) % nproc;
1555	v = reinterpret_cast<dispatch_private_info_template<T> *>(
1556	&team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1557	KMP_DEBUG_ASSERT(v);
1558	}
1559	if (v == pr \|\| KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1560	continue; // try once more (nproc attempts in total)
1561	}
1562	if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1563	kmp_uint32 old = UNUSED;
1564	// try to steal whole range from inactive victim
1565	status = v->steal_flag.compare_exchange_strong(old, THIEF);
1566	if (status) {
1567	// initialize self buffer with victim's whole range of chunks
1568	T id = victimId;
1569	T small_chunk = `0`, extras = `0`, p_extra = `0`;
1570	__kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
1571	init, small_chunk, extras,
1572	p_extra);
1573	vnew.p.count = init + `1`;
1574	vnew.p.ub = init + small_chunk + p_extra + (id < extras ? `1` : `0`);
1575	// write pair (count, ub) at once atomically
1576	#if KMP_ARCH_X86
1577	KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);
1578	#else
1579	(volatile* kmp_int64 *)(&pr->u.p.count) = vnew.b;
1580	#endif
1581	pr->u.p.parm4 = (id + `1`) % nproc; // remember neighbour tid
1582	// no need to initialize other thread invariants: lb, st, etc.
1583	#ifdef KMP_DEBUG
1584	{
1585	char *buff;
1586	// create format specifiers before the debug output
1587	buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1588	"stolen chunks from T#%%d, "
1589	"count:%%%s ub:%%%s\n",
1590	traits_t<UT>::spec, traits_t<T>::spec);
1591	KD_TRACE(`10`, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1592	__kmp_str_free(str: &buff);
1593	}
1594	#endif
1595	// activate non-empty buffer and let others steal from us
1596	if (pr->u.p.count < (UT)pr->u.p.ub)
1597	KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1598	break;
1599	}
1600	}
1601	while (`1`) { // CAS loop with check if victim still has enough chunks
1602	// many threads may be stealing concurrently from same victim
1603	vold.b = (volatile* kmp_int64 *)(&v->u.p.count);
1604	if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY \|\|
1605	vold.p.count >= (UT)vold.p.ub) {
1606	pr->u.p.parm4 = (victimId + `1`) % nproc; // shift start victim id
1607	break; // no chunks to steal, try next victim
1608	}
1609	vnew.b = vold.b;
1610	remaining = vold.p.ub - vold.p.count;
1611	// try to steal 1/4 of remaining
1612	// TODO: is this heuristics good enough??
1613	if (remaining > `7`) {
1614	vnew.p.ub -= remaining >> `2`; // steal from tail of victim's range
1615	} else {
1616	vnew.p.ub -= `1`; // steal 1 chunk of 1..7 remaining
1617	}
1618	KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip);
1619	if (KMP_COMPARE_AND_STORE_REL64(
1620	(volatile kmp_int64 *)&v->u.p.count,
1621	VOLATILE_CAST(kmp_int64 ) & vold.b,
1622	VOLATILE_CAST(kmp_int64 ) & vnew.b)) {
1623	// stealing succedded
1624	#ifdef KMP_DEBUG
1625	{
1626	char *buff;
1627	// create format specifiers before the debug output
1628	buff = __kmp_str_format(
1629	"__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1630	"count:%%%s ub:%%%s\n",
1631	traits_t<T>::spec, traits_t<T>::spec);
1632	KD_TRACE(`10`, (buff, gtid, victimId, vnew.p.ub, vold.p.ub));
1633	__kmp_str_free(str: &buff);
1634	}
1635	#endif
1636	KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1637	vold.p.ub - vnew.p.ub);
1638	status = `1`;
1639	pr->u.p.parm4 = victimId; // keep victim id
1640	// now update own count and ub
1641	init = vnew.p.ub;
1642	vold.p.count = init + `1`;
1643	#if KMP_ARCH_X86
1644	KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1645	#else
1646	(volatile* kmp_int64 *)(&pr->u.p.count) = vold.b;
1647	#endif
1648	// activate non-empty buffer and let others steal from us
1649	if (vold.p.count < (UT)vold.p.ub)
1650	KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1651	break;
1652	} // if (check CAS result)
1653	KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
1654	} // while (try to steal from particular victim)
1655	} // while (search for victim)
1656	} // if (try to find victim and steal)
1657	} // if (4-byte induction variable)
1658	if (!status) {
1659	*p_lb = `0`;
1660	*p_ub = `0`;
1661	if (p_st != NULL)
1662	*p_st = `0`;
1663	} else {
1664	start = pr->u.p.lb;
1665	init *= chunk;
1666	limit = chunk + init - `1`;
1667	incr = pr->u.p.st;
1668	KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, `1`);
1669
1670	KMP_DEBUG_ASSERT(init <= trip);
1671	// keep track of done chunks for possible early exit from stealing
1672	// TODO: count executed chunks locally with rare update of shared location
1673	// test_then_inc<ST>((volatile ST )&sh->u.s.iteration);*
1674	if ((last = (limit >= trip)) != `0`)
1675	limit = trip;
1676	if (p_st != NULL)
1677	*p_st = incr;
1678
1679	if (incr == `1`) {
1680	*p_lb = start + init;
1681	*p_ub = start + limit;
1682	} else {
1683	p_lb = start + init incr;
1684	p_ub = start + limit incr;
1685	}
1686	} // if
1687	break;
1688	} // case
1689	#endif // KMP_STATIC_STEAL_ENABLED
1690	case kmp_sch_static_balanced: {
1691	KD_TRACE(
1692	`10`,
1693	("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1694	gtid));
1695	/ check if thread has any iteration to do /
1696	if ((status = !pr->u.p.count) != `0`) {
1697	pr->u.p.count = `1`;
1698	*p_lb = pr->u.p.lb;
1699	*p_ub = pr->u.p.ub;
1700	last = (pr->u.p.parm1 != `0`);
1701	if (p_st != NULL)
1702	*p_st = pr->u.p.st;
1703	} else { / no iterations to do /
1704	pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1705	}
1706	} // case
1707	break;
1708	case kmp_sch_static_greedy: / original code for kmp_sch_static_greedy was*
1709	merged here /*
1710	case kmp_sch_static_chunked: {
1711	T parm1;
1712
1713	KD_TRACE(`100`, ("__kmp_dispatch_next_algorithm: T#%d "
1714	"kmp_sch_static_[affinity\|chunked] case\n",
1715	gtid));
1716	parm1 = pr->u.p.parm1;
1717
1718	trip = pr->u.p.tc - `1`;
1719	init = parm1 * (pr->u.p.count + tid);
1720
1721	if ((status = (init <= trip)) != `0`) {
1722	start = pr->u.p.lb;
1723	incr = pr->u.p.st;
1724	limit = parm1 + init - `1`;
1725
1726	if ((last = (limit >= trip)) != `0`)
1727	limit = trip;
1728
1729	if (p_st != NULL)
1730	*p_st = incr;
1731
1732	pr->u.p.count += nproc;
1733
1734	if (incr == `1`) {
1735	*p_lb = start + init;
1736	*p_ub = start + limit;
1737	} else {
1738	p_lb = start + init incr;
1739	p_ub = start + limit incr;
1740	}
1741
1742	if (pr->flags.ordered) {
1743	pr->u.p.ordered_lower = init;
1744	pr->u.p.ordered_upper = limit;
1745	} // if
1746	} // if
1747	} // case
1748	break;
1749
1750	case kmp_sch_dynamic_chunked: {
1751	UT chunk_number;
1752	UT chunk_size = pr->u.p.parm1;
1753	UT nchunks = pr->u.p.parm2;
1754
1755	KD_TRACE(
1756	`100`,
1757	("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1758	gtid));
1759
1760	chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1761	status = (chunk_number < nchunks);
1762	if (!status) {
1763	*p_lb = `0`;
1764	*p_ub = `0`;
1765	if (p_st != NULL)
1766	*p_st = `0`;
1767	} else {
1768	init = chunk_size * chunk_number;
1769	trip = pr->u.p.tc - `1`;
1770	start = pr->u.p.lb;
1771	incr = pr->u.p.st;
1772
1773	if ((last = (trip - init < (UT)chunk_size)))
1774	limit = trip;
1775	else
1776	limit = chunk_size + init - `1`;
1777
1778	if (p_st != NULL)
1779	*p_st = incr;
1780
1781	if (incr == `1`) {
1782	*p_lb = start + init;
1783	*p_ub = start + limit;
1784	} else {
1785	p_lb = start + init incr;
1786	p_ub = start + limit incr;
1787	}
1788
1789	if (pr->flags.ordered) {
1790	pr->u.p.ordered_lower = init;
1791	pr->u.p.ordered_upper = limit;
1792	} // if
1793	} // if
1794	} // case
1795	break;
1796
1797	case kmp_sch_guided_iterative_chunked: {
1798	T chunkspec = pr->u.p.parm1;
1799	KD_TRACE(`100`, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1800	"iterative case\n",
1801	gtid));
1802	trip = pr->u.p.tc;
1803	// Start atomic part of calculations
1804	while (`1`) {
1805	ST remaining; // signed, because can be < 0
1806	init = sh->u.s.iteration; // shared value
1807	remaining = trip - init;
1808	if (remaining <= `0`) { // AC: need to compare with 0 first
1809	// nothing to do, don't try atomic op
1810	status = `0`;
1811	break;
1812	}
1813	if ((T)remaining <
1814	pr->u.p.parm2) { // compare with Knproc(chunk+1), K=2 by default
1815	// use dynamic-style schedule
1816	// atomically increment iterations, get old value
1817	init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1818	(ST)chunkspec);
1819	remaining = trip - init;
1820	if (remaining <= `0`) {
1821	status = `0`; // all iterations got by other threads
1822	} else {
1823	// got some iterations to work on
1824	status = `1`;
1825	if ((T)remaining > chunkspec) {
1826	limit = init + chunkspec - `1`;
1827	} else {
1828	last = true; // the last chunk
1829	limit = init + remaining - `1`;
1830	} // if
1831	} // if
1832	break;
1833	} // if
1834	limit = init + (UT)((double)remaining *
1835	(double* )&pr->u.p.parm3); // divide by Knproc
1836	if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1837	(ST)init, (ST)limit)) {
1838	// CAS was successful, chunk obtained
1839	status = `1`;
1840	--limit;
1841	break;
1842	} // if
1843	} // while
1844	if (status != `0`) {
1845	start = pr->u.p.lb;
1846	incr = pr->u.p.st;
1847	if (p_st != NULL)
1848	*p_st = incr;
1849	p_lb = start + init incr;
1850	p_ub = start + limit incr;
1851	if (pr->flags.ordered) {
1852	pr->u.p.ordered_lower = init;
1853	pr->u.p.ordered_upper = limit;
1854	} // if
1855	} else {
1856	*p_lb = `0`;
1857	*p_ub = `0`;
1858	if (p_st != NULL)
1859	*p_st = `0`;
1860	} // if
1861	} // case
1862	break;
1863
1864	case kmp_sch_guided_simd: {
1865	// same as iterative but curr-chunk adjusted to be multiple of given
1866	// chunk
1867	T chunk = pr->u.p.parm1;
1868	KD_TRACE(`100`,
1869	("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1870	gtid));
1871	trip = pr->u.p.tc;
1872	// Start atomic part of calculations
1873	while (`1`) {
1874	ST remaining; // signed, because can be < 0
1875	init = sh->u.s.iteration; // shared value
1876	remaining = trip - init;
1877	if (remaining <= `0`) { // AC: need to compare with 0 first
1878	status = `0`; // nothing to do, don't try atomic op
1879	break;
1880	}
1881	KMP_DEBUG_ASSERT(chunk && init % chunk == `0`);
1882	// compare with Knproc(chunk+1), K=2 by default
1883	if ((T)remaining < pr->u.p.parm2) {
1884	// use dynamic-style schedule
1885	// atomically increment iterations, get old value
1886	init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1887	(ST)chunk);
1888	remaining = trip - init;
1889	if (remaining <= `0`) {
1890	status = `0`; // all iterations got by other threads
1891	} else {
1892	// got some iterations to work on
1893	status = `1`;
1894	if ((T)remaining > chunk) {
1895	limit = init + chunk - `1`;
1896	} else {
1897	last = true; // the last chunk
1898	limit = init + remaining - `1`;
1899	} // if
1900	} // if
1901	break;
1902	} // if
1903	// divide by Knproc*
1904	UT span;
1905	__kmp_type_convert((double)remaining * ((double* *)&pr->u.p.parm3),
1906	&span);
1907	UT rem = span % chunk;
1908	if (rem) // adjust so that span%chunk == 0
1909	span += chunk - rem;
1910	limit = init + span;
1911	if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1912	(ST)init, (ST)limit)) {
1913	// CAS was successful, chunk obtained
1914	status = `1`;
1915	--limit;
1916	break;
1917	} // if
1918	} // while
1919	if (status != `0`) {
1920	start = pr->u.p.lb;
1921	incr = pr->u.p.st;
1922	if (p_st != NULL)
1923	*p_st = incr;
1924	p_lb = start + init incr;
1925	p_ub = start + limit incr;
1926	if (pr->flags.ordered) {
1927	pr->u.p.ordered_lower = init;
1928	pr->u.p.ordered_upper = limit;
1929	} // if
1930	} else {
1931	*p_lb = `0`;
1932	*p_ub = `0`;
1933	if (p_st != NULL)
1934	*p_st = `0`;
1935	} // if
1936	} // case
1937	break;
1938
1939	case kmp_sch_guided_analytical_chunked: {
1940	T chunkspec = pr->u.p.parm1;
1941	UT chunkIdx;
1942	#if KMP_USE_X87CONTROL
1943	/ for storing original FPCW value for Windows* OS on*
1944	IA-32 architecture 8-byte version /*
1945	unsigned int oldFpcw;
1946	unsigned int fpcwSet = `0`;
1947	#endif
1948	KD_TRACE(`100`, ("__kmp_dispatch_next_algorithm: T#%d "
1949	"kmp_sch_guided_analytical_chunked case\n",
1950	gtid));
1951
1952	trip = pr->u.p.tc;
1953
1954	KMP_DEBUG_ASSERT(nproc > `1`);
1955	KMP_DEBUG_ASSERT((`2UL` * chunkspec + `1`) * (UT)nproc < trip);
1956
1957	while (`1`) { / this while loop is a safeguard against unexpected zero*
1958	chunk sizes /*
1959	chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1960	if (chunkIdx >= (UT)pr->u.p.parm2) {
1961	--trip;
1962	/ use dynamic-style scheduling /
1963	init = chunkIdx * chunkspec + pr->u.p.count;
1964	/ need to verify init > 0 in case of overflow in the above*
1965	* calculation */
1966	if ((status = (init > `0` && init <= trip)) != `0`) {
1967	limit = init + chunkspec - `1`;
1968
1969	if ((last = (limit >= trip)) != `0`)
1970	limit = trip;
1971	}
1972	break;
1973	} else {
1974	/ use exponential-style scheduling /
1975	/ The following check is to workaround the lack of long double precision on*
1976	Windows OS.*
1977	This check works around the possible effect that init != 0 for chunkIdx == 0.
1978	*/
1979	#if KMP_USE_X87CONTROL
1980	/ If we haven't already done so, save original*
1981	FPCW and set precision to 64-bit, as Windows OS*
1982	on IA-32 architecture defaults to 53-bit /*
1983	if (!fpcwSet) {
1984	oldFpcw = _control87(`0`, `0`);
1985	_control87(_PC_64, _MCW_PC);
1986	fpcwSet = `0x30000`;
1987	}
1988	#endif
1989	if (chunkIdx) {
1990	init = __kmp_dispatch_guided_remaining<T>(
1991	trip, (DBL )&pr->u.p.parm3, chunkIdx);
1992	KMP_DEBUG_ASSERT(init);
1993	init = trip - init;
1994	} else
1995	init = `0`;
1996	limit = trip - __kmp_dispatch_guided_remaining<T>(
1997	trip, (DBL )&pr->u.p.parm3, chunkIdx + `1`);
1998	KMP_ASSERT(init <= limit);
1999	if (init < limit) {
2000	KMP_DEBUG_ASSERT(limit <= trip);
2001	--limit;
2002	status = `1`;
2003	break;
2004	} // if
2005	} // if
2006	} // while (1)
2007	#if KMP_USE_X87CONTROL
2008	/ restore FPCW if necessary*
2009	AC: check fpcwSet flag first because oldFpcw can be uninitialized here
2010	*/
2011	if (fpcwSet && (oldFpcw & fpcwSet))
2012	_control87(oldFpcw, _MCW_PC);
2013	#endif
2014	if (status != `0`) {
2015	start = pr->u.p.lb;
2016	incr = pr->u.p.st;
2017	if (p_st != NULL)
2018	*p_st = incr;
2019	p_lb = start + init incr;
2020	p_ub = start + limit incr;
2021	if (pr->flags.ordered) {
2022	pr->u.p.ordered_lower = init;
2023	pr->u.p.ordered_upper = limit;
2024	}
2025	} else {
2026	*p_lb = `0`;
2027	*p_ub = `0`;
2028	if (p_st != NULL)
2029	*p_st = `0`;
2030	}
2031	} // case
2032	break;
2033
2034	case kmp_sch_trapezoidal: {
2035	UT index;
2036	T parm2 = pr->u.p.parm2;
2037	T parm3 = pr->u.p.parm3;
2038	T parm4 = pr->u.p.parm4;
2039	KD_TRACE(`100`,
2040	("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
2041	gtid));
2042
2043	index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
2044
2045	init = (index * ((`2` * parm2) - (index - `1`) * parm4)) / `2`;
2046	trip = pr->u.p.tc - `1`;
2047
2048	if ((status = ((T)index < parm3 && init <= trip)) == `0`) {
2049	*p_lb = `0`;
2050	*p_ub = `0`;
2051	if (p_st != NULL)
2052	*p_st = `0`;
2053	} else {
2054	start = pr->u.p.lb;
2055	limit = ((index + `1`) * (`2` * parm2 - index * parm4)) / `2` - `1`;
2056	incr = pr->u.p.st;
2057
2058	if ((last = (limit >= trip)) != `0`)
2059	limit = trip;
2060
2061	if (p_st != NULL)
2062	*p_st = incr;
2063
2064	if (incr == `1`) {
2065	*p_lb = start + init;
2066	*p_ub = start + limit;
2067	} else {
2068	p_lb = start + init incr;
2069	p_ub = start + limit incr;
2070	}
2071
2072	if (pr->flags.ordered) {
2073	pr->u.p.ordered_lower = init;
2074	pr->u.p.ordered_upper = limit;
2075	} // if
2076	} // if
2077	} // case
2078	break;
2079	default: {
2080	status = `0`; // to avoid complaints on uninitialized variable use
2081	__kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
2082	KMP_HNT(GetNewerLibrary), // Hint
2083	__kmp_msg_null // Variadic argument list terminator
2084	);
2085	} break;
2086	} // switch
2087	if (p_last)
2088	*p_last = last;
2089	#ifdef KMP_DEBUG
2090	if (pr->flags.ordered) {
2091	char *buff;
2092	// create format specifiers before the debug output
2093	buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
2094	"ordered_lower:%%%s ordered_upper:%%%s\n",
2095	traits_t<UT>::spec, traits_t<UT>::spec);
2096	KD_TRACE(`1000`, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
2097	__kmp_str_free(str: &buff);
2098	}
2099	{
2100	char *buff;
2101	// create format specifiers before the debug output
2102	buff = __kmp_str_format(
2103	"__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
2104	"p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
2105	traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2106	KMP_DEBUG_ASSERT(p_last);
2107	KMP_DEBUG_ASSERT(p_st);
2108	KD_TRACE(`10`, (buff, gtid, status, p_last, p_lb, p_ub, p_st));
2109	__kmp_str_free(str: &buff);
2110	}
2111	#endif
2112	return status;
2113	}
2114
2115	/ Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more*
2116	work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
2117	is not called. /*
2118	#if OMPT_SUPPORT && OMPT_OPTIONAL
2119	#define OMPT_LOOP_END \
2120	if (status == 0) { \
2121	if (ompt_enabled.ompt_callback_work) { \
2122	ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
2123	ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
2124	ompt_callbacks.ompt_callback(ompt_callback_work)( \
2125	ompt_get_work_schedule(pr->schedule), ompt_scope_end, \
2126	&(team_info->parallel_data), &(task_info->task_data), 0, codeptr); \
2127	} \
2128	}
2129	#define OMPT_LOOP_DISPATCH(lb, ub, st, status) \
2130	if (ompt_enabled.ompt_callback_dispatch && status) { \
2131	ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
2132	ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
2133	ompt_dispatch_chunk_t chunk; \
2134	ompt_data_t instance = ompt_data_none; \
2135	OMPT_GET_DISPATCH_CHUNK(chunk, lb, ub, st); \
2136	instance.ptr = &chunk; \
2137	ompt_callbacks.ompt_callback(ompt_callback_dispatch)( \
2138	&(team_info->parallel_data), &(task_info->task_data), \
2139	ompt_dispatch_ws_loop_chunk, instance); \
2140	}
2141	// TODO: implement count
2142	#else
2143	#define OMPT_LOOP_END // no-op
2144	#define OMPT_LOOP_DISPATCH(lb, ub, st, status) // no-op
2145	#endif
2146
2147	#if KMP_STATS_ENABLED
2148	#define KMP_STATS_LOOP_END \
2149	{ \
2150	kmp_int64 u, l, t, i; \
2151	l = (kmp_int64)(*p_lb); \
2152	u = (kmp_int64)(*p_ub); \
2153	i = (kmp_int64)(pr->u.p.st); \
2154	if (status == 0) { \
2155	t = 0; \
2156	KMP_POP_PARTITIONED_TIMER(); \
2157	} else if (i == 1) { \
2158	if (u >= l) \
2159	t = u - l + 1; \
2160	else \
2161	t = 0; \
2162	} else if (i < 0) { \
2163	if (l >= u) \
2164	t = (l - u) / (-i) + 1; \
2165	else \
2166	t = 0; \
2167	} else { \
2168	if (u >= l) \
2169	t = (u - l) / i + 1; \
2170	else \
2171	t = 0; \
2172	} \
2173	KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \
2174	}
2175	#else
2176	#define KMP_STATS_LOOP_END /* Nothing */
2177	#endif
2178
2179	template <typename T>
2180	static int __kmp_dispatch_next(ident_t loc, int* gtid, kmp_int32 *p_last,
2181	T p_lb, T p_ub,
2182	typename traits_t<T>::signed_t *p_st
2183	#if OMPT_SUPPORT && OMPT_OPTIONAL
2184	,
2185	void *codeptr
2186	#endif
2187	) {
2188
2189	typedef typename traits_t<T>::unsigned_t UT;
2190	typedef typename traits_t<T>::signed_t ST;
2191	// This is potentially slightly misleading, schedule(runtime) will appear here
2192	// even if the actual runtime schedule is static. (Which points out a
2193	// disadvantage of schedule(runtime): even when static scheduling is used it
2194	// costs more than a compile time choice to use static scheduling would.)
2195	KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
2196
2197	int status;
2198	dispatch_private_info_template<T> *pr;
2199	__kmp_assert_valid_gtid(gtid);
2200	kmp_info_t *th = __kmp_threads[gtid];
2201	kmp_team_t *team = th->th.th_team;
2202
2203	KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
2204	KD_TRACE(
2205	`1000`,
2206	("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
2207	gtid, p_lb, p_ub, p_st, p_last));
2208
2209	if (team->t.t_serialized) {
2210	/ NOTE: serialize this dispatch because we are not at the active level /
2211	pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2212	th->th.th_dispatch->th_disp_buffer); / top of the stack /
2213	KMP_DEBUG_ASSERT(pr);
2214
2215	if ((status = (pr->u.p.tc != `0`)) == `0`) {
2216	*p_lb = `0`;
2217	*p_ub = `0`;
2218	// if ( p_last != NULL )
2219	// p_last = 0;*
2220	if (p_st != NULL)
2221	*p_st = `0`;
2222	if (__kmp_env_consistency_check) {
2223	if (pr->pushed_ws != ct_none) {
2224	pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2225	}
2226	}
2227	} else if (pr->flags.nomerge) {
2228	kmp_int32 last;
2229	T start;
2230	UT limit, trip, init;
2231	ST incr;
2232	T chunk = pr->u.p.parm1;
2233
2234	KD_TRACE(`100`, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
2235	gtid));
2236
2237	init = chunk * pr->u.p.count++;
2238	trip = pr->u.p.tc - `1`;
2239
2240	if ((status = (init <= trip)) == `0`) {
2241	*p_lb = `0`;
2242	*p_ub = `0`;
2243	// if ( p_last != NULL )
2244	// p_last = 0;*
2245	if (p_st != NULL)
2246	*p_st = `0`;
2247	if (__kmp_env_consistency_check) {
2248	if (pr->pushed_ws != ct_none) {
2249	pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2250	}
2251	}
2252	} else {
2253	start = pr->u.p.lb;
2254	limit = chunk + init - `1`;
2255	incr = pr->u.p.st;
2256
2257	if ((last = (limit >= trip)) != `0`) {
2258	limit = trip;
2259	#if KMP_OS_WINDOWS
2260	pr->u.p.last_upper = pr->u.p.ub;
2261	#endif /* KMP_OS_WINDOWS */
2262	}
2263	if (p_last != NULL)
2264	*p_last = last;
2265	if (p_st != NULL)
2266	*p_st = incr;
2267	if (incr == `1`) {
2268	*p_lb = start + init;
2269	*p_ub = start + limit;
2270	} else {
2271	p_lb = start + init incr;
2272	p_ub = start + limit incr;
2273	}
2274
2275	if (pr->flags.ordered) {
2276	pr->u.p.ordered_lower = init;
2277	pr->u.p.ordered_upper = limit;
2278	#ifdef KMP_DEBUG
2279	{
2280	char *buff;
2281	// create format specifiers before the debug output
2282	buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2283	"ordered_lower:%%%s ordered_upper:%%%s\n",
2284	traits_t<UT>::spec, traits_t<UT>::spec);
2285	KD_TRACE(`1000`, (buff, gtid, pr->u.p.ordered_lower,
2286	pr->u.p.ordered_upper));
2287	__kmp_str_free(str: &buff);
2288	}
2289	#endif
2290	} // if
2291	} // if
2292	} else {
2293	pr->u.p.tc = `0`;
2294	*p_lb = pr->u.p.lb;
2295	*p_ub = pr->u.p.ub;
2296	#if KMP_OS_WINDOWS
2297	pr->u.p.last_upper = *p_ub;
2298	#endif /* KMP_OS_WINDOWS */
2299	if (p_last != NULL)
2300	*p_last = TRUE;
2301	if (p_st != NULL)
2302	*p_st = pr->u.p.st;
2303	} // if
2304	#ifdef KMP_DEBUG
2305	{
2306	char *buff;
2307	// create format specifiers before the debug output
2308	buff = __kmp_str_format(
2309	"__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2310	"p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
2311	traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2312	KD_TRACE(`10`, (buff, gtid, p_lb, p_ub, *p_st, p_last,
2313	(p_last ? *p_last : `0`), status));
2314	__kmp_str_free(str: &buff);
2315	}
2316	#endif
2317	#if INCLUDE_SSC_MARKS
2318	SSC_MARK_DISPATCH_NEXT();
2319	#endif
2320	OMPT_LOOP_DISPATCH(p_lb, p_ub, pr->u.p.st, status);
2321	OMPT_LOOP_END;
2322	KMP_STATS_LOOP_END;
2323	return status;
2324	} else {
2325	kmp_int32 last = `0`;
2326	dispatch_shared_info_template<T> volatile *sh;
2327
2328	KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2329	&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2330
2331	pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2332	th->th.th_dispatch->th_dispatch_pr_current);
2333	KMP_DEBUG_ASSERT(pr);
2334	sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2335	th->th.th_dispatch->th_dispatch_sh_current);
2336	KMP_DEBUG_ASSERT(sh);
2337
2338	#if KMP_USE_HIER_SCHED
2339	if (pr->flags.use_hier)
2340	status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2341	else
2342	#endif // KMP_USE_HIER_SCHED
2343	status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2344	p_st, th->th.th_team_nproc,
2345	th->th.th_info.ds.ds_tid);
2346	// status == 0: no more iterations to execute
2347	if (status == `0`) {
2348	ST num_done;
2349	num_done = test_then_inc<ST>(&sh->u.s.num_done);
2350	#ifdef KMP_DEBUG
2351	{
2352	char *buff;
2353	// create format specifiers before the debug output
2354	buff = __kmp_str_format(
2355	"__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2356	traits_t<ST>::spec);
2357	KD_TRACE(`10`, (buff, gtid, sh->u.s.num_done));
2358	__kmp_str_free(str: &buff);
2359	}
2360	#endif
2361
2362	#if KMP_USE_HIER_SCHED
2363	pr->flags.use_hier = FALSE;
2364	#endif
2365	if (num_done == th->th.th_team_nproc - `1`) {
2366	#if KMP_STATIC_STEAL_ENABLED
2367	if (pr->schedule == kmp_sch_static_steal) {
2368	int i;
2369	int idx = (th->th.th_dispatch->th_disp_index - `1`) %
2370	__kmp_dispatch_num_buffers; // current loop index
2371	// loop complete, safe to destroy locks used for stealing
2372	for (i = `0`; i < th->th.th_team_nproc; ++i) {
2373	dispatch_private_info_template<T> *buf =
2374	reinterpret_cast<dispatch_private_info_template<T> *>(
2375	&team->t.t_dispatch[i].th_disp_buffer[idx]);
2376	KMP_ASSERT(buf->steal_flag == THIEF); // buffer must be inactive
2377	KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED);
2378	if (traits_t<T>::type_size > `4`) {
2379	// destroy locks used for stealing
2380	kmp_lock_t *lck = buf->u.p.steal_lock;
2381	KMP_ASSERT(lck != NULL);
2382	__kmp_destroy_lock(lck);
2383	__kmp_free(lck);
2384	buf->u.p.steal_lock = NULL;
2385	}
2386	}
2387	}
2388	#endif
2389	/ NOTE: release shared buffer to be reused /
2390
2391	KMP_MB(); / Flush all pending memory write invalidates. /
2392
2393	sh->u.s.num_done = `0`;
2394	sh->u.s.iteration = `0`;
2395
2396	/ TODO replace with general release procedure? /
2397	if (pr->flags.ordered) {
2398	sh->u.s.ordered_iteration = `0`;
2399	}
2400
2401	KMP_MB(); / Flush all pending memory write invalidates. /
2402
2403	sh->buffer_index += __kmp_dispatch_num_buffers;
2404	KD_TRACE(`100`, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2405	gtid, sh->buffer_index));
2406
2407	KMP_MB(); / Flush all pending memory write invalidates. /
2408
2409	} // if
2410	if (__kmp_env_consistency_check) {
2411	if (pr->pushed_ws != ct_none) {
2412	pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2413	}
2414	}
2415
2416	th->th.th_dispatch->th_deo_fcn = NULL;
2417	th->th.th_dispatch->th_dxo_fcn = NULL;
2418	th->th.th_dispatch->th_dispatch_sh_current = NULL;
2419	th->th.th_dispatch->th_dispatch_pr_current = NULL;
2420	} // if (status == 0)
2421	#if KMP_OS_WINDOWS
2422	else if (last) {
2423	pr->u.p.last_upper = pr->u.p.ub;
2424	}
2425	#endif /* KMP_OS_WINDOWS */
2426	if (p_last != NULL && status != `0`)
2427	*p_last = last;
2428	} // if
2429
2430	#ifdef KMP_DEBUG
2431	{
2432	char *buff;
2433	// create format specifiers before the debug output
2434	buff = __kmp_str_format(
2435	"__kmp_dispatch_next: T#%%d normal case: "
2436	"p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2437	traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2438	KD_TRACE(`10`, (buff, gtid, p_lb, p_ub, p_st ? *p_st : `0`, p_last,
2439	(p_last ? *p_last : `0`), status));
2440	__kmp_str_free(str: &buff);
2441	}
2442	#endif
2443	#if INCLUDE_SSC_MARKS
2444	SSC_MARK_DISPATCH_NEXT();
2445	#endif
2446	OMPT_LOOP_DISPATCH(p_lb, p_ub, pr->u.p.st, status);
2447	OMPT_LOOP_END;
2448	KMP_STATS_LOOP_END;
2449	return status;
2450	}
2451
2452	/!*
2453	@ingroup WORK_SHARING
2454	@param loc source location information
2455	@param global_tid global thread number
2456	@return Zero if the parallel region is not active and this thread should execute
2457	all sections, non-zero otherwise.
2458
2459	Beginning of sections construct.
2460	There are no implicit barriers in the "sections" calls, rather the compiler
2461	should introduce an explicit barrier if it is required.
2462
2463	This implementation is based on __kmp_dispatch_init, using same constructs for
2464	shared data (we can't have sections nested directly in omp for loop, there
2465	should be a parallel region in between)
2466	*/
2467	kmp_int32 __kmpc_sections_init(ident_t *loc, kmp_int32 gtid) {
2468
2469	int active;
2470	kmp_info_t *th;
2471	kmp_team_t *team;
2472	kmp_uint32 my_buffer_index;
2473	dispatch_shared_info_template<kmp_int32> volatile *sh;
2474
2475	KMP_DEBUG_ASSERT(__kmp_init_serial);
2476
2477	if (!TCR_4(__kmp_init_parallel))
2478	__kmp_parallel_initialize();
2479	__kmp_resume_if_soft_paused();
2480
2481	/ setup data /
2482	th = __kmp_threads[gtid];
2483	team = th->th.th_team;
2484	active = !team->t.t_serialized;
2485	th->th.th_ident = loc;
2486
2487	KMP_COUNT_BLOCK(OMP_SECTIONS);
2488	KD_TRACE(`10`, ("__kmpc_sections: called by T#%d\n", gtid));
2489
2490	if (active) {
2491	// Setup sections in the same way as dynamic scheduled loops.
2492	// We need one shared data: which section is to execute next.
2493	// (in case parallel is not active, all sections will be executed on the
2494	// same thread)
2495	KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2496	&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2497
2498	my_buffer_index = th->th.th_dispatch->th_disp_index++;
2499
2500	// reuse shared data structures from dynamic sched loops:
2501	sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
2502	&team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
2503	KD_TRACE(`10`, ("__kmpc_sections_init: T#%d my_buffer_index:%d\n", gtid,
2504	my_buffer_index));
2505
2506	th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
2507	th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
2508
2509	KD_TRACE(`100`, ("__kmpc_sections_init: T#%d before wait: my_buffer_index:%d "
2510	"sh->buffer_index:%d\n",
2511	gtid, my_buffer_index, sh->buffer_index));
2512	__kmp_wait<kmp_uint32>(spinner: &sh->buffer_index, checker: my_buffer_index,
2513	pred: __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
2514	// Note: KMP_WAIT() cannot be used there: buffer index and
2515	// my_buffer_index are always* 32-bit integers.*
2516	KMP_MB();
2517	KD_TRACE(`100`, ("__kmpc_sections_init: T#%d after wait: my_buffer_index:%d "
2518	"sh->buffer_index:%d\n",
2519	gtid, my_buffer_index, sh->buffer_index));
2520
2521	th->th.th_dispatch->th_dispatch_pr_current =
2522	nullptr; // sections construct doesn't need private data
2523	th->th.th_dispatch->th_dispatch_sh_current =
2524	CCAST(dispatch_shared_info_t , (volatile* dispatch_shared_info_t *)sh);
2525	}
2526
2527	#if OMPT_SUPPORT && OMPT_OPTIONAL
2528	if (ompt_enabled.ompt_callback_work) {
2529	ompt_team_info_t *team_info = __ompt_get_teaminfo(depth: `0`, NULL);
2530	ompt_task_info_t *task_info = __ompt_get_task_info_object(depth: `0`);
2531	ompt_callbacks.ompt_callback(ompt_callback_work)(
2532	ompt_work_sections, ompt_scope_begin, &(team_info->parallel_data),
2533	&(task_info->task_data), `0`, OMPT_GET_RETURN_ADDRESS(`0`));
2534	}
2535	#endif
2536	KMP_PUSH_PARTITIONED_TIMER(OMP_sections);
2537
2538	return active;
2539	}
2540
2541	/!*
2542	@ingroup WORK_SHARING
2543	@param loc source location information
2544	@param global_tid global thread number
2545	@param numberOfSections number of sections in the 'sections' construct
2546	@return unsigned [from 0 to n) - number (id) of the section to execute next on
2547	this thread. n (or any other number not in range) - nothing to execute on this
2548	thread
2549	*/
2550
2551	kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 gtid,
2552	kmp_int32 numberOfSections) {
2553
2554	KMP_TIME_PARTITIONED_BLOCK(OMP_sections_overhead);
2555
2556	kmp_info_t *th = __kmp_threads[gtid];
2557	#ifdef KMP_DEBUG
2558	kmp_team_t *team = th->th.th_team;
2559	#endif
2560
2561	KD_TRACE(`1000`, ("__kmp_dispatch_next: T#%d; number of sections:%d\n", gtid,
2562	numberOfSections));
2563
2564	// For serialized case we should not call this function:
2565	KMP_DEBUG_ASSERT(!team->t.t_serialized);
2566
2567	dispatch_shared_info_template<kmp_int32> volatile *sh;
2568
2569	KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2570	&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2571
2572	KMP_DEBUG_ASSERT(!(th->th.th_dispatch->th_dispatch_pr_current));
2573	sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
2574	th->th.th_dispatch->th_dispatch_sh_current);
2575	KMP_DEBUG_ASSERT(sh);
2576
2577	kmp_int32 sectionIndex = `0`;
2578	bool moreSectionsToExecute = true;
2579
2580	// Find section to execute:
2581	sectionIndex = test_then_inc<kmp_int32>(p: (kmp_int32 *)&sh->u.s.iteration);
2582	if (sectionIndex >= numberOfSections) {
2583	moreSectionsToExecute = false;
2584	}
2585
2586	// status == 0: no more sections to execute;
2587	// OMPTODO: __kmpc_end_sections could be bypassed?
2588	if (!moreSectionsToExecute) {
2589	kmp_int32 num_done;
2590
2591	num_done = test_then_inc<kmp_int32>(p: (kmp_int32 *)(&sh->u.s.num_done));
2592
2593	if (num_done == th->th.th_team_nproc - `1`) {
2594	/ NOTE: release this buffer to be reused /
2595
2596	KMP_MB(); / Flush all pending memory write invalidates. /
2597
2598	sh->u.s.num_done = `0`;
2599	sh->u.s.iteration = `0`;
2600
2601	KMP_MB(); / Flush all pending memory write invalidates. /
2602
2603	sh->buffer_index += __kmp_dispatch_num_buffers;
2604	KD_TRACE(`100`, ("__kmpc_next_section: T#%d change buffer_index:%d\n", gtid,
2605	sh->buffer_index));
2606
2607	KMP_MB(); / Flush all pending memory write invalidates. /
2608
2609	} // if
2610
2611	th->th.th_dispatch->th_deo_fcn = NULL;
2612	th->th.th_dispatch->th_dxo_fcn = NULL;
2613	th->th.th_dispatch->th_dispatch_sh_current = NULL;
2614	th->th.th_dispatch->th_dispatch_pr_current = NULL;
2615
2616	#if OMPT_SUPPORT && OMPT_OPTIONAL
2617	if (ompt_enabled.ompt_callback_dispatch) {
2618	ompt_team_info_t *team_info = __ompt_get_teaminfo(depth: `0`, NULL);
2619	ompt_task_info_t *task_info = __ompt_get_task_info_object(depth: `0`);
2620	ompt_data_t instance = ompt_data_none;
2621	instance.ptr = OMPT_GET_RETURN_ADDRESS(`0`);
2622	ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
2623	&(team_info->parallel_data), &(task_info->task_data),
2624	ompt_dispatch_section, instance);
2625	}
2626	#endif
2627	}
2628
2629	return sectionIndex;
2630	}
2631
2632	/!*
2633	@ingroup WORK_SHARING
2634	@param loc source location information
2635	@param global_tid global thread number
2636
2637	End of "sections" construct.
2638	Don't need to wait here: barrier is added separately when needed.
2639	*/
2640	void __kmpc_end_sections(ident_t *loc, kmp_int32 gtid) {
2641
2642	kmp_info_t *th = __kmp_threads[gtid];
2643	int active = !th->th.th_team->t.t_serialized;
2644
2645	KD_TRACE(`100`, ("__kmpc_end_sections: T#%d called\n", gtid));
2646
2647	if (!active) {
2648	// In active case call finalization is done in __kmpc_next_section
2649	#if OMPT_SUPPORT && OMPT_OPTIONAL
2650	if (ompt_enabled.ompt_callback_work) {
2651	ompt_team_info_t *team_info = __ompt_get_teaminfo(depth: `0`, NULL);
2652	ompt_task_info_t *task_info = __ompt_get_task_info_object(depth: `0`);
2653	ompt_callbacks.ompt_callback(ompt_callback_work)(
2654	ompt_work_sections, ompt_scope_end, &(team_info->parallel_data),
2655	&(task_info->task_data), `0`, OMPT_GET_RETURN_ADDRESS(`0`));
2656	}
2657	#endif
2658	}
2659
2660	KMP_POP_PARTITIONED_TIMER();
2661	KD_TRACE(`100`, ("__kmpc_end_sections: T#%d returned\n", gtid));
2662	}
2663
2664	template <typename T>
2665	static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2666	kmp_int32 plastiter, T plower, T *pupper,
2667	typename traits_t<T>::signed_t incr) {
2668	typedef typename traits_t<T>::unsigned_t UT;
2669	kmp_uint32 team_id;
2670	kmp_uint32 nteams;
2671	UT trip_count;
2672	kmp_team_t *team;
2673	kmp_info_t *th;
2674
2675	KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2676	KE_TRACE(`10`, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2677	#ifdef KMP_DEBUG
2678	typedef typename traits_t<T>::signed_t ST;
2679	{
2680	char *buff;
2681	// create format specifiers before the debug output
2682	buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2683	"iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2684	traits_t<T>::spec, traits_t<T>::spec,
2685	traits_t<ST>::spec, traits_t<T>::spec);
2686	KD_TRACE(`100`, (buff, gtid, plastiter, plower, *pupper, incr));
2687	__kmp_str_free(str: &buff);
2688	}
2689	#endif
2690
2691	if (__kmp_env_consistency_check) {
2692	if (incr == `0`) {
2693	__kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2694	loc);
2695	}
2696	if (incr > `0` ? (pupper < plower) : (plower < pupper)) {
2697	// The loop is illegal.
2698	// Some zero-trip loops maintained by compiler, e.g.:
2699	// for(i=10;i<0;++i) // lower >= upper - run-time check
2700	// for(i=0;i>10;--i) // lower <= upper - run-time check
2701	// for(i=0;i>10;++i) // incr > 0 - compile-time check
2702	// for(i=10;i<0;--i) // incr < 0 - compile-time check
2703	// Compiler does not check the following illegal loops:
2704	// for(i=0;i<10;i+=incr) // where incr<0
2705	// for(i=10;i>0;i-=incr) // where incr<0
2706	__kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2707	}
2708	}
2709	__kmp_assert_valid_gtid(gtid);
2710	th = __kmp_threads[gtid];
2711	team = th->th.th_team;
2712	KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2713	nteams = th->th.th_teams_size.nteams;
2714	team_id = team->t.t_master_tid;
2715	KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2716
2717	// compute global trip count
2718	if (incr == `1`) {
2719	trip_count = pupper - plower + `1`;
2720	} else if (incr == -`1`) {
2721	trip_count = plower - pupper + `1`;
2722	} else if (incr > `0`) {
2723	// upper-lower can exceed the limit of signed type
2724	trip_count = (UT)(pupper - plower) / incr + `1`;
2725	} else {
2726	trip_count = (UT)(plower - pupper) / (-incr) + `1`;
2727	}
2728
2729	if (trip_count <= nteams) {
2730	KMP_DEBUG_ASSERT(
2731	__kmp_static == kmp_sch_static_greedy \|\|
2732	__kmp_static ==
2733	kmp_sch_static_balanced); // Unknown static scheduling type.
2734	// only some teams get single iteration, others get nothing
2735	if (team_id < trip_count) {
2736	pupper = plower = plower + team_id incr;
2737	} else {
2738	plower = pupper + incr; // zero-trip loop
2739	}
2740	if (plastiter != NULL)
2741	*plastiter = (team_id == trip_count - `1`);
2742	} else {
2743	if (__kmp_static == kmp_sch_static_balanced) {
2744	UT chunk = trip_count / nteams;
2745	UT extras = trip_count % nteams;
2746	*plower +=
2747	incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2748	pupper = plower + chunk * incr - (team_id < extras ? `0` : incr);
2749	if (plastiter != NULL)
2750	*plastiter = (team_id == nteams - `1`);
2751	} else {
2752	T chunk_inc_count =
2753	(trip_count / nteams + ((trip_count % nteams) ? `1` : `0`)) * incr;
2754	T upper = *pupper;
2755	KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2756	// Unknown static scheduling type.
2757	plower += team_id chunk_inc_count;
2758	pupper = plower + chunk_inc_count - incr;
2759	// Check/correct bounds if needed
2760	if (incr > `0`) {
2761	if (pupper < plower)
2762	*pupper = traits_t<T>::max_value;
2763	if (plastiter != NULL)
2764	plastiter = plower <= upper && *pupper > upper - incr;
2765	if (*pupper > upper)
2766	pupper = upper; // tracker C73258*
2767	} else {
2768	if (pupper > plower)
2769	*pupper = traits_t<T>::min_value;
2770	if (plastiter != NULL)
2771	plastiter = plower >= upper && *pupper < upper - incr;
2772	if (*pupper < upper)
2773	pupper = upper; // tracker C73258*
2774	}
2775	}
2776	}
2777	}
2778
2779	//-----------------------------------------------------------------------------
2780	// Dispatch routines
2781	// Transfer call to template< type T >
2782	// __kmp_dispatch_init( ident_t loc, int gtid, enum sched_type schedule,*
2783	// T lb, T ub, ST st, ST chunk )
2784	extern "C" {
2785
2786	/!*
2787	@ingroup WORK_SHARING
2788	@{
2789	@param loc Source location
2790	@param gtid Global thread id
2791	@param schedule Schedule type
2792	@param lb Lower bound
2793	@param ub Upper bound
2794	@param st Step (or increment if you prefer)
2795	@param chunk The chunk size to block with
2796
2797	This function prepares the runtime to start a dynamically scheduled for loop,
2798	saving the loop arguments.
2799	These functions are all identical apart from the types of the arguments.
2800	*/
2801
2802	void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2803	enum sched_type schedule, kmp_int32 lb,
2804	kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2805	KMP_DEBUG_ASSERT(__kmp_init_serial);
2806	#if OMPT_SUPPORT && OMPT_OPTIONAL
2807	OMPT_STORE_RETURN_ADDRESS(gtid);
2808	#endif
2809	__kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, push_ws: true);
2810	}
2811	/!*
2812	See @ref __kmpc_dispatch_init_4
2813	*/
2814	void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2815	enum sched_type schedule, kmp_uint32 lb,
2816	kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2817	KMP_DEBUG_ASSERT(__kmp_init_serial);
2818	#if OMPT_SUPPORT && OMPT_OPTIONAL
2819	OMPT_STORE_RETURN_ADDRESS(gtid);
2820	#endif
2821	__kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, push_ws: true);
2822	}
2823
2824	/!*
2825	See @ref __kmpc_dispatch_init_4
2826	*/
2827	void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2828	enum sched_type schedule, kmp_int64 lb,
2829	kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2830	KMP_DEBUG_ASSERT(__kmp_init_serial);
2831	#if OMPT_SUPPORT && OMPT_OPTIONAL
2832	OMPT_STORE_RETURN_ADDRESS(gtid);
2833	#endif
2834	__kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, push_ws: true);
2835	}
2836
2837	/!*
2838	See @ref __kmpc_dispatch_init_4
2839	*/
2840	void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2841	enum sched_type schedule, kmp_uint64 lb,
2842	kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2843	KMP_DEBUG_ASSERT(__kmp_init_serial);
2844	#if OMPT_SUPPORT && OMPT_OPTIONAL
2845	OMPT_STORE_RETURN_ADDRESS(gtid);
2846	#endif
2847	__kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, push_ws: true);
2848	}
2849
2850	/!*
2851	See @ref __kmpc_dispatch_init_4
2852
2853	Difference from __kmpc_dispatch_init set of functions is these functions
2854	are called for composite distribute parallel for construct. Thus before
2855	regular iterations dispatching we need to calc per-team iteration space.
2856
2857	These functions are all identical apart from the types of the arguments.
2858	*/
2859	void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2860	enum sched_type schedule, kmp_int32 *p_last,
2861	kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2862	kmp_int32 chunk) {
2863	KMP_DEBUG_ASSERT(__kmp_init_serial);
2864	#if OMPT_SUPPORT && OMPT_OPTIONAL
2865	OMPT_STORE_RETURN_ADDRESS(gtid);
2866	#endif
2867	__kmp_dist_get_bounds<kmp_int32>(loc, gtid, plastiter: p_last, plower: &lb, pupper: &ub, incr: st);
2868	__kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, push_ws: true);
2869	}
2870
2871	void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2872	enum sched_type schedule, kmp_int32 *p_last,
2873	kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2874	kmp_int32 chunk) {
2875	KMP_DEBUG_ASSERT(__kmp_init_serial);
2876	#if OMPT_SUPPORT && OMPT_OPTIONAL
2877	OMPT_STORE_RETURN_ADDRESS(gtid);
2878	#endif
2879	__kmp_dist_get_bounds<kmp_uint32>(loc, gtid, plastiter: p_last, plower: &lb, pupper: &ub, incr: st);
2880	__kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, push_ws: true);
2881	}
2882
2883	void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2884	enum sched_type schedule, kmp_int32 *p_last,
2885	kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2886	kmp_int64 chunk) {
2887	KMP_DEBUG_ASSERT(__kmp_init_serial);
2888	#if OMPT_SUPPORT && OMPT_OPTIONAL
2889	OMPT_STORE_RETURN_ADDRESS(gtid);
2890	#endif
2891	__kmp_dist_get_bounds<kmp_int64>(loc, gtid, plastiter: p_last, plower: &lb, pupper: &ub, incr: st);
2892	__kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, push_ws: true);
2893	}
2894
2895	void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2896	enum sched_type schedule, kmp_int32 *p_last,
2897	kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2898	kmp_int64 chunk) {
2899	KMP_DEBUG_ASSERT(__kmp_init_serial);
2900	#if OMPT_SUPPORT && OMPT_OPTIONAL
2901	OMPT_STORE_RETURN_ADDRESS(gtid);
2902	#endif
2903	__kmp_dist_get_bounds<kmp_uint64>(loc, gtid, plastiter: p_last, plower: &lb, pupper: &ub, incr: st);
2904	__kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, push_ws: true);
2905	}
2906
2907	/!*
2908	@param loc Source code location
2909	@param gtid Global thread id
2910	@param p_last Pointer to a flag set to one if this is the last chunk or zero
2911	otherwise
2912	@param p_lb Pointer to the lower bound for the next chunk of work
2913	@param p_ub Pointer to the upper bound for the next chunk of work
2914	@param p_st Pointer to the stride for the next chunk of work
2915	@return one if there is work to be done, zero otherwise
2916
2917	Get the next dynamically allocated chunk of work for this thread.
2918	If there is no more work, then the lb,ub and stride need not be modified.
2919	*/
2920	int __kmpc_dispatch_next_4(ident_t loc, kmp_int32 gtid, kmp_int32 p_last,
2921	kmp_int32 p_lb, kmp_int32 p_ub, kmp_int32 *p_st) {
2922	#if OMPT_SUPPORT && OMPT_OPTIONAL
2923	OMPT_STORE_RETURN_ADDRESS(gtid);
2924	#endif
2925	return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2926	#if OMPT_SUPPORT && OMPT_OPTIONAL
2927	,
2928	OMPT_LOAD_RETURN_ADDRESS(gtid)
2929	#endif
2930	);
2931	}
2932
2933	/!*
2934	See @ref __kmpc_dispatch_next_4
2935	*/
2936	int __kmpc_dispatch_next_4u(ident_t loc, kmp_int32 gtid, kmp_int32 p_last,
2937	kmp_uint32 p_lb, kmp_uint32 p_ub,
2938	kmp_int32 *p_st) {
2939	#if OMPT_SUPPORT && OMPT_OPTIONAL
2940	OMPT_STORE_RETURN_ADDRESS(gtid);
2941	#endif
2942	return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2943	#if OMPT_SUPPORT && OMPT_OPTIONAL
2944	,
2945	OMPT_LOAD_RETURN_ADDRESS(gtid)
2946	#endif
2947	);
2948	}
2949
2950	/!*
2951	See @ref __kmpc_dispatch_next_4
2952	*/
2953	int __kmpc_dispatch_next_8(ident_t loc, kmp_int32 gtid, kmp_int32 p_last,
2954	kmp_int64 p_lb, kmp_int64 p_ub, kmp_int64 *p_st) {
2955	#if OMPT_SUPPORT && OMPT_OPTIONAL
2956	OMPT_STORE_RETURN_ADDRESS(gtid);
2957	#endif
2958	return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2959	#if OMPT_SUPPORT && OMPT_OPTIONAL
2960	,
2961	OMPT_LOAD_RETURN_ADDRESS(gtid)
2962	#endif
2963	);
2964	}
2965
2966	/!*
2967	See @ref __kmpc_dispatch_next_4
2968	*/
2969	int __kmpc_dispatch_next_8u(ident_t loc, kmp_int32 gtid, kmp_int32 p_last,
2970	kmp_uint64 p_lb, kmp_uint64 p_ub,
2971	kmp_int64 *p_st) {
2972	#if OMPT_SUPPORT && OMPT_OPTIONAL
2973	OMPT_STORE_RETURN_ADDRESS(gtid);
2974	#endif
2975	return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2976	#if OMPT_SUPPORT && OMPT_OPTIONAL
2977	,
2978	OMPT_LOAD_RETURN_ADDRESS(gtid)
2979	#endif
2980	);
2981	}
2982
2983	/!*
2984	@param loc Source code location
2985	@param gtid Global thread id
2986
2987	Mark the end of a dynamic loop.
2988	*/
2989	void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2990	__kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2991	}
2992
2993	/!*
2994	See @ref __kmpc_dispatch_fini_4
2995	*/
2996	void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2997	__kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2998	}
2999
3000	/!*
3001	See @ref __kmpc_dispatch_fini_4
3002	*/
3003	void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
3004	__kmp_dispatch_finish<kmp_uint32>(gtid, loc);
3005	}
3006
3007	/!*
3008	See @ref __kmpc_dispatch_fini_4
3009	*/
3010	void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
3011	__kmp_dispatch_finish<kmp_uint64>(gtid, loc);
3012	}
3013
3014	/!*
3015	See @ref __kmpc_dispatch_deinit
3016	*/
3017	void __kmpc_dispatch_deinit(ident_t *loc, kmp_int32 gtid) {}
3018	/! @} /
3019
3020	//-----------------------------------------------------------------------------
3021	// Non-template routines from kmp_dispatch.cpp used in other sources
3022
3023	kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
3024	return value == checker;
3025	}
3026
3027	kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
3028	return value != checker;
3029	}
3030
3031	kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
3032	return value < checker;
3033	}
3034
3035	kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
3036	return value >= checker;
3037	}
3038
3039	kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
3040	return value <= checker;
3041	}
3042
3043	kmp_uint32
3044	__kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
3045	kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
3046	void obj // Higher-level synchronization object, or NULL.*
3047	) {
3048	// note: we may not belong to a team at this point
3049	volatile kmp_uint32 *spin = spinner;
3050	kmp_uint32 check = checker;
3051	kmp_uint32 spins;
3052	kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
3053	kmp_uint32 r;
3054	kmp_uint64 time;
3055
3056	KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
3057	KMP_INIT_YIELD(spins);
3058	KMP_INIT_BACKOFF(time);
3059	// main wait spin loop
3060	while (!f(r = TCR_4(*spin), check)) {
3061	KMP_FSYNC_SPIN_PREPARE(obj);
3062	/ GEH - remove this since it was accidentally introduced when kmp_wait was*
3063	split. It causes problems with infinite recursion because of exit lock /*
3064	/ if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)*
3065	__kmp_abort_thread(); /*
3066	KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
3067	}
3068	KMP_FSYNC_SPIN_ACQUIRED(obj);
3069	return r;
3070	}
3071
3072	void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
3073	kmp_uint32 (pred)(void* *, kmp_uint32),
3074	void obj // Higher-level synchronization object, or NULL.*
3075	) {
3076	// note: we may not belong to a team at this point
3077	void *spin = spinner;
3078	kmp_uint32 check = checker;
3079	kmp_uint32 spins;
3080	kmp_uint32 (f)(void* *, kmp_uint32) = pred;
3081	kmp_uint64 time;
3082
3083	KMP_FSYNC_SPIN_INIT(obj, spin);
3084	KMP_INIT_YIELD(spins);
3085	KMP_INIT_BACKOFF(time);
3086	// main wait spin loop
3087	while (!f(spin, check)) {
3088	KMP_FSYNC_SPIN_PREPARE(obj);
3089	/ if we have waited a bit, or are noversubscribed, yield /
3090	/ pause is in the following code /
3091	KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
3092	}
3093	KMP_FSYNC_SPIN_ACQUIRED(obj);
3094	}
3095
3096	} // extern "C"
3097
3098	#ifdef KMP_GOMP_COMPAT
3099
3100	void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
3101	enum sched_type schedule, kmp_int32 lb,
3102	kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
3103	int push_ws) {
3104	__kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
3105	push_ws);
3106	}
3107
3108	void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
3109	enum sched_type schedule, kmp_uint32 lb,
3110	kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
3111	int push_ws) {
3112	__kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
3113	push_ws);
3114	}
3115
3116	void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
3117	enum sched_type schedule, kmp_int64 lb,
3118	kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
3119	int push_ws) {
3120	__kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
3121	push_ws);
3122	}
3123
3124	void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
3125	enum sched_type schedule, kmp_uint64 lb,
3126	kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
3127	int push_ws) {
3128	__kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
3129	push_ws);
3130	}
3131
3132	void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
3133	__kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
3134	}
3135
3136	void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
3137	__kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
3138	}
3139
3140	void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
3141	__kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
3142	}
3143
3144	void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
3145	__kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
3146	}
3147
3148	#endif /* KMP_GOMP_COMPAT */
3149
3150	/ ------------------------------------------------------------------------ /
3151

source code of openmp/runtime/src/kmp_dispatch.cpp