kmp_collapse.cpp source code [openmp/runtime/src/kmp_collapse.cpp]

1	/*
2	* kmp_collapse.cpp -- loop collapse feature
3	*/
4
5	//===----------------------------------------------------------------------===//
6	//
7	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8	// See https://llvm.org/LICENSE.txt for license information.
9	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "kmp.h"
14	#include "kmp_error.h"
15	#include "kmp_i18n.h"
16	#include "kmp_itt.h"
17	#include "kmp_stats.h"
18	#include "kmp_str.h"
19	#include "kmp_collapse.h"
20
21	#if OMPT_SUPPORT
22	#include "ompt-specific.h"
23	#endif
24
25	// OMPTODO: different style of comments (see kmp_sched)
26	// OMPTODO: OMPT/OMPD
27
28	// avoid inadevertently using a library based abs
29	template <typename T> T __kmp_abs(const T val) {
30	return (val < `0`) ? -val : val;
31	}
32	kmp_uint32 __kmp_abs(const kmp_uint32 val) { return val; }
33	kmp_uint64 __kmp_abs(const kmp_uint64 val) { return val; }
34
35	//----------------------------------------------------------------------------
36	// Common functions for working with rectangular and non-rectangular loops
37	//----------------------------------------------------------------------------
38
39	template <typename T> int __kmp_sign(T val) {
40	return (T(`0`) < val) - (val < T(`0`));
41	}
42
43	template <typename T> class CollapseAllocator {
44	typedef T *pT;
45
46	private:
47	static const size_t allocaSize = `32`; // size limit for stack allocations
48	// (8 bytes x 4 nested loops)
49	char stackAlloc[allocaSize];
50	static constexpr size_t maxElemCount = allocaSize / sizeof(T);
51	pT pTAlloc;
52
53	public:
54	CollapseAllocator(size_t n) : pTAlloc(reinterpret_cast<pT>(stackAlloc)) {
55	if (n > maxElemCount) {
56	pTAlloc = reinterpret_cast<pT>(__kmp_allocate(n * sizeof(T)));
57	}
58	}
59	~CollapseAllocator() {
60	if (pTAlloc != reinterpret_cast<pT>(stackAlloc)) {
61	__kmp_free(pTAlloc);
62	}
63	}
64	T &operator[](int index) { return pTAlloc[index]; }
65	operator const pT() { return pTAlloc; }
66	};
67
68	//----------Loop canonicalization---------------------------------------------
69
70	// For loop nest (any shape):
71	// convert != to < or >;
72	// switch from using < or > to <= or >=.
73	// "bounds" array has to be allocated per thread.
74	// All other internal functions will work only with canonicalized loops.
75	template <typename T>
76	void kmp_canonicalize_one_loop_XX(
77	ident_t *loc,
78	/in/out/ bounds_infoXX_template<T> *bounds) {
79
80	if (__kmp_env_consistency_check) {
81	if (bounds->step == `0`) {
82	__kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
83	loc);
84	}
85	}
86
87	if (bounds->comparison == comparison_t::comp_not_eq) {
88	// We can convert this to < or >, depends on the sign of the step:
89	if (bounds->step > `0`) {
90	bounds->comparison = comparison_t::comp_less;
91	} else {
92	bounds->comparison = comparison_t::comp_greater;
93	}
94	}
95
96	if (bounds->comparison == comparison_t::comp_less) {
97	// Note: ub0 can be unsigned. Should be Ok to hit overflow here,
98	// because ub0 + ub1j should be still positive (otherwise loop was not*
99	// well formed)
100	bounds->ub0 -= `1`;
101	bounds->comparison = comparison_t::comp_less_or_eq;
102	} else if (bounds->comparison == comparison_t::comp_greater) {
103	bounds->ub0 += `1`;
104	bounds->comparison = comparison_t::comp_greater_or_eq;
105	}
106	}
107
108	// Canonicalize loop nest. original_bounds_nest is an array of length n.
109	void kmp_canonicalize_loop_nest(ident_t *loc,
110	/in/out/ bounds_info_t *original_bounds_nest,
111	kmp_index_t n) {
112
113	for (kmp_index_t ind = `0`; ind < n; ++ind) {
114	auto bounds = &(original_bounds_nest[ind]);
115
116	switch (bounds->loop_type) {
117	case loop_type_t::loop_type_int32:
118	kmp_canonicalize_one_loop_XX<kmp_int32>(
119	loc,
120	/in/out/ bounds: (bounds_infoXX_template<kmp_int32> *)(bounds));
121	break;
122	case loop_type_t::loop_type_uint32:
123	kmp_canonicalize_one_loop_XX<kmp_uint32>(
124	loc,
125	/in/out/ bounds: (bounds_infoXX_template<kmp_uint32> *)(bounds));
126	break;
127	case loop_type_t::loop_type_int64:
128	kmp_canonicalize_one_loop_XX<kmp_int64>(
129	loc,
130	/in/out/ bounds: (bounds_infoXX_template<kmp_int64> *)(bounds));
131	break;
132	case loop_type_t::loop_type_uint64:
133	kmp_canonicalize_one_loop_XX<kmp_uint64>(
134	loc,
135	/in/out/ bounds: (bounds_infoXX_template<kmp_uint64> *)(bounds));
136	break;
137	default:
138	KMP_ASSERT(false);
139	}
140	}
141	}
142
143	//----------Calculating trip count on one level-------------------------------
144
145	// Calculate trip count on this loop level.
146	// We do this either for a rectangular loop nest,
147	// or after an adjustment bringing the loops to a parallelepiped shape.
148	// This number should not depend on the value of outer IV
149	// even if the formular has lb1 and ub1.
150	// Note: for non-rectangular loops don't use span for this, it's too big.
151
152	template <typename T>
153	kmp_loop_nest_iv_t kmp_calculate_trip_count_XX(
154	/in/out/ bounds_infoXX_template<T> *bounds) {
155
156	if (bounds->comparison == comparison_t::comp_less_or_eq) {
157	if (bounds->ub0 < bounds->lb0) {
158	// Note: after this we don't need to calculate inner loops,
159	// but that should be an edge case:
160	bounds->trip_count = `0`;
161	} else {
162	// ub - lb may exceed signed type range; we need to cast to
163	// kmp_loop_nest_iv_t anyway
164	bounds->trip_count =
165	static_cast<kmp_loop_nest_iv_t>(bounds->ub0 - bounds->lb0) /
166	__kmp_abs(bounds->step) +
167	`1`;
168	}
169	} else if (bounds->comparison == comparison_t::comp_greater_or_eq) {
170	if (bounds->lb0 < bounds->ub0) {
171	// Note: after this we don't need to calculate inner loops,
172	// but that should be an edge case:
173	bounds->trip_count = `0`;
174	} else {
175	// lb - ub may exceed signed type range; we need to cast to
176	// kmp_loop_nest_iv_t anyway
177	bounds->trip_count =
178	static_cast<kmp_loop_nest_iv_t>(bounds->lb0 - bounds->ub0) /
179	__kmp_abs(bounds->step) +
180	`1`;
181	}
182	} else {
183	KMP_ASSERT(false);
184	}
185	return bounds->trip_count;
186	}
187
188	// Calculate trip count on this loop level.
189	kmp_loop_nest_iv_t kmp_calculate_trip_count(/in/out/ bounds_info_t *bounds) {
190
191	kmp_loop_nest_iv_t trip_count = `0`;
192
193	switch (bounds->loop_type) {
194	case loop_type_t::loop_type_int32:
195	trip_count = kmp_calculate_trip_count_XX<kmp_int32>(
196	/in/out/ bounds: (bounds_infoXX_template<kmp_int32> *)(bounds));
197	break;
198	case loop_type_t::loop_type_uint32:
199	trip_count = kmp_calculate_trip_count_XX<kmp_uint32>(
200	/in/out/ bounds: (bounds_infoXX_template<kmp_uint32> *)(bounds));
201	break;
202	case loop_type_t::loop_type_int64:
203	trip_count = kmp_calculate_trip_count_XX<kmp_int64>(
204	/in/out/ bounds: (bounds_infoXX_template<kmp_int64> *)(bounds));
205	break;
206	case loop_type_t::loop_type_uint64:
207	trip_count = kmp_calculate_trip_count_XX<kmp_uint64>(
208	/in/out/ bounds: (bounds_infoXX_template<kmp_uint64> *)(bounds));
209	break;
210	default:
211	KMP_ASSERT(false);
212	}
213
214	return trip_count;
215	}
216
217	//----------Trim original iv according to its type----------------------------
218
219	// Trim original iv according to its type.
220	// Return kmp_uint64 value which can be easily used in all internal calculations
221	// And can be statically cast back to original type in user code.
222	kmp_uint64 kmp_fix_iv(loop_type_t loop_iv_type, kmp_uint64 original_iv) {
223	kmp_uint64 res = `0`;
224
225	switch (loop_iv_type) {
226	case loop_type_t::loop_type_int8:
227	res = static_cast<kmp_uint64>(static_cast<kmp_int8>(original_iv));
228	break;
229	case loop_type_t::loop_type_uint8:
230	res = static_cast<kmp_uint64>(static_cast<kmp_uint8>(original_iv));
231	break;
232	case loop_type_t::loop_type_int16:
233	res = static_cast<kmp_uint64>(static_cast<kmp_int16>(original_iv));
234	break;
235	case loop_type_t::loop_type_uint16:
236	res = static_cast<kmp_uint64>(static_cast<kmp_uint16>(original_iv));
237	break;
238	case loop_type_t::loop_type_int32:
239	res = static_cast<kmp_uint64>(static_cast<kmp_int32>(original_iv));
240	break;
241	case loop_type_t::loop_type_uint32:
242	res = static_cast<kmp_uint64>(static_cast<kmp_uint32>(original_iv));
243	break;
244	case loop_type_t::loop_type_int64:
245	res = static_cast<kmp_uint64>(static_cast<kmp_int64>(original_iv));
246	break;
247	case loop_type_t::loop_type_uint64:
248	res = static_cast<kmp_uint64>(original_iv);
249	break;
250	default:
251	KMP_ASSERT(false);
252	}
253
254	return res;
255	}
256
257	//----------Compare two IVs (remember they have a type)-----------------------
258
259	bool kmp_ivs_eq(loop_type_t loop_iv_type, kmp_uint64 original_iv1,
260	kmp_uint64 original_iv2) {
261	bool res = false;
262
263	switch (loop_iv_type) {
264	case loop_type_t::loop_type_int8:
265	res = static_cast<kmp_int8>(original_iv1) ==
266	static_cast<kmp_int8>(original_iv2);
267	break;
268	case loop_type_t::loop_type_uint8:
269	res = static_cast<kmp_uint8>(original_iv1) ==
270	static_cast<kmp_uint8>(original_iv2);
271	break;
272	case loop_type_t::loop_type_int16:
273	res = static_cast<kmp_int16>(original_iv1) ==
274	static_cast<kmp_int16>(original_iv2);
275	break;
276	case loop_type_t::loop_type_uint16:
277	res = static_cast<kmp_uint16>(original_iv1) ==
278	static_cast<kmp_uint16>(original_iv2);
279	break;
280	case loop_type_t::loop_type_int32:
281	res = static_cast<kmp_int32>(original_iv1) ==
282	static_cast<kmp_int32>(original_iv2);
283	break;
284	case loop_type_t::loop_type_uint32:
285	res = static_cast<kmp_uint32>(original_iv1) ==
286	static_cast<kmp_uint32>(original_iv2);
287	break;
288	case loop_type_t::loop_type_int64:
289	res = static_cast<kmp_int64>(original_iv1) ==
290	static_cast<kmp_int64>(original_iv2);
291	break;
292	case loop_type_t::loop_type_uint64:
293	res = static_cast<kmp_uint64>(original_iv1) ==
294	static_cast<kmp_uint64>(original_iv2);
295	break;
296	default:
297	KMP_ASSERT(false);
298	}
299
300	return res;
301	}
302
303	//----------Calculate original iv on one level--------------------------------
304
305	// Return true if the point fits into upper bounds on this level,
306	// false otherwise
307	template <typename T>
308	bool kmp_iv_is_in_upper_bound_XX(const bounds_infoXX_template<T> *bounds,
309	const kmp_point_t original_ivs,
310	kmp_index_t ind) {
311
312	T iv = static_cast<T>(original_ivs[ind]);
313	T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
314
315	if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
316	(iv > (bounds->ub0 + bounds->ub1 * outer_iv))) \|\|
317	((bounds->comparison == comparison_t::comp_greater_or_eq) &&
318	(iv < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
319	// The calculated point is outside of loop upper boundary:
320	return false;
321	}
322
323	return true;
324	}
325
326	// Calculate one iv corresponding to iteration on the level ind.
327	// Return true if it fits into lower-upper bounds on this level
328	// (if not, we need to re-calculate)
329	template <typename T>
330	bool kmp_calc_one_iv_XX(const bounds_infoXX_template<T> *bounds,
331	/in/out/ kmp_point_t original_ivs,
332	const kmp_iterations_t iterations, kmp_index_t ind,
333	bool start_with_lower_bound, bool checkBounds) {
334
335	kmp_uint64 temp = `0`;
336	T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
337
338	if (start_with_lower_bound) {
339	// we moved to the next iteration on one of outer loops, should start
340	// with the lower bound here:
341	temp = bounds->lb0 + bounds->lb1 * outer_iv;
342	} else {
343	auto iteration = iterations[ind];
344	temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration * bounds->step;
345	}
346
347	// Now trim original iv according to its type:
348	original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
349
350	if (checkBounds) {
351	return kmp_iv_is_in_upper_bound_XX(bounds, original_ivs, ind);
352	} else {
353	return true;
354	}
355	}
356
357	bool kmp_calc_one_iv(const bounds_info_t *bounds,
358	/in/out/ kmp_point_t original_ivs,
359	const kmp_iterations_t iterations, kmp_index_t ind,
360	bool start_with_lower_bound, bool checkBounds) {
361
362	switch (bounds->loop_type) {
363	case loop_type_t::loop_type_int32:
364	return kmp_calc_one_iv_XX<kmp_int32>(
365	bounds: (bounds_infoXX_template<kmp_int32> *)(bounds),
366	/in/out/ original_ivs, iterations, ind, start_with_lower_bound,
367	checkBounds);
368	break;
369	case loop_type_t::loop_type_uint32:
370	return kmp_calc_one_iv_XX<kmp_uint32>(
371	bounds: (bounds_infoXX_template<kmp_uint32> *)(bounds),
372	/in/out/ original_ivs, iterations, ind, start_with_lower_bound,
373	checkBounds);
374	break;
375	case loop_type_t::loop_type_int64:
376	return kmp_calc_one_iv_XX<kmp_int64>(
377	bounds: (bounds_infoXX_template<kmp_int64> *)(bounds),
378	/in/out/ original_ivs, iterations, ind, start_with_lower_bound,
379	checkBounds);
380	break;
381	case loop_type_t::loop_type_uint64:
382	return kmp_calc_one_iv_XX<kmp_uint64>(
383	bounds: (bounds_infoXX_template<kmp_uint64> *)(bounds),
384	/in/out/ original_ivs, iterations, ind, start_with_lower_bound,
385	checkBounds);
386	break;
387	default:
388	KMP_ASSERT(false);
389	return false;
390	}
391	}
392
393	//----------Calculate original iv on one level for rectangular loop nest------
394
395	// Calculate one iv corresponding to iteration on the level ind.
396	// Return true if it fits into lower-upper bounds on this level
397	// (if not, we need to re-calculate)
398	template <typename T>
399	void kmp_calc_one_iv_rectang_XX(const bounds_infoXX_template<T> *bounds,
400	/in/out/ kmp_uint64 *original_ivs,
401	const kmp_iterations_t iterations,
402	kmp_index_t ind) {
403
404	auto iteration = iterations[ind];
405
406	kmp_uint64 temp =
407	bounds->lb0 +
408	bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) +
409	iteration * bounds->step;
410
411	// Now trim original iv according to its type:
412	original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
413	}
414
415	void kmp_calc_one_iv_rectang(const bounds_info_t *bounds,
416	/in/out/ kmp_uint64 *original_ivs,
417	const kmp_iterations_t iterations,
418	kmp_index_t ind) {
419
420	switch (bounds->loop_type) {
421	case loop_type_t::loop_type_int32:
422	kmp_calc_one_iv_rectang_XX<kmp_int32>(
423	bounds: (bounds_infoXX_template<kmp_int32> *)(bounds),
424	/in/out/ original_ivs, iterations, ind);
425	break;
426	case loop_type_t::loop_type_uint32:
427	kmp_calc_one_iv_rectang_XX<kmp_uint32>(
428	bounds: (bounds_infoXX_template<kmp_uint32> *)(bounds),
429	/in/out/ original_ivs, iterations, ind);
430	break;
431	case loop_type_t::loop_type_int64:
432	kmp_calc_one_iv_rectang_XX<kmp_int64>(
433	bounds: (bounds_infoXX_template<kmp_int64> *)(bounds),
434	/in/out/ original_ivs, iterations, ind);
435	break;
436	case loop_type_t::loop_type_uint64:
437	kmp_calc_one_iv_rectang_XX<kmp_uint64>(
438	bounds: (bounds_infoXX_template<kmp_uint64> *)(bounds),
439	/in/out/ original_ivs, iterations, ind);
440	break;
441	default:
442	KMP_ASSERT(false);
443	}
444	}
445
446	//----------------------------------------------------------------------------
447	// Rectangular loop nest
448	//----------------------------------------------------------------------------
449
450	//----------Canonicalize loop nest and calculate trip count-------------------
451
452	// Canonicalize loop nest and calculate overall trip count.
453	// "bounds_nest" has to be allocated per thread.
454	// API will modify original bounds_nest array to bring it to a canonical form
455	// (only <= and >=, no !=, <, >). If the original loop nest was already in a
456	// canonical form there will be no changes to bounds in bounds_nest array
457	// (only trip counts will be calculated).
458	// Returns trip count of overall space.
459	extern "C" kmp_loop_nest_iv_t
460	__kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid,
461	/in/out/ bounds_info_t *original_bounds_nest,
462	kmp_index_t n) {
463
464	kmp_canonicalize_loop_nest(loc, /in/out/ original_bounds_nest, n);
465
466	kmp_loop_nest_iv_t total = `1`;
467
468	for (kmp_index_t ind = `0`; ind < n; ++ind) {
469	auto bounds = &(original_bounds_nest[ind]);
470
471	kmp_loop_nest_iv_t trip_count = kmp_calculate_trip_count(/in/out/ bounds);
472	total *= trip_count;
473	}
474
475	return total;
476	}
477
478	//----------Calculate old induction variables---------------------------------
479
480	// Calculate old induction variables corresponding to overall new_iv.
481	// Note: original IV will be returned as if it had kmp_uint64 type,
482	// will have to be converted to original type in user code.
483	// Note: trip counts should be already calculated by
484	// __kmpc_process_loop_nest_rectang.
485	// OMPTODO: special case 2, 3 nested loops: either do different
486	// interface without array or possibly template this over n
487	extern "C" void
488	__kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv,
489	const bounds_info_t *original_bounds_nest,
490	/out/ kmp_uint64 *original_ivs,
491	kmp_index_t n) {
492
493	CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
494
495	// First, calc corresponding iteration in every original loop:
496	for (kmp_index_t ind = n; ind > `0`;) {
497	--ind;
498	auto bounds = &(original_bounds_nest[ind]);
499
500	// should be optimized to OPDIVREM:
501	auto temp = new_iv / bounds->trip_count;
502	auto iteration = new_iv % bounds->trip_count;
503	new_iv = temp;
504
505	iterations [ind] = iteration;
506	}
507	KMP_ASSERT(new_iv == `0`);
508
509	for (kmp_index_t ind = `0`; ind < n; ++ind) {
510	auto bounds = &(original_bounds_nest[ind]);
511
512	kmp_calc_one_iv_rectang(bounds, /in/out/ original_ivs, iterations, ind);
513	}
514	}
515
516	//----------------------------------------------------------------------------
517	// Non-rectangular loop nest
518	//----------------------------------------------------------------------------
519
520	//----------Calculate maximum possible span of iv values on one level---------
521
522	// Calculate span for IV on this loop level for "<=" case.
523	// Note: it's for <= on this loop nest level, so lower bound should be smallest
524	// value, upper bound should be the biggest value. If the loop won't execute,
525	// 'smallest' may be bigger than 'biggest', but we'd better not switch them
526	// around.
527	template <typename T>
528	void kmp_calc_span_lessoreq_XX(
529	/ in/out/ bounds_info_internalXX_template<T> *bounds,
530	/ in/out/ bounds_info_internal_t *bounds_nest) {
531
532	typedef typename traits_t<T>::unsigned_t UT;
533	// typedef typename traits_t<T>::signed_t ST;
534
535	// typedef typename big_span_t span_t;
536	typedef T span_t;
537
538	auto &bbounds = bounds->b;
539
540	if ((bbounds.lb1 != `0`) \|\| (bbounds.ub1 != `0`)) {
541	// This dimention depends on one of previous ones; can't be the outermost
542	// one.
543	bounds_info_internalXX_template<T> *previous =
544	reinterpret_cast<bounds_info_internalXX_template<T> *>(
545	&(bounds_nest[bbounds.outer_iv]));
546
547	// OMPTODO: assert that T is compatible with loop variable type on
548	// 'previous' loop
549
550	{
551	span_t bound_candidate1 =
552	bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
553	span_t bound_candidate2 =
554	bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
555	if (bound_candidate1 < bound_candidate2) {
556	bounds->span_smallest = bound_candidate1;
557	} else {
558	bounds->span_smallest = bound_candidate2;
559	}
560	}
561
562	{
563	// We can't adjust the upper bound with respect to step, because
564	// lower bound might be off after adjustments
565
566	span_t bound_candidate1 =
567	bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
568	span_t bound_candidate2 =
569	bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
570	if (bound_candidate1 < bound_candidate2) {
571	bounds->span_biggest = bound_candidate2;
572	} else {
573	bounds->span_biggest = bound_candidate1;
574	}
575	}
576	} else {
577	// Rectangular:
578	bounds->span_smallest = bbounds.lb0;
579	bounds->span_biggest = bbounds.ub0;
580	}
581	if (!bounds->loop_bounds_adjusted) {
582	// Here it's safe to reduce the space to the multiply of step.
583	// OMPTODO: check if the formular is correct.
584	// Also check if it would be safe to do this if we didn't adjust left side.
585	bounds->span_biggest -=
586	(static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
587	}
588	}
589
590	// Calculate span for IV on this loop level for ">=" case.
591	template <typename T>
592	void kmp_calc_span_greateroreq_XX(
593	/ in/out/ bounds_info_internalXX_template<T> *bounds,
594	/ in/out/ bounds_info_internal_t *bounds_nest) {
595
596	typedef typename traits_t<T>::unsigned_t UT;
597	// typedef typename traits_t<T>::signed_t ST;
598
599	// typedef typename big_span_t span_t;
600	typedef T span_t;
601
602	auto &bbounds = bounds->b;
603
604	if ((bbounds.lb1 != `0`) \|\| (bbounds.ub1 != `0`)) {
605	// This dimention depends on one of previous ones; can't be the outermost
606	// one.
607	bounds_info_internalXX_template<T> *previous =
608	reinterpret_cast<bounds_info_internalXX_template<T> *>(
609	&(bounds_nest[bbounds.outer_iv]));
610
611	// OMPTODO: assert that T is compatible with loop variable type on
612	// 'previous' loop
613
614	{
615	span_t bound_candidate1 =
616	bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
617	span_t bound_candidate2 =
618	bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
619	if (bound_candidate1 >= bound_candidate2) {
620	bounds->span_smallest = bound_candidate1;
621	} else {
622	bounds->span_smallest = bound_candidate2;
623	}
624	}
625
626	{
627	// We can't adjust the upper bound with respect to step, because
628	// lower bound might be off after adjustments
629
630	span_t bound_candidate1 =
631	bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
632	span_t bound_candidate2 =
633	bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
634	if (bound_candidate1 >= bound_candidate2) {
635	bounds->span_biggest = bound_candidate2;
636	} else {
637	bounds->span_biggest = bound_candidate1;
638	}
639	}
640
641	} else {
642	// Rectangular:
643	bounds->span_biggest = bbounds.lb0;
644	bounds->span_smallest = bbounds.ub0;
645	}
646	if (!bounds->loop_bounds_adjusted) {
647	// Here it's safe to reduce the space to the multiply of step.
648	// OMPTODO: check if the formular is correct.
649	// Also check if it would be safe to do this if we didn't adjust left side.
650	bounds->span_biggest -=
651	(static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
652	}
653	}
654
655	// Calculate maximum possible span for IV on this loop level.
656	template <typename T>
657	void kmp_calc_span_XX(
658	/ in/out/ bounds_info_internalXX_template<T> *bounds,
659	/ in/out/ bounds_info_internal_t *bounds_nest) {
660
661	if (bounds->b.comparison == comparison_t::comp_less_or_eq) {
662	kmp_calc_span_lessoreq_XX(/ in/out/ bounds, / in/out/ bounds_nest);
663	} else {
664	KMP_ASSERT(bounds->b.comparison == comparison_t::comp_greater_or_eq);
665	kmp_calc_span_greateroreq_XX(/ in/out/ bounds, / in/out/ bounds_nest);
666	}
667	}
668
669	//----------All initial processing of the loop nest---------------------------
670
671	// Calculate new bounds for this loop level.
672	// To be able to work with the nest we need to get it to a parallelepiped shape.
673	// We need to stay in the original range of values, so that there will be no
674	// overflow, for that we'll adjust both upper and lower bounds as needed.
675	template <typename T>
676	void kmp_calc_new_bounds_XX(
677	/ in/out/ bounds_info_internalXX_template<T> *bounds,
678	/ in/out/ bounds_info_internal_t *bounds_nest) {
679
680	auto &bbounds = bounds->b;
681
682	if (bbounds.lb1 == bbounds.ub1) {
683	// Already parallel, no need to adjust:
684	bounds->loop_bounds_adjusted = false;
685	} else {
686	bounds->loop_bounds_adjusted = true;
687
688	T old_lb1 = bbounds.lb1;
689	T old_ub1 = bbounds.ub1;
690
691	if (__kmp_sign(old_lb1) != __kmp_sign(old_ub1)) {
692	// With this shape we can adjust to a rectangle:
693	bbounds.lb1 = `0`;
694	bbounds.ub1 = `0`;
695	} else {
696	// get upper and lower bounds to be parallel
697	// with values in the old range.
698	// Note: abs didn't work here.
699	if (((old_lb1 < `0`) && (old_lb1 < old_ub1)) \|\|
700	((old_lb1 > `0`) && (old_lb1 > old_ub1))) {
701	bbounds.lb1 = old_ub1;
702	} else {
703	bbounds.ub1 = old_lb1;
704	}
705	}
706
707	// Now need to adjust lb0, ub0, otherwise in some cases space will shrink.
708	// The idea here that for this IV we are now getting the same span
709	// irrespective of the previous IV value.
710	bounds_info_internalXX_template<T> *previous =
711	reinterpret_cast<bounds_info_internalXX_template<T> *>(
712	&bounds_nest[bbounds.outer_iv]);
713
714	if (bbounds.comparison == comparison_t::comp_less_or_eq) {
715	if (old_lb1 < bbounds.lb1) {
716	KMP_ASSERT(old_lb1 < `0`);
717	// The length is good on outer_iv biggest number,
718	// can use it to find where to move the lower bound:
719
720	T sub = (bbounds.lb1 - old_lb1) * previous->span_biggest;
721	bbounds.lb0 -= sub; // OMPTODO: what if it'll go out of unsigned space?
722	// e.g. it was 0?? (same below)
723	} else if (old_lb1 > bbounds.lb1) {
724	// still need to move lower bound:
725	T add = (old_lb1 - bbounds.lb1) * previous->span_smallest;
726	bbounds.lb0 += add;
727	}
728
729	if (old_ub1 > bbounds.ub1) {
730	KMP_ASSERT(old_ub1 > `0`);
731	// The length is good on outer_iv biggest number,
732	// can use it to find where to move upper bound:
733
734	T add = (old_ub1 - bbounds.ub1) * previous->span_biggest;
735	bbounds.ub0 += add;
736	} else if (old_ub1 < bbounds.ub1) {
737	// still need to move upper bound:
738	T sub = (bbounds.ub1 - old_ub1) * previous->span_smallest;
739	bbounds.ub0 -= sub;
740	}
741	} else {
742	KMP_ASSERT(bbounds.comparison == comparison_t::comp_greater_or_eq);
743	if (old_lb1 < bbounds.lb1) {
744	KMP_ASSERT(old_lb1 < `0`);
745	T sub = (bbounds.lb1 - old_lb1) * previous->span_smallest;
746	bbounds.lb0 -= sub;
747	} else if (old_lb1 > bbounds.lb1) {
748	T add = (old_lb1 - bbounds.lb1) * previous->span_biggest;
749	bbounds.lb0 += add;
750	}
751
752	if (old_ub1 > bbounds.ub1) {
753	KMP_ASSERT(old_ub1 > `0`);
754	T add = (old_ub1 - bbounds.ub1) * previous->span_smallest;
755	bbounds.ub0 += add;
756	} else if (old_ub1 < bbounds.ub1) {
757	T sub = (bbounds.ub1 - old_ub1) * previous->span_biggest;
758	bbounds.ub0 -= sub;
759	}
760	}
761	}
762	}
763
764	// Do all processing for one canonicalized loop in the nest
765	// (assuming that outer loops already were processed):
766	template <typename T>
767	kmp_loop_nest_iv_t kmp_process_one_loop_XX(
768	/ in/out/ bounds_info_internalXX_template<T> *bounds,
769	/in/out/ bounds_info_internal_t *bounds_nest) {
770
771	kmp_calc_new_bounds_XX(/ in/out/ bounds, / in/out/ bounds_nest);
772	kmp_calc_span_XX(/ in/out/ bounds, / in/out/ bounds_nest);
773	return kmp_calculate_trip_count_XX(/in/out/ &(bounds->b));
774	}
775
776	// Non-rectangular loop nest, canonicalized to use <= or >=.
777	// Process loop nest to have a parallelepiped shape,
778	// calculate biggest spans for IV's on all levels and calculate overall trip
779	// count. "bounds_nest" has to be allocated per thread.
780	// Returns overall trip count (for adjusted space).
781	kmp_loop_nest_iv_t kmp_process_loop_nest(
782	/in/out/ bounds_info_internal_t *bounds_nest, kmp_index_t n) {
783
784	kmp_loop_nest_iv_t total = `1`;
785
786	for (kmp_index_t ind = `0`; ind < n; ++ind) {
787	auto bounds = &(bounds_nest[ind]);
788	kmp_loop_nest_iv_t trip_count = `0`;
789
790	switch (bounds->b.loop_type) {
791	case loop_type_t::loop_type_int32:
792	trip_count = kmp_process_one_loop_XX<kmp_int32>(
793	/in/out/ bounds: (bounds_info_internalXX_template<kmp_int32> *)(bounds),
794	/in/out/ bounds_nest);
795	break;
796	case loop_type_t::loop_type_uint32:
797	trip_count = kmp_process_one_loop_XX<kmp_uint32>(
798	/in/out/ bounds: (bounds_info_internalXX_template<kmp_uint32> *)(bounds),
799	/in/out/ bounds_nest);
800	break;
801	case loop_type_t::loop_type_int64:
802	trip_count = kmp_process_one_loop_XX<kmp_int64>(
803	/in/out/ bounds: (bounds_info_internalXX_template<kmp_int64> *)(bounds),
804	/in/out/ bounds_nest);
805	break;
806	case loop_type_t::loop_type_uint64:
807	trip_count = kmp_process_one_loop_XX<kmp_uint64>(
808	/in/out/ bounds: (bounds_info_internalXX_template<kmp_uint64> *)(bounds),
809	/in/out/ bounds_nest);
810	break;
811	default:
812	KMP_ASSERT(false);
813	}
814	total *= trip_count;
815	}
816
817	return total;
818	}
819
820	//----------Calculate iterations (in the original or updated space)-----------
821
822	// Calculate number of iterations in original or updated space resulting in
823	// original_ivs[ind] (only on this level, non-negative)
824	// (not counting initial iteration)
825	template <typename T>
826	kmp_loop_nest_iv_t
827	kmp_calc_number_of_iterations_XX(const bounds_infoXX_template<T> *bounds,
828	const kmp_point_t original_ivs,
829	kmp_index_t ind) {
830
831	kmp_loop_nest_iv_t iterations = `0`;
832
833	if (bounds->comparison == comparison_t::comp_less_or_eq) {
834	iterations =
835	(static_cast<T>(original_ivs[ind]) - bounds->lb0 -
836	bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv])) /
837	__kmp_abs(bounds->step);
838	} else {
839	KMP_DEBUG_ASSERT(bounds->comparison == comparison_t::comp_greater_or_eq);
840	iterations = (bounds->lb0 +
841	bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) -
842	static_cast<T>(original_ivs[ind])) /
843	__kmp_abs(bounds->step);
844	}
845
846	return iterations;
847	}
848
849	// Calculate number of iterations in the original or updated space resulting in
850	// original_ivs[ind] (only on this level, non-negative)
851	kmp_loop_nest_iv_t kmp_calc_number_of_iterations(const bounds_info_t *bounds,
852	const kmp_point_t original_ivs,
853	kmp_index_t ind) {
854
855	switch (bounds->loop_type) {
856	case loop_type_t::loop_type_int32:
857	return kmp_calc_number_of_iterations_XX<kmp_int32>(
858	bounds: (bounds_infoXX_template<kmp_int32> *)(bounds), original_ivs, ind);
859	break;
860	case loop_type_t::loop_type_uint32:
861	return kmp_calc_number_of_iterations_XX<kmp_uint32>(
862	bounds: (bounds_infoXX_template<kmp_uint32> *)(bounds), original_ivs, ind);
863	break;
864	case loop_type_t::loop_type_int64:
865	return kmp_calc_number_of_iterations_XX<kmp_int64>(
866	bounds: (bounds_infoXX_template<kmp_int64> *)(bounds), original_ivs, ind);
867	break;
868	case loop_type_t::loop_type_uint64:
869	return kmp_calc_number_of_iterations_XX<kmp_uint64>(
870	bounds: (bounds_infoXX_template<kmp_uint64> *)(bounds), original_ivs, ind);
871	break;
872	default:
873	KMP_ASSERT(false);
874	return `0`;
875	}
876	}
877
878	//----------Calculate new iv corresponding to original ivs--------------------
879
880	// We got a point in the original loop nest.
881	// Take updated bounds and calculate what new_iv will correspond to this point.
882	// When we are getting original IVs from new_iv, we have to adjust to fit into
883	// original loops bounds. Getting new_iv for the adjusted original IVs will help
884	// with making more chunks non-empty.
885	kmp_loop_nest_iv_t
886	kmp_calc_new_iv_from_original_ivs(const bounds_info_internal_t *bounds_nest,
887	const kmp_point_t original_ivs,
888	kmp_index_t n) {
889
890	kmp_loop_nest_iv_t new_iv = `0`;
891
892	for (kmp_index_t ind = `0`; ind < n; ++ind) {
893	auto bounds = &(bounds_nest[ind].b);
894
895	new_iv = new_iv * bounds->trip_count +
896	kmp_calc_number_of_iterations(bounds, original_ivs, ind);
897	}
898
899	return new_iv;
900	}
901
902	//----------Calculate original ivs for provided iterations--------------------
903
904	// Calculate original IVs for provided iterations, assuming iterations are
905	// calculated in the original space.
906	// Loop nest is in canonical form (with <= / >=).
907	bool kmp_calc_original_ivs_from_iterations(
908	const bounds_info_t *original_bounds_nest, kmp_index_t n,
909	/in/out/ kmp_point_t original_ivs,
910	/in/out/ kmp_iterations_t iterations, kmp_index_t ind) {
911
912	kmp_index_t lengthened_ind = n;
913
914	for (; ind < n;) {
915	auto bounds = &(original_bounds_nest[ind]);
916	bool good = kmp_calc_one_iv(bounds, /in/out/ original_ivs, iterations,
917	ind, start_with_lower_bound: (lengthened_ind < ind), checkBounds: true);
918
919	if (!good) {
920	// The calculated iv value is too big (or too small for >=):
921	if (ind == `0`) {
922	// Space is empty:
923	return false;
924	} else {
925	// Go to next iteration on the outer loop:
926	--ind;
927	++iterations[ind];
928	lengthened_ind = ind;
929	for (kmp_index_t i = ind + `1`; i < n; ++i) {
930	iterations[i] = `0`;
931	}
932	continue;
933	}
934	}
935	++ind;
936	}
937
938	return true;
939	}
940
941	//----------Calculate original ivs for the beginning of the loop nest---------
942
943	// Calculate IVs for the beginning of the loop nest.
944	// Note: lower bounds of all loops may not work -
945	// if on some of the iterations of the outer loops inner loops are empty.
946	// Loop nest is in canonical form (with <= / >=).
947	bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest,
948	kmp_index_t n,
949	/out/ kmp_point_t original_ivs) {
950
951	// Iterations in the original space, multiplied by step:
952	CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
953	for (kmp_index_t ind = n; ind > `0`;) {
954	--ind;
955	iterations [ind] = `0`;
956	}
957
958	// Now calculate the point:
959	bool b = kmp_calc_original_ivs_from_iterations(original_bounds_nest, n,
960	/in/out/ original_ivs,
961	/in/out/ iterations, ind: `0`);
962	return b;
963	}
964
965	//----------Calculate next point in the original loop space-------------------
966
967	// From current set of original IVs calculate next point.
968	// Return false if there is no next point in the loop bounds.
969	bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest,
970	kmp_index_t n, const kmp_point_t original_ivs,
971	/out/ kmp_point_t next_original_ivs) {
972	// Iterations in the original space, multiplied by step (so can be negative):
973	CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
974	// First, calc corresponding iteration in every original loop:
975	for (kmp_index_t ind = `0`; ind < n; ++ind) {
976	auto bounds = &(original_bounds_nest[ind]);
977	iterations [ind] = kmp_calc_number_of_iterations(bounds, original_ivs, ind);
978	}
979
980	for (kmp_index_t ind = `0`; ind < n; ++ind) {
981	next_original_ivs[ind] = original_ivs[ind];
982	}
983
984	// Next add one step to the iterations on the inner-most level, and see if we
985	// need to move up the nest:
986	kmp_index_t ind = n - `1`;
987	++iterations [ind];
988
989	bool b = kmp_calc_original_ivs_from_iterations(
990	original_bounds_nest, n, /in/out/ original_ivs: next_original_ivs, iterations, ind);
991
992	return b;
993	}
994
995	//----------Calculate chunk end in the original loop space--------------------
996
997	// For one level calculate old induction variable corresponding to overall
998	// new_iv for the chunk end.
999	// Return true if it fits into upper bound on this level
1000	// (if not, we need to re-calculate)
1001	template <typename T>
1002	bool kmp_calc_one_iv_for_chunk_end_XX(
1003	const bounds_infoXX_template<T> *bounds,
1004	const bounds_infoXX_template<T> *updated_bounds,
1005	/in/out/ kmp_point_t original_ivs, const kmp_iterations_t iterations,
1006	kmp_index_t ind, bool start_with_lower_bound, bool compare_with_start,
1007	const kmp_point_t original_ivs_start) {
1008
1009	// typedef std::conditional<std::is_signed<T>::value, kmp_int64, kmp_uint64>
1010	// big_span_t;
1011
1012	// OMPTODO: is it good enough, or do we need ST or do we need big_span_t?
1013	T temp = `0`;
1014
1015	T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
1016
1017	if (start_with_lower_bound) {
1018	// we moved to the next iteration on one of outer loops, may as well use
1019	// the lower bound here:
1020	temp = bounds->lb0 + bounds->lb1 * outer_iv;
1021	} else {
1022	// Start in expanded space, but:
1023	// - we need to hit original space lower bound, so need to account for
1024	// that
1025	// - we have to go into original space, even if that means adding more
1026	// iterations than was planned
1027	// - we have to go past (or equal to) previous point (which is the chunk
1028	// starting point)
1029
1030	auto iteration = iterations[ind];
1031
1032	auto step = bounds->step;
1033
1034	// In case of >= it's negative:
1035	auto accountForStep =
1036	((bounds->lb0 + bounds->lb1 * outer_iv) -
1037	(updated_bounds->lb0 + updated_bounds->lb1 * outer_iv)) %
1038	step;
1039
1040	temp = updated_bounds->lb0 + updated_bounds->lb1 * outer_iv +
1041	accountForStep + iteration * step;
1042
1043	if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1044	(temp < (bounds->lb0 + bounds->lb1 * outer_iv))) \|\|
1045	((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1046	(temp > (bounds->lb0 + bounds->lb1 * outer_iv)))) {
1047	// Too small (or too big), didn't reach the original lower bound. Use
1048	// heuristic:
1049	temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration / `2` * step;
1050	}
1051
1052	if (compare_with_start) {
1053
1054	T start = static_cast<T>(original_ivs_start[ind]);
1055
1056	temp = kmp_fix_iv(bounds->loop_iv_type, temp);
1057
1058	// On all previous levels start of the chunk is same as the end, need to
1059	// be really careful here:
1060	if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1061	(temp < start)) \|\|
1062	((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1063	(temp > start))) {
1064	// End of the chunk can't be smaller (for >= bigger) than it's start.
1065	// Use heuristic:
1066	temp = start + iteration / `4` * step;
1067	}
1068	}
1069	}
1070
1071	original_ivs[ind] = temp = kmp_fix_iv(bounds->loop_iv_type, temp);
1072
1073	if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1074	(temp > (bounds->ub0 + bounds->ub1 * outer_iv))) \|\|
1075	((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1076	(temp < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
1077	// Too big (or too small for >=).
1078	return false;
1079	}
1080
1081	return true;
1082	}
1083
1084	// For one level calculate old induction variable corresponding to overall
1085	// new_iv for the chunk end.
1086	bool kmp_calc_one_iv_for_chunk_end(const bounds_info_t *bounds,
1087	const bounds_info_t *updated_bounds,
1088	/in/out/ kmp_point_t original_ivs,
1089	const kmp_iterations_t iterations,
1090	kmp_index_t ind, bool start_with_lower_bound,
1091	bool compare_with_start,
1092	const kmp_point_t original_ivs_start) {
1093
1094	switch (bounds->loop_type) {
1095	case loop_type_t::loop_type_int32:
1096	return kmp_calc_one_iv_for_chunk_end_XX<kmp_int32>(
1097	bounds: (bounds_infoXX_template<kmp_int32> *)(bounds),
1098	updated_bounds: (bounds_infoXX_template<kmp_int32> *)(updated_bounds),
1099	/in/out/
1100	original_ivs, iterations, ind, start_with_lower_bound,
1101	compare_with_start, original_ivs_start);
1102	break;
1103	case loop_type_t::loop_type_uint32:
1104	return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint32>(
1105	bounds: (bounds_infoXX_template<kmp_uint32> *)(bounds),
1106	updated_bounds: (bounds_infoXX_template<kmp_uint32> *)(updated_bounds),
1107	/in/out/
1108	original_ivs, iterations, ind, start_with_lower_bound,
1109	compare_with_start, original_ivs_start);
1110	break;
1111	case loop_type_t::loop_type_int64:
1112	return kmp_calc_one_iv_for_chunk_end_XX<kmp_int64>(
1113	bounds: (bounds_infoXX_template<kmp_int64> *)(bounds),
1114	updated_bounds: (bounds_infoXX_template<kmp_int64> *)(updated_bounds),
1115	/in/out/
1116	original_ivs, iterations, ind, start_with_lower_bound,
1117	compare_with_start, original_ivs_start);
1118	break;
1119	case loop_type_t::loop_type_uint64:
1120	return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint64>(
1121	bounds: (bounds_infoXX_template<kmp_uint64> *)(bounds),
1122	updated_bounds: (bounds_infoXX_template<kmp_uint64> *)(updated_bounds),
1123	/in/out/
1124	original_ivs, iterations, ind, start_with_lower_bound,
1125	compare_with_start, original_ivs_start);
1126	break;
1127	default:
1128	KMP_ASSERT(false);
1129	return false;
1130	}
1131	}
1132
1133	// Calculate old induction variables corresponding to overall new_iv for the
1134	// chunk end. If due to space extension we are getting old IVs outside of the
1135	// boundaries, bring them into the boundaries. Need to do this in the runtime,
1136	// esp. on the lower bounds side. When getting result need to make sure that the
1137	// new chunk starts at next position to old chunk, not overlaps with it (this is
1138	// done elsewhere), and need to make sure end of the chunk is further than the
1139	// beginning of the chunk. We don't need an exact ending point here, just
1140	// something more-or-less close to the desired chunk length, bigger is fine
1141	// (smaller would be fine, but we risk going into infinite loop, so do smaller
1142	// only at the very end of the space). result: false if could not find the
1143	// ending point in the original loop space. In this case the caller can use
1144	// original upper bounds as the end of the chunk. Chunk won't be empty, because
1145	// it'll have at least the starting point, which is by construction in the
1146	// original space.
1147	bool kmp_calc_original_ivs_for_chunk_end(
1148	const bounds_info_t *original_bounds_nest, kmp_index_t n,
1149	const bounds_info_internal_t *updated_bounds_nest,
1150	const kmp_point_t original_ivs_start, kmp_loop_nest_iv_t new_iv,
1151	/out/ kmp_point_t original_ivs) {
1152
1153	// Iterations in the expanded space:
1154	CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
1155	// First, calc corresponding iteration in every modified loop:
1156	for (kmp_index_t ind = n; ind > `0`;) {
1157	--ind;
1158	auto &updated_bounds = updated_bounds_nest[ind];
1159
1160	// should be optimized to OPDIVREM:
1161	auto new_ind = new_iv / updated_bounds.b.trip_count;
1162	auto iteration = new_iv % updated_bounds.b.trip_count;
1163
1164	new_iv = new_ind;
1165	iterations [ind] = iteration;
1166	}
1167	KMP_DEBUG_ASSERT(new_iv == `0`);
1168
1169	kmp_index_t lengthened_ind = n;
1170	kmp_index_t equal_ind = -`1`;
1171
1172	// Next calculate the point, but in original loop nest.
1173	for (kmp_index_t ind = `0`; ind < n;) {
1174	auto bounds = &(original_bounds_nest[ind]);
1175	auto updated_bounds = &(updated_bounds_nest[ind].b);
1176
1177	bool good = kmp_calc_one_iv_for_chunk_end(
1178	bounds, updated_bounds,
1179	/in/out/ original_ivs, iterations, ind, start_with_lower_bound: (lengthened_ind < ind),
1180	compare_with_start: (equal_ind >= ind - `1`), original_ivs_start);
1181
1182	if (!good) {
1183	// Too big (or too small for >=).
1184	if (ind == `0`) {
1185	// Need to reduce to the end.
1186	return false;
1187	} else {
1188	// Go to next iteration on outer loop:
1189	--ind;
1190	++(iterations [ind]);
1191	lengthened_ind = ind;
1192	if (equal_ind >= lengthened_ind) {
1193	// We've changed the number of iterations here,
1194	// can't be same anymore:
1195	equal_ind = lengthened_ind - `1`;
1196	}
1197	for (kmp_index_t i = ind + `1`; i < n; ++i) {
1198	iterations [i] = `0`;
1199	}
1200	continue;
1201	}
1202	}
1203
1204	if ((equal_ind == ind - `1`) &&
1205	(kmp_ivs_eq(loop_iv_type: bounds->loop_iv_type, original_iv1: original_ivs[ind],
1206	original_iv2: original_ivs_start[ind]))) {
1207	equal_ind = ind;
1208	} else if ((equal_ind > ind - `1`) &&
1209	!(kmp_ivs_eq(loop_iv_type: bounds->loop_iv_type, original_iv1: original_ivs[ind],
1210	original_iv2: original_ivs_start[ind]))) {
1211	equal_ind = ind - `1`;
1212	}
1213	++ind;
1214	}
1215
1216	return true;
1217	}
1218
1219	//----------Calculate upper bounds for the last chunk-------------------------
1220
1221	// Calculate one upper bound for the end.
1222	template <typename T>
1223	void kmp_calc_one_iv_end_XX(const bounds_infoXX_template<T> *bounds,
1224	/in/out/ kmp_point_t original_ivs,
1225	kmp_index_t ind) {
1226
1227	T temp = bounds->ub0 +
1228	bounds->ub1 * static_cast<T>(original_ivs[bounds->outer_iv]);
1229
1230	original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
1231	}
1232
1233	void kmp_calc_one_iv_end(const bounds_info_t *bounds,
1234	/in/out/ kmp_point_t original_ivs, kmp_index_t ind) {
1235
1236	switch (bounds->loop_type) {
1237	default:
1238	KMP_ASSERT(false);
1239	break;
1240	case loop_type_t::loop_type_int32:
1241	kmp_calc_one_iv_end_XX<kmp_int32>(
1242	bounds: (bounds_infoXX_template<kmp_int32> *)(bounds),
1243	/in/out/ original_ivs, ind);
1244	break;
1245	case loop_type_t::loop_type_uint32:
1246	kmp_calc_one_iv_end_XX<kmp_uint32>(
1247	bounds: (bounds_infoXX_template<kmp_uint32> *)(bounds),
1248	/in/out/ original_ivs, ind);
1249	break;
1250	case loop_type_t::loop_type_int64:
1251	kmp_calc_one_iv_end_XX<kmp_int64>(
1252	bounds: (bounds_infoXX_template<kmp_int64> *)(bounds),
1253	/in/out/ original_ivs, ind);
1254	break;
1255	case loop_type_t::loop_type_uint64:
1256	kmp_calc_one_iv_end_XX<kmp_uint64>(
1257	bounds: (bounds_infoXX_template<kmp_uint64> *)(bounds),
1258	/in/out/ original_ivs, ind);
1259	break;
1260	}
1261	}
1262
1263	// Calculate upper bounds for the last loop iteration. Just use original upper
1264	// bounds (adjusted when canonicalized to use <= / >=). No need to check that
1265	// this point is in the original space (it's likely not)
1266	void kmp_calc_original_ivs_for_end(
1267	const bounds_info_t *const original_bounds_nest, kmp_index_t n,
1268	/out/ kmp_point_t original_ivs) {
1269	for (kmp_index_t ind = `0`; ind < n; ++ind) {
1270	auto bounds = &(original_bounds_nest[ind]);
1271	kmp_calc_one_iv_end(bounds, /in/out/ original_ivs, ind);
1272	}
1273	}
1274
1275	/**************************************************************************
1276	* Identify nested loop structure - loops come in the canonical form
1277	* Lower triangle matrix: i = 0; i <= N; i++ {0,0}:{N,0}
1278	* j = 0; j <= 0/-1+1*i; j++ {0,0}:{0/-1,1}
1279	* Upper Triangle matrix
1280	* i = 0; i <= N; i++ {0,0}:{N,0}
1281	* j = 0+1*i; j <= N; j++ {0,1}:{N,0}
1282	* ************************************************************************/
1283	nested_loop_type_t
1284	kmp_identify_nested_loop_structure(/in/ bounds_info_t *original_bounds_nest,
1285	/in/ kmp_index_t n) {
1286	// only 2-level nested loops are supported
1287	if (n != `2`) {
1288	return nested_loop_type_unkown;
1289	}
1290	// loops must be canonical
1291	KMP_ASSERT(
1292	(original_bounds_nest[`0`].comparison == comparison_t::comp_less_or_eq) &&
1293	(original_bounds_nest[`1`].comparison == comparison_t::comp_less_or_eq));
1294	// check outer loop bounds: for triangular need to be {0,0}:{N,0}
1295	kmp_uint64 outer_lb0_u64 = kmp_fix_iv(loop_iv_type: original_bounds_nest[`0`].loop_iv_type,
1296	original_iv: original_bounds_nest[`0`].lb0_u64);
1297	kmp_uint64 outer_ub0_u64 = kmp_fix_iv(loop_iv_type: original_bounds_nest[`0`].loop_iv_type,
1298	original_iv: original_bounds_nest[`0`].ub0_u64);
1299	kmp_uint64 outer_lb1_u64 = kmp_fix_iv(loop_iv_type: original_bounds_nest[`0`].loop_iv_type,
1300	original_iv: original_bounds_nest[`0`].lb1_u64);
1301	kmp_uint64 outer_ub1_u64 = kmp_fix_iv(loop_iv_type: original_bounds_nest[`0`].loop_iv_type,
1302	original_iv: original_bounds_nest[`0`].ub1_u64);
1303	if (outer_lb0_u64 != `0` \|\| outer_lb1_u64 != `0` \|\| outer_ub1_u64 != `0`) {
1304	return nested_loop_type_unkown;
1305	}
1306	// check inner bounds to determine triangle type
1307	kmp_uint64 inner_lb0_u64 = kmp_fix_iv(loop_iv_type: original_bounds_nest[`1`].loop_iv_type,
1308	original_iv: original_bounds_nest[`1`].lb0_u64);
1309	kmp_uint64 inner_ub0_u64 = kmp_fix_iv(loop_iv_type: original_bounds_nest[`1`].loop_iv_type,
1310	original_iv: original_bounds_nest[`1`].ub0_u64);
1311	kmp_uint64 inner_lb1_u64 = kmp_fix_iv(loop_iv_type: original_bounds_nest[`1`].loop_iv_type,
1312	original_iv: original_bounds_nest[`1`].lb1_u64);
1313	kmp_uint64 inner_ub1_u64 = kmp_fix_iv(loop_iv_type: original_bounds_nest[`1`].loop_iv_type,
1314	original_iv: original_bounds_nest[`1`].ub1_u64);
1315	// lower triangle loop inner bounds need to be {0,0}:{0/-1,1}
1316	if (inner_lb0_u64 == `0` && inner_lb1_u64 == `0` &&
1317	(inner_ub0_u64 == `0` \|\| inner_ub0_u64 == -`1`) && inner_ub1_u64 == `1`) {
1318	return nested_loop_type_lower_triangular_matrix;
1319	}
1320	// upper triangle loop inner bounds need to be {0,1}:{N,0}
1321	if (inner_lb0_u64 == `0` && inner_lb1_u64 == `1` &&
1322	inner_ub0_u64 == outer_ub0_u64 && inner_ub1_u64 == `0`) {
1323	return nested_loop_type_upper_triangular_matrix;
1324	}
1325	return nested_loop_type_unkown;
1326	}
1327
1328	/**************************************************************************
1329	* SQRT Approximation: https://math.mit.edu/~stevenj/18.335/newton-sqrt.pdf
1330	* Start point is x so the result is always > sqrt(x)
1331	* The method has uniform convergence, PRECISION is set to 0.1
1332	* ************************************************************************/
1333	#define level_of_precision 0.1
1334	double sqrt_newton_approx(/in/ kmp_uint64 x) {
1335	double sqrt_old = `0.`;
1336	double sqrt_new = (double)x;
1337	do {
1338	sqrt_old = sqrt_new;
1339	sqrt_new = (sqrt_old + x / sqrt_old) / `2`;
1340	} while ((sqrt_old - sqrt_new) > level_of_precision);
1341	return sqrt_new;
1342	}
1343
1344	/**************************************************************************
1345	* Handle lower triangle matrix in the canonical form
1346	* i = 0; i <= N; i++ {0,0}:{N,0}
1347	* j = 0; j <= 0/-1 + 1*i; j++ {0,0}:{0/-1,1}
1348	* ************************************************************************/
1349	void kmp_handle_lower_triangle_matrix(
1350	/in/ kmp_uint32 nth,
1351	/in/ kmp_uint32 tid,
1352	/in / kmp_index_t n,
1353	/in/out/ bounds_info_t *original_bounds_nest,
1354	/out/ bounds_info_t *chunk_bounds_nest) {
1355
1356	// transfer loop types from the original loop to the chunks
1357	for (kmp_index_t i = `0`; i < n; ++i) {
1358	chunk_bounds_nest[i] = original_bounds_nest[i];
1359	}
1360	// cleanup iv variables
1361	kmp_uint64 outer_ub0 = kmp_fix_iv(loop_iv_type: original_bounds_nest[`0`].loop_iv_type,
1362	original_iv: original_bounds_nest[`0`].ub0_u64);
1363	kmp_uint64 outer_lb0 = kmp_fix_iv(loop_iv_type: original_bounds_nest[`0`].loop_iv_type,
1364	original_iv: original_bounds_nest[`0`].lb0_u64);
1365	kmp_uint64 inner_ub0 = kmp_fix_iv(loop_iv_type: original_bounds_nest[`1`].loop_iv_type,
1366	original_iv: original_bounds_nest[`1`].ub0_u64);
1367	// calculate the chunk's lower and upper bounds
1368	// the total number of iterations in the loop is the sum of the arithmetic
1369	// progression from the outer lower to outer upper bound (inclusive since the
1370	// loop is canonical) note that less_than inner loops (inner_ub0 = -1)
1371	// effectively make the progression 1-based making N = (outer_ub0 - inner_lb0
1372	// + 1) -> N - 1
1373	kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + `1`) + inner_ub0;
1374	kmp_uint64 iter_total = outer_iters * (outer_iters + `1`) / `2`;
1375	// the current thread's number of iterations:
1376	// each thread gets an equal number of iterations: total number of iterations
1377	// divided by the number of threads plus, if there's a remainder,
1378	// the first threads with the number up to the remainder get an additional
1379	// iteration each to cover it
1380	kmp_uint64 iter_current =
1381	iter_total / nth + ((tid < (iter_total % nth)) ? `1` : `0`);
1382	// cumulative number of iterations executed by all the previous threads:
1383	// threads with the tid below the remainder will have (iter_total/nth+1)
1384	// elements, and so will all threads before them so the cumulative number of
1385	// iterations executed by the all previous will be the current thread's number
1386	// of iterations multiplied by the number of previous threads which is equal
1387	// to the current thread's tid; threads with the number equal or above the
1388	// remainder will have (iter_total/nth) elements so the cumulative number of
1389	// iterations previously executed is its number of iterations multipled by the
1390	// number of previous threads which is again equal to the current thread's tid
1391	// PLUS all the remainder iterations that will have been executed by the
1392	// previous threads
1393	kmp_uint64 iter_before_current =
1394	tid * iter_current + ((tid < iter_total % nth) ? `0` : (iter_total % nth));
1395	// cumulative number of iterations executed with the current thread is
1396	// the cumulative number executed before it plus its own
1397	kmp_uint64 iter_with_current = iter_before_current + iter_current;
1398	// calculate the outer loop lower bound (lbo) which is the max outer iv value
1399	// that gives the number of iterations that is equal or just below the total
1400	// number of iterations executed by the previous threads, for less_than
1401	// (1-based) inner loops (inner_ub0 == -1) it will be i.e.
1402	// lbo(lbo-1)/2<=iter_before_current => lbo^2-lbo-2iter_before_current<=0
1403	// for less_than_equal (0-based) inner loops (inner_ub == 0) it will be:
1404	// i.e. lbo(lbo+1)/2<=iter_before_current =>*
1405	// lbo^2+lbo-2iter_before_current<=0 both cases can be handled similarily*
1406	// using a parameter to control the equation sign
1407	kmp_int64 inner_adjustment = `1` + `2` * inner_ub0;
1408	kmp_uint64 lower_bound_outer =
1409	(kmp_uint64)(sqrt_newton_approx(x: inner_adjustment * inner_adjustment +
1410	`8` * iter_before_current) +
1411	inner_adjustment) /
1412	`2` -
1413	inner_adjustment;
1414	// calculate the inner loop lower bound which is the remaining number of
1415	// iterations required to hit the total number of iterations executed by the
1416	// previous threads giving the starting point of this thread
1417	kmp_uint64 lower_bound_inner =
1418	iter_before_current -
1419	((lower_bound_outer + inner_adjustment) * lower_bound_outer) / `2`;
1420	// calculate the outer loop upper bound using the same approach as for the
1421	// inner bound except using the total number of iterations executed with the
1422	// current thread
1423	kmp_uint64 upper_bound_outer =
1424	(kmp_uint64)(sqrt_newton_approx(x: inner_adjustment * inner_adjustment +
1425	`8` * iter_with_current) +
1426	inner_adjustment) /
1427	`2` -
1428	inner_adjustment;
1429	// calculate the inner loop upper bound which is the remaining number of
1430	// iterations required to hit the total number of iterations executed after
1431	// the current thread giving the starting point of the next thread
1432	kmp_uint64 upper_bound_inner =
1433	iter_with_current -
1434	((upper_bound_outer + inner_adjustment) * upper_bound_outer) / `2`;
1435	// adjust the upper bounds down by 1 element to point at the last iteration of
1436	// the current thread the first iteration of the next thread
1437	if (upper_bound_inner == `0`) {
1438	// {n,0} => {n-1,n-1}
1439	upper_bound_outer -= `1`;
1440	upper_bound_inner = upper_bound_outer;
1441	} else {
1442	// {n,m} => {n,m-1} (m!=0)
1443	upper_bound_inner -= `1`;
1444	}
1445
1446	// assign the values, zeroing out lb1 and ub1 values since the iteration space
1447	// is now one-dimensional
1448	chunk_bounds_nest[`0`].lb0_u64 = lower_bound_outer;
1449	chunk_bounds_nest[`1`].lb0_u64 = lower_bound_inner;
1450	chunk_bounds_nest[`0`].ub0_u64 = upper_bound_outer;
1451	chunk_bounds_nest[`1`].ub0_u64 = upper_bound_inner;
1452	chunk_bounds_nest[`0`].lb1_u64 = `0`;
1453	chunk_bounds_nest[`0`].ub1_u64 = `0`;
1454	chunk_bounds_nest[`1`].lb1_u64 = `0`;
1455	chunk_bounds_nest[`1`].ub1_u64 = `0`;
1456
1457	#if 0
1458	printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n",
1459	tid, nth, chunk_bounds_nest[`0`].lb0_u64, chunk_bounds_nest[`1`].lb0_u64,
1460	chunk_bounds_nest[`0`].ub0_u64, chunk_bounds_nest[`1`].ub0_u64, iter_current, iter_total);
1461	#endif
1462	}
1463
1464	/**************************************************************************
1465	* Handle upper triangle matrix in the canonical form
1466	* i = 0; i <= N; i++ {0,0}:{N,0}
1467	* j = 0+1*i; j <= N; j++ {0,1}:{N,0}
1468	* ************************************************************************/
1469	void kmp_handle_upper_triangle_matrix(
1470	/in/ kmp_uint32 nth,
1471	/in/ kmp_uint32 tid,
1472	/in / kmp_index_t n,
1473	/in/out/ bounds_info_t *original_bounds_nest,
1474	/out/ bounds_info_t *chunk_bounds_nest) {
1475
1476	// transfer loop types from the original loop to the chunks
1477	for (kmp_index_t i = `0`; i < n; ++i) {
1478	chunk_bounds_nest[i] = original_bounds_nest[i];
1479	}
1480	// cleanup iv variables
1481	kmp_uint64 outer_ub0 = kmp_fix_iv(loop_iv_type: original_bounds_nest[`0`].loop_iv_type,
1482	original_iv: original_bounds_nest[`0`].ub0_u64);
1483	kmp_uint64 outer_lb0 = kmp_fix_iv(loop_iv_type: original_bounds_nest[`0`].loop_iv_type,
1484	original_iv: original_bounds_nest[`0`].lb0_u64);
1485	[[maybe_unused]] kmp_uint64 inner_ub0 = kmp_fix_iv(
1486	loop_iv_type: original_bounds_nest[`1`].loop_iv_type, original_iv: original_bounds_nest[`1`].ub0_u64);
1487	// calculate the chunk's lower and upper bounds
1488	// the total number of iterations in the loop is the sum of the arithmetic
1489	// progression from the outer lower to outer upper bound (inclusive since the
1490	// loop is canonical) note that less_than inner loops (inner_ub0 = -1)
1491	// effectively make the progression 1-based making N = (outer_ub0 - inner_lb0
1492	// + 1) -> N - 1
1493	kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + `1`);
1494	kmp_uint64 iter_total = outer_iters * (outer_iters + `1`) / `2`;
1495	// the current thread's number of iterations:
1496	// each thread gets an equal number of iterations: total number of iterations
1497	// divided by the number of threads plus, if there's a remainder,
1498	// the first threads with the number up to the remainder get an additional
1499	// iteration each to cover it
1500	kmp_uint64 iter_current =
1501	iter_total / nth + ((tid < (iter_total % nth)) ? `1` : `0`);
1502	// cumulative number of iterations executed by all the previous threads:
1503	// threads with the tid below the remainder will have (iter_total/nth+1)
1504	// elements, and so will all threads before them so the cumulative number of
1505	// iterations executed by the all previous will be the current thread's number
1506	// of iterations multiplied by the number of previous threads which is equal
1507	// to the current thread's tid; threads with the number equal or above the
1508	// remainder will have (iter_total/nth) elements so the cumulative number of
1509	// iterations previously executed is its number of iterations multipled by the
1510	// number of previous threads which is again equal to the current thread's tid
1511	// PLUS all the remainder iterations that will have been executed by the
1512	// previous threads
1513	kmp_uint64 iter_before_current =
1514	tid * iter_current + ((tid < iter_total % nth) ? `0` : (iter_total % nth));
1515	// cumulative number of iterations executed with the current thread is
1516	// the cumulative number executed before it plus its own
1517	kmp_uint64 iter_with_current = iter_before_current + iter_current;
1518	// calculate the outer loop lower bound (lbo) which is the max outer iv value
1519	// that gives the number of iterations that is equal or just below the total
1520	// number of iterations executed by the previous threads:
1521	// lbo(lbo+1)/2<=iter_before_current =>*
1522	// lbo^2+lbo-2iter_before_current<=0*
1523	kmp_uint64 lower_bound_outer =
1524	(kmp_uint64)(sqrt_newton_approx(x: `1` + `8` * iter_before_current) + `1`) / `2` - `1`;
1525	// calculate the inner loop lower bound which is the remaining number of
1526	// iterations required to hit the total number of iterations executed by the
1527	// previous threads giving the starting point of this thread
1528	kmp_uint64 lower_bound_inner =
1529	iter_before_current - ((lower_bound_outer + `1`) * lower_bound_outer) / `2`;
1530	// calculate the outer loop upper bound using the same approach as for the
1531	// inner bound except using the total number of iterations executed with the
1532	// current thread
1533	kmp_uint64 upper_bound_outer =
1534	(kmp_uint64)(sqrt_newton_approx(x: `1` + `8` * iter_with_current) + `1`) / `2` - `1`;
1535	// calculate the inner loop upper bound which is the remaining number of
1536	// iterations required to hit the total number of iterations executed after
1537	// the current thread giving the starting point of the next thread
1538	kmp_uint64 upper_bound_inner =
1539	iter_with_current - ((upper_bound_outer + `1`) * upper_bound_outer) / `2`;
1540	// adjust the upper bounds down by 1 element to point at the last iteration of
1541	// the current thread the first iteration of the next thread
1542	if (upper_bound_inner == `0`) {
1543	// {n,0} => {n-1,n-1}
1544	upper_bound_outer -= `1`;
1545	upper_bound_inner = upper_bound_outer;
1546	} else {
1547	// {n,m} => {n,m-1} (m!=0)
1548	upper_bound_inner -= `1`;
1549	}
1550
1551	// assign the values, zeroing out lb1 and ub1 values since the iteration space
1552	// is now one-dimensional
1553	chunk_bounds_nest[`0`].lb0_u64 = (outer_iters - `1`) - upper_bound_outer;
1554	chunk_bounds_nest[`1`].lb0_u64 = (outer_iters - `1`) - upper_bound_inner;
1555	chunk_bounds_nest[`0`].ub0_u64 = (outer_iters - `1`) - lower_bound_outer;
1556	chunk_bounds_nest[`1`].ub0_u64 = (outer_iters - `1`) - lower_bound_inner;
1557	chunk_bounds_nest[`0`].lb1_u64 = `0`;
1558	chunk_bounds_nest[`0`].ub1_u64 = `0`;
1559	chunk_bounds_nest[`1`].lb1_u64 = `0`;
1560	chunk_bounds_nest[`1`].ub1_u64 = `0`;
1561
1562	#if 0
1563	printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n",
1564	tid, nth, chunk_bounds_nest[`0`].lb0_u64, chunk_bounds_nest[`1`].lb0_u64,
1565	chunk_bounds_nest[`0`].ub0_u64, chunk_bounds_nest[`1`].ub0_u64, iter_current, iter_total);
1566	#endif
1567	}
1568	//----------Init API for non-rectangular loops--------------------------------
1569
1570	// Init API for collapsed loops (static, no chunks defined).
1571	// "bounds_nest" has to be allocated per thread.
1572	// API will modify original bounds_nest array to bring it to a canonical form
1573	// (only <= and >=, no !=, <, >). If the original loop nest was already in a
1574	// canonical form there will be no changes to bounds in bounds_nest array
1575	// (only trip counts will be calculated). Internally API will expand the space
1576	// to parallelogram/parallelepiped, calculate total, calculate bounds for the
1577	// chunks in terms of the new IV, re-calc them in terms of old IVs (especially
1578	// important on the left side, to hit the lower bounds and not step over), and
1579	// pick the correct chunk for this thread (so it will calculate chunks up to the
1580	// needed one). It could be optimized to calculate just this chunk, potentially
1581	// a bit less well distributed among threads. It is designed to make sure that
1582	// threads will receive predictable chunks, deterministically (so that next nest
1583	// of loops with similar characteristics will get exactly same chunks on same
1584	// threads).
1585	// Current contract: chunk_bounds_nest has only lb0 and ub0,
1586	// lb1 and ub1 are set to 0 and can be ignored. (This may change in the future).
1587	extern "C" kmp_int32
1588	__kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
1589	/in/out/ bounds_info_t *original_bounds_nest,
1590	/out/ bounds_info_t *chunk_bounds_nest,
1591	kmp_index_t n, /out/ kmp_int32 *plastiter) {
1592
1593	KMP_DEBUG_ASSERT(plastiter && original_bounds_nest);
1594	KE_TRACE(`10`, ("__kmpc_for_collapsed_init called (%d)\n", gtid));
1595
1596	if (__kmp_env_consistency_check) {
1597	__kmp_push_workshare(gtid, ct: ct_pdo, ident: loc);
1598	}
1599
1600	kmp_canonicalize_loop_nest(loc, /in/out/ original_bounds_nest, n);
1601
1602	CollapseAllocator<bounds_info_internal_t> updated_bounds_nest(n);
1603
1604	for (kmp_index_t i = `0`; i < n; ++i) {
1605	updated_bounds_nest [i].b = original_bounds_nest[i];
1606	}
1607
1608	kmp_loop_nest_iv_t total =
1609	kmp_process_loop_nest(/in/out/ bounds_nest: updated_bounds_nest, n);
1610
1611	if (plastiter != NULL) {
1612	*plastiter = FALSE;
1613	}
1614
1615	if (total == `0`) {
1616	// Loop won't execute:
1617	return FALSE;
1618	}
1619
1620	// OMPTODO: DISTRIBUTE is not supported yet
1621	__kmp_assert_valid_gtid(gtid);
1622	kmp_uint32 tid = __kmp_tid_from_gtid(gtid);
1623
1624	kmp_info_t *th = __kmp_threads[gtid];
1625	kmp_team_t *team = th->th.th_team;
1626	kmp_uint32 nth = team->t.t_nproc; // Number of threads
1627
1628	KMP_DEBUG_ASSERT(tid < nth);
1629
1630	// Handle special cases
1631	nested_loop_type_t loop_type =
1632	kmp_identify_nested_loop_structure(original_bounds_nest, n);
1633	if (loop_type == nested_loop_type_lower_triangular_matrix) {
1634	kmp_handle_lower_triangle_matrix(nth, tid, n, original_bounds_nest,
1635	chunk_bounds_nest);
1636	return TRUE;
1637	} else if (loop_type == nested_loop_type_upper_triangular_matrix) {
1638	kmp_handle_upper_triangle_matrix(nth, tid, n, original_bounds_nest,
1639	chunk_bounds_nest);
1640	return TRUE;
1641	}
1642
1643	CollapseAllocator<kmp_uint64> original_ivs_start(n);
1644
1645	if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n,
1646	/out/ original_ivs: original_ivs_start)) {
1647	// Loop won't execute:
1648	return FALSE;
1649	}
1650
1651	// Not doing this optimization for one thread:
1652	// (1) more to test
1653	// (2) without it current contract that chunk_bounds_nest has only lb0 and
1654	// ub0, lb1 and ub1 are set to 0 and can be ignored.
1655	// if (nth == 1) {
1656	// // One thread:
1657	// // Copy all info from original_bounds_nest, it'll be good enough.
1658
1659	// for (kmp_index_t i = 0; i < n; ++i) {
1660	// chunk_bounds_nest[i] = original_bounds_nest[i];
1661	// }
1662
1663	// if (plastiter != NULL) {
1664	// plastiter = TRUE;*
1665	// }
1666	// return TRUE;
1667	//}
1668
1669	kmp_loop_nest_iv_t new_iv = kmp_calc_new_iv_from_original_ivs(
1670	bounds_nest: updated_bounds_nest, original_ivs: original_ivs_start, n);
1671
1672	bool last_iter = false;
1673
1674	for (; nth > `0`;) {
1675	// We could calculate chunk size once, but this is to compensate that the
1676	// original space is not parallelepiped and some threads can be left
1677	// without work:
1678	KMP_DEBUG_ASSERT(total >= new_iv);
1679
1680	kmp_loop_nest_iv_t total_left = total - new_iv;
1681	kmp_loop_nest_iv_t chunk_size = total_left / nth;
1682	kmp_loop_nest_iv_t remainder = total_left % nth;
1683
1684	kmp_loop_nest_iv_t curr_chunk_size = chunk_size;
1685
1686	if (remainder > `0`) {
1687	++curr_chunk_size;
1688	--remainder;
1689	}
1690
1691	#if defined(KMP_DEBUG)
1692	kmp_loop_nest_iv_t new_iv_for_start = new_iv;
1693	#endif
1694
1695	if (curr_chunk_size > `1`) {
1696	new_iv += curr_chunk_size - `1`;
1697	}
1698
1699	CollapseAllocator<kmp_uint64> original_ivs_end(n);
1700	if ((nth == `1`) \|\| (new_iv >= total - `1`)) {
1701	// Do this one till the end - just in case we miscalculated
1702	// and either too much is left to process or new_iv is a bit too big:
1703	kmp_calc_original_ivs_for_end(original_bounds_nest, n,
1704	/out/ original_ivs: original_ivs_end);
1705
1706	last_iter = true;
1707	} else {
1708	// Note: here we make sure it's past (or equal to) the previous point.
1709	if (!kmp_calc_original_ivs_for_chunk_end(original_bounds_nest, n,
1710	updated_bounds_nest,
1711	original_ivs_start, new_iv,
1712	/out/ original_ivs: original_ivs_end)) {
1713	// We could not find the ending point, use the original upper bounds:
1714	kmp_calc_original_ivs_for_end(original_bounds_nest, n,
1715	/out/ original_ivs: original_ivs_end);
1716
1717	last_iter = true;
1718	}
1719	}
1720
1721	#if defined(KMP_DEBUG)
1722	auto new_iv_for_end = kmp_calc_new_iv_from_original_ivs(
1723	bounds_nest: updated_bounds_nest, original_ivs: original_ivs_end, n);
1724	KMP_DEBUG_ASSERT(new_iv_for_end >= new_iv_for_start);
1725	#endif
1726
1727	if (last_iter && (tid != `0`)) {
1728	// We are done, this was last chunk, but no chunk for current thread was
1729	// found:
1730	return FALSE;
1731	}
1732
1733	if (tid == `0`) {
1734	// We found the chunk for this thread, now we need to check if it's the
1735	// last chunk or not:
1736
1737	CollapseAllocator<kmp_uint64> original_ivs_next_start(n);
1738	if (last_iter \|\|
1739	!kmp_calc_next_original_ivs(original_bounds_nest, n, original_ivs: original_ivs_end,
1740	/out/ next_original_ivs: original_ivs_next_start)) {
1741	// no more loop iterations left to process,
1742	// this means that currently found chunk is the last chunk:
1743	if (plastiter != NULL) {
1744	*plastiter = TRUE;
1745	}
1746	}
1747
1748	// Fill in chunk bounds:
1749	for (kmp_index_t i = `0`; i < n; ++i) {
1750	chunk_bounds_nest[i] =
1751	original_bounds_nest[i]; // To fill in types, etc. - optional
1752	chunk_bounds_nest[i].lb0_u64 = original_ivs_start [i];
1753	chunk_bounds_nest[i].lb1_u64 = `0`;
1754
1755	chunk_bounds_nest[i].ub0_u64 = original_ivs_end [i];
1756	chunk_bounds_nest[i].ub1_u64 = `0`;
1757	}
1758
1759	return TRUE;
1760	}
1761
1762	--tid;
1763	--nth;
1764
1765	bool next_chunk = kmp_calc_next_original_ivs(
1766	original_bounds_nest, n, original_ivs: original_ivs_end, /out/ next_original_ivs: original_ivs_start);
1767	if (!next_chunk) {
1768	// no more loop iterations to process,
1769	// the prevoius chunk was the last chunk
1770	break;
1771	}
1772
1773	// original_ivs_start is next to previous chunk original_ivs_end,
1774	// we need to start new chunk here, so chunks will be one after another
1775	// without any gap or overlap:
1776	new_iv = kmp_calc_new_iv_from_original_ivs(bounds_nest: updated_bounds_nest,
1777	original_ivs: original_ivs_start, n);
1778	}
1779
1780	return FALSE;
1781	}
1782

source code of openmp/runtime/src/kmp_collapse.cpp