Workshare.cpp source code [offload/DeviceRTL/src/Workshare.cpp]

1	//===----- Workshare.cpp - OpenMP workshare implementation ------ C++ --===//*
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file contains the implementation of the KMPC interface
10	// for the loop construct plus other worksharing constructs that use the same
11	// interface as loops.
12	//
13	//===----------------------------------------------------------------------===//
14
15	#include "Workshare.h"
16	#include "Debug.h"
17	#include "DeviceTypes.h"
18	#include "DeviceUtils.h"
19	#include "Interface.h"
20	#include "Mapping.h"
21	#include "State.h"
22	#include "Synchronization.h"
23
24	using namespace ompx;
25
26	// TODO:
27	struct DynamicScheduleTracker {
28	int64_t Chunk;
29	int64_t LoopUpperBound;
30	int64_t NextLowerBound;
31	int64_t Stride;
32	kmp_sched_t ScheduleType;
33	DynamicScheduleTracker *NextDST;
34	};
35
36	#define ASSERT0(...)
37
38	// used by the library for the interface with the app
39	#define DISPATCH_FINISHED 0
40	#define DISPATCH_NOTFINISHED 1
41
42	// used by dynamic scheduling
43	#define FINISHED 0
44	#define NOT_FINISHED 1
45	#define LAST_CHUNK 2
46
47	// TODO: This variable is a hack inherited from the old runtime.
48	[[clang::loader_uninitialized]] static Local<uint64_t> Cnt;
49
50	template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
51	////////////////////////////////////////////////////////////////////////////////
52	// Loop with static scheduling with chunk
53
54	// Generic implementation of OMP loop scheduling with static policy
55	/! \brief Calculate initial bounds for static loop and stride*
56	* @param[in] loc location in code of the call (not used here)
57	* @param[in] global_tid global thread id
58	* @param[in] schetype type of scheduling (see omptarget-nvptx.h)
59	* @param[in] plastiter pointer to last iteration
60	* @param[in,out] pointer to loop lower bound. it will contain value of
61	* lower bound of first chunk
62	* @param[in,out] pointer to loop upper bound. It will contain value of
63	* upper bound of first chunk
64	* @param[in,out] pointer to loop stride. It will contain value of stride
65	* between two successive chunks executed by the same thread
66	* @param[in] loop increment bump
67	* @param[in] chunk size
68	*/
69
70	// helper function for static chunk
71	static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, ST chunk,
72	T entityId, T numberOfEntities) {
73	// each thread executes multiple chunks all of the same size, except
74	// the last one
75	// distance between two successive chunks
76	stride = numberOfEntities * chunk;
77	lb = lb + entityId * chunk;
78	T inputUb = ub;
79	ub = lb + chunk - `1`; // Clang uses i <= ub
80	// Say ub' is the beginning of the last chunk. Then who ever has a
81	// lower bound plus a multiple of the increment equal to ub' is
82	// the last one.
83	T beginingLastChunk = inputUb - (inputUb % chunk);
84	last = ((beginingLastChunk - lb) % stride) == `0`;
85	}
86
87	////////////////////////////////////////////////////////////////////////////////
88	// Loop with static scheduling without chunk
89
90	// helper function for static no chunk
91	static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, ST &chunk,
92	T entityId, T numberOfEntities) {
93	// No chunk size specified. Each thread or warp gets at most one
94	// chunk; chunks are all almost of equal size
95	T loopSize = ub - lb + `1`;
96
97	chunk = loopSize / numberOfEntities;
98	T leftOver = loopSize - chunk * numberOfEntities;
99
100	if (entityId < leftOver) {
101	chunk++;
102	lb = lb + entityId * chunk;
103	} else {
104	lb = lb + entityId * chunk + leftOver;
105	}
106
107	T inputUb = ub;
108	ub = lb + chunk - `1`; // Clang uses i <= ub
109	last = lb <= inputUb && inputUb <= ub;
110	stride = loopSize; // make sure we only do 1 chunk per warp
111	}
112
113	////////////////////////////////////////////////////////////////////////////////
114	// Support for Static Init
115
116	static void for_static_init(int32_t, int32_t schedtype, int32_t *plastiter,
117	T plower, T pupper, ST *pstride, ST chunk,
118	bool IsSPMDExecutionMode) {
119	int32_t gtid = omp_get_thread_num();
120	int numberOfActiveOMPThreads = omp_get_num_threads();
121
122	// All warps that are in excess of the maximum requested, do
123	// not execute the loop
124	ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads,
125	"current thread is not needed here; error");
126
127	// copy
128	int lastiter = `0`;
129	T lb = *plower;
130	T ub = *pupper;
131	ST stride = *pstride;
132
133	// init
134	switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) {
135	case kmp_sched_static_chunk: {
136	if (chunk > `0`) {
137	ForStaticChunk(last&: lastiter, lb, ub, stride, chunk, entityId: gtid,
138	numberOfEntities: numberOfActiveOMPThreads);
139	break;
140	}
141	[[fallthrough]];
142	} // note: if chunk <=0, use nochunk
143	case kmp_sched_static_balanced_chunk: {
144	if (chunk > `0`) {
145	// round up to make sure the chunk is enough to cover all iterations
146	T tripCount = ub - lb + `1`; // +1 because ub is inclusive
147	T span = (tripCount + numberOfActiveOMPThreads - `1`) /
148	numberOfActiveOMPThreads;
149	// perform chunk adjustment
150	chunk = (span + chunk - `1`) & ~(chunk - `1`);
151
152	ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
153	T oldUb = ub;
154	ForStaticChunk(last&: lastiter, lb, ub, stride, chunk, entityId: gtid,
155	numberOfEntities: numberOfActiveOMPThreads);
156	if (ub > oldUb)
157	ub = oldUb;
158	break;
159	}
160	[[fallthrough]];
161	} // note: if chunk <=0, use nochunk
162	case kmp_sched_static_nochunk: {
163	ForStaticNoChunk(last&: lastiter, lb, ub, stride, chunk, entityId: gtid,
164	numberOfEntities: numberOfActiveOMPThreads);
165	break;
166	}
167	case kmp_sched_distr_static_chunk: {
168	if (chunk > `0`) {
169	ForStaticChunk(last&: lastiter, lb, ub, stride, chunk, entityId: omp_get_team_num(),
170	numberOfEntities: omp_get_num_teams());
171	break;
172	}
173	[[fallthrough]];
174	} // note: if chunk <=0, use nochunk
175	case kmp_sched_distr_static_nochunk: {
176	ForStaticNoChunk(last&: lastiter, lb, ub, stride, chunk, entityId: omp_get_team_num(),
177	numberOfEntities: omp_get_num_teams());
178	break;
179	}
180	case kmp_sched_distr_static_chunk_sched_static_chunkone: {
181	ForStaticChunk(last&: lastiter, lb, ub, stride, chunk,
182	entityId: numberOfActiveOMPThreads * omp_get_team_num() + gtid,
183	numberOfEntities: omp_get_num_teams() * numberOfActiveOMPThreads);
184	break;
185	}
186	default: {
187	// ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
188	ForStaticChunk(last&: lastiter, lb, ub, stride, chunk, entityId: gtid,
189	numberOfEntities: numberOfActiveOMPThreads);
190	break;
191	}
192	}
193	// copy back
194	*plastiter = lastiter;
195	*plower = lb;
196	*pupper = ub;
197	*pstride = stride;
198	}
199
200	////////////////////////////////////////////////////////////////////////////////
201	// Support for dispatch Init
202
203	static int OrderedSchedule(kmp_sched_t schedule) {
204	return schedule >= kmp_sched_ordered_first &&
205	schedule <= kmp_sched_ordered_last;
206	}
207
208	static void dispatch_init(IdentTy *loc, int32_t threadId,
209	kmp_sched_t schedule, T lb, T ub, ST st, ST chunk,
210	DynamicScheduleTracker *DST) {
211	int tid = mapping::getThreadIdInBlock();
212	T tnum = omp_get_num_threads();
213	T tripCount = ub - lb + `1`; // +1 because ub is inclusive
214	ASSERT0(LT_FUSSY, threadId < tnum,
215	"current thread is not needed here; error");
216
217	/ Currently just ignore the monotonic and non-monotonic modifiers*
218	* (the compiler isn't producing them * yet anyway).
219	* When it is we'll want to look at them somewhere here and use that
220	* information to add to our schedule choice. We shouldn't need to pass
221	* them on, they merely affect which schedule we can legally choose for
222	* various dynamic cases. (In particular, whether or not a stealing scheme
223	* is legal).
224	*/
225	schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
226
227	// Process schedule.
228	if (tnum == `1` \|\| tripCount <= `1` \|\| OrderedSchedule(schedule)) {
229	if (OrderedSchedule(schedule))
230	__kmpc_barrier(loc, threadId);
231	schedule = kmp_sched_static_chunk;
232	chunk = tripCount; // one thread gets the whole loop
233	} else if (schedule == kmp_sched_runtime) {
234	// process runtime
235	omp_sched_t rtSched;
236	int ChunkInt;
237	omp_get_schedule(&rtSched, &ChunkInt);
238	chunk = ChunkInt;
239	switch (rtSched) {
240	case omp_sched_static: {
241	if (chunk > `0`)
242	schedule = kmp_sched_static_chunk;
243	else
244	schedule = kmp_sched_static_nochunk;
245	break;
246	}
247	case omp_sched_auto: {
248	schedule = kmp_sched_static_chunk;
249	chunk = `1`;
250	break;
251	}
252	case omp_sched_dynamic:
253	case omp_sched_guided: {
254	schedule = kmp_sched_dynamic;
255	break;
256	}
257	}
258	} else if (schedule == kmp_sched_auto) {
259	schedule = kmp_sched_static_chunk;
260	chunk = `1`;
261	} else {
262	// ASSERT(LT_FUSSY,
263	// schedule == kmp_sched_dynamic \|\| schedule == kmp_sched_guided,
264	// "unknown schedule %d & chunk %lld\n", (int)schedule,
265	// (long long)chunk);
266	}
267
268	// init schedules
269	if (schedule == kmp_sched_static_chunk) {
270	ASSERT0(LT_FUSSY, chunk > `0`, "bad chunk value");
271	// save sched state
272	DST->ScheduleType = schedule;
273	// save ub
274	DST->LoopUpperBound = ub;
275	// compute static chunk
276	ST stride;
277	int lastiter = `0`;
278	ForStaticChunk(last&: lastiter, lb, ub, stride, chunk, entityId: threadId, numberOfEntities: tnum);
279	// save computed params
280	DST->Chunk = chunk;
281	DST->NextLowerBound = lb;
282	DST->Stride = stride;
283	} else if (schedule == kmp_sched_static_balanced_chunk) {
284	ASSERT0(LT_FUSSY, chunk > `0`, "bad chunk value");
285	// save sched state
286	DST->ScheduleType = schedule;
287	// save ub
288	DST->LoopUpperBound = ub;
289	// compute static chunk
290	ST stride;
291	int lastiter = `0`;
292	// round up to make sure the chunk is enough to cover all iterations
293	T span = (tripCount + tnum - `1`) / tnum;
294	// perform chunk adjustment
295	chunk = (span + chunk - `1`) & ~(chunk - `1`);
296
297	T oldUb = ub;
298	ForStaticChunk(last&: lastiter, lb, ub, stride, chunk, entityId: threadId, numberOfEntities: tnum);
299	ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
300	if (ub > oldUb)
301	ub = oldUb;
302	// save computed params
303	DST->Chunk = chunk;
304	DST->NextLowerBound = lb;
305	DST->Stride = stride;
306	} else if (schedule == kmp_sched_static_nochunk) {
307	ASSERT0(LT_FUSSY, chunk == `0`, "bad chunk value");
308	// save sched state
309	DST->ScheduleType = schedule;
310	// save ub
311	DST->LoopUpperBound = ub;
312	// compute static chunk
313	ST stride;
314	int lastiter = `0`;
315	ForStaticNoChunk(last&: lastiter, lb, ub, stride, chunk, entityId: threadId, numberOfEntities: tnum);
316	// save computed params
317	DST->Chunk = chunk;
318	DST->NextLowerBound = lb;
319	DST->Stride = stride;
320	} else if (schedule == kmp_sched_dynamic \|\| schedule == kmp_sched_guided) {
321	// save data
322	DST->ScheduleType = schedule;
323	if (chunk < `1`)
324	chunk = `1`;
325	DST->Chunk = chunk;
326	DST->LoopUpperBound = ub;
327	DST->NextLowerBound = lb;
328	__kmpc_barrier(loc, threadId);
329	if (tid == `0`) {
330	Cnt = `0`;
331	fence::team(atomic::seq_cst);
332	}
333	__kmpc_barrier(loc, threadId);
334	}
335	}
336
337	////////////////////////////////////////////////////////////////////////////////
338	// Support for dispatch next
339
340	static uint64_t NextIter() {
341	__kmpc_impl_lanemask_t active = mapping::activemask();
342	uint32_t leader = utils::ffs(active) - `1`;
343	uint32_t change = utils::popc(active);
344	__kmpc_impl_lanemask_t lane_mask_lt = mapping::lanemaskLT();
345	unsigned int rank = utils::popc(active & lane_mask_lt);
346	uint64_t warp_res = `0`;
347	if (rank == `0`) {
348	warp_res = atomic::add(&Cnt, change, atomic::seq_cst);
349	}
350	warp_res = utils::shuffle(active, warp_res, leader, mapping::getWarpSize());
351	return warp_res + rank;
352	}
353
354	static int DynamicNextChunk(T &lb, T &ub, T chunkSize, T loopLowerBound,
355	T loopUpperBound) {
356	T N = NextIter();
357	lb = loopLowerBound + N * chunkSize;
358	ub = lb + chunkSize - `1`; // Clang uses i <= ub
359
360	// 3 result cases:
361	// a. lb and ub < loopUpperBound --> NOT_FINISHED
362	// b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
363	// NOT_FINISHED
364	// c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
365	// a.
366	if (lb <= loopUpperBound && ub < loopUpperBound) {
367	return NOT_FINISHED;
368	}
369	// b.
370	if (lb <= loopUpperBound) {
371	ub = loopUpperBound;
372	return LAST_CHUNK;
373	}
374	// c. if we are here, we are in case 'c'
375	lb = loopUpperBound + `2`;
376	ub = loopUpperBound + `1`;
377	return FINISHED;
378	}
379
380	static int dispatch_next(IdentTy loc, int32_t gtid, int32_t plast,
381	T plower, T pupper, ST *pstride,
382	DynamicScheduleTracker *DST) {
383	// ID of a thread in its own warp
384
385	// automatically selects thread or warp ID based on selected implementation
386	ASSERT0(LT_FUSSY, gtid < omp_get_num_threads(),
387	"current thread is not needed here; error");
388	// retrieve schedule
389	kmp_sched_t schedule = DST->ScheduleType;
390
391	// xxx reduce to one
392	if (schedule == kmp_sched_static_chunk \|\|
393	schedule == kmp_sched_static_nochunk) {
394	T myLb = DST->NextLowerBound;
395	T ub = DST->LoopUpperBound;
396	// finished?
397	if (myLb > ub) {
398	return DISPATCH_FINISHED;
399	}
400	// not finished, save current bounds
401	ST chunk = DST->Chunk;
402	*plower = myLb;
403	T myUb = myLb + chunk - `1`; // Clang uses i <= ub
404	if (myUb > ub)
405	myUb = ub;
406	*pupper = myUb;
407	*plast = (int32_t)(myUb == ub);
408
409	// increment next lower bound by the stride
410	ST stride = DST->Stride;
411	DST->NextLowerBound = myLb + stride;
412	return DISPATCH_NOTFINISHED;
413	}
414	ASSERT0(LT_FUSSY,
415	schedule == kmp_sched_dynamic \|\| schedule == kmp_sched_guided,
416	"bad sched");
417	T myLb, myUb;
418	int finished = DynamicNextChunk(lb&: myLb, ub&: myUb, chunkSize: DST->Chunk, loopLowerBound: DST->NextLowerBound,
419	loopUpperBound: DST->LoopUpperBound);
420
421	if (finished == FINISHED)
422	return DISPATCH_FINISHED;
423
424	// not finished (either not finished or last chunk)
425	*plast = (int32_t)(finished == LAST_CHUNK);
426	*plower = myLb;
427	*pupper = myUb;
428	*pstride = `1`;
429
430	return DISPATCH_NOTFINISHED;
431	}
432
433	static void dispatch_fini() {
434	// nothing
435	}
436
437	////////////////////////////////////////////////////////////////////////////////
438	// end of template class that encapsulate all the helper functions
439	////////////////////////////////////////////////////////////////////////////////
440	};
441
442	////////////////////////////////////////////////////////////////////////////////
443	// KMP interface implementation (dyn loops)
444	////////////////////////////////////////////////////////////////////////////////
445
446	// TODO: Expand the dispatch API to take a DST pointer which can then be
447	// allocated properly without malloc.
448	// For now, each team will contain an LDS pointer (ThreadDST) to a global array
449	// of references to the DST structs allocated (in global memory) for each thread
450	// in the team. The global memory array is allocated during the init phase if it
451	// was not allocated already and will be deallocated when the dispatch phase
452	// ends:
453	//
454	// __kmpc_dispatch_init
455	//
456	// Dispatch loop
457	//
458	// __kmpc_dispatch_deinit
459	//
460	[[clang::loader_uninitialized]] static Local<DynamicScheduleTracker **>
461	ThreadDST;
462
463	// Create a new DST, link the current one, and define the new as current.
464	static DynamicScheduleTracker *pushDST() {
465	int32_t ThreadIndex = mapping::getThreadIdInBlock();
466	// Each block will allocate an array of pointers to DST structs. The array is
467	// equal in length to the number of threads in that block.
468	if (!ThreadDST) {
469	// Allocate global memory array of pointers to DST structs:
470	if (mapping::isMainThreadInGenericMode() \|\| ThreadIndex == `0`)
471	ThreadDST = static_cast<DynamicScheduleTracker **>(
472	memory::allocGlobal(mapping::getNumberOfThreadsInBlock() *
473	sizeof(DynamicScheduleTracker *),
474	"new ThreadDST array"));
475	synchronize::threads(atomic::seq_cst);
476
477	// Initialize the array pointers:
478	ThreadDST[ThreadIndex] = nullptr;
479	}
480
481	// Create a DST struct for the current thread:
482	DynamicScheduleTracker NewDST = static_cast<DynamicScheduleTracker >(
483	memory::allocGlobal(sizeof(DynamicScheduleTracker), "new DST"));
484	*NewDST = DynamicScheduleTracker({`0`});
485
486	// Add the new DST struct to the array of DST structs:
487	NewDST->NextDST = ThreadDST[ThreadIndex];
488	ThreadDST[ThreadIndex] = NewDST;
489	return NewDST;
490	}
491
492	// Return the current DST.
493	static DynamicScheduleTracker *peekDST() {
494	return ThreadDST[mapping::getThreadIdInBlock()];
495	}
496
497	// Pop the current DST and restore the last one.
498	static void popDST() {
499	int32_t ThreadIndex = mapping::getThreadIdInBlock();
500	DynamicScheduleTracker *CurrentDST = ThreadDST[ThreadIndex];
501	DynamicScheduleTracker *OldDST = CurrentDST->NextDST;
502	memory::freeGlobal(CurrentDST, "remove DST");
503	ThreadDST[ThreadIndex] = OldDST;
504
505	// Check if we need to deallocate the global array. Ensure all threads
506	// in the block have finished deallocating the individual DSTs.
507	synchronize::threads(atomic::seq_cst);
508	if (!ThreadDST[ThreadIndex] && !ThreadIndex) {
509	memory::freeGlobal(ThreadDST, "remove ThreadDST array");
510	ThreadDST = nullptr;
511	}
512	synchronize::threads(atomic::seq_cst);
513	}
514
515	void workshare::init(bool IsSPMD) {
516	if (mapping::isInitialThreadInLevel0(IsSPMD))
517	ThreadDST = nullptr;
518	}
519
520	extern "C" {
521
522	// init
523	void __kmpc_dispatch_init_4(IdentTy *loc, int32_t tid, int32_t schedule,
524	int32_t lb, int32_t ub, int32_t st, int32_t chunk) {
525	DynamicScheduleTracker *DST = pushDST();
526	omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
527	loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
528	}
529
530	void __kmpc_dispatch_init_4u(IdentTy *loc, int32_t tid, int32_t schedule,
531	uint32_t lb, uint32_t ub, int32_t st,
532	int32_t chunk) {
533	DynamicScheduleTracker *DST = pushDST();
534	omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
535	loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
536	}
537
538	void __kmpc_dispatch_init_8(IdentTy *loc, int32_t tid, int32_t schedule,
539	int64_t lb, int64_t ub, int64_t st, int64_t chunk) {
540	DynamicScheduleTracker *DST = pushDST();
541	omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
542	loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
543	}
544
545	void __kmpc_dispatch_init_8u(IdentTy *loc, int32_t tid, int32_t schedule,
546	uint64_t lb, uint64_t ub, int64_t st,
547	int64_t chunk) {
548	DynamicScheduleTracker *DST = pushDST();
549	omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
550	loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
551	}
552
553	// next
554	int __kmpc_dispatch_next_4(IdentTy loc, int32_t tid, int32_t p_last,
555	int32_t p_lb, int32_t p_ub, int32_t *p_st) {
556	DynamicScheduleTracker *DST = peekDST();
557	return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
558	loc, tid, p_last, p_lb, p_ub, p_st, DST);
559	}
560
561	int __kmpc_dispatch_next_4u(IdentTy loc, int32_t tid, int32_t p_last,
562	uint32_t p_lb, uint32_t p_ub, int32_t *p_st) {
563	DynamicScheduleTracker *DST = peekDST();
564	return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
565	loc, tid, p_last, p_lb, p_ub, p_st, DST);
566	}
567
568	int __kmpc_dispatch_next_8(IdentTy loc, int32_t tid, int32_t p_last,
569	int64_t p_lb, int64_t p_ub, int64_t *p_st) {
570	DynamicScheduleTracker *DST = peekDST();
571	return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
572	loc, tid, p_last, p_lb, p_ub, p_st, DST);
573	}
574
575	int __kmpc_dispatch_next_8u(IdentTy loc, int32_t tid, int32_t p_last,
576	uint64_t p_lb, uint64_t p_ub, int64_t *p_st) {
577	DynamicScheduleTracker *DST = peekDST();
578	return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
579	loc, tid, p_last, p_lb, p_ub, p_st, DST);
580	}
581
582	// fini
583	void __kmpc_dispatch_fini_4(IdentTy *loc, int32_t tid) {
584	omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
585	}
586
587	void __kmpc_dispatch_fini_4u(IdentTy *loc, int32_t tid) {
588	omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
589	}
590
591	void __kmpc_dispatch_fini_8(IdentTy *loc, int32_t tid) {
592	omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
593	}
594
595	void __kmpc_dispatch_fini_8u(IdentTy *loc, int32_t tid) {
596	omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
597	}
598
599	// deinit
600	void __kmpc_dispatch_deinit(IdentTy *loc, int32_t tid) { popDST(); }
601
602	////////////////////////////////////////////////////////////////////////////////
603	// KMP interface implementation (static loops)
604	////////////////////////////////////////////////////////////////////////////////
605
606	void __kmpc_for_static_init_4(IdentTy *loc, int32_t global_tid,
607	int32_t schedtype, int32_t *plastiter,
608	int32_t plower, int32_t pupper,
609	int32_t *pstride, int32_t incr, int32_t chunk) {
610	omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
611	global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
612	mapping::isSPMDMode());
613	}
614
615	void __kmpc_for_static_init_4u(IdentTy *loc, int32_t global_tid,
616	int32_t schedtype, int32_t *plastiter,
617	uint32_t plower, uint32_t pupper,
618	int32_t *pstride, int32_t incr, int32_t chunk) {
619	omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
620	global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
621	mapping::isSPMDMode());
622	}
623
624	void __kmpc_for_static_init_8(IdentTy *loc, int32_t global_tid,
625	int32_t schedtype, int32_t *plastiter,
626	int64_t plower, int64_t pupper,
627	int64_t *pstride, int64_t incr, int64_t chunk) {
628	omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
629	global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
630	mapping::isSPMDMode());
631	}
632
633	void __kmpc_for_static_init_8u(IdentTy *loc, int32_t global_tid,
634	int32_t schedtype, int32_t *plastiter,
635	uint64_t plower, uint64_t pupper,
636	int64_t *pstride, int64_t incr, int64_t chunk) {
637	omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
638	global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
639	mapping::isSPMDMode());
640	}
641
642	void __kmpc_distribute_static_init_4(IdentTy *loc, int32_t global_tid,
643	int32_t schedtype, int32_t *plastiter,
644	int32_t plower, int32_t pupper,
645	int32_t *pstride, int32_t incr,
646	int32_t chunk) {
647	omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
648	global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
649	mapping::isSPMDMode());
650	}
651
652	void __kmpc_distribute_static_init_4u(IdentTy *loc, int32_t global_tid,
653	int32_t schedtype, int32_t *plastiter,
654	uint32_t plower, uint32_t pupper,
655	int32_t *pstride, int32_t incr,
656	int32_t chunk) {
657	omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
658	global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
659	mapping::isSPMDMode());
660	}
661
662	void __kmpc_distribute_static_init_8(IdentTy *loc, int32_t global_tid,
663	int32_t schedtype, int32_t *plastiter,
664	int64_t plower, int64_t pupper,
665	int64_t *pstride, int64_t incr,
666	int64_t chunk) {
667	omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
668	global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
669	mapping::isSPMDMode());
670	}
671
672	void __kmpc_distribute_static_init_8u(IdentTy *loc, int32_t global_tid,
673	int32_t schedtype, int32_t *plastiter,
674	uint64_t plower, uint64_t pupper,
675	int64_t *pstride, int64_t incr,
676	int64_t chunk) {
677	omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
678	global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
679	mapping::isSPMDMode());
680	}
681
682	void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {}
683
684	void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {}
685	}
686
687	namespace ompx {
688
689	/// Helper class to hide the generic loop nest and provide the template argument
690	/// throughout.
691	template <typename Ty> class StaticLoopChunker {
692
693	/// Generic loop nest that handles block and/or thread distribution in the
694	/// absence of user specified chunk sizes. This implicitly picks a block chunk
695	/// size equal to the number of threads in the block and a thread chunk size
696	/// equal to one. In contrast to the chunked version we can get away with a
697	/// single loop in this case
698	static void NormalizedLoopNestNoChunk(void (LoopBody)(Ty, void* ), void* *Arg,
699	Ty NumBlocks, Ty BId, Ty NumThreads,
700	Ty TId, Ty NumIters,
701	bool OneIterationPerThread) {
702	Ty KernelIteration = NumBlocks * NumThreads;
703
704	// Start index in the normalized space.
705	Ty IV = BId * NumThreads + TId;
706	ASSERT(IV >= `0`, "Bad index");
707
708	// Cover the entire iteration space, assumptions in the caller might allow
709	// to simplify this loop to a conditional.
710	if (IV < NumIters) {
711	do {
712
713	// Execute the loop body.
714	LoopBody(IV, Arg);
715
716	// Every thread executed one block and thread chunk now.
717	IV += KernelIteration;
718
719	if (OneIterationPerThread)
720	return;
721
722	} while (IV < NumIters);
723	}
724	}
725
726	/// Generic loop nest that handles block and/or thread distribution in the
727	/// presence of user specified chunk sizes (for at least one of them).
728	static void NormalizedLoopNestChunked(void (LoopBody)(Ty, void* ), void* *Arg,
729	Ty BlockChunk, Ty NumBlocks, Ty BId,
730	Ty ThreadChunk, Ty NumThreads, Ty TId,
731	Ty NumIters,
732	bool OneIterationPerThread) {
733	Ty KernelIteration = NumBlocks * BlockChunk;
734
735	// Start index in the chunked space.
736	Ty IV = BId * BlockChunk + TId;
737	ASSERT(IV >= `0`, "Bad index");
738
739	// Cover the entire iteration space, assumptions in the caller might allow
740	// to simplify this loop to a conditional.
741	do {
742
743	Ty BlockChunkLeft =
744	BlockChunk >= TId * ThreadChunk ? BlockChunk - TId * ThreadChunk : `0`;
745	Ty ThreadChunkLeft =
746	ThreadChunk <= BlockChunkLeft ? ThreadChunk : BlockChunkLeft;
747
748	while (ThreadChunkLeft--) {
749
750	// Given the blocking it's hard to keep track of what to execute.
751	if (IV >= NumIters)
752	return;
753
754	// Execute the loop body.
755	LoopBody(IV, Arg);
756
757	if (OneIterationPerThread)
758	return;
759
760	++IV;
761	}
762
763	IV += KernelIteration;
764
765	} while (IV < NumIters);
766	}
767
768	public:
769	/// Worksharing `for`-loop.
770	static void For(IdentTy Loc, void* (LoopBody)(Ty, void* ), void* *Arg,
771	Ty NumIters, Ty NumThreads, Ty ThreadChunk) {
772	ASSERT(NumIters >= `0`, "Bad iteration count");
773	ASSERT(ThreadChunk >= `0`, "Bad thread count");
774
775	// All threads need to participate but we don't know if we are in a
776	// parallel at all or if the user might have used a `num_threads` clause
777	// on the parallel and reduced the number compared to the block size.
778	// Since nested parallels are possible too we need to get the thread id
779	// from the `omp` getter and not the mapping directly.
780	Ty TId = omp_get_thread_num();
781
782	// There are no blocks involved here.
783	Ty BlockChunk = `0`;
784	Ty NumBlocks = `1`;
785	Ty BId = `0`;
786
787	// If the thread chunk is not specified we pick a default now.
788	if (ThreadChunk == `0`)
789	ThreadChunk = `1`;
790
791	// If we know we have more threads than iterations we can indicate that to
792	// avoid an outer loop.
793	bool OneIterationPerThread = false;
794	if (config::getAssumeThreadsOversubscription()) {
795	ASSERT(NumThreads >= NumIters, "Broken assumption");
796	OneIterationPerThread = true;
797	}
798
799	if (ThreadChunk != `1`)
800	NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
801	ThreadChunk, NumThreads, TId, NumIters,
802	OneIterationPerThread);
803	else
804	NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
805	NumIters, OneIterationPerThread);
806	}
807
808	/// Worksharing `distribute`-loop.
809	static void Distribute(IdentTy Loc, void* (LoopBody)(Ty, void* ), void* *Arg,
810	Ty NumIters, Ty BlockChunk) {
811	ASSERT(icv::Level == `0`, "Bad distribute");
812	ASSERT(icv::ActiveLevel == `0`, "Bad distribute");
813	ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
814	ASSERT(state::ParallelTeamSize == `1`, "Bad distribute");
815
816	ASSERT(NumIters >= `0`, "Bad iteration count");
817	ASSERT(BlockChunk >= `0`, "Bad block count");
818
819	// There are no threads involved here.
820	Ty ThreadChunk = `0`;
821	Ty NumThreads = `1`;
822	Ty TId = `0`;
823
824	// All teams need to participate.
825	Ty NumBlocks = mapping::getNumberOfBlocksInKernel();
826	Ty BId = mapping::getBlockIdInKernel();
827
828	// If the block chunk is not specified we pick a default now.
829	if (BlockChunk == `0`)
830	BlockChunk = NumThreads;
831
832	// If we know we have more blocks than iterations we can indicate that to
833	// avoid an outer loop.
834	bool OneIterationPerThread = false;
835	if (config::getAssumeTeamsOversubscription()) {
836	ASSERT(NumBlocks >= NumIters, "Broken assumption");
837	OneIterationPerThread = true;
838	}
839
840	if (BlockChunk != NumThreads)
841	NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
842	ThreadChunk, NumThreads, TId, NumIters,
843	OneIterationPerThread);
844	else
845	NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
846	NumIters, OneIterationPerThread);
847
848	ASSERT(icv::Level == `0`, "Bad distribute");
849	ASSERT(icv::ActiveLevel == `0`, "Bad distribute");
850	ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
851	ASSERT(state::ParallelTeamSize == `1`, "Bad distribute");
852	}
853
854	/// Worksharing `distribute parallel for`-loop.
855	static void DistributeFor(IdentTy Loc, void* (LoopBody)(Ty, void* *),
856	void *Arg, Ty NumIters, Ty NumThreads,
857	Ty BlockChunk, Ty ThreadChunk) {
858	ASSERT(icv::Level == `1`, "Bad distribute");
859	ASSERT(icv::ActiveLevel == `1`, "Bad distribute");
860	ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
861
862	ASSERT(NumIters >= `0`, "Bad iteration count");
863	ASSERT(BlockChunk >= `0`, "Bad block count");
864	ASSERT(ThreadChunk >= `0`, "Bad thread count");
865
866	// All threads need to participate but the user might have used a
867	// `num_threads` clause on the parallel and reduced the number compared to
868	// the block size.
869	Ty TId = mapping::getThreadIdInBlock();
870
871	// All teams need to participate.
872	Ty NumBlocks = mapping::getNumberOfBlocksInKernel();
873	Ty BId = mapping::getBlockIdInKernel();
874
875	// If the block chunk is not specified we pick a default now.
876	if (BlockChunk == `0`)
877	BlockChunk = NumThreads;
878
879	// If the thread chunk is not specified we pick a default now.
880	if (ThreadChunk == `0`)
881	ThreadChunk = `1`;
882
883	// If we know we have more threads (across all blocks) than iterations we
884	// can indicate that to avoid an outer loop.
885	bool OneIterationPerThread = false;
886	if (config::getAssumeTeamsOversubscription() &
887	config::getAssumeThreadsOversubscription()) {
888	OneIterationPerThread = true;
889	ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption");
890	}
891
892	if (BlockChunk != NumThreads \|\| ThreadChunk != `1`)
893	NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
894	ThreadChunk, NumThreads, TId, NumIters,
895	OneIterationPerThread);
896	else
897	NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
898	NumIters, OneIterationPerThread);
899
900	ASSERT(icv::Level == `1`, "Bad distribute");
901	ASSERT(icv::ActiveLevel == `1`, "Bad distribute");
902	ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
903	}
904	};
905
906	} // namespace ompx
907
908	#define OMP_LOOP_ENTRY(BW, TY) \
909	[[gnu::flatten, clang::always_inline]] void \
910	__kmpc_distribute_for_static_loop##BW( \
911	IdentTy loc, void (fn)(TY, void ), void arg, TY num_iters, \
912	TY num_threads, TY block_chunk, TY thread_chunk) { \
913	ompx::StaticLoopChunker<TY>::DistributeFor( \
914	loc, fn, arg, num_iters, num_threads, block_chunk, thread_chunk); \
915	} \
916	[[gnu::flatten, clang::always_inline]] void \
917	__kmpc_distribute_static_loop##BW(IdentTy loc, void (fn)(TY, void *), \
918	void *arg, TY num_iters, \
919	TY block_chunk) { \
920	ompx::StaticLoopChunker<TY>::Distribute(loc, fn, arg, num_iters, \
921	block_chunk); \
922	} \
923	[[gnu::flatten, clang::always_inline]] void __kmpc_for_static_loop##BW( \
924	IdentTy loc, void (fn)(TY, void ), void arg, TY num_iters, \
925	TY num_threads, TY thread_chunk) { \
926	ompx::StaticLoopChunker<TY>::For(loc, fn, arg, num_iters, num_threads, \
927	thread_chunk); \
928	}
929
930	extern "C" {
931	OMP_LOOP_ENTRY(_4, int32_t)
932	OMP_LOOP_ENTRY(_4u, uint32_t)
933	OMP_LOOP_ENTRY(_8, int64_t)
934	OMP_LOOP_ENTRY(_8u, uint64_t)
935	}
936

Provided by KDAB

Definitions

DynamicScheduleTracker
Cnt
omptarget_nvptx_LoopSupport
ForStaticChunk
ForStaticNoChunk
for_static_init
OrderedSchedule
dispatch_init
NextIter
DynamicNextChunk
dispatch_next
dispatch_fini
ThreadDST
pushDST
peekDST
popDST
__kmpc_dispatch_init_4
__kmpc_dispatch_init_4u
__kmpc_dispatch_init_8
__kmpc_dispatch_init_8u
__kmpc_dispatch_next_4
__kmpc_dispatch_next_4u
__kmpc_dispatch_next_8
__kmpc_dispatch_next_8u
__kmpc_dispatch_fini_4
__kmpc_dispatch_fini_4u
__kmpc_dispatch_fini_8
__kmpc_dispatch_fini_8u
__kmpc_dispatch_deinit
__kmpc_for_static_init_4
__kmpc_for_static_init_4u
__kmpc_for_static_init_8
__kmpc_for_static_init_8u
__kmpc_distribute_static_init_4
__kmpc_distribute_static_init_4u
__kmpc_distribute_static_init_8
__kmpc_distribute_static_init_8u
__kmpc_for_static_fini
__kmpc_distribute_static_fini
StaticLoopChunker
NormalizedLoopNestNoChunk
NormalizedLoopNestChunked
For
Distribute

Improve your Profiling and Debugging skills

Find out more

Definitions

source code of offload/DeviceRTL/src/Workshare.cpp