1//===----- Workshare.cpp - OpenMP workshare implementation ------ C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the implementation of the KMPC interface
10// for the loop construct plus other worksharing constructs that use the same
11// interface as loops.
12//
13//===----------------------------------------------------------------------===//
14
15#include "Workshare.h"
16#include "Debug.h"
17#include "DeviceTypes.h"
18#include "DeviceUtils.h"
19#include "Interface.h"
20#include "Mapping.h"
21#include "State.h"
22#include "Synchronization.h"
23
24using namespace ompx;
25
26// TODO:
27struct DynamicScheduleTracker {
28 int64_t Chunk;
29 int64_t LoopUpperBound;
30 int64_t NextLowerBound;
31 int64_t Stride;
32 kmp_sched_t ScheduleType;
33 DynamicScheduleTracker *NextDST;
34};
35
36#define ASSERT0(...)
37
38// used by the library for the interface with the app
39#define DISPATCH_FINISHED 0
40#define DISPATCH_NOTFINISHED 1
41
42// used by dynamic scheduling
43#define FINISHED 0
44#define NOT_FINISHED 1
45#define LAST_CHUNK 2
46
47// TODO: This variable is a hack inherited from the old runtime.
48[[clang::loader_uninitialized]] static Local<uint64_t> Cnt;
49
50template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
51 ////////////////////////////////////////////////////////////////////////////////
52 // Loop with static scheduling with chunk
53
54 // Generic implementation of OMP loop scheduling with static policy
55 /*! \brief Calculate initial bounds for static loop and stride
56 * @param[in] loc location in code of the call (not used here)
57 * @param[in] global_tid global thread id
58 * @param[in] schetype type of scheduling (see omptarget-nvptx.h)
59 * @param[in] plastiter pointer to last iteration
60 * @param[in,out] pointer to loop lower bound. it will contain value of
61 * lower bound of first chunk
62 * @param[in,out] pointer to loop upper bound. It will contain value of
63 * upper bound of first chunk
64 * @param[in,out] pointer to loop stride. It will contain value of stride
65 * between two successive chunks executed by the same thread
66 * @param[in] loop increment bump
67 * @param[in] chunk size
68 */
69
70 // helper function for static chunk
71 static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, ST chunk,
72 T entityId, T numberOfEntities) {
73 // each thread executes multiple chunks all of the same size, except
74 // the last one
75 // distance between two successive chunks
76 stride = numberOfEntities * chunk;
77 lb = lb + entityId * chunk;
78 T inputUb = ub;
79 ub = lb + chunk - 1; // Clang uses i <= ub
80 // Say ub' is the beginning of the last chunk. Then who ever has a
81 // lower bound plus a multiple of the increment equal to ub' is
82 // the last one.
83 T beginingLastChunk = inputUb - (inputUb % chunk);
84 last = ((beginingLastChunk - lb) % stride) == 0;
85 }
86
87 ////////////////////////////////////////////////////////////////////////////////
88 // Loop with static scheduling without chunk
89
90 // helper function for static no chunk
91 static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, ST &chunk,
92 T entityId, T numberOfEntities) {
93 // No chunk size specified. Each thread or warp gets at most one
94 // chunk; chunks are all almost of equal size
95 T loopSize = ub - lb + 1;
96
97 chunk = loopSize / numberOfEntities;
98 T leftOver = loopSize - chunk * numberOfEntities;
99
100 if (entityId < leftOver) {
101 chunk++;
102 lb = lb + entityId * chunk;
103 } else {
104 lb = lb + entityId * chunk + leftOver;
105 }
106
107 T inputUb = ub;
108 ub = lb + chunk - 1; // Clang uses i <= ub
109 last = lb <= inputUb && inputUb <= ub;
110 stride = loopSize; // make sure we only do 1 chunk per warp
111 }
112
113 ////////////////////////////////////////////////////////////////////////////////
114 // Support for Static Init
115
116 static void for_static_init(int32_t, int32_t schedtype, int32_t *plastiter,
117 T *plower, T *pupper, ST *pstride, ST chunk,
118 bool IsSPMDExecutionMode) {
119 int32_t gtid = omp_get_thread_num();
120 int numberOfActiveOMPThreads = omp_get_num_threads();
121
122 // All warps that are in excess of the maximum requested, do
123 // not execute the loop
124 ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads,
125 "current thread is not needed here; error");
126
127 // copy
128 int lastiter = 0;
129 T lb = *plower;
130 T ub = *pupper;
131 ST stride = *pstride;
132
133 // init
134 switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) {
135 case kmp_sched_static_chunk: {
136 if (chunk > 0) {
137 ForStaticChunk(last&: lastiter, lb, ub, stride, chunk, entityId: gtid,
138 numberOfEntities: numberOfActiveOMPThreads);
139 break;
140 }
141 [[fallthrough]];
142 } // note: if chunk <=0, use nochunk
143 case kmp_sched_static_balanced_chunk: {
144 if (chunk > 0) {
145 // round up to make sure the chunk is enough to cover all iterations
146 T tripCount = ub - lb + 1; // +1 because ub is inclusive
147 T span = (tripCount + numberOfActiveOMPThreads - 1) /
148 numberOfActiveOMPThreads;
149 // perform chunk adjustment
150 chunk = (span + chunk - 1) & ~(chunk - 1);
151
152 ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
153 T oldUb = ub;
154 ForStaticChunk(last&: lastiter, lb, ub, stride, chunk, entityId: gtid,
155 numberOfEntities: numberOfActiveOMPThreads);
156 if (ub > oldUb)
157 ub = oldUb;
158 break;
159 }
160 [[fallthrough]];
161 } // note: if chunk <=0, use nochunk
162 case kmp_sched_static_nochunk: {
163 ForStaticNoChunk(last&: lastiter, lb, ub, stride, chunk, entityId: gtid,
164 numberOfEntities: numberOfActiveOMPThreads);
165 break;
166 }
167 case kmp_sched_distr_static_chunk: {
168 if (chunk > 0) {
169 ForStaticChunk(last&: lastiter, lb, ub, stride, chunk, entityId: omp_get_team_num(),
170 numberOfEntities: omp_get_num_teams());
171 break;
172 }
173 [[fallthrough]];
174 } // note: if chunk <=0, use nochunk
175 case kmp_sched_distr_static_nochunk: {
176 ForStaticNoChunk(last&: lastiter, lb, ub, stride, chunk, entityId: omp_get_team_num(),
177 numberOfEntities: omp_get_num_teams());
178 break;
179 }
180 case kmp_sched_distr_static_chunk_sched_static_chunkone: {
181 ForStaticChunk(last&: lastiter, lb, ub, stride, chunk,
182 entityId: numberOfActiveOMPThreads * omp_get_team_num() + gtid,
183 numberOfEntities: omp_get_num_teams() * numberOfActiveOMPThreads);
184 break;
185 }
186 default: {
187 // ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
188 ForStaticChunk(last&: lastiter, lb, ub, stride, chunk, entityId: gtid,
189 numberOfEntities: numberOfActiveOMPThreads);
190 break;
191 }
192 }
193 // copy back
194 *plastiter = lastiter;
195 *plower = lb;
196 *pupper = ub;
197 *pstride = stride;
198 }
199
200 ////////////////////////////////////////////////////////////////////////////////
201 // Support for dispatch Init
202
203 static int OrderedSchedule(kmp_sched_t schedule) {
204 return schedule >= kmp_sched_ordered_first &&
205 schedule <= kmp_sched_ordered_last;
206 }
207
208 static void dispatch_init(IdentTy *loc, int32_t threadId,
209 kmp_sched_t schedule, T lb, T ub, ST st, ST chunk,
210 DynamicScheduleTracker *DST) {
211 int tid = mapping::getThreadIdInBlock();
212 T tnum = omp_get_num_threads();
213 T tripCount = ub - lb + 1; // +1 because ub is inclusive
214 ASSERT0(LT_FUSSY, threadId < tnum,
215 "current thread is not needed here; error");
216
217 /* Currently just ignore the monotonic and non-monotonic modifiers
218 * (the compiler isn't producing them * yet anyway).
219 * When it is we'll want to look at them somewhere here and use that
220 * information to add to our schedule choice. We shouldn't need to pass
221 * them on, they merely affect which schedule we can legally choose for
222 * various dynamic cases. (In particular, whether or not a stealing scheme
223 * is legal).
224 */
225 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
226
227 // Process schedule.
228 if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
229 if (OrderedSchedule(schedule))
230 __kmpc_barrier(loc, threadId);
231 schedule = kmp_sched_static_chunk;
232 chunk = tripCount; // one thread gets the whole loop
233 } else if (schedule == kmp_sched_runtime) {
234 // process runtime
235 omp_sched_t rtSched;
236 int ChunkInt;
237 omp_get_schedule(&rtSched, &ChunkInt);
238 chunk = ChunkInt;
239 switch (rtSched) {
240 case omp_sched_static: {
241 if (chunk > 0)
242 schedule = kmp_sched_static_chunk;
243 else
244 schedule = kmp_sched_static_nochunk;
245 break;
246 }
247 case omp_sched_auto: {
248 schedule = kmp_sched_static_chunk;
249 chunk = 1;
250 break;
251 }
252 case omp_sched_dynamic:
253 case omp_sched_guided: {
254 schedule = kmp_sched_dynamic;
255 break;
256 }
257 }
258 } else if (schedule == kmp_sched_auto) {
259 schedule = kmp_sched_static_chunk;
260 chunk = 1;
261 } else {
262 // ASSERT(LT_FUSSY,
263 // schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
264 // "unknown schedule %d & chunk %lld\n", (int)schedule,
265 // (long long)chunk);
266 }
267
268 // init schedules
269 if (schedule == kmp_sched_static_chunk) {
270 ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
271 // save sched state
272 DST->ScheduleType = schedule;
273 // save ub
274 DST->LoopUpperBound = ub;
275 // compute static chunk
276 ST stride;
277 int lastiter = 0;
278 ForStaticChunk(last&: lastiter, lb, ub, stride, chunk, entityId: threadId, numberOfEntities: tnum);
279 // save computed params
280 DST->Chunk = chunk;
281 DST->NextLowerBound = lb;
282 DST->Stride = stride;
283 } else if (schedule == kmp_sched_static_balanced_chunk) {
284 ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
285 // save sched state
286 DST->ScheduleType = schedule;
287 // save ub
288 DST->LoopUpperBound = ub;
289 // compute static chunk
290 ST stride;
291 int lastiter = 0;
292 // round up to make sure the chunk is enough to cover all iterations
293 T span = (tripCount + tnum - 1) / tnum;
294 // perform chunk adjustment
295 chunk = (span + chunk - 1) & ~(chunk - 1);
296
297 T oldUb = ub;
298 ForStaticChunk(last&: lastiter, lb, ub, stride, chunk, entityId: threadId, numberOfEntities: tnum);
299 ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
300 if (ub > oldUb)
301 ub = oldUb;
302 // save computed params
303 DST->Chunk = chunk;
304 DST->NextLowerBound = lb;
305 DST->Stride = stride;
306 } else if (schedule == kmp_sched_static_nochunk) {
307 ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
308 // save sched state
309 DST->ScheduleType = schedule;
310 // save ub
311 DST->LoopUpperBound = ub;
312 // compute static chunk
313 ST stride;
314 int lastiter = 0;
315 ForStaticNoChunk(last&: lastiter, lb, ub, stride, chunk, entityId: threadId, numberOfEntities: tnum);
316 // save computed params
317 DST->Chunk = chunk;
318 DST->NextLowerBound = lb;
319 DST->Stride = stride;
320 } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
321 // save data
322 DST->ScheduleType = schedule;
323 if (chunk < 1)
324 chunk = 1;
325 DST->Chunk = chunk;
326 DST->LoopUpperBound = ub;
327 DST->NextLowerBound = lb;
328 __kmpc_barrier(loc, threadId);
329 if (tid == 0) {
330 Cnt = 0;
331 fence::team(atomic::seq_cst);
332 }
333 __kmpc_barrier(loc, threadId);
334 }
335 }
336
337 ////////////////////////////////////////////////////////////////////////////////
338 // Support for dispatch next
339
340 static uint64_t NextIter() {
341 __kmpc_impl_lanemask_t active = mapping::activemask();
342 uint32_t leader = utils::ffs(active) - 1;
343 uint32_t change = utils::popc(active);
344 __kmpc_impl_lanemask_t lane_mask_lt = mapping::lanemaskLT();
345 unsigned int rank = utils::popc(active & lane_mask_lt);
346 uint64_t warp_res = 0;
347 if (rank == 0) {
348 warp_res = atomic::add(&Cnt, change, atomic::seq_cst);
349 }
350 warp_res = utils::shuffle(active, warp_res, leader, mapping::getWarpSize());
351 return warp_res + rank;
352 }
353
354 static int DynamicNextChunk(T &lb, T &ub, T chunkSize, T loopLowerBound,
355 T loopUpperBound) {
356 T N = NextIter();
357 lb = loopLowerBound + N * chunkSize;
358 ub = lb + chunkSize - 1; // Clang uses i <= ub
359
360 // 3 result cases:
361 // a. lb and ub < loopUpperBound --> NOT_FINISHED
362 // b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
363 // NOT_FINISHED
364 // c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
365 // a.
366 if (lb <= loopUpperBound && ub < loopUpperBound) {
367 return NOT_FINISHED;
368 }
369 // b.
370 if (lb <= loopUpperBound) {
371 ub = loopUpperBound;
372 return LAST_CHUNK;
373 }
374 // c. if we are here, we are in case 'c'
375 lb = loopUpperBound + 2;
376 ub = loopUpperBound + 1;
377 return FINISHED;
378 }
379
380 static int dispatch_next(IdentTy *loc, int32_t gtid, int32_t *plast,
381 T *plower, T *pupper, ST *pstride,
382 DynamicScheduleTracker *DST) {
383 // ID of a thread in its own warp
384
385 // automatically selects thread or warp ID based on selected implementation
386 ASSERT0(LT_FUSSY, gtid < omp_get_num_threads(),
387 "current thread is not needed here; error");
388 // retrieve schedule
389 kmp_sched_t schedule = DST->ScheduleType;
390
391 // xxx reduce to one
392 if (schedule == kmp_sched_static_chunk ||
393 schedule == kmp_sched_static_nochunk) {
394 T myLb = DST->NextLowerBound;
395 T ub = DST->LoopUpperBound;
396 // finished?
397 if (myLb > ub) {
398 return DISPATCH_FINISHED;
399 }
400 // not finished, save current bounds
401 ST chunk = DST->Chunk;
402 *plower = myLb;
403 T myUb = myLb + chunk - 1; // Clang uses i <= ub
404 if (myUb > ub)
405 myUb = ub;
406 *pupper = myUb;
407 *plast = (int32_t)(myUb == ub);
408
409 // increment next lower bound by the stride
410 ST stride = DST->Stride;
411 DST->NextLowerBound = myLb + stride;
412 return DISPATCH_NOTFINISHED;
413 }
414 ASSERT0(LT_FUSSY,
415 schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
416 "bad sched");
417 T myLb, myUb;
418 int finished = DynamicNextChunk(lb&: myLb, ub&: myUb, chunkSize: DST->Chunk, loopLowerBound: DST->NextLowerBound,
419 loopUpperBound: DST->LoopUpperBound);
420
421 if (finished == FINISHED)
422 return DISPATCH_FINISHED;
423
424 // not finished (either not finished or last chunk)
425 *plast = (int32_t)(finished == LAST_CHUNK);
426 *plower = myLb;
427 *pupper = myUb;
428 *pstride = 1;
429
430 return DISPATCH_NOTFINISHED;
431 }
432
433 static void dispatch_fini() {
434 // nothing
435 }
436
437 ////////////////////////////////////////////////////////////////////////////////
438 // end of template class that encapsulate all the helper functions
439 ////////////////////////////////////////////////////////////////////////////////
440};
441
442////////////////////////////////////////////////////////////////////////////////
443// KMP interface implementation (dyn loops)
444////////////////////////////////////////////////////////////////////////////////
445
446// TODO: Expand the dispatch API to take a DST pointer which can then be
447// allocated properly without malloc.
448// For now, each team will contain an LDS pointer (ThreadDST) to a global array
449// of references to the DST structs allocated (in global memory) for each thread
450// in the team. The global memory array is allocated during the init phase if it
451// was not allocated already and will be deallocated when the dispatch phase
452// ends:
453//
454// __kmpc_dispatch_init
455//
456// ** Dispatch loop **
457//
458// __kmpc_dispatch_deinit
459//
460[[clang::loader_uninitialized]] static Local<DynamicScheduleTracker **>
461 ThreadDST;
462
463// Create a new DST, link the current one, and define the new as current.
464static DynamicScheduleTracker *pushDST() {
465 int32_t ThreadIndex = mapping::getThreadIdInBlock();
466 // Each block will allocate an array of pointers to DST structs. The array is
467 // equal in length to the number of threads in that block.
468 if (!ThreadDST) {
469 // Allocate global memory array of pointers to DST structs:
470 if (mapping::isMainThreadInGenericMode() || ThreadIndex == 0)
471 ThreadDST = static_cast<DynamicScheduleTracker **>(
472 memory::allocGlobal(mapping::getNumberOfThreadsInBlock() *
473 sizeof(DynamicScheduleTracker *),
474 "new ThreadDST array"));
475 synchronize::threads(atomic::seq_cst);
476
477 // Initialize the array pointers:
478 ThreadDST[ThreadIndex] = nullptr;
479 }
480
481 // Create a DST struct for the current thread:
482 DynamicScheduleTracker *NewDST = static_cast<DynamicScheduleTracker *>(
483 memory::allocGlobal(sizeof(DynamicScheduleTracker), "new DST"));
484 *NewDST = DynamicScheduleTracker({0});
485
486 // Add the new DST struct to the array of DST structs:
487 NewDST->NextDST = ThreadDST[ThreadIndex];
488 ThreadDST[ThreadIndex] = NewDST;
489 return NewDST;
490}
491
492// Return the current DST.
493static DynamicScheduleTracker *peekDST() {
494 return ThreadDST[mapping::getThreadIdInBlock()];
495}
496
497// Pop the current DST and restore the last one.
498static void popDST() {
499 int32_t ThreadIndex = mapping::getThreadIdInBlock();
500 DynamicScheduleTracker *CurrentDST = ThreadDST[ThreadIndex];
501 DynamicScheduleTracker *OldDST = CurrentDST->NextDST;
502 memory::freeGlobal(CurrentDST, "remove DST");
503 ThreadDST[ThreadIndex] = OldDST;
504
505 // Check if we need to deallocate the global array. Ensure all threads
506 // in the block have finished deallocating the individual DSTs.
507 synchronize::threads(atomic::seq_cst);
508 if (!ThreadDST[ThreadIndex] && !ThreadIndex) {
509 memory::freeGlobal(ThreadDST, "remove ThreadDST array");
510 ThreadDST = nullptr;
511 }
512 synchronize::threads(atomic::seq_cst);
513}
514
515void workshare::init(bool IsSPMD) {
516 if (mapping::isInitialThreadInLevel0(IsSPMD))
517 ThreadDST = nullptr;
518}
519
520extern "C" {
521
522// init
523void __kmpc_dispatch_init_4(IdentTy *loc, int32_t tid, int32_t schedule,
524 int32_t lb, int32_t ub, int32_t st, int32_t chunk) {
525 DynamicScheduleTracker *DST = pushDST();
526 omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
527 loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
528}
529
530void __kmpc_dispatch_init_4u(IdentTy *loc, int32_t tid, int32_t schedule,
531 uint32_t lb, uint32_t ub, int32_t st,
532 int32_t chunk) {
533 DynamicScheduleTracker *DST = pushDST();
534 omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
535 loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
536}
537
538void __kmpc_dispatch_init_8(IdentTy *loc, int32_t tid, int32_t schedule,
539 int64_t lb, int64_t ub, int64_t st, int64_t chunk) {
540 DynamicScheduleTracker *DST = pushDST();
541 omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
542 loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
543}
544
545void __kmpc_dispatch_init_8u(IdentTy *loc, int32_t tid, int32_t schedule,
546 uint64_t lb, uint64_t ub, int64_t st,
547 int64_t chunk) {
548 DynamicScheduleTracker *DST = pushDST();
549 omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
550 loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
551}
552
553// next
554int __kmpc_dispatch_next_4(IdentTy *loc, int32_t tid, int32_t *p_last,
555 int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
556 DynamicScheduleTracker *DST = peekDST();
557 return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
558 loc, tid, p_last, p_lb, p_ub, p_st, DST);
559}
560
561int __kmpc_dispatch_next_4u(IdentTy *loc, int32_t tid, int32_t *p_last,
562 uint32_t *p_lb, uint32_t *p_ub, int32_t *p_st) {
563 DynamicScheduleTracker *DST = peekDST();
564 return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
565 loc, tid, p_last, p_lb, p_ub, p_st, DST);
566}
567
568int __kmpc_dispatch_next_8(IdentTy *loc, int32_t tid, int32_t *p_last,
569 int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
570 DynamicScheduleTracker *DST = peekDST();
571 return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
572 loc, tid, p_last, p_lb, p_ub, p_st, DST);
573}
574
575int __kmpc_dispatch_next_8u(IdentTy *loc, int32_t tid, int32_t *p_last,
576 uint64_t *p_lb, uint64_t *p_ub, int64_t *p_st) {
577 DynamicScheduleTracker *DST = peekDST();
578 return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
579 loc, tid, p_last, p_lb, p_ub, p_st, DST);
580}
581
582// fini
583void __kmpc_dispatch_fini_4(IdentTy *loc, int32_t tid) {
584 omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
585}
586
587void __kmpc_dispatch_fini_4u(IdentTy *loc, int32_t tid) {
588 omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
589}
590
591void __kmpc_dispatch_fini_8(IdentTy *loc, int32_t tid) {
592 omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
593}
594
595void __kmpc_dispatch_fini_8u(IdentTy *loc, int32_t tid) {
596 omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
597}
598
599// deinit
600void __kmpc_dispatch_deinit(IdentTy *loc, int32_t tid) { popDST(); }
601
602////////////////////////////////////////////////////////////////////////////////
603// KMP interface implementation (static loops)
604////////////////////////////////////////////////////////////////////////////////
605
606void __kmpc_for_static_init_4(IdentTy *loc, int32_t global_tid,
607 int32_t schedtype, int32_t *plastiter,
608 int32_t *plower, int32_t *pupper,
609 int32_t *pstride, int32_t incr, int32_t chunk) {
610 omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
611 global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
612 mapping::isSPMDMode());
613}
614
615void __kmpc_for_static_init_4u(IdentTy *loc, int32_t global_tid,
616 int32_t schedtype, int32_t *plastiter,
617 uint32_t *plower, uint32_t *pupper,
618 int32_t *pstride, int32_t incr, int32_t chunk) {
619 omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
620 global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
621 mapping::isSPMDMode());
622}
623
624void __kmpc_for_static_init_8(IdentTy *loc, int32_t global_tid,
625 int32_t schedtype, int32_t *plastiter,
626 int64_t *plower, int64_t *pupper,
627 int64_t *pstride, int64_t incr, int64_t chunk) {
628 omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
629 global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
630 mapping::isSPMDMode());
631}
632
633void __kmpc_for_static_init_8u(IdentTy *loc, int32_t global_tid,
634 int32_t schedtype, int32_t *plastiter,
635 uint64_t *plower, uint64_t *pupper,
636 int64_t *pstride, int64_t incr, int64_t chunk) {
637 omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
638 global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
639 mapping::isSPMDMode());
640}
641
642void __kmpc_distribute_static_init_4(IdentTy *loc, int32_t global_tid,
643 int32_t schedtype, int32_t *plastiter,
644 int32_t *plower, int32_t *pupper,
645 int32_t *pstride, int32_t incr,
646 int32_t chunk) {
647 omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
648 global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
649 mapping::isSPMDMode());
650}
651
652void __kmpc_distribute_static_init_4u(IdentTy *loc, int32_t global_tid,
653 int32_t schedtype, int32_t *plastiter,
654 uint32_t *plower, uint32_t *pupper,
655 int32_t *pstride, int32_t incr,
656 int32_t chunk) {
657 omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
658 global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
659 mapping::isSPMDMode());
660}
661
662void __kmpc_distribute_static_init_8(IdentTy *loc, int32_t global_tid,
663 int32_t schedtype, int32_t *plastiter,
664 int64_t *plower, int64_t *pupper,
665 int64_t *pstride, int64_t incr,
666 int64_t chunk) {
667 omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
668 global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
669 mapping::isSPMDMode());
670}
671
672void __kmpc_distribute_static_init_8u(IdentTy *loc, int32_t global_tid,
673 int32_t schedtype, int32_t *plastiter,
674 uint64_t *plower, uint64_t *pupper,
675 int64_t *pstride, int64_t incr,
676 int64_t chunk) {
677 omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
678 global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
679 mapping::isSPMDMode());
680}
681
682void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {}
683
684void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {}
685}
686
687namespace ompx {
688
689/// Helper class to hide the generic loop nest and provide the template argument
690/// throughout.
691template <typename Ty> class StaticLoopChunker {
692
693 /// Generic loop nest that handles block and/or thread distribution in the
694 /// absence of user specified chunk sizes. This implicitly picks a block chunk
695 /// size equal to the number of threads in the block and a thread chunk size
696 /// equal to one. In contrast to the chunked version we can get away with a
697 /// single loop in this case
698 static void NormalizedLoopNestNoChunk(void (*LoopBody)(Ty, void *), void *Arg,
699 Ty NumBlocks, Ty BId, Ty NumThreads,
700 Ty TId, Ty NumIters,
701 bool OneIterationPerThread) {
702 Ty KernelIteration = NumBlocks * NumThreads;
703
704 // Start index in the normalized space.
705 Ty IV = BId * NumThreads + TId;
706 ASSERT(IV >= 0, "Bad index");
707
708 // Cover the entire iteration space, assumptions in the caller might allow
709 // to simplify this loop to a conditional.
710 if (IV < NumIters) {
711 do {
712
713 // Execute the loop body.
714 LoopBody(IV, Arg);
715
716 // Every thread executed one block and thread chunk now.
717 IV += KernelIteration;
718
719 if (OneIterationPerThread)
720 return;
721
722 } while (IV < NumIters);
723 }
724 }
725
726 /// Generic loop nest that handles block and/or thread distribution in the
727 /// presence of user specified chunk sizes (for at least one of them).
728 static void NormalizedLoopNestChunked(void (*LoopBody)(Ty, void *), void *Arg,
729 Ty BlockChunk, Ty NumBlocks, Ty BId,
730 Ty ThreadChunk, Ty NumThreads, Ty TId,
731 Ty NumIters,
732 bool OneIterationPerThread) {
733 Ty KernelIteration = NumBlocks * BlockChunk;
734
735 // Start index in the chunked space.
736 Ty IV = BId * BlockChunk + TId;
737 ASSERT(IV >= 0, "Bad index");
738
739 // Cover the entire iteration space, assumptions in the caller might allow
740 // to simplify this loop to a conditional.
741 do {
742
743 Ty BlockChunkLeft =
744 BlockChunk >= TId * ThreadChunk ? BlockChunk - TId * ThreadChunk : 0;
745 Ty ThreadChunkLeft =
746 ThreadChunk <= BlockChunkLeft ? ThreadChunk : BlockChunkLeft;
747
748 while (ThreadChunkLeft--) {
749
750 // Given the blocking it's hard to keep track of what to execute.
751 if (IV >= NumIters)
752 return;
753
754 // Execute the loop body.
755 LoopBody(IV, Arg);
756
757 if (OneIterationPerThread)
758 return;
759
760 ++IV;
761 }
762
763 IV += KernelIteration;
764
765 } while (IV < NumIters);
766 }
767
768public:
769 /// Worksharing `for`-loop.
770 static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
771 Ty NumIters, Ty NumThreads, Ty ThreadChunk) {
772 ASSERT(NumIters >= 0, "Bad iteration count");
773 ASSERT(ThreadChunk >= 0, "Bad thread count");
774
775 // All threads need to participate but we don't know if we are in a
776 // parallel at all or if the user might have used a `num_threads` clause
777 // on the parallel and reduced the number compared to the block size.
778 // Since nested parallels are possible too we need to get the thread id
779 // from the `omp` getter and not the mapping directly.
780 Ty TId = omp_get_thread_num();
781
782 // There are no blocks involved here.
783 Ty BlockChunk = 0;
784 Ty NumBlocks = 1;
785 Ty BId = 0;
786
787 // If the thread chunk is not specified we pick a default now.
788 if (ThreadChunk == 0)
789 ThreadChunk = 1;
790
791 // If we know we have more threads than iterations we can indicate that to
792 // avoid an outer loop.
793 bool OneIterationPerThread = false;
794 if (config::getAssumeThreadsOversubscription()) {
795 ASSERT(NumThreads >= NumIters, "Broken assumption");
796 OneIterationPerThread = true;
797 }
798
799 if (ThreadChunk != 1)
800 NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
801 ThreadChunk, NumThreads, TId, NumIters,
802 OneIterationPerThread);
803 else
804 NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
805 NumIters, OneIterationPerThread);
806 }
807
808 /// Worksharing `distribute`-loop.
809 static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
810 Ty NumIters, Ty BlockChunk) {
811 ASSERT(icv::Level == 0, "Bad distribute");
812 ASSERT(icv::ActiveLevel == 0, "Bad distribute");
813 ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
814 ASSERT(state::ParallelTeamSize == 1, "Bad distribute");
815
816 ASSERT(NumIters >= 0, "Bad iteration count");
817 ASSERT(BlockChunk >= 0, "Bad block count");
818
819 // There are no threads involved here.
820 Ty ThreadChunk = 0;
821 Ty NumThreads = 1;
822 Ty TId = 0;
823
824 // All teams need to participate.
825 Ty NumBlocks = mapping::getNumberOfBlocksInKernel();
826 Ty BId = mapping::getBlockIdInKernel();
827
828 // If the block chunk is not specified we pick a default now.
829 if (BlockChunk == 0)
830 BlockChunk = NumThreads;
831
832 // If we know we have more blocks than iterations we can indicate that to
833 // avoid an outer loop.
834 bool OneIterationPerThread = false;
835 if (config::getAssumeTeamsOversubscription()) {
836 ASSERT(NumBlocks >= NumIters, "Broken assumption");
837 OneIterationPerThread = true;
838 }
839
840 if (BlockChunk != NumThreads)
841 NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
842 ThreadChunk, NumThreads, TId, NumIters,
843 OneIterationPerThread);
844 else
845 NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
846 NumIters, OneIterationPerThread);
847
848 ASSERT(icv::Level == 0, "Bad distribute");
849 ASSERT(icv::ActiveLevel == 0, "Bad distribute");
850 ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
851 ASSERT(state::ParallelTeamSize == 1, "Bad distribute");
852 }
853
854 /// Worksharing `distribute parallel for`-loop.
855 static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *),
856 void *Arg, Ty NumIters, Ty NumThreads,
857 Ty BlockChunk, Ty ThreadChunk) {
858 ASSERT(icv::Level == 1, "Bad distribute");
859 ASSERT(icv::ActiveLevel == 1, "Bad distribute");
860 ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
861
862 ASSERT(NumIters >= 0, "Bad iteration count");
863 ASSERT(BlockChunk >= 0, "Bad block count");
864 ASSERT(ThreadChunk >= 0, "Bad thread count");
865
866 // All threads need to participate but the user might have used a
867 // `num_threads` clause on the parallel and reduced the number compared to
868 // the block size.
869 Ty TId = mapping::getThreadIdInBlock();
870
871 // All teams need to participate.
872 Ty NumBlocks = mapping::getNumberOfBlocksInKernel();
873 Ty BId = mapping::getBlockIdInKernel();
874
875 // If the block chunk is not specified we pick a default now.
876 if (BlockChunk == 0)
877 BlockChunk = NumThreads;
878
879 // If the thread chunk is not specified we pick a default now.
880 if (ThreadChunk == 0)
881 ThreadChunk = 1;
882
883 // If we know we have more threads (across all blocks) than iterations we
884 // can indicate that to avoid an outer loop.
885 bool OneIterationPerThread = false;
886 if (config::getAssumeTeamsOversubscription() &
887 config::getAssumeThreadsOversubscription()) {
888 OneIterationPerThread = true;
889 ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption");
890 }
891
892 if (BlockChunk != NumThreads || ThreadChunk != 1)
893 NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
894 ThreadChunk, NumThreads, TId, NumIters,
895 OneIterationPerThread);
896 else
897 NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
898 NumIters, OneIterationPerThread);
899
900 ASSERT(icv::Level == 1, "Bad distribute");
901 ASSERT(icv::ActiveLevel == 1, "Bad distribute");
902 ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
903 }
904};
905
906} // namespace ompx
907
908#define OMP_LOOP_ENTRY(BW, TY) \
909 [[gnu::flatten, clang::always_inline]] void \
910 __kmpc_distribute_for_static_loop##BW( \
911 IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \
912 TY num_threads, TY block_chunk, TY thread_chunk) { \
913 ompx::StaticLoopChunker<TY>::DistributeFor( \
914 loc, fn, arg, num_iters, num_threads, block_chunk, thread_chunk); \
915 } \
916 [[gnu::flatten, clang::always_inline]] void \
917 __kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *), \
918 void *arg, TY num_iters, \
919 TY block_chunk) { \
920 ompx::StaticLoopChunker<TY>::Distribute(loc, fn, arg, num_iters, \
921 block_chunk); \
922 } \
923 [[gnu::flatten, clang::always_inline]] void __kmpc_for_static_loop##BW( \
924 IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \
925 TY num_threads, TY thread_chunk) { \
926 ompx::StaticLoopChunker<TY>::For(loc, fn, arg, num_iters, num_threads, \
927 thread_chunk); \
928 }
929
930extern "C" {
931OMP_LOOP_ENTRY(_4, int32_t)
932OMP_LOOP_ENTRY(_4u, uint32_t)
933OMP_LOOP_ENTRY(_8, int64_t)
934OMP_LOOP_ENTRY(_8u, uint64_t)
935}
936

Provided by KDAB

Privacy Policy
Learn to use CMake with our Intro Training
Find out more

source code of offload/DeviceRTL/src/Workshare.cpp