1 | //===----- Workshare.cpp - OpenMP workshare implementation ------ C++ -*-===// |
---|---|
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file contains the implementation of the KMPC interface |
10 | // for the loop construct plus other worksharing constructs that use the same |
11 | // interface as loops. |
12 | // |
13 | //===----------------------------------------------------------------------===// |
14 | |
15 | #include "Workshare.h" |
16 | #include "Debug.h" |
17 | #include "DeviceTypes.h" |
18 | #include "DeviceUtils.h" |
19 | #include "Interface.h" |
20 | #include "Mapping.h" |
21 | #include "State.h" |
22 | #include "Synchronization.h" |
23 | |
24 | using namespace ompx; |
25 | |
26 | // TODO: |
27 | struct DynamicScheduleTracker { |
28 | int64_t Chunk; |
29 | int64_t LoopUpperBound; |
30 | int64_t NextLowerBound; |
31 | int64_t Stride; |
32 | kmp_sched_t ScheduleType; |
33 | DynamicScheduleTracker *NextDST; |
34 | }; |
35 | |
36 | #define ASSERT0(...) |
37 | |
38 | // used by the library for the interface with the app |
39 | #define DISPATCH_FINISHED 0 |
40 | #define DISPATCH_NOTFINISHED 1 |
41 | |
42 | // used by dynamic scheduling |
43 | #define FINISHED 0 |
44 | #define NOT_FINISHED 1 |
45 | #define LAST_CHUNK 2 |
46 | |
47 | // TODO: This variable is a hack inherited from the old runtime. |
48 | [[clang::loader_uninitialized]] static Local<uint64_t> Cnt; |
49 | |
50 | template <typename T, typename ST> struct omptarget_nvptx_LoopSupport { |
51 | //////////////////////////////////////////////////////////////////////////////// |
52 | // Loop with static scheduling with chunk |
53 | |
54 | // Generic implementation of OMP loop scheduling with static policy |
55 | /*! \brief Calculate initial bounds for static loop and stride |
56 | * @param[in] loc location in code of the call (not used here) |
57 | * @param[in] global_tid global thread id |
58 | * @param[in] schetype type of scheduling (see omptarget-nvptx.h) |
59 | * @param[in] plastiter pointer to last iteration |
60 | * @param[in,out] pointer to loop lower bound. it will contain value of |
61 | * lower bound of first chunk |
62 | * @param[in,out] pointer to loop upper bound. It will contain value of |
63 | * upper bound of first chunk |
64 | * @param[in,out] pointer to loop stride. It will contain value of stride |
65 | * between two successive chunks executed by the same thread |
66 | * @param[in] loop increment bump |
67 | * @param[in] chunk size |
68 | */ |
69 | |
70 | // helper function for static chunk |
71 | static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, ST chunk, |
72 | T entityId, T numberOfEntities) { |
73 | // each thread executes multiple chunks all of the same size, except |
74 | // the last one |
75 | // distance between two successive chunks |
76 | stride = numberOfEntities * chunk; |
77 | lb = lb + entityId * chunk; |
78 | T inputUb = ub; |
79 | ub = lb + chunk - 1; // Clang uses i <= ub |
80 | // Say ub' is the beginning of the last chunk. Then who ever has a |
81 | // lower bound plus a multiple of the increment equal to ub' is |
82 | // the last one. |
83 | T beginingLastChunk = inputUb - (inputUb % chunk); |
84 | last = ((beginingLastChunk - lb) % stride) == 0; |
85 | } |
86 | |
87 | //////////////////////////////////////////////////////////////////////////////// |
88 | // Loop with static scheduling without chunk |
89 | |
90 | // helper function for static no chunk |
91 | static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, ST &chunk, |
92 | T entityId, T numberOfEntities) { |
93 | // No chunk size specified. Each thread or warp gets at most one |
94 | // chunk; chunks are all almost of equal size |
95 | T loopSize = ub - lb + 1; |
96 | |
97 | chunk = loopSize / numberOfEntities; |
98 | T leftOver = loopSize - chunk * numberOfEntities; |
99 | |
100 | if (entityId < leftOver) { |
101 | chunk++; |
102 | lb = lb + entityId * chunk; |
103 | } else { |
104 | lb = lb + entityId * chunk + leftOver; |
105 | } |
106 | |
107 | T inputUb = ub; |
108 | ub = lb + chunk - 1; // Clang uses i <= ub |
109 | last = lb <= inputUb && inputUb <= ub; |
110 | stride = loopSize; // make sure we only do 1 chunk per warp |
111 | } |
112 | |
113 | //////////////////////////////////////////////////////////////////////////////// |
114 | // Support for Static Init |
115 | |
116 | static void for_static_init(int32_t, int32_t schedtype, int32_t *plastiter, |
117 | T *plower, T *pupper, ST *pstride, ST chunk, |
118 | bool IsSPMDExecutionMode) { |
119 | int32_t gtid = omp_get_thread_num(); |
120 | int numberOfActiveOMPThreads = omp_get_num_threads(); |
121 | |
122 | // All warps that are in excess of the maximum requested, do |
123 | // not execute the loop |
124 | ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads, |
125 | "current thread is not needed here; error"); |
126 | |
127 | // copy |
128 | int lastiter = 0; |
129 | T lb = *plower; |
130 | T ub = *pupper; |
131 | ST stride = *pstride; |
132 | |
133 | // init |
134 | switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) { |
135 | case kmp_sched_static_chunk: { |
136 | if (chunk > 0) { |
137 | ForStaticChunk(last&: lastiter, lb, ub, stride, chunk, entityId: gtid, |
138 | numberOfEntities: numberOfActiveOMPThreads); |
139 | break; |
140 | } |
141 | [[fallthrough]]; |
142 | } // note: if chunk <=0, use nochunk |
143 | case kmp_sched_static_balanced_chunk: { |
144 | if (chunk > 0) { |
145 | // round up to make sure the chunk is enough to cover all iterations |
146 | T tripCount = ub - lb + 1; // +1 because ub is inclusive |
147 | T span = (tripCount + numberOfActiveOMPThreads - 1) / |
148 | numberOfActiveOMPThreads; |
149 | // perform chunk adjustment |
150 | chunk = (span + chunk - 1) & ~(chunk - 1); |
151 | |
152 | ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb."); |
153 | T oldUb = ub; |
154 | ForStaticChunk(last&: lastiter, lb, ub, stride, chunk, entityId: gtid, |
155 | numberOfEntities: numberOfActiveOMPThreads); |
156 | if (ub > oldUb) |
157 | ub = oldUb; |
158 | break; |
159 | } |
160 | [[fallthrough]]; |
161 | } // note: if chunk <=0, use nochunk |
162 | case kmp_sched_static_nochunk: { |
163 | ForStaticNoChunk(last&: lastiter, lb, ub, stride, chunk, entityId: gtid, |
164 | numberOfEntities: numberOfActiveOMPThreads); |
165 | break; |
166 | } |
167 | case kmp_sched_distr_static_chunk: { |
168 | if (chunk > 0) { |
169 | ForStaticChunk(last&: lastiter, lb, ub, stride, chunk, entityId: omp_get_team_num(), |
170 | numberOfEntities: omp_get_num_teams()); |
171 | break; |
172 | } |
173 | [[fallthrough]]; |
174 | } // note: if chunk <=0, use nochunk |
175 | case kmp_sched_distr_static_nochunk: { |
176 | ForStaticNoChunk(last&: lastiter, lb, ub, stride, chunk, entityId: omp_get_team_num(), |
177 | numberOfEntities: omp_get_num_teams()); |
178 | break; |
179 | } |
180 | case kmp_sched_distr_static_chunk_sched_static_chunkone: { |
181 | ForStaticChunk(last&: lastiter, lb, ub, stride, chunk, |
182 | entityId: numberOfActiveOMPThreads * omp_get_team_num() + gtid, |
183 | numberOfEntities: omp_get_num_teams() * numberOfActiveOMPThreads); |
184 | break; |
185 | } |
186 | default: { |
187 | // ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype); |
188 | ForStaticChunk(last&: lastiter, lb, ub, stride, chunk, entityId: gtid, |
189 | numberOfEntities: numberOfActiveOMPThreads); |
190 | break; |
191 | } |
192 | } |
193 | // copy back |
194 | *plastiter = lastiter; |
195 | *plower = lb; |
196 | *pupper = ub; |
197 | *pstride = stride; |
198 | } |
199 | |
200 | //////////////////////////////////////////////////////////////////////////////// |
201 | // Support for dispatch Init |
202 | |
203 | static int OrderedSchedule(kmp_sched_t schedule) { |
204 | return schedule >= kmp_sched_ordered_first && |
205 | schedule <= kmp_sched_ordered_last; |
206 | } |
207 | |
208 | static void dispatch_init(IdentTy *loc, int32_t threadId, |
209 | kmp_sched_t schedule, T lb, T ub, ST st, ST chunk, |
210 | DynamicScheduleTracker *DST) { |
211 | int tid = mapping::getThreadIdInBlock(); |
212 | T tnum = omp_get_num_threads(); |
213 | T tripCount = ub - lb + 1; // +1 because ub is inclusive |
214 | ASSERT0(LT_FUSSY, threadId < tnum, |
215 | "current thread is not needed here; error"); |
216 | |
217 | /* Currently just ignore the monotonic and non-monotonic modifiers |
218 | * (the compiler isn't producing them * yet anyway). |
219 | * When it is we'll want to look at them somewhere here and use that |
220 | * information to add to our schedule choice. We shouldn't need to pass |
221 | * them on, they merely affect which schedule we can legally choose for |
222 | * various dynamic cases. (In particular, whether or not a stealing scheme |
223 | * is legal). |
224 | */ |
225 | schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); |
226 | |
227 | // Process schedule. |
228 | if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) { |
229 | if (OrderedSchedule(schedule)) |
230 | __kmpc_barrier(loc, threadId); |
231 | schedule = kmp_sched_static_chunk; |
232 | chunk = tripCount; // one thread gets the whole loop |
233 | } else if (schedule == kmp_sched_runtime) { |
234 | // process runtime |
235 | omp_sched_t rtSched; |
236 | int ChunkInt; |
237 | omp_get_schedule(&rtSched, &ChunkInt); |
238 | chunk = ChunkInt; |
239 | switch (rtSched) { |
240 | case omp_sched_static: { |
241 | if (chunk > 0) |
242 | schedule = kmp_sched_static_chunk; |
243 | else |
244 | schedule = kmp_sched_static_nochunk; |
245 | break; |
246 | } |
247 | case omp_sched_auto: { |
248 | schedule = kmp_sched_static_chunk; |
249 | chunk = 1; |
250 | break; |
251 | } |
252 | case omp_sched_dynamic: |
253 | case omp_sched_guided: { |
254 | schedule = kmp_sched_dynamic; |
255 | break; |
256 | } |
257 | } |
258 | } else if (schedule == kmp_sched_auto) { |
259 | schedule = kmp_sched_static_chunk; |
260 | chunk = 1; |
261 | } else { |
262 | // ASSERT(LT_FUSSY, |
263 | // schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, |
264 | // "unknown schedule %d & chunk %lld\n", (int)schedule, |
265 | // (long long)chunk); |
266 | } |
267 | |
268 | // init schedules |
269 | if (schedule == kmp_sched_static_chunk) { |
270 | ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value"); |
271 | // save sched state |
272 | DST->ScheduleType = schedule; |
273 | // save ub |
274 | DST->LoopUpperBound = ub; |
275 | // compute static chunk |
276 | ST stride; |
277 | int lastiter = 0; |
278 | ForStaticChunk(last&: lastiter, lb, ub, stride, chunk, entityId: threadId, numberOfEntities: tnum); |
279 | // save computed params |
280 | DST->Chunk = chunk; |
281 | DST->NextLowerBound = lb; |
282 | DST->Stride = stride; |
283 | } else if (schedule == kmp_sched_static_balanced_chunk) { |
284 | ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value"); |
285 | // save sched state |
286 | DST->ScheduleType = schedule; |
287 | // save ub |
288 | DST->LoopUpperBound = ub; |
289 | // compute static chunk |
290 | ST stride; |
291 | int lastiter = 0; |
292 | // round up to make sure the chunk is enough to cover all iterations |
293 | T span = (tripCount + tnum - 1) / tnum; |
294 | // perform chunk adjustment |
295 | chunk = (span + chunk - 1) & ~(chunk - 1); |
296 | |
297 | T oldUb = ub; |
298 | ForStaticChunk(last&: lastiter, lb, ub, stride, chunk, entityId: threadId, numberOfEntities: tnum); |
299 | ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb."); |
300 | if (ub > oldUb) |
301 | ub = oldUb; |
302 | // save computed params |
303 | DST->Chunk = chunk; |
304 | DST->NextLowerBound = lb; |
305 | DST->Stride = stride; |
306 | } else if (schedule == kmp_sched_static_nochunk) { |
307 | ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value"); |
308 | // save sched state |
309 | DST->ScheduleType = schedule; |
310 | // save ub |
311 | DST->LoopUpperBound = ub; |
312 | // compute static chunk |
313 | ST stride; |
314 | int lastiter = 0; |
315 | ForStaticNoChunk(last&: lastiter, lb, ub, stride, chunk, entityId: threadId, numberOfEntities: tnum); |
316 | // save computed params |
317 | DST->Chunk = chunk; |
318 | DST->NextLowerBound = lb; |
319 | DST->Stride = stride; |
320 | } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) { |
321 | // save data |
322 | DST->ScheduleType = schedule; |
323 | if (chunk < 1) |
324 | chunk = 1; |
325 | DST->Chunk = chunk; |
326 | DST->LoopUpperBound = ub; |
327 | DST->NextLowerBound = lb; |
328 | __kmpc_barrier(loc, threadId); |
329 | if (tid == 0) { |
330 | Cnt = 0; |
331 | fence::team(atomic::seq_cst); |
332 | } |
333 | __kmpc_barrier(loc, threadId); |
334 | } |
335 | } |
336 | |
337 | //////////////////////////////////////////////////////////////////////////////// |
338 | // Support for dispatch next |
339 | |
340 | static uint64_t NextIter() { |
341 | __kmpc_impl_lanemask_t active = mapping::activemask(); |
342 | uint32_t leader = utils::ffs(active) - 1; |
343 | uint32_t change = utils::popc(active); |
344 | __kmpc_impl_lanemask_t lane_mask_lt = mapping::lanemaskLT(); |
345 | unsigned int rank = utils::popc(active & lane_mask_lt); |
346 | uint64_t warp_res = 0; |
347 | if (rank == 0) { |
348 | warp_res = atomic::add(&Cnt, change, atomic::seq_cst); |
349 | } |
350 | warp_res = utils::shuffle(active, warp_res, leader, mapping::getWarpSize()); |
351 | return warp_res + rank; |
352 | } |
353 | |
354 | static int DynamicNextChunk(T &lb, T &ub, T chunkSize, T loopLowerBound, |
355 | T loopUpperBound) { |
356 | T N = NextIter(); |
357 | lb = loopLowerBound + N * chunkSize; |
358 | ub = lb + chunkSize - 1; // Clang uses i <= ub |
359 | |
360 | // 3 result cases: |
361 | // a. lb and ub < loopUpperBound --> NOT_FINISHED |
362 | // b. lb < loopUpperBound and ub >= loopUpperBound: last chunk --> |
363 | // NOT_FINISHED |
364 | // c. lb and ub >= loopUpperBound: empty chunk --> FINISHED |
365 | // a. |
366 | if (lb <= loopUpperBound && ub < loopUpperBound) { |
367 | return NOT_FINISHED; |
368 | } |
369 | // b. |
370 | if (lb <= loopUpperBound) { |
371 | ub = loopUpperBound; |
372 | return LAST_CHUNK; |
373 | } |
374 | // c. if we are here, we are in case 'c' |
375 | lb = loopUpperBound + 2; |
376 | ub = loopUpperBound + 1; |
377 | return FINISHED; |
378 | } |
379 | |
380 | static int dispatch_next(IdentTy *loc, int32_t gtid, int32_t *plast, |
381 | T *plower, T *pupper, ST *pstride, |
382 | DynamicScheduleTracker *DST) { |
383 | // ID of a thread in its own warp |
384 | |
385 | // automatically selects thread or warp ID based on selected implementation |
386 | ASSERT0(LT_FUSSY, gtid < omp_get_num_threads(), |
387 | "current thread is not needed here; error"); |
388 | // retrieve schedule |
389 | kmp_sched_t schedule = DST->ScheduleType; |
390 | |
391 | // xxx reduce to one |
392 | if (schedule == kmp_sched_static_chunk || |
393 | schedule == kmp_sched_static_nochunk) { |
394 | T myLb = DST->NextLowerBound; |
395 | T ub = DST->LoopUpperBound; |
396 | // finished? |
397 | if (myLb > ub) { |
398 | return DISPATCH_FINISHED; |
399 | } |
400 | // not finished, save current bounds |
401 | ST chunk = DST->Chunk; |
402 | *plower = myLb; |
403 | T myUb = myLb + chunk - 1; // Clang uses i <= ub |
404 | if (myUb > ub) |
405 | myUb = ub; |
406 | *pupper = myUb; |
407 | *plast = (int32_t)(myUb == ub); |
408 | |
409 | // increment next lower bound by the stride |
410 | ST stride = DST->Stride; |
411 | DST->NextLowerBound = myLb + stride; |
412 | return DISPATCH_NOTFINISHED; |
413 | } |
414 | ASSERT0(LT_FUSSY, |
415 | schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, |
416 | "bad sched"); |
417 | T myLb, myUb; |
418 | int finished = DynamicNextChunk(lb&: myLb, ub&: myUb, chunkSize: DST->Chunk, loopLowerBound: DST->NextLowerBound, |
419 | loopUpperBound: DST->LoopUpperBound); |
420 | |
421 | if (finished == FINISHED) |
422 | return DISPATCH_FINISHED; |
423 | |
424 | // not finished (either not finished or last chunk) |
425 | *plast = (int32_t)(finished == LAST_CHUNK); |
426 | *plower = myLb; |
427 | *pupper = myUb; |
428 | *pstride = 1; |
429 | |
430 | return DISPATCH_NOTFINISHED; |
431 | } |
432 | |
433 | static void dispatch_fini() { |
434 | // nothing |
435 | } |
436 | |
437 | //////////////////////////////////////////////////////////////////////////////// |
438 | // end of template class that encapsulate all the helper functions |
439 | //////////////////////////////////////////////////////////////////////////////// |
440 | }; |
441 | |
442 | //////////////////////////////////////////////////////////////////////////////// |
443 | // KMP interface implementation (dyn loops) |
444 | //////////////////////////////////////////////////////////////////////////////// |
445 | |
446 | // TODO: Expand the dispatch API to take a DST pointer which can then be |
447 | // allocated properly without malloc. |
448 | // For now, each team will contain an LDS pointer (ThreadDST) to a global array |
449 | // of references to the DST structs allocated (in global memory) for each thread |
450 | // in the team. The global memory array is allocated during the init phase if it |
451 | // was not allocated already and will be deallocated when the dispatch phase |
452 | // ends: |
453 | // |
454 | // __kmpc_dispatch_init |
455 | // |
456 | // ** Dispatch loop ** |
457 | // |
458 | // __kmpc_dispatch_deinit |
459 | // |
460 | [[clang::loader_uninitialized]] static Local<DynamicScheduleTracker **> |
461 | ThreadDST; |
462 | |
463 | // Create a new DST, link the current one, and define the new as current. |
464 | static DynamicScheduleTracker *pushDST() { |
465 | int32_t ThreadIndex = mapping::getThreadIdInBlock(); |
466 | // Each block will allocate an array of pointers to DST structs. The array is |
467 | // equal in length to the number of threads in that block. |
468 | if (!ThreadDST) { |
469 | // Allocate global memory array of pointers to DST structs: |
470 | if (mapping::isMainThreadInGenericMode() || ThreadIndex == 0) |
471 | ThreadDST = static_cast<DynamicScheduleTracker **>( |
472 | memory::allocGlobal(mapping::getNumberOfThreadsInBlock() * |
473 | sizeof(DynamicScheduleTracker *), |
474 | "new ThreadDST array")); |
475 | synchronize::threads(atomic::seq_cst); |
476 | |
477 | // Initialize the array pointers: |
478 | ThreadDST[ThreadIndex] = nullptr; |
479 | } |
480 | |
481 | // Create a DST struct for the current thread: |
482 | DynamicScheduleTracker *NewDST = static_cast<DynamicScheduleTracker *>( |
483 | memory::allocGlobal(sizeof(DynamicScheduleTracker), "new DST")); |
484 | *NewDST = DynamicScheduleTracker({0}); |
485 | |
486 | // Add the new DST struct to the array of DST structs: |
487 | NewDST->NextDST = ThreadDST[ThreadIndex]; |
488 | ThreadDST[ThreadIndex] = NewDST; |
489 | return NewDST; |
490 | } |
491 | |
492 | // Return the current DST. |
493 | static DynamicScheduleTracker *peekDST() { |
494 | return ThreadDST[mapping::getThreadIdInBlock()]; |
495 | } |
496 | |
497 | // Pop the current DST and restore the last one. |
498 | static void popDST() { |
499 | int32_t ThreadIndex = mapping::getThreadIdInBlock(); |
500 | DynamicScheduleTracker *CurrentDST = ThreadDST[ThreadIndex]; |
501 | DynamicScheduleTracker *OldDST = CurrentDST->NextDST; |
502 | memory::freeGlobal(CurrentDST, "remove DST"); |
503 | ThreadDST[ThreadIndex] = OldDST; |
504 | |
505 | // Check if we need to deallocate the global array. Ensure all threads |
506 | // in the block have finished deallocating the individual DSTs. |
507 | synchronize::threads(atomic::seq_cst); |
508 | if (!ThreadDST[ThreadIndex] && !ThreadIndex) { |
509 | memory::freeGlobal(ThreadDST, "remove ThreadDST array"); |
510 | ThreadDST = nullptr; |
511 | } |
512 | synchronize::threads(atomic::seq_cst); |
513 | } |
514 | |
515 | void workshare::init(bool IsSPMD) { |
516 | if (mapping::isInitialThreadInLevel0(IsSPMD)) |
517 | ThreadDST = nullptr; |
518 | } |
519 | |
520 | extern "C"{ |
521 | |
522 | // init |
523 | void __kmpc_dispatch_init_4(IdentTy *loc, int32_t tid, int32_t schedule, |
524 | int32_t lb, int32_t ub, int32_t st, int32_t chunk) { |
525 | DynamicScheduleTracker *DST = pushDST(); |
526 | omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init( |
527 | loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); |
528 | } |
529 | |
530 | void __kmpc_dispatch_init_4u(IdentTy *loc, int32_t tid, int32_t schedule, |
531 | uint32_t lb, uint32_t ub, int32_t st, |
532 | int32_t chunk) { |
533 | DynamicScheduleTracker *DST = pushDST(); |
534 | omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init( |
535 | loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); |
536 | } |
537 | |
538 | void __kmpc_dispatch_init_8(IdentTy *loc, int32_t tid, int32_t schedule, |
539 | int64_t lb, int64_t ub, int64_t st, int64_t chunk) { |
540 | DynamicScheduleTracker *DST = pushDST(); |
541 | omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init( |
542 | loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); |
543 | } |
544 | |
545 | void __kmpc_dispatch_init_8u(IdentTy *loc, int32_t tid, int32_t schedule, |
546 | uint64_t lb, uint64_t ub, int64_t st, |
547 | int64_t chunk) { |
548 | DynamicScheduleTracker *DST = pushDST(); |
549 | omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init( |
550 | loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); |
551 | } |
552 | |
553 | // next |
554 | int __kmpc_dispatch_next_4(IdentTy *loc, int32_t tid, int32_t *p_last, |
555 | int32_t *p_lb, int32_t *p_ub, int32_t *p_st) { |
556 | DynamicScheduleTracker *DST = peekDST(); |
557 | return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next( |
558 | loc, tid, p_last, p_lb, p_ub, p_st, DST); |
559 | } |
560 | |
561 | int __kmpc_dispatch_next_4u(IdentTy *loc, int32_t tid, int32_t *p_last, |
562 | uint32_t *p_lb, uint32_t *p_ub, int32_t *p_st) { |
563 | DynamicScheduleTracker *DST = peekDST(); |
564 | return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next( |
565 | loc, tid, p_last, p_lb, p_ub, p_st, DST); |
566 | } |
567 | |
568 | int __kmpc_dispatch_next_8(IdentTy *loc, int32_t tid, int32_t *p_last, |
569 | int64_t *p_lb, int64_t *p_ub, int64_t *p_st) { |
570 | DynamicScheduleTracker *DST = peekDST(); |
571 | return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next( |
572 | loc, tid, p_last, p_lb, p_ub, p_st, DST); |
573 | } |
574 | |
575 | int __kmpc_dispatch_next_8u(IdentTy *loc, int32_t tid, int32_t *p_last, |
576 | uint64_t *p_lb, uint64_t *p_ub, int64_t *p_st) { |
577 | DynamicScheduleTracker *DST = peekDST(); |
578 | return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next( |
579 | loc, tid, p_last, p_lb, p_ub, p_st, DST); |
580 | } |
581 | |
582 | // fini |
583 | void __kmpc_dispatch_fini_4(IdentTy *loc, int32_t tid) { |
584 | omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini(); |
585 | } |
586 | |
587 | void __kmpc_dispatch_fini_4u(IdentTy *loc, int32_t tid) { |
588 | omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini(); |
589 | } |
590 | |
591 | void __kmpc_dispatch_fini_8(IdentTy *loc, int32_t tid) { |
592 | omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini(); |
593 | } |
594 | |
595 | void __kmpc_dispatch_fini_8u(IdentTy *loc, int32_t tid) { |
596 | omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini(); |
597 | } |
598 | |
599 | // deinit |
600 | void __kmpc_dispatch_deinit(IdentTy *loc, int32_t tid) { popDST(); } |
601 | |
602 | //////////////////////////////////////////////////////////////////////////////// |
603 | // KMP interface implementation (static loops) |
604 | //////////////////////////////////////////////////////////////////////////////// |
605 | |
606 | void __kmpc_for_static_init_4(IdentTy *loc, int32_t global_tid, |
607 | int32_t schedtype, int32_t *plastiter, |
608 | int32_t *plower, int32_t *pupper, |
609 | int32_t *pstride, int32_t incr, int32_t chunk) { |
610 | omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init( |
611 | global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, |
612 | mapping::isSPMDMode()); |
613 | } |
614 | |
615 | void __kmpc_for_static_init_4u(IdentTy *loc, int32_t global_tid, |
616 | int32_t schedtype, int32_t *plastiter, |
617 | uint32_t *plower, uint32_t *pupper, |
618 | int32_t *pstride, int32_t incr, int32_t chunk) { |
619 | omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init( |
620 | global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, |
621 | mapping::isSPMDMode()); |
622 | } |
623 | |
624 | void __kmpc_for_static_init_8(IdentTy *loc, int32_t global_tid, |
625 | int32_t schedtype, int32_t *plastiter, |
626 | int64_t *plower, int64_t *pupper, |
627 | int64_t *pstride, int64_t incr, int64_t chunk) { |
628 | omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init( |
629 | global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, |
630 | mapping::isSPMDMode()); |
631 | } |
632 | |
633 | void __kmpc_for_static_init_8u(IdentTy *loc, int32_t global_tid, |
634 | int32_t schedtype, int32_t *plastiter, |
635 | uint64_t *plower, uint64_t *pupper, |
636 | int64_t *pstride, int64_t incr, int64_t chunk) { |
637 | omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init( |
638 | global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, |
639 | mapping::isSPMDMode()); |
640 | } |
641 | |
642 | void __kmpc_distribute_static_init_4(IdentTy *loc, int32_t global_tid, |
643 | int32_t schedtype, int32_t *plastiter, |
644 | int32_t *plower, int32_t *pupper, |
645 | int32_t *pstride, int32_t incr, |
646 | int32_t chunk) { |
647 | omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init( |
648 | global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, |
649 | mapping::isSPMDMode()); |
650 | } |
651 | |
652 | void __kmpc_distribute_static_init_4u(IdentTy *loc, int32_t global_tid, |
653 | int32_t schedtype, int32_t *plastiter, |
654 | uint32_t *plower, uint32_t *pupper, |
655 | int32_t *pstride, int32_t incr, |
656 | int32_t chunk) { |
657 | omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init( |
658 | global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, |
659 | mapping::isSPMDMode()); |
660 | } |
661 | |
662 | void __kmpc_distribute_static_init_8(IdentTy *loc, int32_t global_tid, |
663 | int32_t schedtype, int32_t *plastiter, |
664 | int64_t *plower, int64_t *pupper, |
665 | int64_t *pstride, int64_t incr, |
666 | int64_t chunk) { |
667 | omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init( |
668 | global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, |
669 | mapping::isSPMDMode()); |
670 | } |
671 | |
672 | void __kmpc_distribute_static_init_8u(IdentTy *loc, int32_t global_tid, |
673 | int32_t schedtype, int32_t *plastiter, |
674 | uint64_t *plower, uint64_t *pupper, |
675 | int64_t *pstride, int64_t incr, |
676 | int64_t chunk) { |
677 | omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init( |
678 | global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, |
679 | mapping::isSPMDMode()); |
680 | } |
681 | |
682 | void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {} |
683 | |
684 | void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {} |
685 | } |
686 | |
687 | namespace ompx { |
688 | |
689 | /// Helper class to hide the generic loop nest and provide the template argument |
690 | /// throughout. |
691 | template <typename Ty> class StaticLoopChunker { |
692 | |
693 | /// Generic loop nest that handles block and/or thread distribution in the |
694 | /// absence of user specified chunk sizes. This implicitly picks a block chunk |
695 | /// size equal to the number of threads in the block and a thread chunk size |
696 | /// equal to one. In contrast to the chunked version we can get away with a |
697 | /// single loop in this case |
698 | static void NormalizedLoopNestNoChunk(void (*LoopBody)(Ty, void *), void *Arg, |
699 | Ty NumBlocks, Ty BId, Ty NumThreads, |
700 | Ty TId, Ty NumIters, |
701 | bool OneIterationPerThread) { |
702 | Ty KernelIteration = NumBlocks * NumThreads; |
703 | |
704 | // Start index in the normalized space. |
705 | Ty IV = BId * NumThreads + TId; |
706 | ASSERT(IV >= 0, "Bad index"); |
707 | |
708 | // Cover the entire iteration space, assumptions in the caller might allow |
709 | // to simplify this loop to a conditional. |
710 | if (IV < NumIters) { |
711 | do { |
712 | |
713 | // Execute the loop body. |
714 | LoopBody(IV, Arg); |
715 | |
716 | // Every thread executed one block and thread chunk now. |
717 | IV += KernelIteration; |
718 | |
719 | if (OneIterationPerThread) |
720 | return; |
721 | |
722 | } while (IV < NumIters); |
723 | } |
724 | } |
725 | |
726 | /// Generic loop nest that handles block and/or thread distribution in the |
727 | /// presence of user specified chunk sizes (for at least one of them). |
728 | static void NormalizedLoopNestChunked(void (*LoopBody)(Ty, void *), void *Arg, |
729 | Ty BlockChunk, Ty NumBlocks, Ty BId, |
730 | Ty ThreadChunk, Ty NumThreads, Ty TId, |
731 | Ty NumIters, |
732 | bool OneIterationPerThread) { |
733 | Ty KernelIteration = NumBlocks * BlockChunk; |
734 | |
735 | // Start index in the chunked space. |
736 | Ty IV = BId * BlockChunk + TId; |
737 | ASSERT(IV >= 0, "Bad index"); |
738 | |
739 | // Cover the entire iteration space, assumptions in the caller might allow |
740 | // to simplify this loop to a conditional. |
741 | do { |
742 | |
743 | Ty BlockChunkLeft = |
744 | BlockChunk >= TId * ThreadChunk ? BlockChunk - TId * ThreadChunk : 0; |
745 | Ty ThreadChunkLeft = |
746 | ThreadChunk <= BlockChunkLeft ? ThreadChunk : BlockChunkLeft; |
747 | |
748 | while (ThreadChunkLeft--) { |
749 | |
750 | // Given the blocking it's hard to keep track of what to execute. |
751 | if (IV >= NumIters) |
752 | return; |
753 | |
754 | // Execute the loop body. |
755 | LoopBody(IV, Arg); |
756 | |
757 | if (OneIterationPerThread) |
758 | return; |
759 | |
760 | ++IV; |
761 | } |
762 | |
763 | IV += KernelIteration; |
764 | |
765 | } while (IV < NumIters); |
766 | } |
767 | |
768 | public: |
769 | /// Worksharing `for`-loop. |
770 | static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg, |
771 | Ty NumIters, Ty NumThreads, Ty ThreadChunk) { |
772 | ASSERT(NumIters >= 0, "Bad iteration count"); |
773 | ASSERT(ThreadChunk >= 0, "Bad thread count"); |
774 | |
775 | // All threads need to participate but we don't know if we are in a |
776 | // parallel at all or if the user might have used a `num_threads` clause |
777 | // on the parallel and reduced the number compared to the block size. |
778 | // Since nested parallels are possible too we need to get the thread id |
779 | // from the `omp` getter and not the mapping directly. |
780 | Ty TId = omp_get_thread_num(); |
781 | |
782 | // There are no blocks involved here. |
783 | Ty BlockChunk = 0; |
784 | Ty NumBlocks = 1; |
785 | Ty BId = 0; |
786 | |
787 | // If the thread chunk is not specified we pick a default now. |
788 | if (ThreadChunk == 0) |
789 | ThreadChunk = 1; |
790 | |
791 | // If we know we have more threads than iterations we can indicate that to |
792 | // avoid an outer loop. |
793 | bool OneIterationPerThread = false; |
794 | if (config::getAssumeThreadsOversubscription()) { |
795 | ASSERT(NumThreads >= NumIters, "Broken assumption"); |
796 | OneIterationPerThread = true; |
797 | } |
798 | |
799 | if (ThreadChunk != 1) |
800 | NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId, |
801 | ThreadChunk, NumThreads, TId, NumIters, |
802 | OneIterationPerThread); |
803 | else |
804 | NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId, |
805 | NumIters, OneIterationPerThread); |
806 | } |
807 | |
808 | /// Worksharing `distribute`-loop. |
809 | static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg, |
810 | Ty NumIters, Ty BlockChunk) { |
811 | ASSERT(icv::Level == 0, "Bad distribute"); |
812 | ASSERT(icv::ActiveLevel == 0, "Bad distribute"); |
813 | ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute"); |
814 | ASSERT(state::ParallelTeamSize == 1, "Bad distribute"); |
815 | |
816 | ASSERT(NumIters >= 0, "Bad iteration count"); |
817 | ASSERT(BlockChunk >= 0, "Bad block count"); |
818 | |
819 | // There are no threads involved here. |
820 | Ty ThreadChunk = 0; |
821 | Ty NumThreads = 1; |
822 | Ty TId = 0; |
823 | |
824 | // All teams need to participate. |
825 | Ty NumBlocks = mapping::getNumberOfBlocksInKernel(); |
826 | Ty BId = mapping::getBlockIdInKernel(); |
827 | |
828 | // If the block chunk is not specified we pick a default now. |
829 | if (BlockChunk == 0) |
830 | BlockChunk = NumThreads; |
831 | |
832 | // If we know we have more blocks than iterations we can indicate that to |
833 | // avoid an outer loop. |
834 | bool OneIterationPerThread = false; |
835 | if (config::getAssumeTeamsOversubscription()) { |
836 | ASSERT(NumBlocks >= NumIters, "Broken assumption"); |
837 | OneIterationPerThread = true; |
838 | } |
839 | |
840 | if (BlockChunk != NumThreads) |
841 | NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId, |
842 | ThreadChunk, NumThreads, TId, NumIters, |
843 | OneIterationPerThread); |
844 | else |
845 | NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId, |
846 | NumIters, OneIterationPerThread); |
847 | |
848 | ASSERT(icv::Level == 0, "Bad distribute"); |
849 | ASSERT(icv::ActiveLevel == 0, "Bad distribute"); |
850 | ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute"); |
851 | ASSERT(state::ParallelTeamSize == 1, "Bad distribute"); |
852 | } |
853 | |
854 | /// Worksharing `distribute parallel for`-loop. |
855 | static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *), |
856 | void *Arg, Ty NumIters, Ty NumThreads, |
857 | Ty BlockChunk, Ty ThreadChunk) { |
858 | ASSERT(icv::Level == 1, "Bad distribute"); |
859 | ASSERT(icv::ActiveLevel == 1, "Bad distribute"); |
860 | ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute"); |
861 | |
862 | ASSERT(NumIters >= 0, "Bad iteration count"); |
863 | ASSERT(BlockChunk >= 0, "Bad block count"); |
864 | ASSERT(ThreadChunk >= 0, "Bad thread count"); |
865 | |
866 | // All threads need to participate but the user might have used a |
867 | // `num_threads` clause on the parallel and reduced the number compared to |
868 | // the block size. |
869 | Ty TId = mapping::getThreadIdInBlock(); |
870 | |
871 | // All teams need to participate. |
872 | Ty NumBlocks = mapping::getNumberOfBlocksInKernel(); |
873 | Ty BId = mapping::getBlockIdInKernel(); |
874 | |
875 | // If the block chunk is not specified we pick a default now. |
876 | if (BlockChunk == 0) |
877 | BlockChunk = NumThreads; |
878 | |
879 | // If the thread chunk is not specified we pick a default now. |
880 | if (ThreadChunk == 0) |
881 | ThreadChunk = 1; |
882 | |
883 | // If we know we have more threads (across all blocks) than iterations we |
884 | // can indicate that to avoid an outer loop. |
885 | bool OneIterationPerThread = false; |
886 | if (config::getAssumeTeamsOversubscription() & |
887 | config::getAssumeThreadsOversubscription()) { |
888 | OneIterationPerThread = true; |
889 | ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption"); |
890 | } |
891 | |
892 | if (BlockChunk != NumThreads || ThreadChunk != 1) |
893 | NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId, |
894 | ThreadChunk, NumThreads, TId, NumIters, |
895 | OneIterationPerThread); |
896 | else |
897 | NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId, |
898 | NumIters, OneIterationPerThread); |
899 | |
900 | ASSERT(icv::Level == 1, "Bad distribute"); |
901 | ASSERT(icv::ActiveLevel == 1, "Bad distribute"); |
902 | ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute"); |
903 | } |
904 | }; |
905 | |
906 | } // namespace ompx |
907 | |
908 | #define OMP_LOOP_ENTRY(BW, TY) \ |
909 | [[gnu::flatten, clang::always_inline]] void \ |
910 | __kmpc_distribute_for_static_loop##BW( \ |
911 | IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \ |
912 | TY num_threads, TY block_chunk, TY thread_chunk) { \ |
913 | ompx::StaticLoopChunker<TY>::DistributeFor( \ |
914 | loc, fn, arg, num_iters, num_threads, block_chunk, thread_chunk); \ |
915 | } \ |
916 | [[gnu::flatten, clang::always_inline]] void \ |
917 | __kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *), \ |
918 | void *arg, TY num_iters, \ |
919 | TY block_chunk) { \ |
920 | ompx::StaticLoopChunker<TY>::Distribute(loc, fn, arg, num_iters, \ |
921 | block_chunk); \ |
922 | } \ |
923 | [[gnu::flatten, clang::always_inline]] void __kmpc_for_static_loop##BW( \ |
924 | IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \ |
925 | TY num_threads, TY thread_chunk) { \ |
926 | ompx::StaticLoopChunker<TY>::For(loc, fn, arg, num_iters, num_threads, \ |
927 | thread_chunk); \ |
928 | } |
929 | |
930 | extern "C"{ |
931 | OMP_LOOP_ENTRY(_4, int32_t) |
932 | OMP_LOOP_ENTRY(_4u, uint32_t) |
933 | OMP_LOOP_ENTRY(_8, int64_t) |
934 | OMP_LOOP_ENTRY(_8u, uint64_t) |
935 | } |
936 |
Definitions
- DynamicScheduleTracker
- Cnt
- omptarget_nvptx_LoopSupport
- ForStaticChunk
- ForStaticNoChunk
- for_static_init
- OrderedSchedule
- dispatch_init
- NextIter
- DynamicNextChunk
- dispatch_next
- dispatch_fini
- ThreadDST
- pushDST
- peekDST
- popDST
- __kmpc_dispatch_init_4
- __kmpc_dispatch_init_4u
- __kmpc_dispatch_init_8
- __kmpc_dispatch_init_8u
- __kmpc_dispatch_next_4
- __kmpc_dispatch_next_4u
- __kmpc_dispatch_next_8
- __kmpc_dispatch_next_8u
- __kmpc_dispatch_fini_4
- __kmpc_dispatch_fini_4u
- __kmpc_dispatch_fini_8
- __kmpc_dispatch_fini_8u
- __kmpc_dispatch_deinit
- __kmpc_for_static_init_4
- __kmpc_for_static_init_4u
- __kmpc_for_static_init_8
- __kmpc_for_static_init_8u
- __kmpc_distribute_static_init_4
- __kmpc_distribute_static_init_4u
- __kmpc_distribute_static_init_8
- __kmpc_distribute_static_init_8u
- __kmpc_for_static_fini
- __kmpc_distribute_static_fini
- StaticLoopChunker
- NormalizedLoopNestNoChunk
- NormalizedLoopNestChunked
- For
- Distribute
Learn to use CMake with our Intro Training
Find out more