| 1 | //===----- Workshare.cpp - OpenMP workshare implementation ------ C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file contains the implementation of the KMPC interface |
| 10 | // for the loop construct plus other worksharing constructs that use the same |
| 11 | // interface as loops. |
| 12 | // |
| 13 | //===----------------------------------------------------------------------===// |
| 14 | |
| 15 | #include "Workshare.h" |
| 16 | #include "Debug.h" |
| 17 | #include "DeviceTypes.h" |
| 18 | #include "DeviceUtils.h" |
| 19 | #include "Interface.h" |
| 20 | #include "Mapping.h" |
| 21 | #include "State.h" |
| 22 | #include "Synchronization.h" |
| 23 | |
| 24 | using namespace ompx; |
| 25 | |
| 26 | // TODO: |
| 27 | struct DynamicScheduleTracker { |
| 28 | int64_t Chunk; |
| 29 | int64_t LoopUpperBound; |
| 30 | int64_t NextLowerBound; |
| 31 | int64_t Stride; |
| 32 | kmp_sched_t ScheduleType; |
| 33 | DynamicScheduleTracker *NextDST; |
| 34 | }; |
| 35 | |
| 36 | #define ASSERT0(...) |
| 37 | |
| 38 | // used by the library for the interface with the app |
| 39 | #define DISPATCH_FINISHED 0 |
| 40 | #define DISPATCH_NOTFINISHED 1 |
| 41 | |
| 42 | // used by dynamic scheduling |
| 43 | #define FINISHED 0 |
| 44 | #define NOT_FINISHED 1 |
| 45 | #define LAST_CHUNK 2 |
| 46 | |
| 47 | // TODO: This variable is a hack inherited from the old runtime. |
| 48 | [[clang::loader_uninitialized]] static Local<uint64_t> Cnt; |
| 49 | |
| 50 | template <typename T, typename ST> struct omptarget_nvptx_LoopSupport { |
| 51 | //////////////////////////////////////////////////////////////////////////////// |
| 52 | // Loop with static scheduling with chunk |
| 53 | |
| 54 | // Generic implementation of OMP loop scheduling with static policy |
| 55 | /*! \brief Calculate initial bounds for static loop and stride |
| 56 | * @param[in] loc location in code of the call (not used here) |
| 57 | * @param[in] global_tid global thread id |
| 58 | * @param[in] schetype type of scheduling (see omptarget-nvptx.h) |
| 59 | * @param[in] plastiter pointer to last iteration |
| 60 | * @param[in,out] pointer to loop lower bound. it will contain value of |
| 61 | * lower bound of first chunk |
| 62 | * @param[in,out] pointer to loop upper bound. It will contain value of |
| 63 | * upper bound of first chunk |
| 64 | * @param[in,out] pointer to loop stride. It will contain value of stride |
| 65 | * between two successive chunks executed by the same thread |
| 66 | * @param[in] loop increment bump |
| 67 | * @param[in] chunk size |
| 68 | */ |
| 69 | |
| 70 | // helper function for static chunk |
| 71 | static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, ST chunk, |
| 72 | T entityId, T numberOfEntities) { |
| 73 | // each thread executes multiple chunks all of the same size, except |
| 74 | // the last one |
| 75 | // distance between two successive chunks |
| 76 | stride = numberOfEntities * chunk; |
| 77 | lb = lb + entityId * chunk; |
| 78 | T inputUb = ub; |
| 79 | ub = lb + chunk - 1; // Clang uses i <= ub |
| 80 | // Say ub' is the beginning of the last chunk. Then who ever has a |
| 81 | // lower bound plus a multiple of the increment equal to ub' is |
| 82 | // the last one. |
| 83 | T beginingLastChunk = inputUb - (inputUb % chunk); |
| 84 | last = ((beginingLastChunk - lb) % stride) == 0; |
| 85 | } |
| 86 | |
| 87 | //////////////////////////////////////////////////////////////////////////////// |
| 88 | // Loop with static scheduling without chunk |
| 89 | |
| 90 | // helper function for static no chunk |
| 91 | static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, ST &chunk, |
| 92 | T entityId, T numberOfEntities) { |
| 93 | // No chunk size specified. Each thread or warp gets at most one |
| 94 | // chunk; chunks are all almost of equal size |
| 95 | T loopSize = ub - lb + 1; |
| 96 | |
| 97 | chunk = loopSize / numberOfEntities; |
| 98 | T leftOver = loopSize - chunk * numberOfEntities; |
| 99 | |
| 100 | if (entityId < leftOver) { |
| 101 | chunk++; |
| 102 | lb = lb + entityId * chunk; |
| 103 | } else { |
| 104 | lb = lb + entityId * chunk + leftOver; |
| 105 | } |
| 106 | |
| 107 | T inputUb = ub; |
| 108 | ub = lb + chunk - 1; // Clang uses i <= ub |
| 109 | last = lb <= inputUb && inputUb <= ub; |
| 110 | stride = loopSize; // make sure we only do 1 chunk per warp |
| 111 | } |
| 112 | |
| 113 | //////////////////////////////////////////////////////////////////////////////// |
| 114 | // Support for Static Init |
| 115 | |
| 116 | static void for_static_init(int32_t, int32_t schedtype, int32_t *plastiter, |
| 117 | T *plower, T *pupper, ST *pstride, ST chunk, |
| 118 | bool IsSPMDExecutionMode) { |
| 119 | int32_t gtid = omp_get_thread_num(); |
| 120 | int numberOfActiveOMPThreads = omp_get_num_threads(); |
| 121 | |
| 122 | // All warps that are in excess of the maximum requested, do |
| 123 | // not execute the loop |
| 124 | ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads, |
| 125 | "current thread is not needed here; error" ); |
| 126 | |
| 127 | // copy |
| 128 | int lastiter = 0; |
| 129 | T lb = *plower; |
| 130 | T ub = *pupper; |
| 131 | ST stride = *pstride; |
| 132 | |
| 133 | // init |
| 134 | switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) { |
| 135 | case kmp_sched_static_chunk: { |
| 136 | if (chunk > 0) { |
| 137 | ForStaticChunk(last&: lastiter, lb, ub, stride, chunk, entityId: gtid, |
| 138 | numberOfEntities: numberOfActiveOMPThreads); |
| 139 | break; |
| 140 | } |
| 141 | [[fallthrough]]; |
| 142 | } // note: if chunk <=0, use nochunk |
| 143 | case kmp_sched_static_balanced_chunk: { |
| 144 | if (chunk > 0) { |
| 145 | // round up to make sure the chunk is enough to cover all iterations |
| 146 | T tripCount = ub - lb + 1; // +1 because ub is inclusive |
| 147 | T span = (tripCount + numberOfActiveOMPThreads - 1) / |
| 148 | numberOfActiveOMPThreads; |
| 149 | // perform chunk adjustment |
| 150 | chunk = (span + chunk - 1) & ~(chunk - 1); |
| 151 | |
| 152 | ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb." ); |
| 153 | T oldUb = ub; |
| 154 | ForStaticChunk(last&: lastiter, lb, ub, stride, chunk, entityId: gtid, |
| 155 | numberOfEntities: numberOfActiveOMPThreads); |
| 156 | if (ub > oldUb) |
| 157 | ub = oldUb; |
| 158 | break; |
| 159 | } |
| 160 | [[fallthrough]]; |
| 161 | } // note: if chunk <=0, use nochunk |
| 162 | case kmp_sched_static_nochunk: { |
| 163 | ForStaticNoChunk(last&: lastiter, lb, ub, stride, chunk, entityId: gtid, |
| 164 | numberOfEntities: numberOfActiveOMPThreads); |
| 165 | break; |
| 166 | } |
| 167 | case kmp_sched_distr_static_chunk: { |
| 168 | if (chunk > 0) { |
| 169 | ForStaticChunk(last&: lastiter, lb, ub, stride, chunk, entityId: omp_get_team_num(), |
| 170 | numberOfEntities: omp_get_num_teams()); |
| 171 | break; |
| 172 | } |
| 173 | [[fallthrough]]; |
| 174 | } // note: if chunk <=0, use nochunk |
| 175 | case kmp_sched_distr_static_nochunk: { |
| 176 | ForStaticNoChunk(last&: lastiter, lb, ub, stride, chunk, entityId: omp_get_team_num(), |
| 177 | numberOfEntities: omp_get_num_teams()); |
| 178 | break; |
| 179 | } |
| 180 | case kmp_sched_distr_static_chunk_sched_static_chunkone: { |
| 181 | ForStaticChunk(last&: lastiter, lb, ub, stride, chunk, |
| 182 | entityId: numberOfActiveOMPThreads * omp_get_team_num() + gtid, |
| 183 | numberOfEntities: omp_get_num_teams() * numberOfActiveOMPThreads); |
| 184 | break; |
| 185 | } |
| 186 | default: { |
| 187 | // ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype); |
| 188 | ForStaticChunk(last&: lastiter, lb, ub, stride, chunk, entityId: gtid, |
| 189 | numberOfEntities: numberOfActiveOMPThreads); |
| 190 | break; |
| 191 | } |
| 192 | } |
| 193 | // copy back |
| 194 | *plastiter = lastiter; |
| 195 | *plower = lb; |
| 196 | *pupper = ub; |
| 197 | *pstride = stride; |
| 198 | } |
| 199 | |
| 200 | //////////////////////////////////////////////////////////////////////////////// |
| 201 | // Support for dispatch Init |
| 202 | |
| 203 | static int OrderedSchedule(kmp_sched_t schedule) { |
| 204 | return schedule >= kmp_sched_ordered_first && |
| 205 | schedule <= kmp_sched_ordered_last; |
| 206 | } |
| 207 | |
| 208 | static void dispatch_init(IdentTy *loc, int32_t threadId, |
| 209 | kmp_sched_t schedule, T lb, T ub, ST st, ST chunk, |
| 210 | DynamicScheduleTracker *DST) { |
| 211 | int tid = mapping::getThreadIdInBlock(); |
| 212 | T tnum = omp_get_num_threads(); |
| 213 | T tripCount = ub - lb + 1; // +1 because ub is inclusive |
| 214 | ASSERT0(LT_FUSSY, threadId < tnum, |
| 215 | "current thread is not needed here; error" ); |
| 216 | |
| 217 | /* Currently just ignore the monotonic and non-monotonic modifiers |
| 218 | * (the compiler isn't producing them * yet anyway). |
| 219 | * When it is we'll want to look at them somewhere here and use that |
| 220 | * information to add to our schedule choice. We shouldn't need to pass |
| 221 | * them on, they merely affect which schedule we can legally choose for |
| 222 | * various dynamic cases. (In particular, whether or not a stealing scheme |
| 223 | * is legal). |
| 224 | */ |
| 225 | schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); |
| 226 | |
| 227 | // Process schedule. |
| 228 | if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) { |
| 229 | if (OrderedSchedule(schedule)) |
| 230 | __kmpc_barrier(loc, threadId); |
| 231 | schedule = kmp_sched_static_chunk; |
| 232 | chunk = tripCount; // one thread gets the whole loop |
| 233 | } else if (schedule == kmp_sched_runtime) { |
| 234 | // process runtime |
| 235 | omp_sched_t rtSched; |
| 236 | int ChunkInt; |
| 237 | omp_get_schedule(&rtSched, &ChunkInt); |
| 238 | chunk = ChunkInt; |
| 239 | switch (rtSched) { |
| 240 | case omp_sched_static: { |
| 241 | if (chunk > 0) |
| 242 | schedule = kmp_sched_static_chunk; |
| 243 | else |
| 244 | schedule = kmp_sched_static_nochunk; |
| 245 | break; |
| 246 | } |
| 247 | case omp_sched_auto: { |
| 248 | schedule = kmp_sched_static_chunk; |
| 249 | chunk = 1; |
| 250 | break; |
| 251 | } |
| 252 | case omp_sched_dynamic: |
| 253 | case omp_sched_guided: { |
| 254 | schedule = kmp_sched_dynamic; |
| 255 | break; |
| 256 | } |
| 257 | } |
| 258 | } else if (schedule == kmp_sched_auto) { |
| 259 | schedule = kmp_sched_static_chunk; |
| 260 | chunk = 1; |
| 261 | } else { |
| 262 | // ASSERT(LT_FUSSY, |
| 263 | // schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, |
| 264 | // "unknown schedule %d & chunk %lld\n", (int)schedule, |
| 265 | // (long long)chunk); |
| 266 | } |
| 267 | |
| 268 | // init schedules |
| 269 | if (schedule == kmp_sched_static_chunk) { |
| 270 | ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value" ); |
| 271 | // save sched state |
| 272 | DST->ScheduleType = schedule; |
| 273 | // save ub |
| 274 | DST->LoopUpperBound = ub; |
| 275 | // compute static chunk |
| 276 | ST stride; |
| 277 | int lastiter = 0; |
| 278 | ForStaticChunk(last&: lastiter, lb, ub, stride, chunk, entityId: threadId, numberOfEntities: tnum); |
| 279 | // save computed params |
| 280 | DST->Chunk = chunk; |
| 281 | DST->NextLowerBound = lb; |
| 282 | DST->Stride = stride; |
| 283 | } else if (schedule == kmp_sched_static_balanced_chunk) { |
| 284 | ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value" ); |
| 285 | // save sched state |
| 286 | DST->ScheduleType = schedule; |
| 287 | // save ub |
| 288 | DST->LoopUpperBound = ub; |
| 289 | // compute static chunk |
| 290 | ST stride; |
| 291 | int lastiter = 0; |
| 292 | // round up to make sure the chunk is enough to cover all iterations |
| 293 | T span = (tripCount + tnum - 1) / tnum; |
| 294 | // perform chunk adjustment |
| 295 | chunk = (span + chunk - 1) & ~(chunk - 1); |
| 296 | |
| 297 | T oldUb = ub; |
| 298 | ForStaticChunk(last&: lastiter, lb, ub, stride, chunk, entityId: threadId, numberOfEntities: tnum); |
| 299 | ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb." ); |
| 300 | if (ub > oldUb) |
| 301 | ub = oldUb; |
| 302 | // save computed params |
| 303 | DST->Chunk = chunk; |
| 304 | DST->NextLowerBound = lb; |
| 305 | DST->Stride = stride; |
| 306 | } else if (schedule == kmp_sched_static_nochunk) { |
| 307 | ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value" ); |
| 308 | // save sched state |
| 309 | DST->ScheduleType = schedule; |
| 310 | // save ub |
| 311 | DST->LoopUpperBound = ub; |
| 312 | // compute static chunk |
| 313 | ST stride; |
| 314 | int lastiter = 0; |
| 315 | ForStaticNoChunk(last&: lastiter, lb, ub, stride, chunk, entityId: threadId, numberOfEntities: tnum); |
| 316 | // save computed params |
| 317 | DST->Chunk = chunk; |
| 318 | DST->NextLowerBound = lb; |
| 319 | DST->Stride = stride; |
| 320 | } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) { |
| 321 | // save data |
| 322 | DST->ScheduleType = schedule; |
| 323 | if (chunk < 1) |
| 324 | chunk = 1; |
| 325 | DST->Chunk = chunk; |
| 326 | DST->LoopUpperBound = ub; |
| 327 | DST->NextLowerBound = lb; |
| 328 | __kmpc_barrier(loc, threadId); |
| 329 | if (tid == 0) { |
| 330 | Cnt = 0; |
| 331 | fence::team(atomic::seq_cst); |
| 332 | } |
| 333 | __kmpc_barrier(loc, threadId); |
| 334 | } |
| 335 | } |
| 336 | |
| 337 | //////////////////////////////////////////////////////////////////////////////// |
| 338 | // Support for dispatch next |
| 339 | |
| 340 | static uint64_t NextIter() { |
| 341 | __kmpc_impl_lanemask_t active = mapping::activemask(); |
| 342 | uint32_t leader = utils::ffs(active) - 1; |
| 343 | uint32_t change = utils::popc(active); |
| 344 | __kmpc_impl_lanemask_t lane_mask_lt = mapping::lanemaskLT(); |
| 345 | unsigned int rank = utils::popc(active & lane_mask_lt); |
| 346 | uint64_t warp_res = 0; |
| 347 | if (rank == 0) { |
| 348 | warp_res = atomic::add(&Cnt, change, atomic::seq_cst); |
| 349 | } |
| 350 | warp_res = utils::shuffle(active, warp_res, leader, mapping::getWarpSize()); |
| 351 | return warp_res + rank; |
| 352 | } |
| 353 | |
| 354 | static int DynamicNextChunk(T &lb, T &ub, T chunkSize, T loopLowerBound, |
| 355 | T loopUpperBound) { |
| 356 | T N = NextIter(); |
| 357 | lb = loopLowerBound + N * chunkSize; |
| 358 | ub = lb + chunkSize - 1; // Clang uses i <= ub |
| 359 | |
| 360 | // 3 result cases: |
| 361 | // a. lb and ub < loopUpperBound --> NOT_FINISHED |
| 362 | // b. lb < loopUpperBound and ub >= loopUpperBound: last chunk --> |
| 363 | // NOT_FINISHED |
| 364 | // c. lb and ub >= loopUpperBound: empty chunk --> FINISHED |
| 365 | // a. |
| 366 | if (lb <= loopUpperBound && ub < loopUpperBound) { |
| 367 | return NOT_FINISHED; |
| 368 | } |
| 369 | // b. |
| 370 | if (lb <= loopUpperBound) { |
| 371 | ub = loopUpperBound; |
| 372 | return LAST_CHUNK; |
| 373 | } |
| 374 | // c. if we are here, we are in case 'c' |
| 375 | lb = loopUpperBound + 2; |
| 376 | ub = loopUpperBound + 1; |
| 377 | return FINISHED; |
| 378 | } |
| 379 | |
| 380 | static int dispatch_next(IdentTy *loc, int32_t gtid, int32_t *plast, |
| 381 | T *plower, T *pupper, ST *pstride, |
| 382 | DynamicScheduleTracker *DST) { |
| 383 | // ID of a thread in its own warp |
| 384 | |
| 385 | // automatically selects thread or warp ID based on selected implementation |
| 386 | ASSERT0(LT_FUSSY, gtid < omp_get_num_threads(), |
| 387 | "current thread is not needed here; error" ); |
| 388 | // retrieve schedule |
| 389 | kmp_sched_t schedule = DST->ScheduleType; |
| 390 | |
| 391 | // xxx reduce to one |
| 392 | if (schedule == kmp_sched_static_chunk || |
| 393 | schedule == kmp_sched_static_nochunk) { |
| 394 | T myLb = DST->NextLowerBound; |
| 395 | T ub = DST->LoopUpperBound; |
| 396 | // finished? |
| 397 | if (myLb > ub) { |
| 398 | return DISPATCH_FINISHED; |
| 399 | } |
| 400 | // not finished, save current bounds |
| 401 | ST chunk = DST->Chunk; |
| 402 | *plower = myLb; |
| 403 | T myUb = myLb + chunk - 1; // Clang uses i <= ub |
| 404 | if (myUb > ub) |
| 405 | myUb = ub; |
| 406 | *pupper = myUb; |
| 407 | *plast = (int32_t)(myUb == ub); |
| 408 | |
| 409 | // increment next lower bound by the stride |
| 410 | ST stride = DST->Stride; |
| 411 | DST->NextLowerBound = myLb + stride; |
| 412 | return DISPATCH_NOTFINISHED; |
| 413 | } |
| 414 | ASSERT0(LT_FUSSY, |
| 415 | schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, |
| 416 | "bad sched" ); |
| 417 | T myLb, myUb; |
| 418 | int finished = DynamicNextChunk(lb&: myLb, ub&: myUb, chunkSize: DST->Chunk, loopLowerBound: DST->NextLowerBound, |
| 419 | loopUpperBound: DST->LoopUpperBound); |
| 420 | |
| 421 | if (finished == FINISHED) |
| 422 | return DISPATCH_FINISHED; |
| 423 | |
| 424 | // not finished (either not finished or last chunk) |
| 425 | *plast = (int32_t)(finished == LAST_CHUNK); |
| 426 | *plower = myLb; |
| 427 | *pupper = myUb; |
| 428 | *pstride = 1; |
| 429 | |
| 430 | return DISPATCH_NOTFINISHED; |
| 431 | } |
| 432 | |
| 433 | static void dispatch_fini() { |
| 434 | // nothing |
| 435 | } |
| 436 | |
| 437 | //////////////////////////////////////////////////////////////////////////////// |
| 438 | // end of template class that encapsulate all the helper functions |
| 439 | //////////////////////////////////////////////////////////////////////////////// |
| 440 | }; |
| 441 | |
| 442 | //////////////////////////////////////////////////////////////////////////////// |
| 443 | // KMP interface implementation (dyn loops) |
| 444 | //////////////////////////////////////////////////////////////////////////////// |
| 445 | |
| 446 | // TODO: Expand the dispatch API to take a DST pointer which can then be |
| 447 | // allocated properly without malloc. |
| 448 | // For now, each team will contain an LDS pointer (ThreadDST) to a global array |
| 449 | // of references to the DST structs allocated (in global memory) for each thread |
| 450 | // in the team. The global memory array is allocated during the init phase if it |
| 451 | // was not allocated already and will be deallocated when the dispatch phase |
| 452 | // ends: |
| 453 | // |
| 454 | // __kmpc_dispatch_init |
| 455 | // |
| 456 | // ** Dispatch loop ** |
| 457 | // |
| 458 | // __kmpc_dispatch_deinit |
| 459 | // |
| 460 | [[clang::loader_uninitialized]] static Local<DynamicScheduleTracker **> |
| 461 | ThreadDST; |
| 462 | |
| 463 | // Create a new DST, link the current one, and define the new as current. |
| 464 | static DynamicScheduleTracker *pushDST() { |
| 465 | int32_t ThreadIndex = mapping::getThreadIdInBlock(); |
| 466 | // Each block will allocate an array of pointers to DST structs. The array is |
| 467 | // equal in length to the number of threads in that block. |
| 468 | if (!ThreadDST) { |
| 469 | // Allocate global memory array of pointers to DST structs: |
| 470 | if (mapping::isMainThreadInGenericMode() || ThreadIndex == 0) |
| 471 | ThreadDST = static_cast<DynamicScheduleTracker **>( |
| 472 | memory::allocGlobal(mapping::getNumberOfThreadsInBlock() * |
| 473 | sizeof(DynamicScheduleTracker *), |
| 474 | "new ThreadDST array" )); |
| 475 | synchronize::threads(atomic::seq_cst); |
| 476 | |
| 477 | // Initialize the array pointers: |
| 478 | ThreadDST[ThreadIndex] = nullptr; |
| 479 | } |
| 480 | |
| 481 | // Create a DST struct for the current thread: |
| 482 | DynamicScheduleTracker *NewDST = static_cast<DynamicScheduleTracker *>( |
| 483 | memory::allocGlobal(sizeof(DynamicScheduleTracker), "new DST" )); |
| 484 | *NewDST = DynamicScheduleTracker({0}); |
| 485 | |
| 486 | // Add the new DST struct to the array of DST structs: |
| 487 | NewDST->NextDST = ThreadDST[ThreadIndex]; |
| 488 | ThreadDST[ThreadIndex] = NewDST; |
| 489 | return NewDST; |
| 490 | } |
| 491 | |
| 492 | // Return the current DST. |
| 493 | static DynamicScheduleTracker *peekDST() { |
| 494 | return ThreadDST[mapping::getThreadIdInBlock()]; |
| 495 | } |
| 496 | |
| 497 | // Pop the current DST and restore the last one. |
| 498 | static void popDST() { |
| 499 | int32_t ThreadIndex = mapping::getThreadIdInBlock(); |
| 500 | DynamicScheduleTracker *CurrentDST = ThreadDST[ThreadIndex]; |
| 501 | DynamicScheduleTracker *OldDST = CurrentDST->NextDST; |
| 502 | memory::freeGlobal(CurrentDST, "remove DST" ); |
| 503 | ThreadDST[ThreadIndex] = OldDST; |
| 504 | |
| 505 | // Check if we need to deallocate the global array. Ensure all threads |
| 506 | // in the block have finished deallocating the individual DSTs. |
| 507 | synchronize::threads(atomic::seq_cst); |
| 508 | if (!ThreadDST[ThreadIndex] && !ThreadIndex) { |
| 509 | memory::freeGlobal(ThreadDST, "remove ThreadDST array" ); |
| 510 | ThreadDST = nullptr; |
| 511 | } |
| 512 | synchronize::threads(atomic::seq_cst); |
| 513 | } |
| 514 | |
| 515 | void workshare::init(bool IsSPMD) { |
| 516 | if (mapping::isInitialThreadInLevel0(IsSPMD)) |
| 517 | ThreadDST = nullptr; |
| 518 | } |
| 519 | |
| 520 | extern "C" { |
| 521 | |
| 522 | // init |
| 523 | void __kmpc_dispatch_init_4(IdentTy *loc, int32_t tid, int32_t schedule, |
| 524 | int32_t lb, int32_t ub, int32_t st, int32_t chunk) { |
| 525 | DynamicScheduleTracker *DST = pushDST(); |
| 526 | omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init( |
| 527 | loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); |
| 528 | } |
| 529 | |
| 530 | void __kmpc_dispatch_init_4u(IdentTy *loc, int32_t tid, int32_t schedule, |
| 531 | uint32_t lb, uint32_t ub, int32_t st, |
| 532 | int32_t chunk) { |
| 533 | DynamicScheduleTracker *DST = pushDST(); |
| 534 | omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init( |
| 535 | loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); |
| 536 | } |
| 537 | |
| 538 | void __kmpc_dispatch_init_8(IdentTy *loc, int32_t tid, int32_t schedule, |
| 539 | int64_t lb, int64_t ub, int64_t st, int64_t chunk) { |
| 540 | DynamicScheduleTracker *DST = pushDST(); |
| 541 | omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init( |
| 542 | loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); |
| 543 | } |
| 544 | |
| 545 | void __kmpc_dispatch_init_8u(IdentTy *loc, int32_t tid, int32_t schedule, |
| 546 | uint64_t lb, uint64_t ub, int64_t st, |
| 547 | int64_t chunk) { |
| 548 | DynamicScheduleTracker *DST = pushDST(); |
| 549 | omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init( |
| 550 | loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); |
| 551 | } |
| 552 | |
| 553 | // next |
| 554 | int __kmpc_dispatch_next_4(IdentTy *loc, int32_t tid, int32_t *p_last, |
| 555 | int32_t *p_lb, int32_t *p_ub, int32_t *p_st) { |
| 556 | DynamicScheduleTracker *DST = peekDST(); |
| 557 | return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next( |
| 558 | loc, tid, p_last, p_lb, p_ub, p_st, DST); |
| 559 | } |
| 560 | |
| 561 | int __kmpc_dispatch_next_4u(IdentTy *loc, int32_t tid, int32_t *p_last, |
| 562 | uint32_t *p_lb, uint32_t *p_ub, int32_t *p_st) { |
| 563 | DynamicScheduleTracker *DST = peekDST(); |
| 564 | return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next( |
| 565 | loc, tid, p_last, p_lb, p_ub, p_st, DST); |
| 566 | } |
| 567 | |
| 568 | int __kmpc_dispatch_next_8(IdentTy *loc, int32_t tid, int32_t *p_last, |
| 569 | int64_t *p_lb, int64_t *p_ub, int64_t *p_st) { |
| 570 | DynamicScheduleTracker *DST = peekDST(); |
| 571 | return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next( |
| 572 | loc, tid, p_last, p_lb, p_ub, p_st, DST); |
| 573 | } |
| 574 | |
| 575 | int __kmpc_dispatch_next_8u(IdentTy *loc, int32_t tid, int32_t *p_last, |
| 576 | uint64_t *p_lb, uint64_t *p_ub, int64_t *p_st) { |
| 577 | DynamicScheduleTracker *DST = peekDST(); |
| 578 | return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next( |
| 579 | loc, tid, p_last, p_lb, p_ub, p_st, DST); |
| 580 | } |
| 581 | |
| 582 | // fini |
| 583 | void __kmpc_dispatch_fini_4(IdentTy *loc, int32_t tid) { |
| 584 | omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini(); |
| 585 | } |
| 586 | |
| 587 | void __kmpc_dispatch_fini_4u(IdentTy *loc, int32_t tid) { |
| 588 | omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini(); |
| 589 | } |
| 590 | |
| 591 | void __kmpc_dispatch_fini_8(IdentTy *loc, int32_t tid) { |
| 592 | omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini(); |
| 593 | } |
| 594 | |
| 595 | void __kmpc_dispatch_fini_8u(IdentTy *loc, int32_t tid) { |
| 596 | omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini(); |
| 597 | } |
| 598 | |
| 599 | // deinit |
| 600 | void __kmpc_dispatch_deinit(IdentTy *loc, int32_t tid) { popDST(); } |
| 601 | |
| 602 | //////////////////////////////////////////////////////////////////////////////// |
| 603 | // KMP interface implementation (static loops) |
| 604 | //////////////////////////////////////////////////////////////////////////////// |
| 605 | |
| 606 | void __kmpc_for_static_init_4(IdentTy *loc, int32_t global_tid, |
| 607 | int32_t schedtype, int32_t *plastiter, |
| 608 | int32_t *plower, int32_t *pupper, |
| 609 | int32_t *pstride, int32_t incr, int32_t chunk) { |
| 610 | omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init( |
| 611 | global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, |
| 612 | mapping::isSPMDMode()); |
| 613 | } |
| 614 | |
| 615 | void __kmpc_for_static_init_4u(IdentTy *loc, int32_t global_tid, |
| 616 | int32_t schedtype, int32_t *plastiter, |
| 617 | uint32_t *plower, uint32_t *pupper, |
| 618 | int32_t *pstride, int32_t incr, int32_t chunk) { |
| 619 | omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init( |
| 620 | global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, |
| 621 | mapping::isSPMDMode()); |
| 622 | } |
| 623 | |
| 624 | void __kmpc_for_static_init_8(IdentTy *loc, int32_t global_tid, |
| 625 | int32_t schedtype, int32_t *plastiter, |
| 626 | int64_t *plower, int64_t *pupper, |
| 627 | int64_t *pstride, int64_t incr, int64_t chunk) { |
| 628 | omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init( |
| 629 | global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, |
| 630 | mapping::isSPMDMode()); |
| 631 | } |
| 632 | |
| 633 | void __kmpc_for_static_init_8u(IdentTy *loc, int32_t global_tid, |
| 634 | int32_t schedtype, int32_t *plastiter, |
| 635 | uint64_t *plower, uint64_t *pupper, |
| 636 | int64_t *pstride, int64_t incr, int64_t chunk) { |
| 637 | omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init( |
| 638 | global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, |
| 639 | mapping::isSPMDMode()); |
| 640 | } |
| 641 | |
| 642 | void __kmpc_distribute_static_init_4(IdentTy *loc, int32_t global_tid, |
| 643 | int32_t schedtype, int32_t *plastiter, |
| 644 | int32_t *plower, int32_t *pupper, |
| 645 | int32_t *pstride, int32_t incr, |
| 646 | int32_t chunk) { |
| 647 | omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init( |
| 648 | global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, |
| 649 | mapping::isSPMDMode()); |
| 650 | } |
| 651 | |
| 652 | void __kmpc_distribute_static_init_4u(IdentTy *loc, int32_t global_tid, |
| 653 | int32_t schedtype, int32_t *plastiter, |
| 654 | uint32_t *plower, uint32_t *pupper, |
| 655 | int32_t *pstride, int32_t incr, |
| 656 | int32_t chunk) { |
| 657 | omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init( |
| 658 | global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, |
| 659 | mapping::isSPMDMode()); |
| 660 | } |
| 661 | |
| 662 | void __kmpc_distribute_static_init_8(IdentTy *loc, int32_t global_tid, |
| 663 | int32_t schedtype, int32_t *plastiter, |
| 664 | int64_t *plower, int64_t *pupper, |
| 665 | int64_t *pstride, int64_t incr, |
| 666 | int64_t chunk) { |
| 667 | omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init( |
| 668 | global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, |
| 669 | mapping::isSPMDMode()); |
| 670 | } |
| 671 | |
| 672 | void __kmpc_distribute_static_init_8u(IdentTy *loc, int32_t global_tid, |
| 673 | int32_t schedtype, int32_t *plastiter, |
| 674 | uint64_t *plower, uint64_t *pupper, |
| 675 | int64_t *pstride, int64_t incr, |
| 676 | int64_t chunk) { |
| 677 | omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init( |
| 678 | global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, |
| 679 | mapping::isSPMDMode()); |
| 680 | } |
| 681 | |
| 682 | void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {} |
| 683 | |
| 684 | void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {} |
| 685 | } |
| 686 | |
| 687 | namespace ompx { |
| 688 | |
| 689 | /// Helper class to hide the generic loop nest and provide the template argument |
| 690 | /// throughout. |
| 691 | template <typename Ty> class StaticLoopChunker { |
| 692 | |
| 693 | /// Generic loop nest that handles block and/or thread distribution in the |
| 694 | /// absence of user specified chunk sizes. This implicitly picks a block chunk |
| 695 | /// size equal to the number of threads in the block and a thread chunk size |
| 696 | /// equal to one. In contrast to the chunked version we can get away with a |
| 697 | /// single loop in this case |
| 698 | static void NormalizedLoopNestNoChunk(void (*LoopBody)(Ty, void *), void *Arg, |
| 699 | Ty NumBlocks, Ty BId, Ty NumThreads, |
| 700 | Ty TId, Ty NumIters, |
| 701 | bool OneIterationPerThread) { |
| 702 | Ty KernelIteration = NumBlocks * NumThreads; |
| 703 | |
| 704 | // Start index in the normalized space. |
| 705 | Ty IV = BId * NumThreads + TId; |
| 706 | ASSERT(IV >= 0, "Bad index" ); |
| 707 | |
| 708 | // Cover the entire iteration space, assumptions in the caller might allow |
| 709 | // to simplify this loop to a conditional. |
| 710 | if (IV < NumIters) { |
| 711 | do { |
| 712 | |
| 713 | // Execute the loop body. |
| 714 | LoopBody(IV, Arg); |
| 715 | |
| 716 | // Every thread executed one block and thread chunk now. |
| 717 | IV += KernelIteration; |
| 718 | |
| 719 | if (OneIterationPerThread) |
| 720 | return; |
| 721 | |
| 722 | } while (IV < NumIters); |
| 723 | } |
| 724 | } |
| 725 | |
| 726 | /// Generic loop nest that handles block and/or thread distribution in the |
| 727 | /// presence of user specified chunk sizes (for at least one of them). |
| 728 | static void NormalizedLoopNestChunked(void (*LoopBody)(Ty, void *), void *Arg, |
| 729 | Ty BlockChunk, Ty NumBlocks, Ty BId, |
| 730 | Ty ThreadChunk, Ty NumThreads, Ty TId, |
| 731 | Ty NumIters, |
| 732 | bool OneIterationPerThread) { |
| 733 | Ty KernelIteration = NumBlocks * BlockChunk; |
| 734 | |
| 735 | // Start index in the chunked space. |
| 736 | Ty IV = BId * BlockChunk + TId; |
| 737 | ASSERT(IV >= 0, "Bad index" ); |
| 738 | |
| 739 | // Cover the entire iteration space, assumptions in the caller might allow |
| 740 | // to simplify this loop to a conditional. |
| 741 | do { |
| 742 | |
| 743 | Ty BlockChunkLeft = |
| 744 | BlockChunk >= TId * ThreadChunk ? BlockChunk - TId * ThreadChunk : 0; |
| 745 | Ty ThreadChunkLeft = |
| 746 | ThreadChunk <= BlockChunkLeft ? ThreadChunk : BlockChunkLeft; |
| 747 | |
| 748 | while (ThreadChunkLeft--) { |
| 749 | |
| 750 | // Given the blocking it's hard to keep track of what to execute. |
| 751 | if (IV >= NumIters) |
| 752 | return; |
| 753 | |
| 754 | // Execute the loop body. |
| 755 | LoopBody(IV, Arg); |
| 756 | |
| 757 | if (OneIterationPerThread) |
| 758 | return; |
| 759 | |
| 760 | ++IV; |
| 761 | } |
| 762 | |
| 763 | IV += KernelIteration; |
| 764 | |
| 765 | } while (IV < NumIters); |
| 766 | } |
| 767 | |
| 768 | public: |
| 769 | /// Worksharing `for`-loop. |
| 770 | static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg, |
| 771 | Ty NumIters, Ty NumThreads, Ty ThreadChunk) { |
| 772 | ASSERT(NumIters >= 0, "Bad iteration count" ); |
| 773 | ASSERT(ThreadChunk >= 0, "Bad thread count" ); |
| 774 | |
| 775 | // All threads need to participate but we don't know if we are in a |
| 776 | // parallel at all or if the user might have used a `num_threads` clause |
| 777 | // on the parallel and reduced the number compared to the block size. |
| 778 | // Since nested parallels are possible too we need to get the thread id |
| 779 | // from the `omp` getter and not the mapping directly. |
| 780 | Ty TId = omp_get_thread_num(); |
| 781 | |
| 782 | // There are no blocks involved here. |
| 783 | Ty BlockChunk = 0; |
| 784 | Ty NumBlocks = 1; |
| 785 | Ty BId = 0; |
| 786 | |
| 787 | // If the thread chunk is not specified we pick a default now. |
| 788 | if (ThreadChunk == 0) |
| 789 | ThreadChunk = 1; |
| 790 | |
| 791 | // If we know we have more threads than iterations we can indicate that to |
| 792 | // avoid an outer loop. |
| 793 | bool OneIterationPerThread = false; |
| 794 | if (config::getAssumeThreadsOversubscription()) { |
| 795 | ASSERT(NumThreads >= NumIters, "Broken assumption" ); |
| 796 | OneIterationPerThread = true; |
| 797 | } |
| 798 | |
| 799 | if (ThreadChunk != 1) |
| 800 | NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId, |
| 801 | ThreadChunk, NumThreads, TId, NumIters, |
| 802 | OneIterationPerThread); |
| 803 | else |
| 804 | NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId, |
| 805 | NumIters, OneIterationPerThread); |
| 806 | } |
| 807 | |
| 808 | /// Worksharing `distribute`-loop. |
| 809 | static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg, |
| 810 | Ty NumIters, Ty BlockChunk) { |
| 811 | ASSERT(icv::Level == 0, "Bad distribute" ); |
| 812 | ASSERT(icv::ActiveLevel == 0, "Bad distribute" ); |
| 813 | ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute" ); |
| 814 | ASSERT(state::ParallelTeamSize == 1, "Bad distribute" ); |
| 815 | |
| 816 | ASSERT(NumIters >= 0, "Bad iteration count" ); |
| 817 | ASSERT(BlockChunk >= 0, "Bad block count" ); |
| 818 | |
| 819 | // There are no threads involved here. |
| 820 | Ty ThreadChunk = 0; |
| 821 | Ty NumThreads = 1; |
| 822 | Ty TId = 0; |
| 823 | |
| 824 | // All teams need to participate. |
| 825 | Ty NumBlocks = mapping::getNumberOfBlocksInKernel(); |
| 826 | Ty BId = mapping::getBlockIdInKernel(); |
| 827 | |
| 828 | // If the block chunk is not specified we pick a default now. |
| 829 | if (BlockChunk == 0) |
| 830 | BlockChunk = NumThreads; |
| 831 | |
| 832 | // If we know we have more blocks than iterations we can indicate that to |
| 833 | // avoid an outer loop. |
| 834 | bool OneIterationPerThread = false; |
| 835 | if (config::getAssumeTeamsOversubscription()) { |
| 836 | ASSERT(NumBlocks >= NumIters, "Broken assumption" ); |
| 837 | OneIterationPerThread = true; |
| 838 | } |
| 839 | |
| 840 | if (BlockChunk != NumThreads) |
| 841 | NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId, |
| 842 | ThreadChunk, NumThreads, TId, NumIters, |
| 843 | OneIterationPerThread); |
| 844 | else |
| 845 | NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId, |
| 846 | NumIters, OneIterationPerThread); |
| 847 | |
| 848 | ASSERT(icv::Level == 0, "Bad distribute" ); |
| 849 | ASSERT(icv::ActiveLevel == 0, "Bad distribute" ); |
| 850 | ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute" ); |
| 851 | ASSERT(state::ParallelTeamSize == 1, "Bad distribute" ); |
| 852 | } |
| 853 | |
| 854 | /// Worksharing `distribute parallel for`-loop. |
| 855 | static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *), |
| 856 | void *Arg, Ty NumIters, Ty NumThreads, |
| 857 | Ty BlockChunk, Ty ThreadChunk) { |
| 858 | ASSERT(icv::Level == 1, "Bad distribute" ); |
| 859 | ASSERT(icv::ActiveLevel == 1, "Bad distribute" ); |
| 860 | ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute" ); |
| 861 | |
| 862 | ASSERT(NumIters >= 0, "Bad iteration count" ); |
| 863 | ASSERT(BlockChunk >= 0, "Bad block count" ); |
| 864 | ASSERT(ThreadChunk >= 0, "Bad thread count" ); |
| 865 | |
| 866 | // All threads need to participate but the user might have used a |
| 867 | // `num_threads` clause on the parallel and reduced the number compared to |
| 868 | // the block size. |
| 869 | Ty TId = mapping::getThreadIdInBlock(); |
| 870 | |
| 871 | // All teams need to participate. |
| 872 | Ty NumBlocks = mapping::getNumberOfBlocksInKernel(); |
| 873 | Ty BId = mapping::getBlockIdInKernel(); |
| 874 | |
| 875 | // If the block chunk is not specified we pick a default now. |
| 876 | if (BlockChunk == 0) |
| 877 | BlockChunk = NumThreads; |
| 878 | |
| 879 | // If the thread chunk is not specified we pick a default now. |
| 880 | if (ThreadChunk == 0) |
| 881 | ThreadChunk = 1; |
| 882 | |
| 883 | // If we know we have more threads (across all blocks) than iterations we |
| 884 | // can indicate that to avoid an outer loop. |
| 885 | bool OneIterationPerThread = false; |
| 886 | if (config::getAssumeTeamsOversubscription() & |
| 887 | config::getAssumeThreadsOversubscription()) { |
| 888 | OneIterationPerThread = true; |
| 889 | ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption" ); |
| 890 | } |
| 891 | |
| 892 | if (BlockChunk != NumThreads || ThreadChunk != 1) |
| 893 | NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId, |
| 894 | ThreadChunk, NumThreads, TId, NumIters, |
| 895 | OneIterationPerThread); |
| 896 | else |
| 897 | NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId, |
| 898 | NumIters, OneIterationPerThread); |
| 899 | |
| 900 | ASSERT(icv::Level == 1, "Bad distribute" ); |
| 901 | ASSERT(icv::ActiveLevel == 1, "Bad distribute" ); |
| 902 | ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute" ); |
| 903 | } |
| 904 | }; |
| 905 | |
| 906 | } // namespace ompx |
| 907 | |
| 908 | #define OMP_LOOP_ENTRY(BW, TY) \ |
| 909 | [[gnu::flatten, clang::always_inline]] void \ |
| 910 | __kmpc_distribute_for_static_loop##BW( \ |
| 911 | IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \ |
| 912 | TY num_threads, TY block_chunk, TY thread_chunk) { \ |
| 913 | ompx::StaticLoopChunker<TY>::DistributeFor( \ |
| 914 | loc, fn, arg, num_iters, num_threads, block_chunk, thread_chunk); \ |
| 915 | } \ |
| 916 | [[gnu::flatten, clang::always_inline]] void \ |
| 917 | __kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *), \ |
| 918 | void *arg, TY num_iters, \ |
| 919 | TY block_chunk) { \ |
| 920 | ompx::StaticLoopChunker<TY>::Distribute(loc, fn, arg, num_iters, \ |
| 921 | block_chunk); \ |
| 922 | } \ |
| 923 | [[gnu::flatten, clang::always_inline]] void __kmpc_for_static_loop##BW( \ |
| 924 | IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \ |
| 925 | TY num_threads, TY thread_chunk) { \ |
| 926 | ompx::StaticLoopChunker<TY>::For(loc, fn, arg, num_iters, num_threads, \ |
| 927 | thread_chunk); \ |
| 928 | } |
| 929 | |
| 930 | extern "C" { |
| 931 | OMP_LOOP_ENTRY(_4, int32_t) |
| 932 | OMP_LOOP_ENTRY(_4u, uint32_t) |
| 933 | OMP_LOOP_ENTRY(_8, int64_t) |
| 934 | OMP_LOOP_ENTRY(_8u, uint64_t) |
| 935 | } |
| 936 | |