| 1 | //===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // Parallel implementation in the GPU. Here is the pattern: |
| 10 | // |
| 11 | // while (not finished) { |
| 12 | // |
| 13 | // if (master) { |
| 14 | // sequential code, decide which par loop to do, or if finished |
| 15 | // __kmpc_kernel_prepare_parallel() // exec by master only |
| 16 | // } |
| 17 | // syncthreads // A |
| 18 | // __kmpc_kernel_parallel() // exec by all |
| 19 | // if (this thread is included in the parallel) { |
| 20 | // switch () for all parallel loops |
| 21 | // __kmpc_kernel_end_parallel() // exec only by threads in parallel |
| 22 | // } |
| 23 | // |
| 24 | // |
| 25 | // The reason we don't exec end_parallel for the threads not included |
| 26 | // in the parallel loop is that for each barrier in the parallel |
| 27 | // region, these non-included threads will cycle through the |
| 28 | // syncthread A. Thus they must preserve their current threadId that |
| 29 | // is larger than thread in team. |
| 30 | // |
| 31 | // To make a long story short... |
| 32 | // |
| 33 | //===----------------------------------------------------------------------===// |
| 34 | |
| 35 | #include "Debug.h" |
| 36 | #include "DeviceTypes.h" |
| 37 | #include "DeviceUtils.h" |
| 38 | #include "Interface.h" |
| 39 | #include "LibC.h" |
| 40 | #include "Mapping.h" |
| 41 | #include "State.h" |
| 42 | #include "Synchronization.h" |
| 43 | |
| 44 | using namespace ompx; |
| 45 | |
| 46 | namespace { |
| 47 | |
| 48 | uint32_t determineNumberOfThreads(int32_t NumThreadsClause) { |
| 49 | uint32_t NThreadsICV = |
| 50 | NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads; |
| 51 | uint32_t NumThreads = mapping::getMaxTeamThreads(); |
| 52 | |
| 53 | if (NThreadsICV != 0 && NThreadsICV < NumThreads) |
| 54 | NumThreads = NThreadsICV; |
| 55 | |
| 56 | // SPMD mode allows any number of threads, for generic mode we round down to a |
| 57 | // multiple of WARPSIZE since it is legal to do so in OpenMP. |
| 58 | if (mapping::isSPMDMode()) |
| 59 | return NumThreads; |
| 60 | |
| 61 | if (NumThreads < mapping::getWarpSize()) |
| 62 | NumThreads = 1; |
| 63 | else |
| 64 | NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1)); |
| 65 | |
| 66 | return NumThreads; |
| 67 | } |
| 68 | |
| 69 | // Invoke an outlined parallel function unwrapping arguments (up to 32). |
| 70 | [[clang::always_inline]] void invokeMicrotask(int32_t global_tid, |
| 71 | int32_t bound_tid, void *fn, |
| 72 | void **args, int64_t nargs) { |
| 73 | switch (nargs) { |
| 74 | #include "generated_microtask_cases.gen" |
| 75 | default: |
| 76 | printf(format: "Too many arguments in kmp_invoke_microtask, aborting execution.\n" ); |
| 77 | __builtin_trap(); |
| 78 | } |
| 79 | } |
| 80 | |
| 81 | } // namespace |
| 82 | |
| 83 | extern "C" { |
| 84 | |
| 85 | [[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident, |
| 86 | int32_t num_threads, |
| 87 | void *fn, void **args, |
| 88 | const int64_t nargs) { |
| 89 | uint32_t TId = mapping::getThreadIdInBlock(); |
| 90 | uint32_t NumThreads = determineNumberOfThreads(NumThreadsClause: num_threads); |
| 91 | uint32_t PTeamSize = |
| 92 | NumThreads == mapping::getMaxTeamThreads() ? 0 : NumThreads; |
| 93 | // Avoid the race between the read of the `icv::Level` above and the write |
| 94 | // below by synchronizing all threads here. |
| 95 | synchronize::threadsAligned(atomic::seq_cst); |
| 96 | { |
| 97 | // Note that the order here is important. `icv::Level` has to be updated |
| 98 | // last or the other updates will cause a thread specific state to be |
| 99 | // created. |
| 100 | state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize, |
| 101 | 1u, TId == 0, ident, |
| 102 | /*ForceTeamState=*/true); |
| 103 | state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0, ident, |
| 104 | /*ForceTeamState=*/true); |
| 105 | state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident, |
| 106 | /*ForceTeamState=*/true); |
| 107 | |
| 108 | // Synchronize all threads after the main thread (TId == 0) set up the |
| 109 | // team state properly. |
| 110 | synchronize::threadsAligned(atomic::acq_rel); |
| 111 | |
| 112 | state::ParallelTeamSize.assert_eq(PTeamSize, ident, |
| 113 | /*ForceTeamState=*/true); |
| 114 | icv::ActiveLevel.assert_eq(1u, ident, /*ForceTeamState=*/true); |
| 115 | icv::Level.assert_eq(1u, ident, /*ForceTeamState=*/true); |
| 116 | |
| 117 | // Ensure we synchronize before we run user code to avoid invalidating the |
| 118 | // assumptions above. |
| 119 | synchronize::threadsAligned(atomic::relaxed); |
| 120 | |
| 121 | if (!PTeamSize || TId < PTeamSize) |
| 122 | invokeMicrotask(global_tid: TId, bound_tid: 0, fn, args, nargs); |
| 123 | |
| 124 | // Synchronize all threads at the end of a parallel region. |
| 125 | synchronize::threadsAligned(atomic::seq_cst); |
| 126 | } |
| 127 | |
| 128 | // Synchronize all threads to make sure every thread exits the scope above; |
| 129 | // otherwise the following assertions and the assumption in |
| 130 | // __kmpc_target_deinit may not hold. |
| 131 | synchronize::threadsAligned(atomic::acq_rel); |
| 132 | |
| 133 | state::ParallelTeamSize.assert_eq(1u, ident, /*ForceTeamState=*/true); |
| 134 | icv::ActiveLevel.assert_eq(0u, ident, /*ForceTeamState=*/true); |
| 135 | icv::Level.assert_eq(0u, ident, /*ForceTeamState=*/true); |
| 136 | |
| 137 | // Ensure we synchronize to create an aligned region around the assumptions. |
| 138 | synchronize::threadsAligned(atomic::relaxed); |
| 139 | |
| 140 | return; |
| 141 | } |
| 142 | |
| 143 | [[clang::always_inline]] void |
| 144 | __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, |
| 145 | int32_t num_threads, int proc_bind, void *fn, |
| 146 | void *wrapper_fn, void **args, int64_t nargs) { |
| 147 | uint32_t TId = mapping::getThreadIdInBlock(); |
| 148 | |
| 149 | // Assert the parallelism level is zero if disabled by the user. |
| 150 | ASSERT((config::mayUseNestedParallelism() || icv::Level == 0), |
| 151 | "nested parallelism while disabled" ); |
| 152 | |
| 153 | // Handle the serialized case first, same for SPMD/non-SPMD: |
| 154 | // 1) if-clause(0) |
| 155 | // 2) parallel in task or other thread state inducing construct |
| 156 | // 3) nested parallel regions |
| 157 | if (OMP_UNLIKELY(!if_expr || state::HasThreadState || |
| 158 | (config::mayUseNestedParallelism() && icv::Level))) { |
| 159 | state::DateEnvironmentRAII DERAII(ident); |
| 160 | ++icv::Level; |
| 161 | invokeMicrotask(global_tid: TId, bound_tid: 0, fn, args, nargs); |
| 162 | return; |
| 163 | } |
| 164 | |
| 165 | // From this point forward we know that there is no thread state used. |
| 166 | ASSERT(state::HasThreadState == false, nullptr); |
| 167 | |
| 168 | if (mapping::isSPMDMode()) { |
| 169 | // This was moved to its own routine so it could be called directly |
| 170 | // in certain situations to avoid resource consumption of unused |
| 171 | // logic in parallel_51. |
| 172 | __kmpc_parallel_spmd(ident, num_threads, fn, args, nargs); |
| 173 | |
| 174 | return; |
| 175 | } |
| 176 | |
| 177 | uint32_t NumThreads = determineNumberOfThreads(NumThreadsClause: num_threads); |
| 178 | uint32_t MaxTeamThreads = mapping::getMaxTeamThreads(); |
| 179 | uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads; |
| 180 | |
| 181 | // We do *not* create a new data environment because all threads in the team |
| 182 | // that are active are now running this parallel region. They share the |
| 183 | // TeamState, which has an increase level-var and potentially active-level |
| 184 | // set, but they do not have individual ThreadStates yet. If they ever |
| 185 | // modify the ICVs beyond this point a ThreadStates will be allocated. |
| 186 | |
| 187 | bool IsActiveParallelRegion = NumThreads > 1; |
| 188 | if (!IsActiveParallelRegion) { |
| 189 | state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident); |
| 190 | invokeMicrotask(global_tid: TId, bound_tid: 0, fn, args, nargs); |
| 191 | return; |
| 192 | } |
| 193 | |
| 194 | void **GlobalArgs = nullptr; |
| 195 | if (nargs) { |
| 196 | __kmpc_begin_sharing_variables(&GlobalArgs, nargs); |
| 197 | switch (nargs) { |
| 198 | default: |
| 199 | for (int I = 0; I < nargs; I++) |
| 200 | GlobalArgs[I] = args[I]; |
| 201 | break; |
| 202 | case 16: |
| 203 | GlobalArgs[15] = args[15]; |
| 204 | [[fallthrough]]; |
| 205 | case 15: |
| 206 | GlobalArgs[14] = args[14]; |
| 207 | [[fallthrough]]; |
| 208 | case 14: |
| 209 | GlobalArgs[13] = args[13]; |
| 210 | [[fallthrough]]; |
| 211 | case 13: |
| 212 | GlobalArgs[12] = args[12]; |
| 213 | [[fallthrough]]; |
| 214 | case 12: |
| 215 | GlobalArgs[11] = args[11]; |
| 216 | [[fallthrough]]; |
| 217 | case 11: |
| 218 | GlobalArgs[10] = args[10]; |
| 219 | [[fallthrough]]; |
| 220 | case 10: |
| 221 | GlobalArgs[9] = args[9]; |
| 222 | [[fallthrough]]; |
| 223 | case 9: |
| 224 | GlobalArgs[8] = args[8]; |
| 225 | [[fallthrough]]; |
| 226 | case 8: |
| 227 | GlobalArgs[7] = args[7]; |
| 228 | [[fallthrough]]; |
| 229 | case 7: |
| 230 | GlobalArgs[6] = args[6]; |
| 231 | [[fallthrough]]; |
| 232 | case 6: |
| 233 | GlobalArgs[5] = args[5]; |
| 234 | [[fallthrough]]; |
| 235 | case 5: |
| 236 | GlobalArgs[4] = args[4]; |
| 237 | [[fallthrough]]; |
| 238 | case 4: |
| 239 | GlobalArgs[3] = args[3]; |
| 240 | [[fallthrough]]; |
| 241 | case 3: |
| 242 | GlobalArgs[2] = args[2]; |
| 243 | [[fallthrough]]; |
| 244 | case 2: |
| 245 | GlobalArgs[1] = args[1]; |
| 246 | [[fallthrough]]; |
| 247 | case 1: |
| 248 | GlobalArgs[0] = args[0]; |
| 249 | [[fallthrough]]; |
| 250 | case 0: |
| 251 | break; |
| 252 | } |
| 253 | } |
| 254 | |
| 255 | { |
| 256 | // Note that the order here is important. `icv::Level` has to be updated |
| 257 | // last or the other updates will cause a thread specific state to be |
| 258 | // created. |
| 259 | state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize, |
| 260 | 1u, true, ident, |
| 261 | /*ForceTeamState=*/true); |
| 262 | state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn, |
| 263 | (void *)nullptr, true, ident, |
| 264 | /*ForceTeamState=*/true); |
| 265 | state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident, |
| 266 | /*ForceTeamState=*/true); |
| 267 | state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident, |
| 268 | /*ForceTeamState=*/true); |
| 269 | |
| 270 | // Master signals work to activate workers. |
| 271 | synchronize::threads(atomic::seq_cst); |
| 272 | // Master waits for workers to signal. |
| 273 | synchronize::threads(atomic::seq_cst); |
| 274 | } |
| 275 | |
| 276 | if (nargs) |
| 277 | __kmpc_end_sharing_variables(); |
| 278 | } |
| 279 | |
| 280 | [[clang::noinline]] bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) { |
| 281 | // Work function and arguments for L1 parallel region. |
| 282 | *WorkFn = state::ParallelRegionFn; |
| 283 | |
| 284 | // If this is the termination signal from the master, quit early. |
| 285 | if (!*WorkFn) |
| 286 | return false; |
| 287 | |
| 288 | // Set to true for workers participating in the parallel region. |
| 289 | uint32_t TId = mapping::getThreadIdInBlock(); |
| 290 | bool ThreadIsActive = TId < state::getEffectivePTeamSize(); |
| 291 | return ThreadIsActive; |
| 292 | } |
| 293 | |
| 294 | [[clang::noinline]] void __kmpc_kernel_end_parallel() { |
| 295 | // In case we have modified an ICV for this thread before a ThreadState was |
| 296 | // created. We drop it now to not contaminate the next parallel region. |
| 297 | ASSERT(!mapping::isSPMDMode(), nullptr); |
| 298 | uint32_t TId = mapping::getThreadIdInBlock(); |
| 299 | state::resetStateForThread(TId); |
| 300 | ASSERT(!mapping::isSPMDMode(), nullptr); |
| 301 | } |
| 302 | |
| 303 | uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { return omp_get_level(); } |
| 304 | |
| 305 | int32_t __kmpc_global_thread_num(IdentTy *) { return omp_get_thread_num(); } |
| 306 | |
| 307 | void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams, |
| 308 | int32_t thread_limit) {} |
| 309 | |
| 310 | void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {} |
| 311 | } |
| 312 | |