1//===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Parallel implementation in the GPU. Here is the pattern:
10//
11// while (not finished) {
12//
13// if (master) {
14// sequential code, decide which par loop to do, or if finished
15// __kmpc_kernel_prepare_parallel() // exec by master only
16// }
17// syncthreads // A
18// __kmpc_kernel_parallel() // exec by all
19// if (this thread is included in the parallel) {
20// switch () for all parallel loops
21// __kmpc_kernel_end_parallel() // exec only by threads in parallel
22// }
23//
24//
25// The reason we don't exec end_parallel for the threads not included
26// in the parallel loop is that for each barrier in the parallel
27// region, these non-included threads will cycle through the
28// syncthread A. Thus they must preserve their current threadId that
29// is larger than thread in team.
30//
31// To make a long story short...
32//
33//===----------------------------------------------------------------------===//
34
35#include "Debug.h"
36#include "DeviceTypes.h"
37#include "DeviceUtils.h"
38#include "Interface.h"
39#include "LibC.h"
40#include "Mapping.h"
41#include "State.h"
42#include "Synchronization.h"
43
44using namespace ompx;
45
46namespace {
47
48uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
49 uint32_t NThreadsICV =
50 NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
51 uint32_t NumThreads = mapping::getMaxTeamThreads();
52
53 if (NThreadsICV != 0 && NThreadsICV < NumThreads)
54 NumThreads = NThreadsICV;
55
56 // SPMD mode allows any number of threads, for generic mode we round down to a
57 // multiple of WARPSIZE since it is legal to do so in OpenMP.
58 if (mapping::isSPMDMode())
59 return NumThreads;
60
61 if (NumThreads < mapping::getWarpSize())
62 NumThreads = 1;
63 else
64 NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));
65
66 return NumThreads;
67}
68
69// Invoke an outlined parallel function unwrapping arguments (up to 32).
70[[clang::always_inline]] void invokeMicrotask(int32_t global_tid,
71 int32_t bound_tid, void *fn,
72 void **args, int64_t nargs) {
73 switch (nargs) {
74#include "generated_microtask_cases.gen"
75 default:
76 printf(format: "Too many arguments in kmp_invoke_microtask, aborting execution.\n");
77 __builtin_trap();
78 }
79}
80
81} // namespace
82
83extern "C" {
84
85[[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident,
86 int32_t num_threads,
87 void *fn, void **args,
88 const int64_t nargs) {
89 uint32_t TId = mapping::getThreadIdInBlock();
90 uint32_t NumThreads = determineNumberOfThreads(NumThreadsClause: num_threads);
91 uint32_t PTeamSize =
92 NumThreads == mapping::getMaxTeamThreads() ? 0 : NumThreads;
93 // Avoid the race between the read of the `icv::Level` above and the write
94 // below by synchronizing all threads here.
95 synchronize::threadsAligned(atomic::seq_cst);
96 {
97 // Note that the order here is important. `icv::Level` has to be updated
98 // last or the other updates will cause a thread specific state to be
99 // created.
100 state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
101 1u, TId == 0, ident,
102 /*ForceTeamState=*/true);
103 state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0, ident,
104 /*ForceTeamState=*/true);
105 state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident,
106 /*ForceTeamState=*/true);
107
108 // Synchronize all threads after the main thread (TId == 0) set up the
109 // team state properly.
110 synchronize::threadsAligned(atomic::acq_rel);
111
112 state::ParallelTeamSize.assert_eq(PTeamSize, ident,
113 /*ForceTeamState=*/true);
114 icv::ActiveLevel.assert_eq(1u, ident, /*ForceTeamState=*/true);
115 icv::Level.assert_eq(1u, ident, /*ForceTeamState=*/true);
116
117 // Ensure we synchronize before we run user code to avoid invalidating the
118 // assumptions above.
119 synchronize::threadsAligned(atomic::relaxed);
120
121 if (!PTeamSize || TId < PTeamSize)
122 invokeMicrotask(global_tid: TId, bound_tid: 0, fn, args, nargs);
123
124 // Synchronize all threads at the end of a parallel region.
125 synchronize::threadsAligned(atomic::seq_cst);
126 }
127
128 // Synchronize all threads to make sure every thread exits the scope above;
129 // otherwise the following assertions and the assumption in
130 // __kmpc_target_deinit may not hold.
131 synchronize::threadsAligned(atomic::acq_rel);
132
133 state::ParallelTeamSize.assert_eq(1u, ident, /*ForceTeamState=*/true);
134 icv::ActiveLevel.assert_eq(0u, ident, /*ForceTeamState=*/true);
135 icv::Level.assert_eq(0u, ident, /*ForceTeamState=*/true);
136
137 // Ensure we synchronize to create an aligned region around the assumptions.
138 synchronize::threadsAligned(atomic::relaxed);
139
140 return;
141}
142
143[[clang::always_inline]] void
144__kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
145 int32_t num_threads, int proc_bind, void *fn,
146 void *wrapper_fn, void **args, int64_t nargs) {
147 uint32_t TId = mapping::getThreadIdInBlock();
148
149 // Assert the parallelism level is zero if disabled by the user.
150 ASSERT((config::mayUseNestedParallelism() || icv::Level == 0),
151 "nested parallelism while disabled");
152
153 // Handle the serialized case first, same for SPMD/non-SPMD:
154 // 1) if-clause(0)
155 // 2) parallel in task or other thread state inducing construct
156 // 3) nested parallel regions
157 if (OMP_UNLIKELY(!if_expr || state::HasThreadState ||
158 (config::mayUseNestedParallelism() && icv::Level))) {
159 state::DateEnvironmentRAII DERAII(ident);
160 ++icv::Level;
161 invokeMicrotask(global_tid: TId, bound_tid: 0, fn, args, nargs);
162 return;
163 }
164
165 // From this point forward we know that there is no thread state used.
166 ASSERT(state::HasThreadState == false, nullptr);
167
168 if (mapping::isSPMDMode()) {
169 // This was moved to its own routine so it could be called directly
170 // in certain situations to avoid resource consumption of unused
171 // logic in parallel_51.
172 __kmpc_parallel_spmd(ident, num_threads, fn, args, nargs);
173
174 return;
175 }
176
177 uint32_t NumThreads = determineNumberOfThreads(NumThreadsClause: num_threads);
178 uint32_t MaxTeamThreads = mapping::getMaxTeamThreads();
179 uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads;
180
181 // We do *not* create a new data environment because all threads in the team
182 // that are active are now running this parallel region. They share the
183 // TeamState, which has an increase level-var and potentially active-level
184 // set, but they do not have individual ThreadStates yet. If they ever
185 // modify the ICVs beyond this point a ThreadStates will be allocated.
186
187 bool IsActiveParallelRegion = NumThreads > 1;
188 if (!IsActiveParallelRegion) {
189 state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident);
190 invokeMicrotask(global_tid: TId, bound_tid: 0, fn, args, nargs);
191 return;
192 }
193
194 void **GlobalArgs = nullptr;
195 if (nargs) {
196 __kmpc_begin_sharing_variables(&GlobalArgs, nargs);
197 switch (nargs) {
198 default:
199 for (int I = 0; I < nargs; I++)
200 GlobalArgs[I] = args[I];
201 break;
202 case 16:
203 GlobalArgs[15] = args[15];
204 [[fallthrough]];
205 case 15:
206 GlobalArgs[14] = args[14];
207 [[fallthrough]];
208 case 14:
209 GlobalArgs[13] = args[13];
210 [[fallthrough]];
211 case 13:
212 GlobalArgs[12] = args[12];
213 [[fallthrough]];
214 case 12:
215 GlobalArgs[11] = args[11];
216 [[fallthrough]];
217 case 11:
218 GlobalArgs[10] = args[10];
219 [[fallthrough]];
220 case 10:
221 GlobalArgs[9] = args[9];
222 [[fallthrough]];
223 case 9:
224 GlobalArgs[8] = args[8];
225 [[fallthrough]];
226 case 8:
227 GlobalArgs[7] = args[7];
228 [[fallthrough]];
229 case 7:
230 GlobalArgs[6] = args[6];
231 [[fallthrough]];
232 case 6:
233 GlobalArgs[5] = args[5];
234 [[fallthrough]];
235 case 5:
236 GlobalArgs[4] = args[4];
237 [[fallthrough]];
238 case 4:
239 GlobalArgs[3] = args[3];
240 [[fallthrough]];
241 case 3:
242 GlobalArgs[2] = args[2];
243 [[fallthrough]];
244 case 2:
245 GlobalArgs[1] = args[1];
246 [[fallthrough]];
247 case 1:
248 GlobalArgs[0] = args[0];
249 [[fallthrough]];
250 case 0:
251 break;
252 }
253 }
254
255 {
256 // Note that the order here is important. `icv::Level` has to be updated
257 // last or the other updates will cause a thread specific state to be
258 // created.
259 state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
260 1u, true, ident,
261 /*ForceTeamState=*/true);
262 state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
263 (void *)nullptr, true, ident,
264 /*ForceTeamState=*/true);
265 state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident,
266 /*ForceTeamState=*/true);
267 state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident,
268 /*ForceTeamState=*/true);
269
270 // Master signals work to activate workers.
271 synchronize::threads(atomic::seq_cst);
272 // Master waits for workers to signal.
273 synchronize::threads(atomic::seq_cst);
274 }
275
276 if (nargs)
277 __kmpc_end_sharing_variables();
278}
279
280[[clang::noinline]] bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
281 // Work function and arguments for L1 parallel region.
282 *WorkFn = state::ParallelRegionFn;
283
284 // If this is the termination signal from the master, quit early.
285 if (!*WorkFn)
286 return false;
287
288 // Set to true for workers participating in the parallel region.
289 uint32_t TId = mapping::getThreadIdInBlock();
290 bool ThreadIsActive = TId < state::getEffectivePTeamSize();
291 return ThreadIsActive;
292}
293
294[[clang::noinline]] void __kmpc_kernel_end_parallel() {
295 // In case we have modified an ICV for this thread before a ThreadState was
296 // created. We drop it now to not contaminate the next parallel region.
297 ASSERT(!mapping::isSPMDMode(), nullptr);
298 uint32_t TId = mapping::getThreadIdInBlock();
299 state::resetStateForThread(TId);
300 ASSERT(!mapping::isSPMDMode(), nullptr);
301}
302
303uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { return omp_get_level(); }
304
305int32_t __kmpc_global_thread_num(IdentTy *) { return omp_get_thread_num(); }
306
307void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams,
308 int32_t thread_limit) {}
309
310void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {}
311}
312

Provided by KDAB

Privacy Policy
Update your C++ knowledge – Modern C++11/14/17 Training
Find out more

source code of offload/DeviceRTL/src/Parallelism.cpp