1 | //===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // Parallel implementation in the GPU. Here is the pattern: |
10 | // |
11 | // while (not finished) { |
12 | // |
13 | // if (master) { |
14 | // sequential code, decide which par loop to do, or if finished |
15 | // __kmpc_kernel_prepare_parallel() // exec by master only |
16 | // } |
17 | // syncthreads // A |
18 | // __kmpc_kernel_parallel() // exec by all |
19 | // if (this thread is included in the parallel) { |
20 | // switch () for all parallel loops |
21 | // __kmpc_kernel_end_parallel() // exec only by threads in parallel |
22 | // } |
23 | // |
24 | // |
25 | // The reason we don't exec end_parallel for the threads not included |
26 | // in the parallel loop is that for each barrier in the parallel |
27 | // region, these non-included threads will cycle through the |
28 | // syncthread A. Thus they must preserve their current threadId that |
29 | // is larger than thread in team. |
30 | // |
31 | // To make a long story short... |
32 | // |
33 | //===----------------------------------------------------------------------===// |
34 | |
35 | #include "Debug.h" |
36 | #include "Interface.h" |
37 | #include "Mapping.h" |
38 | #include "State.h" |
39 | #include "Synchronization.h" |
40 | #include "Types.h" |
41 | #include "Utils.h" |
42 | |
43 | using namespace ompx; |
44 | |
45 | #pragma omp begin declare target device_type(nohost) |
46 | |
47 | namespace { |
48 | |
49 | uint32_t determineNumberOfThreads(int32_t NumThreadsClause) { |
50 | uint32_t NThreadsICV = |
51 | NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads; |
52 | uint32_t NumThreads = mapping::getMaxTeamThreads(); |
53 | |
54 | if (NThreadsICV != 0 && NThreadsICV < NumThreads) |
55 | NumThreads = NThreadsICV; |
56 | |
57 | // SPMD mode allows any number of threads, for generic mode we round down to a |
58 | // multiple of WARPSIZE since it is legal to do so in OpenMP. |
59 | if (mapping::isSPMDMode()) |
60 | return NumThreads; |
61 | |
62 | if (NumThreads < mapping::getWarpSize()) |
63 | NumThreads = 1; |
64 | else |
65 | NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1)); |
66 | |
67 | return NumThreads; |
68 | } |
69 | |
70 | // Invoke an outlined parallel function unwrapping arguments (up to 32). |
71 | [[clang::always_inline]] void invokeMicrotask(int32_t global_tid, |
72 | int32_t bound_tid, void *fn, |
73 | void **args, int64_t nargs) { |
74 | switch (nargs) { |
75 | #include "generated_microtask_cases.gen" |
76 | default: |
77 | PRINT("Too many arguments in kmp_invoke_microtask, aborting execution.\n" ); |
78 | __builtin_trap(); |
79 | } |
80 | } |
81 | |
82 | } // namespace |
83 | |
84 | extern "C" { |
85 | |
86 | [[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident, |
87 | int32_t num_threads, |
88 | void *fn, void **args, |
89 | const int64_t nargs) { |
90 | uint32_t TId = mapping::getThreadIdInBlock(); |
91 | uint32_t NumThreads = determineNumberOfThreads(NumThreadsClause: num_threads); |
92 | uint32_t PTeamSize = |
93 | NumThreads == mapping::getMaxTeamThreads() ? 0 : NumThreads; |
94 | // Avoid the race between the read of the `icv::Level` above and the write |
95 | // below by synchronizing all threads here. |
96 | synchronize::threadsAligned(atomic::seq_cst); |
97 | { |
98 | // Note that the order here is important. `icv::Level` has to be updated |
99 | // last or the other updates will cause a thread specific state to be |
100 | // created. |
101 | state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize, |
102 | 1u, TId == 0, ident, |
103 | /*ForceTeamState=*/true); |
104 | state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0, ident, |
105 | /*ForceTeamState=*/true); |
106 | state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident, |
107 | /*ForceTeamState=*/true); |
108 | |
109 | // Synchronize all threads after the main thread (TId == 0) set up the |
110 | // team state properly. |
111 | synchronize::threadsAligned(atomic::acq_rel); |
112 | |
113 | state::ParallelTeamSize.assert_eq(PTeamSize, ident, |
114 | /*ForceTeamState=*/true); |
115 | icv::ActiveLevel.assert_eq(1u, ident, /*ForceTeamState=*/true); |
116 | icv::Level.assert_eq(1u, ident, /*ForceTeamState=*/true); |
117 | |
118 | // Ensure we synchronize before we run user code to avoid invalidating the |
119 | // assumptions above. |
120 | synchronize::threadsAligned(atomic::relaxed); |
121 | |
122 | if (!PTeamSize || TId < PTeamSize) |
123 | invokeMicrotask(global_tid: TId, bound_tid: 0, fn, args, nargs); |
124 | |
125 | // Synchronize all threads at the end of a parallel region. |
126 | synchronize::threadsAligned(atomic::seq_cst); |
127 | } |
128 | |
129 | // Synchronize all threads to make sure every thread exits the scope above; |
130 | // otherwise the following assertions and the assumption in |
131 | // __kmpc_target_deinit may not hold. |
132 | synchronize::threadsAligned(atomic::acq_rel); |
133 | |
134 | state::ParallelTeamSize.assert_eq(1u, ident, /*ForceTeamState=*/true); |
135 | icv::ActiveLevel.assert_eq(0u, ident, /*ForceTeamState=*/true); |
136 | icv::Level.assert_eq(0u, ident, /*ForceTeamState=*/true); |
137 | |
138 | // Ensure we synchronize to create an aligned region around the assumptions. |
139 | synchronize::threadsAligned(atomic::relaxed); |
140 | |
141 | return; |
142 | } |
143 | |
144 | [[clang::always_inline]] void |
145 | __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, |
146 | int32_t num_threads, int proc_bind, void *fn, |
147 | void *wrapper_fn, void **args, int64_t nargs) { |
148 | uint32_t TId = mapping::getThreadIdInBlock(); |
149 | |
150 | // Assert the parallelism level is zero if disabled by the user. |
151 | ASSERT((config::mayUseNestedParallelism() || icv::Level == 0), |
152 | "nested parallelism while disabled" ); |
153 | |
154 | // Handle the serialized case first, same for SPMD/non-SPMD: |
155 | // 1) if-clause(0) |
156 | // 2) parallel in task or other thread state inducing construct |
157 | // 3) nested parallel regions |
158 | if (OMP_UNLIKELY(!if_expr || state::HasThreadState || |
159 | (config::mayUseNestedParallelism() && icv::Level))) { |
160 | state::DateEnvironmentRAII DERAII(ident); |
161 | ++icv::Level; |
162 | invokeMicrotask(global_tid: TId, bound_tid: 0, fn, args, nargs); |
163 | return; |
164 | } |
165 | |
166 | // From this point forward we know that there is no thread state used. |
167 | ASSERT(state::HasThreadState == false, nullptr); |
168 | |
169 | uint32_t NumThreads = determineNumberOfThreads(NumThreadsClause: num_threads); |
170 | uint32_t MaxTeamThreads = mapping::getMaxTeamThreads(); |
171 | uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads; |
172 | if (mapping::isSPMDMode()) { |
173 | // This was moved to its own routine so it could be called directly |
174 | // in certain situations to avoid resource consumption of unused |
175 | // logic in parallel_51. |
176 | __kmpc_parallel_spmd(ident, num_threads, fn, args, nargs); |
177 | |
178 | return; |
179 | } |
180 | |
181 | // We do *not* create a new data environment because all threads in the team |
182 | // that are active are now running this parallel region. They share the |
183 | // TeamState, which has an increase level-var and potentially active-level |
184 | // set, but they do not have individual ThreadStates yet. If they ever |
185 | // modify the ICVs beyond this point a ThreadStates will be allocated. |
186 | |
187 | bool IsActiveParallelRegion = NumThreads > 1; |
188 | if (!IsActiveParallelRegion) { |
189 | state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident); |
190 | invokeMicrotask(global_tid: TId, bound_tid: 0, fn, args, nargs); |
191 | return; |
192 | } |
193 | |
194 | void **GlobalArgs = nullptr; |
195 | if (nargs) { |
196 | __kmpc_begin_sharing_variables(&GlobalArgs, nargs); |
197 | switch (nargs) { |
198 | default: |
199 | for (int I = 0; I < nargs; I++) |
200 | GlobalArgs[I] = args[I]; |
201 | break; |
202 | case 16: |
203 | GlobalArgs[15] = args[15]; |
204 | [[fallthrough]]; |
205 | case 15: |
206 | GlobalArgs[14] = args[14]; |
207 | [[fallthrough]]; |
208 | case 14: |
209 | GlobalArgs[13] = args[13]; |
210 | [[fallthrough]]; |
211 | case 13: |
212 | GlobalArgs[12] = args[12]; |
213 | [[fallthrough]]; |
214 | case 12: |
215 | GlobalArgs[11] = args[11]; |
216 | [[fallthrough]]; |
217 | case 11: |
218 | GlobalArgs[10] = args[10]; |
219 | [[fallthrough]]; |
220 | case 10: |
221 | GlobalArgs[9] = args[9]; |
222 | [[fallthrough]]; |
223 | case 9: |
224 | GlobalArgs[8] = args[8]; |
225 | [[fallthrough]]; |
226 | case 8: |
227 | GlobalArgs[7] = args[7]; |
228 | [[fallthrough]]; |
229 | case 7: |
230 | GlobalArgs[6] = args[6]; |
231 | [[fallthrough]]; |
232 | case 6: |
233 | GlobalArgs[5] = args[5]; |
234 | [[fallthrough]]; |
235 | case 5: |
236 | GlobalArgs[4] = args[4]; |
237 | [[fallthrough]]; |
238 | case 4: |
239 | GlobalArgs[3] = args[3]; |
240 | [[fallthrough]]; |
241 | case 3: |
242 | GlobalArgs[2] = args[2]; |
243 | [[fallthrough]]; |
244 | case 2: |
245 | GlobalArgs[1] = args[1]; |
246 | [[fallthrough]]; |
247 | case 1: |
248 | GlobalArgs[0] = args[0]; |
249 | [[fallthrough]]; |
250 | case 0: |
251 | break; |
252 | } |
253 | } |
254 | |
255 | { |
256 | // Note that the order here is important. `icv::Level` has to be updated |
257 | // last or the other updates will cause a thread specific state to be |
258 | // created. |
259 | state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize, |
260 | 1u, true, ident, |
261 | /*ForceTeamState=*/true); |
262 | state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn, |
263 | (void *)nullptr, true, ident, |
264 | /*ForceTeamState=*/true); |
265 | state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident, |
266 | /*ForceTeamState=*/true); |
267 | state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident, |
268 | /*ForceTeamState=*/true); |
269 | |
270 | // Master signals work to activate workers. |
271 | synchronize::threads(atomic::seq_cst); |
272 | // Master waits for workers to signal. |
273 | synchronize::threads(atomic::seq_cst); |
274 | } |
275 | |
276 | if (nargs) |
277 | __kmpc_end_sharing_variables(); |
278 | } |
279 | |
280 | [[clang::noinline]] bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) { |
281 | // Work function and arguments for L1 parallel region. |
282 | *WorkFn = state::ParallelRegionFn; |
283 | |
284 | // If this is the termination signal from the master, quit early. |
285 | if (!*WorkFn) |
286 | return false; |
287 | |
288 | // Set to true for workers participating in the parallel region. |
289 | uint32_t TId = mapping::getThreadIdInBlock(); |
290 | bool ThreadIsActive = TId < state::getEffectivePTeamSize(); |
291 | return ThreadIsActive; |
292 | } |
293 | |
294 | [[clang::noinline]] void __kmpc_kernel_end_parallel() { |
295 | // In case we have modified an ICV for this thread before a ThreadState was |
296 | // created. We drop it now to not contaminate the next parallel region. |
297 | ASSERT(!mapping::isSPMDMode(), nullptr); |
298 | uint32_t TId = mapping::getThreadIdInBlock(); |
299 | state::resetStateForThread(TId); |
300 | ASSERT(!mapping::isSPMDMode(), nullptr); |
301 | } |
302 | |
303 | uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { return omp_get_level(); } |
304 | |
305 | int32_t __kmpc_global_thread_num(IdentTy *) { return omp_get_thread_num(); } |
306 | |
307 | void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams, |
308 | int32_t thread_limit) {} |
309 | |
310 | void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {} |
311 | } |
312 | |
313 | #pragma omp end declare target |
314 | |