Parallelism.cpp source code [offload/DeviceRTL/src/Parallelism.cpp]

1	//===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ --===//*
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// Parallel implementation in the GPU. Here is the pattern:
10	//
11	// while (not finished) {
12	//
13	// if (master) {
14	// sequential code, decide which par loop to do, or if finished
15	// __kmpc_kernel_prepare_parallel() // exec by master only
16	// }
17	// syncthreads // A
18	// __kmpc_kernel_parallel() // exec by all
19	// if (this thread is included in the parallel) {
20	// switch () for all parallel loops
21	// __kmpc_kernel_end_parallel() // exec only by threads in parallel
22	// }
23	//
24	//
25	// The reason we don't exec end_parallel for the threads not included
26	// in the parallel loop is that for each barrier in the parallel
27	// region, these non-included threads will cycle through the
28	// syncthread A. Thus they must preserve their current threadId that
29	// is larger than thread in team.
30	//
31	// To make a long story short...
32	//
33	//===----------------------------------------------------------------------===//
34
35	#include "Debug.h"
36	#include "DeviceTypes.h"
37	#include "DeviceUtils.h"
38	#include "Interface.h"
39	#include "LibC.h"
40	#include "Mapping.h"
41	#include "State.h"
42	#include "Synchronization.h"
43
44	using namespace ompx;
45
46	namespace {
47
48	uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
49	uint32_t NThreadsICV =
50	NumThreadsClause != -`1` ? NumThreadsClause : icv::NThreads;
51	uint32_t NumThreads = mapping::getMaxTeamThreads();
52
53	if (NThreadsICV != `0` && NThreadsICV < NumThreads)
54	NumThreads = NThreadsICV;
55
56	// SPMD mode allows any number of threads, for generic mode we round down to a
57	// multiple of WARPSIZE since it is legal to do so in OpenMP.
58	if (mapping::isSPMDMode())
59	return NumThreads;
60
61	if (NumThreads < mapping::getWarpSize())
62	NumThreads = `1`;
63	else
64	NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - `1`));
65
66	return NumThreads;
67	}
68
69	// Invoke an outlined parallel function unwrapping arguments (up to 32).
70	[[clang::always_inline]] void invokeMicrotask(int32_t global_tid,
71	int32_t bound_tid, void *fn,
72	void **args, int64_t nargs) {
73	switch (nargs) {
74	#include "generated_microtask_cases.gen"
75	default:
76	printf(format: "Too many arguments in kmp_invoke_microtask, aborting execution.\n");
77	__builtin_trap();
78	}
79	}
80
81	} // namespace
82
83	extern "C" {
84
85	[[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident,
86	int32_t num_threads,
87	void fn, void* **args,
88	const int64_t nargs) {
89	uint32_t TId = mapping::getThreadIdInBlock();
90	uint32_t NumThreads = determineNumberOfThreads(NumThreadsClause: num_threads);
91	uint32_t PTeamSize =
92	NumThreads == mapping::getMaxTeamThreads() ? `0` : NumThreads;
93	// Avoid the race between the read of the `icv::Level` above and the write
94	// below by synchronizing all threads here.
95	synchronize::threadsAligned(atomic::seq_cst);
96	{
97	// Note that the order here is important. `icv::Level` has to be updated
98	// last or the other updates will cause a thread specific state to be
99	// created.
100	state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
101	`1u`, TId == `0`, ident,
102	/ForceTeamState=/true);
103	state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, `1u`, `0u`, TId == `0`, ident,
104	/ForceTeamState=/true);
105	state::ValueRAII LevelRAII(icv::Level, `1u`, `0u`, TId == `0`, ident,
106	/ForceTeamState=/true);
107
108	// Synchronize all threads after the main thread (TId == 0) set up the
109	// team state properly.
110	synchronize::threadsAligned(atomic::acq_rel);
111
112	state::ParallelTeamSize.assert_eq(PTeamSize, ident,
113	/ForceTeamState=/true);
114	icv::ActiveLevel.assert_eq(`1u`, ident, /ForceTeamState=/true);
115	icv::Level.assert_eq(`1u`, ident, /ForceTeamState=/true);
116
117	// Ensure we synchronize before we run user code to avoid invalidating the
118	// assumptions above.
119	synchronize::threadsAligned(atomic::relaxed);
120
121	if (!PTeamSize \|\| TId < PTeamSize)
122	invokeMicrotask(global_tid: TId, bound_tid: `0`, fn, args, nargs);
123
124	// Synchronize all threads at the end of a parallel region.
125	synchronize::threadsAligned(atomic::seq_cst);
126	}
127
128	// Synchronize all threads to make sure every thread exits the scope above;
129	// otherwise the following assertions and the assumption in
130	// __kmpc_target_deinit may not hold.
131	synchronize::threadsAligned(atomic::acq_rel);
132
133	state::ParallelTeamSize.assert_eq(`1u`, ident, /ForceTeamState=/true);
134	icv::ActiveLevel.assert_eq(`0u`, ident, /ForceTeamState=/true);
135	icv::Level.assert_eq(`0u`, ident, /ForceTeamState=/true);
136
137	// Ensure we synchronize to create an aligned region around the assumptions.
138	synchronize::threadsAligned(atomic::relaxed);
139
140	return;
141	}
142
143	[[clang::always_inline]] void
144	__kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
145	int32_t num_threads, int proc_bind, void *fn,
146	void wrapper_fn, void* **args, int64_t nargs) {
147	uint32_t TId = mapping::getThreadIdInBlock();
148
149	// Assert the parallelism level is zero if disabled by the user.
150	ASSERT((config::mayUseNestedParallelism() \|\| icv::Level == `0`),
151	"nested parallelism while disabled");
152
153	// Handle the serialized case first, same for SPMD/non-SPMD:
154	// 1) if-clause(0)
155	// 2) parallel in task or other thread state inducing construct
156	// 3) nested parallel regions
157	if (OMP_UNLIKELY(!if_expr \|\| state::HasThreadState \|\|
158	(config::mayUseNestedParallelism() && icv::Level))) {
159	state::DateEnvironmentRAII DERAII(ident);
160	++icv::Level;
161	invokeMicrotask(global_tid: TId, bound_tid: `0`, fn, args, nargs);
162	return;
163	}
164
165	// From this point forward we know that there is no thread state used.
166	ASSERT(state::HasThreadState == false, nullptr);
167
168	if (mapping::isSPMDMode()) {
169	// This was moved to its own routine so it could be called directly
170	// in certain situations to avoid resource consumption of unused
171	// logic in parallel_51.
172	__kmpc_parallel_spmd(ident, num_threads, fn, args, nargs);
173
174	return;
175	}
176
177	uint32_t NumThreads = determineNumberOfThreads(NumThreadsClause: num_threads);
178	uint32_t MaxTeamThreads = mapping::getMaxTeamThreads();
179	uint32_t PTeamSize = NumThreads == MaxTeamThreads ? `0` : NumThreads;
180
181	// We do not* create a new data environment because all threads in the team*
182	// that are active are now running this parallel region. They share the
183	// TeamState, which has an increase level-var and potentially active-level
184	// set, but they do not have individual ThreadStates yet. If they ever
185	// modify the ICVs beyond this point a ThreadStates will be allocated.
186
187	bool IsActiveParallelRegion = NumThreads > `1`;
188	if (!IsActiveParallelRegion) {
189	state::ValueRAII LevelRAII(icv::Level, `1u`, `0u`, true, ident);
190	invokeMicrotask(global_tid: TId, bound_tid: `0`, fn, args, nargs);
191	return;
192	}
193
194	void GlobalArgs = nullptr**;
195	if (nargs) {
196	__kmpc_begin_sharing_variables(&GlobalArgs, nargs);
197	switch (nargs) {
198	default:
199	for (int I = `0`; I < nargs; I++)
200	GlobalArgs[I] = args[I];
201	break;
202	case `16`:
203	GlobalArgs[`15`] = args[`15`];
204	[[fallthrough]];
205	case `15`:
206	GlobalArgs[`14`] = args[`14`];
207	[[fallthrough]];
208	case `14`:
209	GlobalArgs[`13`] = args[`13`];
210	[[fallthrough]];
211	case `13`:
212	GlobalArgs[`12`] = args[`12`];
213	[[fallthrough]];
214	case `12`:
215	GlobalArgs[`11`] = args[`11`];
216	[[fallthrough]];
217	case `11`:
218	GlobalArgs[`10`] = args[`10`];
219	[[fallthrough]];
220	case `10`:
221	GlobalArgs[`9`] = args[`9`];
222	[[fallthrough]];
223	case `9`:
224	GlobalArgs[`8`] = args[`8`];
225	[[fallthrough]];
226	case `8`:
227	GlobalArgs[`7`] = args[`7`];
228	[[fallthrough]];
229	case `7`:
230	GlobalArgs[`6`] = args[`6`];
231	[[fallthrough]];
232	case `6`:
233	GlobalArgs[`5`] = args[`5`];
234	[[fallthrough]];
235	case `5`:
236	GlobalArgs[`4`] = args[`4`];
237	[[fallthrough]];
238	case `4`:
239	GlobalArgs[`3`] = args[`3`];
240	[[fallthrough]];
241	case `3`:
242	GlobalArgs[`2`] = args[`2`];
243	[[fallthrough]];
244	case `2`:
245	GlobalArgs[`1`] = args[`1`];
246	[[fallthrough]];
247	case `1`:
248	GlobalArgs[`0`] = args[`0`];
249	[[fallthrough]];
250	case `0`:
251	break;
252	}
253	}
254
255	{
256	// Note that the order here is important. `icv::Level` has to be updated
257	// last or the other updates will cause a thread specific state to be
258	// created.
259	state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
260	`1u`, true, ident,
261	/ForceTeamState=/true);
262	state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
263	(void )nullptr, true*, ident,
264	/ForceTeamState=/true);
265	state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, `1u`, `0u`, true, ident,
266	/ForceTeamState=/true);
267	state::ValueRAII LevelRAII(icv::Level, `1u`, `0u`, true, ident,
268	/ForceTeamState=/true);
269
270	// Master signals work to activate workers.
271	synchronize::threads(atomic::seq_cst);
272	// Master waits for workers to signal.
273	synchronize::threads(atomic::seq_cst);
274	}
275
276	if (nargs)
277	__kmpc_end_sharing_variables();
278	}
279
280	[[clang::noinline]] bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
281	// Work function and arguments for L1 parallel region.
282	*WorkFn = state::ParallelRegionFn;
283
284	// If this is the termination signal from the master, quit early.
285	if (!*WorkFn)
286	return false;
287
288	// Set to true for workers participating in the parallel region.
289	uint32_t TId = mapping::getThreadIdInBlock();
290	bool ThreadIsActive = TId < state::getEffectivePTeamSize();
291	return ThreadIsActive;
292	}
293
294	[[clang::noinline]] void __kmpc_kernel_end_parallel() {
295	// In case we have modified an ICV for this thread before a ThreadState was
296	// created. We drop it now to not contaminate the next parallel region.
297	ASSERT(!mapping::isSPMDMode(), nullptr);
298	uint32_t TId = mapping::getThreadIdInBlock();
299	state::resetStateForThread(TId);
300	ASSERT(!mapping::isSPMDMode(), nullptr);
301	}
302
303	uint16_t __kmpc_parallel_level(IdentTy , uint32_t) { return* omp_get_level(); }
304
305	int32_t __kmpc_global_thread_num(IdentTy ) { return* omp_get_thread_num(); }
306
307	void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams,
308	int32_t thread_limit) {}
309
310	void __kmpc_push_proc_bind(IdentTy loc, uint32_t tid, int* proc_bind) {}
311	}
312

Provided by KDAB

Update your C++ knowledge – Modern C++11/14/17 Training

Find out more

Definitions

source code of offload/DeviceRTL/src/Parallelism.cpp