State.cpp source code [offload/DeviceRTL/src/State.cpp]

1	//===------ State.cpp - OpenMP State & ICV interface ------------- C++ --===//*
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	//===----------------------------------------------------------------------===//
10
11	#include "Shared/Environment.h"
12
13	#include "Allocator.h"
14	#include "Configuration.h"
15	#include "Debug.h"
16	#include "DeviceTypes.h"
17	#include "DeviceUtils.h"
18	#include "Interface.h"
19	#include "LibC.h"
20	#include "Mapping.h"
21	#include "State.h"
22	#include "Synchronization.h"
23
24	using namespace ompx;
25
26	/// Memory implementation
27	///
28	///{
29
30	/// External symbol to access dynamic shared memory.
31	[[gnu::aligned(
32	allocator::ALIGNMENT)]] extern Local<unsigned char> DynamicSharedBuffer[];
33
34	/// The kernel environment passed to the init method by the compiler.
35	[[clang::loader_uninitialized]] static Local<KernelEnvironmentTy *>
36	KernelEnvironmentPtr;
37
38	/// The kernel launch environment passed as argument to the kernel by the
39	/// runtime.
40	[[clang::loader_uninitialized]] static Local<KernelLaunchEnvironmentTy *>
41	KernelLaunchEnvironmentPtr;
42
43	///}
44
45	namespace {
46
47	/// Fallback implementations are missing to trigger a link time error.
48	/// Implementations for new devices, including the host, should go into a
49	/// dedicated begin/end declare variant.
50	///
51	///{
52	extern "C" {
53	#ifdef __AMDGPU__
54
55	[[gnu::weak]] void malloc(size_t Size) { return* allocator::alloc(Size); }
56	[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
57
58	#else
59
60	[[gnu::weak, gnu::leaf]] void *malloc(size_t Size);
61	[[gnu::weak, gnu::leaf]] void free(void *Ptr);
62
63	#endif
64	}
65	///}
66
67	/// A "smart" stack in shared memory.
68	///
69	/// The stack exposes a malloc/free interface but works like a stack internally.
70	/// In fact, it is a separate stack per warp. That means, each warp must push
71	/// and pop symmetrically or this breaks, badly. The implementation will (aim
72	/// to) detect non-lock-step warps and fallback to malloc/free. The same will
73	/// happen if a warp runs out of memory. The master warp in generic memory is
74	/// special and is given more memory than the rest.
75	///
76	struct SharedMemorySmartStackTy {
77	/// Initialize the stack. Must be called by all threads.
78	void init(bool IsSPMD);
79
80	/// Allocate \p Bytes on the stack for the encountering thread. Each thread
81	/// can call this function.
82	void *push(uint64_t Bytes);
83
84	/// Deallocate the last allocation made by the encountering thread and pointed
85	/// to by \p Ptr from the stack. Each thread can call this function.
86	void pop(void *Ptr, uint64_t Bytes);
87
88	private:
89	/// Compute the size of the storage space reserved for a thread.
90	uint32_t computeThreadStorageTotal() {
91	uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock();
92	return __builtin_align_down(state::SharedScratchpadSize / NumLanesInBlock,
93	allocator::ALIGNMENT);
94	}
95
96	/// Return the top address of the warp data stack, that is the first address
97	/// this warp will allocate memory at next.
98	void *getThreadDataTop(uint32_t TId) {
99	return &Data[computeThreadStorageTotal() * TId + Usage[TId]];
100	}
101
102	/// The actual storage, shared among all warps.
103	[[gnu::aligned(
104	allocator::ALIGNMENT)]] unsigned char Data[state::SharedScratchpadSize];
105	[[gnu::aligned(
106	allocator::ALIGNMENT)]] unsigned char Usage[mapping::MaxThreadsPerTeam];
107	};
108
109	static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= `256`,
110	"Shared scratchpad of this size not supported yet.");
111
112	/// The allocation of a single shared memory scratchpad.
113	[[clang::loader_uninitialized]] static Local<SharedMemorySmartStackTy>
114	SharedMemorySmartStack;
115
116	void SharedMemorySmartStackTy::init(bool IsSPMD) {
117	Usage[mapping::getThreadIdInBlock()] = `0`;
118	}
119
120	void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
121	// First align the number of requested bytes.
122	/// FIXME: The stack shouldn't require worst-case padding. Alignment needs to
123	/// be passed in as an argument and the stack rewritten to support it.
124	uint64_t AlignedBytes = __builtin_align_up(Bytes, allocator::ALIGNMENT);
125
126	uint32_t StorageTotal = computeThreadStorageTotal();
127
128	// The main thread in generic mode gets the space of its entire warp as the
129	// other threads do not participate in any computation at all.
130	if (mapping::isMainThreadInGenericMode())
131	StorageTotal *= mapping::getWarpSize();
132
133	int TId = mapping::getThreadIdInBlock();
134	if (Usage[TId] + AlignedBytes <= StorageTotal) {
135	void *Ptr = getThreadDataTop(TId);
136	Usage[TId] += AlignedBytes;
137	return Ptr;
138	}
139
140	if (config::isDebugMode(DeviceDebugKind::CommonIssues))
141	printf("Shared memory stack full, fallback to dynamic allocation of global "
142	"memory will negatively impact performance.\n");
143	void *GlobalMemory = memory::allocGlobal(
144	AlignedBytes, "Slow path shared memory allocation, insufficient "
145	"shared memory stack memory!");
146	ASSERT(GlobalMemory != nullptr, "nullptr returned by malloc!");
147
148	return GlobalMemory;
149	}
150
151	void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) {
152	uint64_t AlignedBytes = __builtin_align_up(Bytes, allocator::ALIGNMENT);
153	if (utils::isSharedMemPtr(Ptr)) {
154	int TId = mapping::getThreadIdInBlock();
155	Usage[TId] -= AlignedBytes;
156	return;
157	}
158	memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
159	}
160
161	} // namespace
162
163	void memory::getDynamicBuffer() { return* DynamicSharedBuffer; }
164
165	void memory::allocShared(uint64_t Bytes, const* char *Reason) {
166	return SharedMemorySmartStack.push(Bytes);
167	}
168
169	void memory::freeShared(void Ptr, uint64_t Bytes, const* char *Reason) {
170	SharedMemorySmartStack.pop(Ptr, Bytes);
171	}
172
173	void memory::allocGlobal(uint64_t Bytes, const* char *Reason) {
174	void *Ptr = malloc(Bytes);
175	if (config::isDebugMode(DeviceDebugKind::CommonIssues) && Ptr == nullptr)
176	printf("nullptr returned by malloc!\n");
177	return Ptr;
178	}
179
180	void memory::freeGlobal(void Ptr, const* char *Reason) { free(Ptr); }
181
182	///}
183
184	bool state::ICVStateTy::operator==(const ICVStateTy &Other) const {
185	return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) &
186	(ActiveLevelVar == Other.ActiveLevelVar) &
187	(MaxActiveLevelsVar == Other.MaxActiveLevelsVar) &
188	(RunSchedVar == Other.RunSchedVar) &
189	(RunSchedChunkVar == Other.RunSchedChunkVar);
190	}
191
192	void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const {
193	ASSERT(NThreadsVar == Other.NThreadsVar, nullptr);
194	ASSERT(LevelVar == Other.LevelVar, nullptr);
195	ASSERT(ActiveLevelVar == Other.ActiveLevelVar, nullptr);
196	ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar, nullptr);
197	ASSERT(RunSchedVar == Other.RunSchedVar, nullptr);
198	ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar, nullptr);
199	}
200
201	void state::TeamStateTy::init(bool IsSPMD) {
202	ICVState.NThreadsVar = `0`;
203	ICVState.LevelVar = `0`;
204	ICVState.ActiveLevelVar = `0`;
205	ICVState.Padding0Val = `0`;
206	ICVState.MaxActiveLevelsVar = `1`;
207	ICVState.RunSchedVar = omp_sched_static;
208	ICVState.RunSchedChunkVar = `1`;
209	ParallelTeamSize = `1`;
210	HasThreadState = false;
211	ParallelRegionFnVar = nullptr;
212	}
213
214	bool state::TeamStateTy::operator==(const TeamStateTy &Other) const {
215	return (ICVState == Other.ICVState) &
216	(HasThreadState == Other.HasThreadState) &
217	(ParallelTeamSize == Other.ParallelTeamSize);
218	}
219
220	void state::TeamStateTy::assertEqual(TeamStateTy &Other) const {
221	ICVState.assertEqual(Other.ICVState);
222	ASSERT(ParallelTeamSize == Other.ParallelTeamSize, nullptr);
223	ASSERT(HasThreadState == Other.HasThreadState, nullptr);
224	}
225
226	[[clang::loader_uninitialized]] Local<state::TeamStateTy>
227	ompx::state::TeamState;
228	[[clang::loader_uninitialized]] Local<state::ThreadStateTy **>
229	ompx::state::ThreadStates;
230
231	namespace {
232
233	int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
234	int OutOfBoundsVal = -`1`) {
235	if (Level == `0`)
236	return DefaultVal;
237	int LevelVar = omp_get_level();
238	if (OMP_UNLIKELY(Level < `0` \|\| Level > LevelVar))
239	return OutOfBoundsVal;
240	int ActiveLevel = icv::ActiveLevel;
241	if (OMP_UNLIKELY(Level != ActiveLevel))
242	return DefaultVal;
243	return Val;
244	}
245
246	} // namespace
247
248	void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
249	KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
250	SharedMemorySmartStack.init(IsSPMD);
251	if (mapping::isInitialThreadInLevel0(IsSPMD)) {
252	TeamState.init(IsSPMD);
253	ThreadStates = nullptr;
254	KernelEnvironmentPtr = &KernelEnvironment;
255	KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment;
256	}
257	}
258
259	KernelEnvironmentTy &state::getKernelEnvironment() {
260	return *KernelEnvironmentPtr;
261	}
262
263	KernelLaunchEnvironmentTy &state::getKernelLaunchEnvironment() {
264	return *KernelLaunchEnvironmentPtr;
265	}
266
267	void state::enterDataEnvironment(IdentTy *Ident) {
268	ASSERT(config::mayUseThreadStates(),
269	"Thread state modified while explicitly disabled!");
270	if (!config::mayUseThreadStates())
271	return;
272
273	unsigned TId = mapping::getThreadIdInBlock();
274	ThreadStateTy NewThreadState = static_cast<ThreadStateTy >(
275	memory::allocGlobal(sizeof(ThreadStateTy), "ThreadStates alloc"));
276	uintptr_t ThreadStatesBitsPtr = reinterpret_cast<uintptr_t >(&ThreadStates);
277	if (!atomic::load(ThreadStatesBitsPtr, atomic::seq_cst)) {
278	uint32_t Bytes =
279	sizeof(ThreadStates[`0`]) * mapping::getNumberOfThreadsInBlock();
280	void *ThreadStatesPtr =
281	memory::allocGlobal(Bytes, "Thread state array allocation");
282	__builtin_memset(ThreadStatesPtr, `0`, Bytes);
283	if (!atomic::cas(ThreadStatesBitsPtr, uintptr_t(`0`),
284	reinterpret_cast<uintptr_t>(ThreadStatesPtr),
285	atomic::seq_cst, atomic::seq_cst))
286	memory::freeGlobal(ThreadStatesPtr,
287	"Thread state array allocated multiple times");
288	ASSERT(atomic::load(ThreadStatesBitsPtr, atomic::seq_cst),
289	"Expected valid thread states bit!");
290	}
291	NewThreadState->init(ThreadStates[TId]);
292	TeamState.HasThreadState = true;
293	ThreadStates[TId] = NewThreadState;
294	}
295
296	void state::exitDataEnvironment() {
297	ASSERT(config::mayUseThreadStates(),
298	"Thread state modified while explicitly disabled!");
299
300	unsigned TId = mapping::getThreadIdInBlock();
301	resetStateForThread(TId);
302	}
303
304	void state::resetStateForThread(uint32_t TId) {
305	if (!config::mayUseThreadStates())
306	return;
307	if (OMP_LIKELY(!TeamState.HasThreadState \|\| !ThreadStates[TId]))
308	return;
309
310	ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState;
311	memory::freeGlobal(ThreadStates[TId], "ThreadStates dealloc");
312	ThreadStates[TId] = PreviousThreadState;
313	}
314
315	void state::runAndCheckState(void(Func(void))) {
316	TeamStateTy OldTeamState = TeamState;
317	OldTeamState.assertEqual(TeamState);
318
319	Func();
320
321	OldTeamState.assertEqual(TeamState);
322	}
323
324	void state::assumeInitialState(bool IsSPMD) {
325	TeamStateTy InitialTeamState;
326	InitialTeamState.init(IsSPMD);
327	InitialTeamState.assertEqual(TeamState);
328	ASSERT(mapping::isSPMDMode() == IsSPMD, nullptr);
329	}
330
331	int state::getEffectivePTeamSize() {
332	int PTeamSize = state::ParallelTeamSize;
333	return PTeamSize ? PTeamSize : mapping::getMaxTeamThreads();
334	}
335
336	extern "C" {
337	void omp_set_dynamic(int V) {}
338
339	int omp_get_dynamic(void) { return `0`; }
340
341	void omp_set_num_threads(int V) { icv::NThreads = V; }
342
343	int omp_get_max_threads(void) {
344	int NT = icv::NThreads;
345	return NT > `0` ? NT : mapping::getMaxTeamThreads();
346	}
347
348	int omp_get_level(void) {
349	int LevelVar = icv::Level;
350	ASSERT(LevelVar >= `0`, nullptr);
351	return LevelVar;
352	}
353
354	int omp_get_active_level(void) { return !!icv::ActiveLevel; }
355
356	int omp_in_parallel(void) { return !!icv::ActiveLevel; }
357
358	void omp_get_schedule(omp_sched_t ScheduleKind, int* *ChunkSize) {
359	ScheduleKind = static_cast<omp_sched_t>((int*)icv::RunSched);
360	*ChunkSize = state::RunSchedChunk;
361	}
362
363	void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) {
364	icv::RunSched = (int)ScheduleKind;
365	state::RunSchedChunk = ChunkSize;
366	}
367
368	int omp_get_ancestor_thread_num(int Level) {
369	return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), `0`);
370	}
371
372	int omp_get_thread_num(void) {
373	return omp_get_ancestor_thread_num(Level: omp_get_level());
374	}
375
376	int omp_get_team_size(int Level) {
377	return returnValIfLevelIsActive(Level, state::getEffectivePTeamSize(), `1`);
378	}
379
380	int omp_get_num_threads(void) {
381	return omp_get_level() != `1` ? `1` : state::getEffectivePTeamSize();
382	}
383
384	int omp_get_thread_limit(void) { return mapping::getMaxTeamThreads(); }
385
386	int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }
387
388	void omp_set_nested(int) {}
389
390	int omp_get_nested(void) { return false; }
391
392	void omp_set_max_active_levels(int Levels) {
393	icv::MaxActiveLevels = Levels > `0` ? `1` : `0`;
394	}
395
396	int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; }
397
398	omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; }
399
400	int omp_get_num_places(void) { return `0`; }
401
402	int omp_get_place_num_procs(int) { return omp_get_num_procs(); }
403
404	void omp_get_place_proc_ids(int, int *) {
405	// TODO
406	}
407
408	int omp_get_place_num(void) { return `0`; }
409
410	int omp_get_partition_num_places(void) { return `0`; }
411
412	void omp_get_partition_place_nums(int *) {
413	// TODO
414	}
415
416	int omp_get_cancellation(void) { return `0`; }
417
418	void omp_set_default_device(int) {}
419
420	int omp_get_default_device(void) { return -`1`; }
421
422	int omp_get_num_devices(void) { return config::getNumDevices(); }
423
424	int omp_get_device_num(void) { return config::getDeviceNum(); }
425
426	int omp_get_num_teams(void) { return mapping::getNumberOfBlocksInKernel(); }
427
428	int omp_get_team_num() { return mapping::getBlockIdInKernel(); }
429
430	int omp_get_initial_device(void) { return -`1`; }
431
432	int omp_is_initial_device(void) { return `0`; }
433	}
434
435	extern "C" {
436	[[clang::noinline]] void *__kmpc_alloc_shared(uint64_t Bytes) {
437	return memory::allocShared(Bytes, "Frontend alloc shared");
438	}
439
440	[[clang::noinline]] void __kmpc_free_shared(void *Ptr, uint64_t Bytes) {
441	memory::freeShared(Ptr, Bytes, "Frontend free shared");
442	}
443
444	void __kmpc_get_dynamic_shared() { return* memory::getDynamicBuffer(); }
445
446	void *llvm_omp_target_dynamic_shared_alloc() {
447	return __kmpc_get_dynamic_shared();
448	}
449
450	void llvm_omp_get_dynamic_shared() { return* __kmpc_get_dynamic_shared(); }
451
452	/// Allocate storage in shared memory to communicate arguments from the main
453	/// thread to the workers in generic mode. If we exceed
454	/// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
455	constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = `64`;
456
457	[[clang::loader_uninitialized]] static Local<void *>
458	SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
459	[[clang::loader_uninitialized]] static Local<void **>
460	SharedMemVariableSharingSpacePtr;
461
462	void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) {
463	if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
464	SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[`0`];
465	} else {
466	SharedMemVariableSharingSpacePtr = (void **)memory::allocGlobal(
467	nArgs * sizeof(void *), "new extended args");
468	ASSERT(SharedMemVariableSharingSpacePtr != nullptr,
469	"Nullptr returned by malloc!");
470	}
471	*GlobalArgs = SharedMemVariableSharingSpacePtr;
472	}
473
474	void __kmpc_end_sharing_variables() {
475	if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[`0`])
476	memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args");
477	}
478
479	void __kmpc_get_shared_variables(void ***GlobalArgs) {
480	*GlobalArgs = SharedMemVariableSharingSpacePtr;
481	}
482	}
483

Provided by KDAB

Definitions

KernelEnvironmentPtr
KernelLaunchEnvironmentPtr
SharedMemorySmartStackTy
computeThreadStorageTotal
getThreadDataTop
SharedMemorySmartStack
init
push
pop
returnValIfLevelIsActive
omp_set_dynamic
omp_get_dynamic
omp_set_num_threads
omp_get_max_threads
omp_get_level
omp_get_active_level
omp_in_parallel
omp_get_schedule
omp_set_schedule
omp_get_ancestor_thread_num
omp_get_thread_num
omp_get_team_size
omp_get_num_threads
omp_get_thread_limit
omp_get_num_procs
omp_set_nested
omp_get_nested
omp_set_max_active_levels
omp_get_max_active_levels
omp_get_proc_bind
omp_get_num_places
omp_get_place_num_procs
omp_get_place_proc_ids
omp_get_place_num
omp_get_partition_num_places
omp_get_partition_place_nums
omp_get_cancellation
omp_set_default_device
omp_get_default_device
omp_get_num_devices
omp_get_device_num
omp_get_num_teams
omp_get_team_num
omp_get_initial_device
omp_is_initial_device
__kmpc_alloc_shared
__kmpc_free_shared
__kmpc_get_dynamic_shared
llvm_omp_target_dynamic_shared_alloc
llvm_omp_get_dynamic_shared
NUM_SHARED_VARIABLES_IN_SHARED_MEM
SharedMemVariableSharingSpace
SharedMemVariableSharingSpacePtr
__kmpc_begin_sharing_variables
__kmpc_end_sharing_variables

Update your C++ knowledge – Modern C++11/14/17 Training

Find out more

Definitions

source code of offload/DeviceRTL/src/State.cpp