1//===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9//===----------------------------------------------------------------------===//
10
11#include "Shared/Environment.h"
12
13#include "Allocator.h"
14#include "Configuration.h"
15#include "Debug.h"
16#include "DeviceTypes.h"
17#include "DeviceUtils.h"
18#include "Interface.h"
19#include "LibC.h"
20#include "Mapping.h"
21#include "State.h"
22#include "Synchronization.h"
23
24using namespace ompx;
25
26/// Memory implementation
27///
28///{
29
30/// External symbol to access dynamic shared memory.
31[[gnu::aligned(
32 allocator::ALIGNMENT)]] extern Local<unsigned char> DynamicSharedBuffer[];
33
34/// The kernel environment passed to the init method by the compiler.
35[[clang::loader_uninitialized]] static Local<KernelEnvironmentTy *>
36 KernelEnvironmentPtr;
37
38/// The kernel launch environment passed as argument to the kernel by the
39/// runtime.
40[[clang::loader_uninitialized]] static Local<KernelLaunchEnvironmentTy *>
41 KernelLaunchEnvironmentPtr;
42
43///}
44
45namespace {
46
47/// Fallback implementations are missing to trigger a link time error.
48/// Implementations for new devices, including the host, should go into a
49/// dedicated begin/end declare variant.
50///
51///{
52extern "C" {
53#ifdef __AMDGPU__
54
55[[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); }
56[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
57
58#else
59
60[[gnu::weak, gnu::leaf]] void *malloc(size_t Size);
61[[gnu::weak, gnu::leaf]] void free(void *Ptr);
62
63#endif
64}
65///}
66
67/// A "smart" stack in shared memory.
68///
69/// The stack exposes a malloc/free interface but works like a stack internally.
70/// In fact, it is a separate stack *per warp*. That means, each warp must push
71/// and pop symmetrically or this breaks, badly. The implementation will (aim
72/// to) detect non-lock-step warps and fallback to malloc/free. The same will
73/// happen if a warp runs out of memory. The master warp in generic memory is
74/// special and is given more memory than the rest.
75///
76struct SharedMemorySmartStackTy {
77 /// Initialize the stack. Must be called by all threads.
78 void init(bool IsSPMD);
79
80 /// Allocate \p Bytes on the stack for the encountering thread. Each thread
81 /// can call this function.
82 void *push(uint64_t Bytes);
83
84 /// Deallocate the last allocation made by the encountering thread and pointed
85 /// to by \p Ptr from the stack. Each thread can call this function.
86 void pop(void *Ptr, uint64_t Bytes);
87
88private:
89 /// Compute the size of the storage space reserved for a thread.
90 uint32_t computeThreadStorageTotal() {
91 uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock();
92 return __builtin_align_down(state::SharedScratchpadSize / NumLanesInBlock,
93 allocator::ALIGNMENT);
94 }
95
96 /// Return the top address of the warp data stack, that is the first address
97 /// this warp will allocate memory at next.
98 void *getThreadDataTop(uint32_t TId) {
99 return &Data[computeThreadStorageTotal() * TId + Usage[TId]];
100 }
101
102 /// The actual storage, shared among all warps.
103 [[gnu::aligned(
104 allocator::ALIGNMENT)]] unsigned char Data[state::SharedScratchpadSize];
105 [[gnu::aligned(
106 allocator::ALIGNMENT)]] unsigned char Usage[mapping::MaxThreadsPerTeam];
107};
108
109static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
110 "Shared scratchpad of this size not supported yet.");
111
112/// The allocation of a single shared memory scratchpad.
113[[clang::loader_uninitialized]] static Local<SharedMemorySmartStackTy>
114 SharedMemorySmartStack;
115
116void SharedMemorySmartStackTy::init(bool IsSPMD) {
117 Usage[mapping::getThreadIdInBlock()] = 0;
118}
119
120void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
121 // First align the number of requested bytes.
122 /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to
123 /// be passed in as an argument and the stack rewritten to support it.
124 uint64_t AlignedBytes = __builtin_align_up(Bytes, allocator::ALIGNMENT);
125
126 uint32_t StorageTotal = computeThreadStorageTotal();
127
128 // The main thread in generic mode gets the space of its entire warp as the
129 // other threads do not participate in any computation at all.
130 if (mapping::isMainThreadInGenericMode())
131 StorageTotal *= mapping::getWarpSize();
132
133 int TId = mapping::getThreadIdInBlock();
134 if (Usage[TId] + AlignedBytes <= StorageTotal) {
135 void *Ptr = getThreadDataTop(TId);
136 Usage[TId] += AlignedBytes;
137 return Ptr;
138 }
139
140 if (config::isDebugMode(DeviceDebugKind::CommonIssues))
141 printf("Shared memory stack full, fallback to dynamic allocation of global "
142 "memory will negatively impact performance.\n");
143 void *GlobalMemory = memory::allocGlobal(
144 AlignedBytes, "Slow path shared memory allocation, insufficient "
145 "shared memory stack memory!");
146 ASSERT(GlobalMemory != nullptr, "nullptr returned by malloc!");
147
148 return GlobalMemory;
149}
150
151void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) {
152 uint64_t AlignedBytes = __builtin_align_up(Bytes, allocator::ALIGNMENT);
153 if (utils::isSharedMemPtr(Ptr)) {
154 int TId = mapping::getThreadIdInBlock();
155 Usage[TId] -= AlignedBytes;
156 return;
157 }
158 memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
159}
160
161} // namespace
162
163void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
164
165void *memory::allocShared(uint64_t Bytes, const char *Reason) {
166 return SharedMemorySmartStack.push(Bytes);
167}
168
169void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) {
170 SharedMemorySmartStack.pop(Ptr, Bytes);
171}
172
173void *memory::allocGlobal(uint64_t Bytes, const char *Reason) {
174 void *Ptr = malloc(Bytes);
175 if (config::isDebugMode(DeviceDebugKind::CommonIssues) && Ptr == nullptr)
176 printf("nullptr returned by malloc!\n");
177 return Ptr;
178}
179
180void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); }
181
182///}
183
184bool state::ICVStateTy::operator==(const ICVStateTy &Other) const {
185 return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) &
186 (ActiveLevelVar == Other.ActiveLevelVar) &
187 (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) &
188 (RunSchedVar == Other.RunSchedVar) &
189 (RunSchedChunkVar == Other.RunSchedChunkVar);
190}
191
192void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const {
193 ASSERT(NThreadsVar == Other.NThreadsVar, nullptr);
194 ASSERT(LevelVar == Other.LevelVar, nullptr);
195 ASSERT(ActiveLevelVar == Other.ActiveLevelVar, nullptr);
196 ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar, nullptr);
197 ASSERT(RunSchedVar == Other.RunSchedVar, nullptr);
198 ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar, nullptr);
199}
200
201void state::TeamStateTy::init(bool IsSPMD) {
202 ICVState.NThreadsVar = 0;
203 ICVState.LevelVar = 0;
204 ICVState.ActiveLevelVar = 0;
205 ICVState.Padding0Val = 0;
206 ICVState.MaxActiveLevelsVar = 1;
207 ICVState.RunSchedVar = omp_sched_static;
208 ICVState.RunSchedChunkVar = 1;
209 ParallelTeamSize = 1;
210 HasThreadState = false;
211 ParallelRegionFnVar = nullptr;
212}
213
214bool state::TeamStateTy::operator==(const TeamStateTy &Other) const {
215 return (ICVState == Other.ICVState) &
216 (HasThreadState == Other.HasThreadState) &
217 (ParallelTeamSize == Other.ParallelTeamSize);
218}
219
220void state::TeamStateTy::assertEqual(TeamStateTy &Other) const {
221 ICVState.assertEqual(Other.ICVState);
222 ASSERT(ParallelTeamSize == Other.ParallelTeamSize, nullptr);
223 ASSERT(HasThreadState == Other.HasThreadState, nullptr);
224}
225
226[[clang::loader_uninitialized]] Local<state::TeamStateTy>
227 ompx::state::TeamState;
228[[clang::loader_uninitialized]] Local<state::ThreadStateTy **>
229 ompx::state::ThreadStates;
230
231namespace {
232
233int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
234 int OutOfBoundsVal = -1) {
235 if (Level == 0)
236 return DefaultVal;
237 int LevelVar = omp_get_level();
238 if (OMP_UNLIKELY(Level < 0 || Level > LevelVar))
239 return OutOfBoundsVal;
240 int ActiveLevel = icv::ActiveLevel;
241 if (OMP_UNLIKELY(Level != ActiveLevel))
242 return DefaultVal;
243 return Val;
244}
245
246} // namespace
247
248void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
249 KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
250 SharedMemorySmartStack.init(IsSPMD);
251 if (mapping::isInitialThreadInLevel0(IsSPMD)) {
252 TeamState.init(IsSPMD);
253 ThreadStates = nullptr;
254 KernelEnvironmentPtr = &KernelEnvironment;
255 KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment;
256 }
257}
258
259KernelEnvironmentTy &state::getKernelEnvironment() {
260 return *KernelEnvironmentPtr;
261}
262
263KernelLaunchEnvironmentTy &state::getKernelLaunchEnvironment() {
264 return *KernelLaunchEnvironmentPtr;
265}
266
267void state::enterDataEnvironment(IdentTy *Ident) {
268 ASSERT(config::mayUseThreadStates(),
269 "Thread state modified while explicitly disabled!");
270 if (!config::mayUseThreadStates())
271 return;
272
273 unsigned TId = mapping::getThreadIdInBlock();
274 ThreadStateTy *NewThreadState = static_cast<ThreadStateTy *>(
275 memory::allocGlobal(sizeof(ThreadStateTy), "ThreadStates alloc"));
276 uintptr_t *ThreadStatesBitsPtr = reinterpret_cast<uintptr_t *>(&ThreadStates);
277 if (!atomic::load(ThreadStatesBitsPtr, atomic::seq_cst)) {
278 uint32_t Bytes =
279 sizeof(ThreadStates[0]) * mapping::getNumberOfThreadsInBlock();
280 void *ThreadStatesPtr =
281 memory::allocGlobal(Bytes, "Thread state array allocation");
282 __builtin_memset(ThreadStatesPtr, 0, Bytes);
283 if (!atomic::cas(ThreadStatesBitsPtr, uintptr_t(0),
284 reinterpret_cast<uintptr_t>(ThreadStatesPtr),
285 atomic::seq_cst, atomic::seq_cst))
286 memory::freeGlobal(ThreadStatesPtr,
287 "Thread state array allocated multiple times");
288 ASSERT(atomic::load(ThreadStatesBitsPtr, atomic::seq_cst),
289 "Expected valid thread states bit!");
290 }
291 NewThreadState->init(ThreadStates[TId]);
292 TeamState.HasThreadState = true;
293 ThreadStates[TId] = NewThreadState;
294}
295
296void state::exitDataEnvironment() {
297 ASSERT(config::mayUseThreadStates(),
298 "Thread state modified while explicitly disabled!");
299
300 unsigned TId = mapping::getThreadIdInBlock();
301 resetStateForThread(TId);
302}
303
304void state::resetStateForThread(uint32_t TId) {
305 if (!config::mayUseThreadStates())
306 return;
307 if (OMP_LIKELY(!TeamState.HasThreadState || !ThreadStates[TId]))
308 return;
309
310 ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState;
311 memory::freeGlobal(ThreadStates[TId], "ThreadStates dealloc");
312 ThreadStates[TId] = PreviousThreadState;
313}
314
315void state::runAndCheckState(void(Func(void))) {
316 TeamStateTy OldTeamState = TeamState;
317 OldTeamState.assertEqual(TeamState);
318
319 Func();
320
321 OldTeamState.assertEqual(TeamState);
322}
323
324void state::assumeInitialState(bool IsSPMD) {
325 TeamStateTy InitialTeamState;
326 InitialTeamState.init(IsSPMD);
327 InitialTeamState.assertEqual(TeamState);
328 ASSERT(mapping::isSPMDMode() == IsSPMD, nullptr);
329}
330
331int state::getEffectivePTeamSize() {
332 int PTeamSize = state::ParallelTeamSize;
333 return PTeamSize ? PTeamSize : mapping::getMaxTeamThreads();
334}
335
336extern "C" {
337void omp_set_dynamic(int V) {}
338
339int omp_get_dynamic(void) { return 0; }
340
341void omp_set_num_threads(int V) { icv::NThreads = V; }
342
343int omp_get_max_threads(void) {
344 int NT = icv::NThreads;
345 return NT > 0 ? NT : mapping::getMaxTeamThreads();
346}
347
348int omp_get_level(void) {
349 int LevelVar = icv::Level;
350 ASSERT(LevelVar >= 0, nullptr);
351 return LevelVar;
352}
353
354int omp_get_active_level(void) { return !!icv::ActiveLevel; }
355
356int omp_in_parallel(void) { return !!icv::ActiveLevel; }
357
358void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) {
359 *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched);
360 *ChunkSize = state::RunSchedChunk;
361}
362
363void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) {
364 icv::RunSched = (int)ScheduleKind;
365 state::RunSchedChunk = ChunkSize;
366}
367
368int omp_get_ancestor_thread_num(int Level) {
369 return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
370}
371
372int omp_get_thread_num(void) {
373 return omp_get_ancestor_thread_num(Level: omp_get_level());
374}
375
376int omp_get_team_size(int Level) {
377 return returnValIfLevelIsActive(Level, state::getEffectivePTeamSize(), 1);
378}
379
380int omp_get_num_threads(void) {
381 return omp_get_level() != 1 ? 1 : state::getEffectivePTeamSize();
382}
383
384int omp_get_thread_limit(void) { return mapping::getMaxTeamThreads(); }
385
386int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }
387
388void omp_set_nested(int) {}
389
390int omp_get_nested(void) { return false; }
391
392void omp_set_max_active_levels(int Levels) {
393 icv::MaxActiveLevels = Levels > 0 ? 1 : 0;
394}
395
396int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; }
397
398omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; }
399
400int omp_get_num_places(void) { return 0; }
401
402int omp_get_place_num_procs(int) { return omp_get_num_procs(); }
403
404void omp_get_place_proc_ids(int, int *) {
405 // TODO
406}
407
408int omp_get_place_num(void) { return 0; }
409
410int omp_get_partition_num_places(void) { return 0; }
411
412void omp_get_partition_place_nums(int *) {
413 // TODO
414}
415
416int omp_get_cancellation(void) { return 0; }
417
418void omp_set_default_device(int) {}
419
420int omp_get_default_device(void) { return -1; }
421
422int omp_get_num_devices(void) { return config::getNumDevices(); }
423
424int omp_get_device_num(void) { return config::getDeviceNum(); }
425
426int omp_get_num_teams(void) { return mapping::getNumberOfBlocksInKernel(); }
427
428int omp_get_team_num() { return mapping::getBlockIdInKernel(); }
429
430int omp_get_initial_device(void) { return -1; }
431
432int omp_is_initial_device(void) { return 0; }
433}
434
435extern "C" {
436[[clang::noinline]] void *__kmpc_alloc_shared(uint64_t Bytes) {
437 return memory::allocShared(Bytes, "Frontend alloc shared");
438}
439
440[[clang::noinline]] void __kmpc_free_shared(void *Ptr, uint64_t Bytes) {
441 memory::freeShared(Ptr, Bytes, "Frontend free shared");
442}
443
444void *__kmpc_get_dynamic_shared() { return memory::getDynamicBuffer(); }
445
446void *llvm_omp_target_dynamic_shared_alloc() {
447 return __kmpc_get_dynamic_shared();
448}
449
450void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); }
451
452/// Allocate storage in shared memory to communicate arguments from the main
453/// thread to the workers in generic mode. If we exceed
454/// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
455constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64;
456
457[[clang::loader_uninitialized]] static Local<void *>
458 SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
459[[clang::loader_uninitialized]] static Local<void **>
460 SharedMemVariableSharingSpacePtr;
461
462void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) {
463 if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
464 SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0];
465 } else {
466 SharedMemVariableSharingSpacePtr = (void **)memory::allocGlobal(
467 nArgs * sizeof(void *), "new extended args");
468 ASSERT(SharedMemVariableSharingSpacePtr != nullptr,
469 "Nullptr returned by malloc!");
470 }
471 *GlobalArgs = SharedMemVariableSharingSpacePtr;
472}
473
474void __kmpc_end_sharing_variables() {
475 if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0])
476 memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args");
477}
478
479void __kmpc_get_shared_variables(void ***GlobalArgs) {
480 *GlobalArgs = SharedMemVariableSharingSpacePtr;
481}
482}
483

Provided by KDAB

Privacy Policy
Update your C++ knowledge – Modern C++11/14/17 Training
Find out more

source code of offload/DeviceRTL/src/State.cpp