1//===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9//===----------------------------------------------------------------------===//
10
11#include "Shared/Environment.h"
12
13#include "Allocator.h"
14#include "Configuration.h"
15#include "Debug.h"
16#include "Interface.h"
17#include "LibC.h"
18#include "Mapping.h"
19#include "State.h"
20#include "Synchronization.h"
21#include "Types.h"
22#include "Utils.h"
23
24using namespace ompx;
25
26#pragma omp begin declare target device_type(nohost)
27
28/// Memory implementation
29///
30///{
31
32/// External symbol to access dynamic shared memory.
33[[gnu::aligned(
34 allocator::ALIGNMENT)]] extern unsigned char DynamicSharedBuffer[];
35#pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
36
37/// The kernel environment passed to the init method by the compiler.
38static KernelEnvironmentTy *SHARED(KernelEnvironmentPtr);
39
40/// The kernel launch environment passed as argument to the kernel by the
41/// runtime.
42static KernelLaunchEnvironmentTy *SHARED(KernelLaunchEnvironmentPtr);
43
44///}
45
46namespace {
47
48/// Fallback implementations are missing to trigger a link time error.
49/// Implementations for new devices, including the host, should go into a
50/// dedicated begin/end declare variant.
51///
52///{
53extern "C" {
54#ifdef __AMDGPU__
55
56[[gnu::weak]] void *malloc(uint64_t Size) { return allocator::alloc(Size); }
57[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
58
59#else
60
61[[gnu::weak, gnu::leaf]] void *malloc(uint64_t Size);
62[[gnu::weak, gnu::leaf]] void free(void *Ptr);
63
64#endif
65}
66///}
67
68/// A "smart" stack in shared memory.
69///
70/// The stack exposes a malloc/free interface but works like a stack internally.
71/// In fact, it is a separate stack *per warp*. That means, each warp must push
72/// and pop symmetrically or this breaks, badly. The implementation will (aim
73/// to) detect non-lock-step warps and fallback to malloc/free. The same will
74/// happen if a warp runs out of memory. The master warp in generic memory is
75/// special and is given more memory than the rest.
76///
77struct SharedMemorySmartStackTy {
78 /// Initialize the stack. Must be called by all threads.
79 void init(bool IsSPMD);
80
81 /// Allocate \p Bytes on the stack for the encountering thread. Each thread
82 /// can call this function.
83 void *push(uint64_t Bytes);
84
85 /// Deallocate the last allocation made by the encountering thread and pointed
86 /// to by \p Ptr from the stack. Each thread can call this function.
87 void pop(void *Ptr, uint32_t Bytes);
88
89private:
90 /// Compute the size of the storage space reserved for a thread.
91 uint32_t computeThreadStorageTotal() {
92 uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock();
93 return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock),
94 allocator::ALIGNMENT);
95 }
96
97 /// Return the top address of the warp data stack, that is the first address
98 /// this warp will allocate memory at next.
99 void *getThreadDataTop(uint32_t TId) {
100 return &Data[computeThreadStorageTotal() * TId + Usage[TId]];
101 }
102
103 /// The actual storage, shared among all warps.
104 [[gnu::aligned(
105 allocator::ALIGNMENT)]] unsigned char Data[state::SharedScratchpadSize];
106 [[gnu::aligned(
107 allocator::ALIGNMENT)]] unsigned char Usage[mapping::MaxThreadsPerTeam];
108};
109
110static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
111 "Shared scratchpad of this size not supported yet.");
112
113/// The allocation of a single shared memory scratchpad.
114static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack);
115
116void SharedMemorySmartStackTy::init(bool IsSPMD) {
117 Usage[mapping::getThreadIdInBlock()] = 0;
118}
119
120void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
121 // First align the number of requested bytes.
122 /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to
123 /// be passed in as an argument and the stack rewritten to support it.
124 uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT);
125
126 uint32_t StorageTotal = computeThreadStorageTotal();
127
128 // The main thread in generic mode gets the space of its entire warp as the
129 // other threads do not participate in any computation at all.
130 if (mapping::isMainThreadInGenericMode())
131 StorageTotal *= mapping::getWarpSize();
132
133 int TId = mapping::getThreadIdInBlock();
134 if (Usage[TId] + AlignedBytes <= StorageTotal) {
135 void *Ptr = getThreadDataTop(TId);
136 Usage[TId] += AlignedBytes;
137 return Ptr;
138 }
139
140 if (config::isDebugMode(DeviceDebugKind::CommonIssues))
141 PRINT("Shared memory stack full, fallback to dynamic allocation of global "
142 "memory will negatively impact performance.\n");
143 void *GlobalMemory = memory::allocGlobal(
144 AlignedBytes, "Slow path shared memory allocation, insufficient "
145 "shared memory stack memory!");
146 ASSERT(GlobalMemory != nullptr, "nullptr returned by malloc!");
147
148 return GlobalMemory;
149}
150
151void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) {
152 uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT);
153 if (utils::isSharedMemPtr(Ptr)) {
154 int TId = mapping::getThreadIdInBlock();
155 Usage[TId] -= AlignedBytes;
156 return;
157 }
158 memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
159}
160
161} // namespace
162
163void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
164
165void *memory::allocShared(uint64_t Bytes, const char *Reason) {
166 return SharedMemorySmartStack.push(Bytes);
167}
168
169void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) {
170 SharedMemorySmartStack.pop(Ptr, Bytes);
171}
172
173void *memory::allocGlobal(uint64_t Bytes, const char *Reason) {
174 void *Ptr = malloc(Bytes);
175 if (config::isDebugMode(DeviceDebugKind::CommonIssues) && Ptr == nullptr)
176 PRINT("nullptr returned by malloc!\n");
177 return Ptr;
178}
179
180void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); }
181
182///}
183
184bool state::ICVStateTy::operator==(const ICVStateTy &Other) const {
185 return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) &
186 (ActiveLevelVar == Other.ActiveLevelVar) &
187 (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) &
188 (RunSchedVar == Other.RunSchedVar) &
189 (RunSchedChunkVar == Other.RunSchedChunkVar);
190}
191
192void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const {
193 ASSERT(NThreadsVar == Other.NThreadsVar, nullptr);
194 ASSERT(LevelVar == Other.LevelVar, nullptr);
195 ASSERT(ActiveLevelVar == Other.ActiveLevelVar, nullptr);
196 ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar, nullptr);
197 ASSERT(RunSchedVar == Other.RunSchedVar, nullptr);
198 ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar, nullptr);
199}
200
201void state::TeamStateTy::init(bool IsSPMD) {
202 ICVState.NThreadsVar = 0;
203 ICVState.LevelVar = 0;
204 ICVState.ActiveLevelVar = 0;
205 ICVState.Padding0Val = 0;
206 ICVState.MaxActiveLevelsVar = 1;
207 ICVState.RunSchedVar = omp_sched_static;
208 ICVState.RunSchedChunkVar = 1;
209 ParallelTeamSize = 1;
210 HasThreadState = false;
211 ParallelRegionFnVar = nullptr;
212}
213
214bool state::TeamStateTy::operator==(const TeamStateTy &Other) const {
215 return (ICVState == Other.ICVState) &
216 (HasThreadState == Other.HasThreadState) &
217 (ParallelTeamSize == Other.ParallelTeamSize);
218}
219
220void state::TeamStateTy::assertEqual(TeamStateTy &Other) const {
221 ICVState.assertEqual(Other.ICVState);
222 ASSERT(ParallelTeamSize == Other.ParallelTeamSize, nullptr);
223 ASSERT(HasThreadState == Other.HasThreadState, nullptr);
224}
225
226state::TeamStateTy SHARED(ompx::state::TeamState);
227state::ThreadStateTy **SHARED(ompx::state::ThreadStates);
228
229namespace {
230
231int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
232 int OutOfBoundsVal = -1) {
233 if (Level == 0)
234 return DefaultVal;
235 int LevelVar = omp_get_level();
236 if (OMP_UNLIKELY(Level < 0 || Level > LevelVar))
237 return OutOfBoundsVal;
238 int ActiveLevel = icv::ActiveLevel;
239 if (OMP_UNLIKELY(Level != ActiveLevel))
240 return DefaultVal;
241 return Val;
242}
243
244} // namespace
245
246void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
247 KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
248 SharedMemorySmartStack.init(IsSPMD);
249 if (mapping::isInitialThreadInLevel0(IsSPMD)) {
250 TeamState.init(IsSPMD);
251 ThreadStates = nullptr;
252 KernelEnvironmentPtr = &KernelEnvironment;
253 KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment;
254 }
255}
256
257KernelEnvironmentTy &state::getKernelEnvironment() {
258 return *KernelEnvironmentPtr;
259}
260
261KernelLaunchEnvironmentTy &state::getKernelLaunchEnvironment() {
262 return *KernelLaunchEnvironmentPtr;
263}
264
265void state::enterDataEnvironment(IdentTy *Ident) {
266 ASSERT(config::mayUseThreadStates(),
267 "Thread state modified while explicitly disabled!");
268 if (!config::mayUseThreadStates())
269 return;
270
271 unsigned TId = mapping::getThreadIdInBlock();
272 ThreadStateTy *NewThreadState = static_cast<ThreadStateTy *>(
273 memory::allocGlobal(sizeof(ThreadStateTy), "ThreadStates alloc"));
274 uintptr_t *ThreadStatesBitsPtr = reinterpret_cast<uintptr_t *>(&ThreadStates);
275 if (!atomic::load(ThreadStatesBitsPtr, atomic::seq_cst)) {
276 uint32_t Bytes =
277 sizeof(ThreadStates[0]) * mapping::getNumberOfThreadsInBlock();
278 void *ThreadStatesPtr =
279 memory::allocGlobal(Bytes, "Thread state array allocation");
280 memset(ThreadStatesPtr, 0, Bytes);
281 if (!atomic::cas(ThreadStatesBitsPtr, uintptr_t(0),
282 reinterpret_cast<uintptr_t>(ThreadStatesPtr),
283 atomic::seq_cst, atomic::seq_cst))
284 memory::freeGlobal(ThreadStatesPtr,
285 "Thread state array allocated multiple times");
286 ASSERT(atomic::load(ThreadStatesBitsPtr, atomic::seq_cst),
287 "Expected valid thread states bit!");
288 }
289 NewThreadState->init(ThreadStates[TId]);
290 TeamState.HasThreadState = true;
291 ThreadStates[TId] = NewThreadState;
292}
293
294void state::exitDataEnvironment() {
295 ASSERT(config::mayUseThreadStates(),
296 "Thread state modified while explicitly disabled!");
297
298 unsigned TId = mapping::getThreadIdInBlock();
299 resetStateForThread(TId);
300}
301
302void state::resetStateForThread(uint32_t TId) {
303 if (!config::mayUseThreadStates())
304 return;
305 if (OMP_LIKELY(!TeamState.HasThreadState || !ThreadStates[TId]))
306 return;
307
308 ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState;
309 memory::freeGlobal(ThreadStates[TId], "ThreadStates dealloc");
310 ThreadStates[TId] = PreviousThreadState;
311}
312
313void state::runAndCheckState(void(Func(void))) {
314 TeamStateTy OldTeamState = TeamState;
315 OldTeamState.assertEqual(TeamState);
316
317 Func();
318
319 OldTeamState.assertEqual(TeamState);
320}
321
322void state::assumeInitialState(bool IsSPMD) {
323 TeamStateTy InitialTeamState;
324 InitialTeamState.init(IsSPMD);
325 InitialTeamState.assertEqual(TeamState);
326 ASSERT(mapping::isSPMDMode() == IsSPMD, nullptr);
327}
328
329int state::getEffectivePTeamSize() {
330 int PTeamSize = state::ParallelTeamSize;
331 return PTeamSize ? PTeamSize : mapping::getMaxTeamThreads();
332}
333
334extern "C" {
335void omp_set_dynamic(int V) {}
336
337int omp_get_dynamic(void) { return 0; }
338
339void omp_set_num_threads(int V) { icv::NThreads = V; }
340
341int omp_get_max_threads(void) {
342 int NT = icv::NThreads;
343 return NT > 0 ? NT : mapping::getMaxTeamThreads();
344}
345
346int omp_get_level(void) {
347 int LevelVar = icv::Level;
348 ASSERT(LevelVar >= 0, nullptr);
349 return LevelVar;
350}
351
352int omp_get_active_level(void) { return !!icv::ActiveLevel; }
353
354int omp_in_parallel(void) { return !!icv::ActiveLevel; }
355
356void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) {
357 *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched);
358 *ChunkSize = state::RunSchedChunk;
359}
360
361void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) {
362 icv::RunSched = (int)ScheduleKind;
363 state::RunSchedChunk = ChunkSize;
364}
365
366int omp_get_ancestor_thread_num(int Level) {
367 return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
368}
369
370int omp_get_thread_num(void) {
371 return omp_get_ancestor_thread_num(Level: omp_get_level());
372}
373
374int omp_get_team_size(int Level) {
375 return returnValIfLevelIsActive(Level, state::getEffectivePTeamSize(), 1);
376}
377
378int omp_get_num_threads(void) {
379 return omp_get_level() != 1 ? 1 : state::getEffectivePTeamSize();
380}
381
382int omp_get_thread_limit(void) { return mapping::getMaxTeamThreads(); }
383
384int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }
385
386void omp_set_nested(int) {}
387
388int omp_get_nested(void) { return false; }
389
390void omp_set_max_active_levels(int Levels) {
391 icv::MaxActiveLevels = Levels > 0 ? 1 : 0;
392}
393
394int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; }
395
396omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; }
397
398int omp_get_num_places(void) { return 0; }
399
400int omp_get_place_num_procs(int) { return omp_get_num_procs(); }
401
402void omp_get_place_proc_ids(int, int *) {
403 // TODO
404}
405
406int omp_get_place_num(void) { return 0; }
407
408int omp_get_partition_num_places(void) { return 0; }
409
410void omp_get_partition_place_nums(int *) {
411 // TODO
412}
413
414int omp_get_cancellation(void) { return 0; }
415
416void omp_set_default_device(int) {}
417
418int omp_get_default_device(void) { return -1; }
419
420int omp_get_num_devices(void) { return config::getNumDevices(); }
421
422int omp_get_device_num(void) { return config::getDeviceNum(); }
423
424int omp_get_num_teams(void) { return mapping::getNumberOfBlocksInKernel(); }
425
426int omp_get_team_num() { return mapping::getBlockIdInKernel(); }
427
428int omp_get_initial_device(void) { return -1; }
429
430int omp_is_initial_device(void) { return 0; }
431}
432
433extern "C" {
434[[clang::noinline]] void *__kmpc_alloc_shared(uint64_t Bytes) {
435 return memory::allocShared(Bytes, "Frontend alloc shared");
436}
437
438[[clang::noinline]] void __kmpc_free_shared(void *Ptr, uint64_t Bytes) {
439 memory::freeShared(Ptr, Bytes, "Frontend free shared");
440}
441
442void *__kmpc_get_dynamic_shared() { return memory::getDynamicBuffer(); }
443
444void *llvm_omp_target_dynamic_shared_alloc() {
445 return __kmpc_get_dynamic_shared();
446}
447
448void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); }
449
450/// Allocate storage in shared memory to communicate arguments from the main
451/// thread to the workers in generic mode. If we exceed
452/// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
453constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64;
454
455[[clang::loader_uninitialized]] static void
456 *SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
457#pragma omp allocate(SharedMemVariableSharingSpace) \
458 allocator(omp_pteam_mem_alloc)
459[[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr;
460#pragma omp allocate(SharedMemVariableSharingSpacePtr) \
461 allocator(omp_pteam_mem_alloc)
462
463void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) {
464 if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
465 SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0];
466 } else {
467 SharedMemVariableSharingSpacePtr = (void **)memory::allocGlobal(
468 nArgs * sizeof(void *), "new extended args");
469 ASSERT(SharedMemVariableSharingSpacePtr != nullptr,
470 "Nullptr returned by malloc!");
471 }
472 *GlobalArgs = SharedMemVariableSharingSpacePtr;
473}
474
475void __kmpc_end_sharing_variables() {
476 if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0])
477 memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args");
478}
479
480void __kmpc_get_shared_variables(void ***GlobalArgs) {
481 *GlobalArgs = SharedMemVariableSharingSpacePtr;
482}
483}
484#pragma omp end declare target
485

source code of offload/DeviceRTL/src/State.cpp