1 | //===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | //===----------------------------------------------------------------------===// |
10 | |
11 | #include "Shared/Environment.h" |
12 | |
13 | #include "Allocator.h" |
14 | #include "Configuration.h" |
15 | #include "Debug.h" |
16 | #include "Interface.h" |
17 | #include "LibC.h" |
18 | #include "Mapping.h" |
19 | #include "State.h" |
20 | #include "Synchronization.h" |
21 | #include "Types.h" |
22 | #include "Utils.h" |
23 | |
24 | using namespace ompx; |
25 | |
26 | #pragma omp begin declare target device_type(nohost) |
27 | |
28 | /// Memory implementation |
29 | /// |
30 | ///{ |
31 | |
32 | /// External symbol to access dynamic shared memory. |
33 | [[gnu::aligned( |
34 | allocator::ALIGNMENT)]] extern unsigned char DynamicSharedBuffer[]; |
35 | #pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc) |
36 | |
37 | /// The kernel environment passed to the init method by the compiler. |
38 | static KernelEnvironmentTy *SHARED(KernelEnvironmentPtr); |
39 | |
40 | /// The kernel launch environment passed as argument to the kernel by the |
41 | /// runtime. |
42 | static KernelLaunchEnvironmentTy *SHARED(KernelLaunchEnvironmentPtr); |
43 | |
44 | ///} |
45 | |
46 | namespace { |
47 | |
48 | /// Fallback implementations are missing to trigger a link time error. |
49 | /// Implementations for new devices, including the host, should go into a |
50 | /// dedicated begin/end declare variant. |
51 | /// |
52 | ///{ |
53 | extern "C" { |
54 | #ifdef __AMDGPU__ |
55 | |
56 | [[gnu::weak]] void *malloc(uint64_t Size) { return allocator::alloc(Size); } |
57 | [[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); } |
58 | |
59 | #else |
60 | |
61 | [[gnu::weak, gnu::leaf]] void *malloc(uint64_t Size); |
62 | [[gnu::weak, gnu::leaf]] void free(void *Ptr); |
63 | |
64 | #endif |
65 | } |
66 | ///} |
67 | |
68 | /// A "smart" stack in shared memory. |
69 | /// |
70 | /// The stack exposes a malloc/free interface but works like a stack internally. |
71 | /// In fact, it is a separate stack *per warp*. That means, each warp must push |
72 | /// and pop symmetrically or this breaks, badly. The implementation will (aim |
73 | /// to) detect non-lock-step warps and fallback to malloc/free. The same will |
74 | /// happen if a warp runs out of memory. The master warp in generic memory is |
75 | /// special and is given more memory than the rest. |
76 | /// |
77 | struct SharedMemorySmartStackTy { |
78 | /// Initialize the stack. Must be called by all threads. |
79 | void init(bool IsSPMD); |
80 | |
81 | /// Allocate \p Bytes on the stack for the encountering thread. Each thread |
82 | /// can call this function. |
83 | void *push(uint64_t Bytes); |
84 | |
85 | /// Deallocate the last allocation made by the encountering thread and pointed |
86 | /// to by \p Ptr from the stack. Each thread can call this function. |
87 | void pop(void *Ptr, uint32_t Bytes); |
88 | |
89 | private: |
90 | /// Compute the size of the storage space reserved for a thread. |
91 | uint32_t computeThreadStorageTotal() { |
92 | uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock(); |
93 | return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock), |
94 | allocator::ALIGNMENT); |
95 | } |
96 | |
97 | /// Return the top address of the warp data stack, that is the first address |
98 | /// this warp will allocate memory at next. |
99 | void *getThreadDataTop(uint32_t TId) { |
100 | return &Data[computeThreadStorageTotal() * TId + Usage[TId]]; |
101 | } |
102 | |
103 | /// The actual storage, shared among all warps. |
104 | [[gnu::aligned( |
105 | allocator::ALIGNMENT)]] unsigned char Data[state::SharedScratchpadSize]; |
106 | [[gnu::aligned( |
107 | allocator::ALIGNMENT)]] unsigned char Usage[mapping::MaxThreadsPerTeam]; |
108 | }; |
109 | |
110 | static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256, |
111 | "Shared scratchpad of this size not supported yet." ); |
112 | |
113 | /// The allocation of a single shared memory scratchpad. |
114 | static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack); |
115 | |
116 | void SharedMemorySmartStackTy::init(bool IsSPMD) { |
117 | Usage[mapping::getThreadIdInBlock()] = 0; |
118 | } |
119 | |
120 | void *SharedMemorySmartStackTy::push(uint64_t Bytes) { |
121 | // First align the number of requested bytes. |
122 | /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to |
123 | /// be passed in as an argument and the stack rewritten to support it. |
124 | uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT); |
125 | |
126 | uint32_t StorageTotal = computeThreadStorageTotal(); |
127 | |
128 | // The main thread in generic mode gets the space of its entire warp as the |
129 | // other threads do not participate in any computation at all. |
130 | if (mapping::isMainThreadInGenericMode()) |
131 | StorageTotal *= mapping::getWarpSize(); |
132 | |
133 | int TId = mapping::getThreadIdInBlock(); |
134 | if (Usage[TId] + AlignedBytes <= StorageTotal) { |
135 | void *Ptr = getThreadDataTop(TId); |
136 | Usage[TId] += AlignedBytes; |
137 | return Ptr; |
138 | } |
139 | |
140 | if (config::isDebugMode(DeviceDebugKind::CommonIssues)) |
141 | PRINT("Shared memory stack full, fallback to dynamic allocation of global " |
142 | "memory will negatively impact performance.\n" ); |
143 | void *GlobalMemory = memory::allocGlobal( |
144 | AlignedBytes, "Slow path shared memory allocation, insufficient " |
145 | "shared memory stack memory!" ); |
146 | ASSERT(GlobalMemory != nullptr, "nullptr returned by malloc!" ); |
147 | |
148 | return GlobalMemory; |
149 | } |
150 | |
151 | void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) { |
152 | uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT); |
153 | if (utils::isSharedMemPtr(Ptr)) { |
154 | int TId = mapping::getThreadIdInBlock(); |
155 | Usage[TId] -= AlignedBytes; |
156 | return; |
157 | } |
158 | memory::freeGlobal(Ptr, "Slow path shared memory deallocation" ); |
159 | } |
160 | |
161 | } // namespace |
162 | |
163 | void *memory::getDynamicBuffer() { return DynamicSharedBuffer; } |
164 | |
165 | void *memory::allocShared(uint64_t Bytes, const char *Reason) { |
166 | return SharedMemorySmartStack.push(Bytes); |
167 | } |
168 | |
169 | void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) { |
170 | SharedMemorySmartStack.pop(Ptr, Bytes); |
171 | } |
172 | |
173 | void *memory::allocGlobal(uint64_t Bytes, const char *Reason) { |
174 | void *Ptr = malloc(Bytes); |
175 | if (config::isDebugMode(DeviceDebugKind::CommonIssues) && Ptr == nullptr) |
176 | PRINT("nullptr returned by malloc!\n" ); |
177 | return Ptr; |
178 | } |
179 | |
180 | void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); } |
181 | |
182 | ///} |
183 | |
184 | bool state::ICVStateTy::operator==(const ICVStateTy &Other) const { |
185 | return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) & |
186 | (ActiveLevelVar == Other.ActiveLevelVar) & |
187 | (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) & |
188 | (RunSchedVar == Other.RunSchedVar) & |
189 | (RunSchedChunkVar == Other.RunSchedChunkVar); |
190 | } |
191 | |
192 | void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const { |
193 | ASSERT(NThreadsVar == Other.NThreadsVar, nullptr); |
194 | ASSERT(LevelVar == Other.LevelVar, nullptr); |
195 | ASSERT(ActiveLevelVar == Other.ActiveLevelVar, nullptr); |
196 | ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar, nullptr); |
197 | ASSERT(RunSchedVar == Other.RunSchedVar, nullptr); |
198 | ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar, nullptr); |
199 | } |
200 | |
201 | void state::TeamStateTy::init(bool IsSPMD) { |
202 | ICVState.NThreadsVar = 0; |
203 | ICVState.LevelVar = 0; |
204 | ICVState.ActiveLevelVar = 0; |
205 | ICVState.Padding0Val = 0; |
206 | ICVState.MaxActiveLevelsVar = 1; |
207 | ICVState.RunSchedVar = omp_sched_static; |
208 | ICVState.RunSchedChunkVar = 1; |
209 | ParallelTeamSize = 1; |
210 | HasThreadState = false; |
211 | ParallelRegionFnVar = nullptr; |
212 | } |
213 | |
214 | bool state::TeamStateTy::operator==(const TeamStateTy &Other) const { |
215 | return (ICVState == Other.ICVState) & |
216 | (HasThreadState == Other.HasThreadState) & |
217 | (ParallelTeamSize == Other.ParallelTeamSize); |
218 | } |
219 | |
220 | void state::TeamStateTy::assertEqual(TeamStateTy &Other) const { |
221 | ICVState.assertEqual(Other.ICVState); |
222 | ASSERT(ParallelTeamSize == Other.ParallelTeamSize, nullptr); |
223 | ASSERT(HasThreadState == Other.HasThreadState, nullptr); |
224 | } |
225 | |
226 | state::TeamStateTy SHARED(ompx::state::TeamState); |
227 | state::ThreadStateTy **SHARED(ompx::state::ThreadStates); |
228 | |
229 | namespace { |
230 | |
231 | int returnValIfLevelIsActive(int Level, int Val, int DefaultVal, |
232 | int OutOfBoundsVal = -1) { |
233 | if (Level == 0) |
234 | return DefaultVal; |
235 | int LevelVar = omp_get_level(); |
236 | if (OMP_UNLIKELY(Level < 0 || Level > LevelVar)) |
237 | return OutOfBoundsVal; |
238 | int ActiveLevel = icv::ActiveLevel; |
239 | if (OMP_UNLIKELY(Level != ActiveLevel)) |
240 | return DefaultVal; |
241 | return Val; |
242 | } |
243 | |
244 | } // namespace |
245 | |
246 | void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, |
247 | KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { |
248 | SharedMemorySmartStack.init(IsSPMD); |
249 | if (mapping::isInitialThreadInLevel0(IsSPMD)) { |
250 | TeamState.init(IsSPMD); |
251 | ThreadStates = nullptr; |
252 | KernelEnvironmentPtr = &KernelEnvironment; |
253 | KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment; |
254 | } |
255 | } |
256 | |
257 | KernelEnvironmentTy &state::getKernelEnvironment() { |
258 | return *KernelEnvironmentPtr; |
259 | } |
260 | |
261 | KernelLaunchEnvironmentTy &state::getKernelLaunchEnvironment() { |
262 | return *KernelLaunchEnvironmentPtr; |
263 | } |
264 | |
265 | void state::enterDataEnvironment(IdentTy *Ident) { |
266 | ASSERT(config::mayUseThreadStates(), |
267 | "Thread state modified while explicitly disabled!" ); |
268 | if (!config::mayUseThreadStates()) |
269 | return; |
270 | |
271 | unsigned TId = mapping::getThreadIdInBlock(); |
272 | ThreadStateTy *NewThreadState = static_cast<ThreadStateTy *>( |
273 | memory::allocGlobal(sizeof(ThreadStateTy), "ThreadStates alloc" )); |
274 | uintptr_t *ThreadStatesBitsPtr = reinterpret_cast<uintptr_t *>(&ThreadStates); |
275 | if (!atomic::load(ThreadStatesBitsPtr, atomic::seq_cst)) { |
276 | uint32_t Bytes = |
277 | sizeof(ThreadStates[0]) * mapping::getNumberOfThreadsInBlock(); |
278 | void *ThreadStatesPtr = |
279 | memory::allocGlobal(Bytes, "Thread state array allocation" ); |
280 | memset(ThreadStatesPtr, 0, Bytes); |
281 | if (!atomic::cas(ThreadStatesBitsPtr, uintptr_t(0), |
282 | reinterpret_cast<uintptr_t>(ThreadStatesPtr), |
283 | atomic::seq_cst, atomic::seq_cst)) |
284 | memory::freeGlobal(ThreadStatesPtr, |
285 | "Thread state array allocated multiple times" ); |
286 | ASSERT(atomic::load(ThreadStatesBitsPtr, atomic::seq_cst), |
287 | "Expected valid thread states bit!" ); |
288 | } |
289 | NewThreadState->init(ThreadStates[TId]); |
290 | TeamState.HasThreadState = true; |
291 | ThreadStates[TId] = NewThreadState; |
292 | } |
293 | |
294 | void state::exitDataEnvironment() { |
295 | ASSERT(config::mayUseThreadStates(), |
296 | "Thread state modified while explicitly disabled!" ); |
297 | |
298 | unsigned TId = mapping::getThreadIdInBlock(); |
299 | resetStateForThread(TId); |
300 | } |
301 | |
302 | void state::resetStateForThread(uint32_t TId) { |
303 | if (!config::mayUseThreadStates()) |
304 | return; |
305 | if (OMP_LIKELY(!TeamState.HasThreadState || !ThreadStates[TId])) |
306 | return; |
307 | |
308 | ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState; |
309 | memory::freeGlobal(ThreadStates[TId], "ThreadStates dealloc" ); |
310 | ThreadStates[TId] = PreviousThreadState; |
311 | } |
312 | |
313 | void state::runAndCheckState(void(Func(void))) { |
314 | TeamStateTy OldTeamState = TeamState; |
315 | OldTeamState.assertEqual(TeamState); |
316 | |
317 | Func(); |
318 | |
319 | OldTeamState.assertEqual(TeamState); |
320 | } |
321 | |
322 | void state::assumeInitialState(bool IsSPMD) { |
323 | TeamStateTy InitialTeamState; |
324 | InitialTeamState.init(IsSPMD); |
325 | InitialTeamState.assertEqual(TeamState); |
326 | ASSERT(mapping::isSPMDMode() == IsSPMD, nullptr); |
327 | } |
328 | |
329 | int state::getEffectivePTeamSize() { |
330 | int PTeamSize = state::ParallelTeamSize; |
331 | return PTeamSize ? PTeamSize : mapping::getMaxTeamThreads(); |
332 | } |
333 | |
334 | extern "C" { |
335 | void omp_set_dynamic(int V) {} |
336 | |
337 | int omp_get_dynamic(void) { return 0; } |
338 | |
339 | void omp_set_num_threads(int V) { icv::NThreads = V; } |
340 | |
341 | int omp_get_max_threads(void) { |
342 | int NT = icv::NThreads; |
343 | return NT > 0 ? NT : mapping::getMaxTeamThreads(); |
344 | } |
345 | |
346 | int omp_get_level(void) { |
347 | int LevelVar = icv::Level; |
348 | ASSERT(LevelVar >= 0, nullptr); |
349 | return LevelVar; |
350 | } |
351 | |
352 | int omp_get_active_level(void) { return !!icv::ActiveLevel; } |
353 | |
354 | int omp_in_parallel(void) { return !!icv::ActiveLevel; } |
355 | |
356 | void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) { |
357 | *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched); |
358 | *ChunkSize = state::RunSchedChunk; |
359 | } |
360 | |
361 | void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) { |
362 | icv::RunSched = (int)ScheduleKind; |
363 | state::RunSchedChunk = ChunkSize; |
364 | } |
365 | |
366 | int omp_get_ancestor_thread_num(int Level) { |
367 | return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0); |
368 | } |
369 | |
370 | int omp_get_thread_num(void) { |
371 | return omp_get_ancestor_thread_num(Level: omp_get_level()); |
372 | } |
373 | |
374 | int omp_get_team_size(int Level) { |
375 | return returnValIfLevelIsActive(Level, state::getEffectivePTeamSize(), 1); |
376 | } |
377 | |
378 | int omp_get_num_threads(void) { |
379 | return omp_get_level() != 1 ? 1 : state::getEffectivePTeamSize(); |
380 | } |
381 | |
382 | int omp_get_thread_limit(void) { return mapping::getMaxTeamThreads(); } |
383 | |
384 | int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); } |
385 | |
386 | void omp_set_nested(int) {} |
387 | |
388 | int omp_get_nested(void) { return false; } |
389 | |
390 | void omp_set_max_active_levels(int Levels) { |
391 | icv::MaxActiveLevels = Levels > 0 ? 1 : 0; |
392 | } |
393 | |
394 | int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; } |
395 | |
396 | omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; } |
397 | |
398 | int omp_get_num_places(void) { return 0; } |
399 | |
400 | int omp_get_place_num_procs(int) { return omp_get_num_procs(); } |
401 | |
402 | void omp_get_place_proc_ids(int, int *) { |
403 | // TODO |
404 | } |
405 | |
406 | int omp_get_place_num(void) { return 0; } |
407 | |
408 | int omp_get_partition_num_places(void) { return 0; } |
409 | |
410 | void omp_get_partition_place_nums(int *) { |
411 | // TODO |
412 | } |
413 | |
414 | int omp_get_cancellation(void) { return 0; } |
415 | |
416 | void omp_set_default_device(int) {} |
417 | |
418 | int omp_get_default_device(void) { return -1; } |
419 | |
420 | int omp_get_num_devices(void) { return config::getNumDevices(); } |
421 | |
422 | int omp_get_device_num(void) { return config::getDeviceNum(); } |
423 | |
424 | int omp_get_num_teams(void) { return mapping::getNumberOfBlocksInKernel(); } |
425 | |
426 | int omp_get_team_num() { return mapping::getBlockIdInKernel(); } |
427 | |
428 | int omp_get_initial_device(void) { return -1; } |
429 | |
430 | int omp_is_initial_device(void) { return 0; } |
431 | } |
432 | |
433 | extern "C" { |
434 | [[clang::noinline]] void *__kmpc_alloc_shared(uint64_t Bytes) { |
435 | return memory::allocShared(Bytes, "Frontend alloc shared" ); |
436 | } |
437 | |
438 | [[clang::noinline]] void __kmpc_free_shared(void *Ptr, uint64_t Bytes) { |
439 | memory::freeShared(Ptr, Bytes, "Frontend free shared" ); |
440 | } |
441 | |
442 | void *__kmpc_get_dynamic_shared() { return memory::getDynamicBuffer(); } |
443 | |
444 | void *llvm_omp_target_dynamic_shared_alloc() { |
445 | return __kmpc_get_dynamic_shared(); |
446 | } |
447 | |
448 | void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); } |
449 | |
450 | /// Allocate storage in shared memory to communicate arguments from the main |
451 | /// thread to the workers in generic mode. If we exceed |
452 | /// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication. |
453 | constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64; |
454 | |
455 | [[clang::loader_uninitialized]] static void |
456 | *SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM]; |
457 | #pragma omp allocate(SharedMemVariableSharingSpace) \ |
458 | allocator(omp_pteam_mem_alloc) |
459 | [[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr; |
460 | #pragma omp allocate(SharedMemVariableSharingSpacePtr) \ |
461 | allocator(omp_pteam_mem_alloc) |
462 | |
463 | void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) { |
464 | if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) { |
465 | SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0]; |
466 | } else { |
467 | SharedMemVariableSharingSpacePtr = (void **)memory::allocGlobal( |
468 | nArgs * sizeof(void *), "new extended args" ); |
469 | ASSERT(SharedMemVariableSharingSpacePtr != nullptr, |
470 | "Nullptr returned by malloc!" ); |
471 | } |
472 | *GlobalArgs = SharedMemVariableSharingSpacePtr; |
473 | } |
474 | |
475 | void __kmpc_end_sharing_variables() { |
476 | if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0]) |
477 | memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args" ); |
478 | } |
479 | |
480 | void __kmpc_get_shared_variables(void ***GlobalArgs) { |
481 | *GlobalArgs = SharedMemVariableSharingSpacePtr; |
482 | } |
483 | } |
484 | #pragma omp end declare target |
485 | |