1 | //===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===// |
---|---|
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | //===----------------------------------------------------------------------===// |
10 | |
11 | #include "Shared/Environment.h" |
12 | |
13 | #include "Allocator.h" |
14 | #include "Configuration.h" |
15 | #include "Debug.h" |
16 | #include "DeviceTypes.h" |
17 | #include "DeviceUtils.h" |
18 | #include "Interface.h" |
19 | #include "LibC.h" |
20 | #include "Mapping.h" |
21 | #include "State.h" |
22 | #include "Synchronization.h" |
23 | |
24 | using namespace ompx; |
25 | |
26 | /// Memory implementation |
27 | /// |
28 | ///{ |
29 | |
30 | /// External symbol to access dynamic shared memory. |
31 | [[gnu::aligned( |
32 | allocator::ALIGNMENT)]] extern Local<unsigned char> DynamicSharedBuffer[]; |
33 | |
34 | /// The kernel environment passed to the init method by the compiler. |
35 | [[clang::loader_uninitialized]] static Local<KernelEnvironmentTy *> |
36 | KernelEnvironmentPtr; |
37 | |
38 | /// The kernel launch environment passed as argument to the kernel by the |
39 | /// runtime. |
40 | [[clang::loader_uninitialized]] static Local<KernelLaunchEnvironmentTy *> |
41 | KernelLaunchEnvironmentPtr; |
42 | |
43 | ///} |
44 | |
45 | namespace { |
46 | |
47 | /// Fallback implementations are missing to trigger a link time error. |
48 | /// Implementations for new devices, including the host, should go into a |
49 | /// dedicated begin/end declare variant. |
50 | /// |
51 | ///{ |
52 | extern "C"{ |
53 | #ifdef __AMDGPU__ |
54 | |
55 | [[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); } |
56 | [[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); } |
57 | |
58 | #else |
59 | |
60 | [[gnu::weak, gnu::leaf]] void *malloc(size_t Size); |
61 | [[gnu::weak, gnu::leaf]] void free(void *Ptr); |
62 | |
63 | #endif |
64 | } |
65 | ///} |
66 | |
67 | /// A "smart" stack in shared memory. |
68 | /// |
69 | /// The stack exposes a malloc/free interface but works like a stack internally. |
70 | /// In fact, it is a separate stack *per warp*. That means, each warp must push |
71 | /// and pop symmetrically or this breaks, badly. The implementation will (aim |
72 | /// to) detect non-lock-step warps and fallback to malloc/free. The same will |
73 | /// happen if a warp runs out of memory. The master warp in generic memory is |
74 | /// special and is given more memory than the rest. |
75 | /// |
76 | struct SharedMemorySmartStackTy { |
77 | /// Initialize the stack. Must be called by all threads. |
78 | void init(bool IsSPMD); |
79 | |
80 | /// Allocate \p Bytes on the stack for the encountering thread. Each thread |
81 | /// can call this function. |
82 | void *push(uint64_t Bytes); |
83 | |
84 | /// Deallocate the last allocation made by the encountering thread and pointed |
85 | /// to by \p Ptr from the stack. Each thread can call this function. |
86 | void pop(void *Ptr, uint64_t Bytes); |
87 | |
88 | private: |
89 | /// Compute the size of the storage space reserved for a thread. |
90 | uint32_t computeThreadStorageTotal() { |
91 | uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock(); |
92 | return __builtin_align_down(state::SharedScratchpadSize / NumLanesInBlock, |
93 | allocator::ALIGNMENT); |
94 | } |
95 | |
96 | /// Return the top address of the warp data stack, that is the first address |
97 | /// this warp will allocate memory at next. |
98 | void *getThreadDataTop(uint32_t TId) { |
99 | return &Data[computeThreadStorageTotal() * TId + Usage[TId]]; |
100 | } |
101 | |
102 | /// The actual storage, shared among all warps. |
103 | [[gnu::aligned( |
104 | allocator::ALIGNMENT)]] unsigned char Data[state::SharedScratchpadSize]; |
105 | [[gnu::aligned( |
106 | allocator::ALIGNMENT)]] unsigned char Usage[mapping::MaxThreadsPerTeam]; |
107 | }; |
108 | |
109 | static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256, |
110 | "Shared scratchpad of this size not supported yet."); |
111 | |
112 | /// The allocation of a single shared memory scratchpad. |
113 | [[clang::loader_uninitialized]] static Local<SharedMemorySmartStackTy> |
114 | SharedMemorySmartStack; |
115 | |
116 | void SharedMemorySmartStackTy::init(bool IsSPMD) { |
117 | Usage[mapping::getThreadIdInBlock()] = 0; |
118 | } |
119 | |
120 | void *SharedMemorySmartStackTy::push(uint64_t Bytes) { |
121 | // First align the number of requested bytes. |
122 | /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to |
123 | /// be passed in as an argument and the stack rewritten to support it. |
124 | uint64_t AlignedBytes = __builtin_align_up(Bytes, allocator::ALIGNMENT); |
125 | |
126 | uint32_t StorageTotal = computeThreadStorageTotal(); |
127 | |
128 | // The main thread in generic mode gets the space of its entire warp as the |
129 | // other threads do not participate in any computation at all. |
130 | if (mapping::isMainThreadInGenericMode()) |
131 | StorageTotal *= mapping::getWarpSize(); |
132 | |
133 | int TId = mapping::getThreadIdInBlock(); |
134 | if (Usage[TId] + AlignedBytes <= StorageTotal) { |
135 | void *Ptr = getThreadDataTop(TId); |
136 | Usage[TId] += AlignedBytes; |
137 | return Ptr; |
138 | } |
139 | |
140 | if (config::isDebugMode(DeviceDebugKind::CommonIssues)) |
141 | printf("Shared memory stack full, fallback to dynamic allocation of global " |
142 | "memory will negatively impact performance.\n"); |
143 | void *GlobalMemory = memory::allocGlobal( |
144 | AlignedBytes, "Slow path shared memory allocation, insufficient " |
145 | "shared memory stack memory!"); |
146 | ASSERT(GlobalMemory != nullptr, "nullptr returned by malloc!"); |
147 | |
148 | return GlobalMemory; |
149 | } |
150 | |
151 | void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) { |
152 | uint64_t AlignedBytes = __builtin_align_up(Bytes, allocator::ALIGNMENT); |
153 | if (utils::isSharedMemPtr(Ptr)) { |
154 | int TId = mapping::getThreadIdInBlock(); |
155 | Usage[TId] -= AlignedBytes; |
156 | return; |
157 | } |
158 | memory::freeGlobal(Ptr, "Slow path shared memory deallocation"); |
159 | } |
160 | |
161 | } // namespace |
162 | |
163 | void *memory::getDynamicBuffer() { return DynamicSharedBuffer; } |
164 | |
165 | void *memory::allocShared(uint64_t Bytes, const char *Reason) { |
166 | return SharedMemorySmartStack.push(Bytes); |
167 | } |
168 | |
169 | void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) { |
170 | SharedMemorySmartStack.pop(Ptr, Bytes); |
171 | } |
172 | |
173 | void *memory::allocGlobal(uint64_t Bytes, const char *Reason) { |
174 | void *Ptr = malloc(Bytes); |
175 | if (config::isDebugMode(DeviceDebugKind::CommonIssues) && Ptr == nullptr) |
176 | printf("nullptr returned by malloc!\n"); |
177 | return Ptr; |
178 | } |
179 | |
180 | void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); } |
181 | |
182 | ///} |
183 | |
184 | bool state::ICVStateTy::operator==(const ICVStateTy &Other) const { |
185 | return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) & |
186 | (ActiveLevelVar == Other.ActiveLevelVar) & |
187 | (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) & |
188 | (RunSchedVar == Other.RunSchedVar) & |
189 | (RunSchedChunkVar == Other.RunSchedChunkVar); |
190 | } |
191 | |
192 | void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const { |
193 | ASSERT(NThreadsVar == Other.NThreadsVar, nullptr); |
194 | ASSERT(LevelVar == Other.LevelVar, nullptr); |
195 | ASSERT(ActiveLevelVar == Other.ActiveLevelVar, nullptr); |
196 | ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar, nullptr); |
197 | ASSERT(RunSchedVar == Other.RunSchedVar, nullptr); |
198 | ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar, nullptr); |
199 | } |
200 | |
201 | void state::TeamStateTy::init(bool IsSPMD) { |
202 | ICVState.NThreadsVar = 0; |
203 | ICVState.LevelVar = 0; |
204 | ICVState.ActiveLevelVar = 0; |
205 | ICVState.Padding0Val = 0; |
206 | ICVState.MaxActiveLevelsVar = 1; |
207 | ICVState.RunSchedVar = omp_sched_static; |
208 | ICVState.RunSchedChunkVar = 1; |
209 | ParallelTeamSize = 1; |
210 | HasThreadState = false; |
211 | ParallelRegionFnVar = nullptr; |
212 | } |
213 | |
214 | bool state::TeamStateTy::operator==(const TeamStateTy &Other) const { |
215 | return (ICVState == Other.ICVState) & |
216 | (HasThreadState == Other.HasThreadState) & |
217 | (ParallelTeamSize == Other.ParallelTeamSize); |
218 | } |
219 | |
220 | void state::TeamStateTy::assertEqual(TeamStateTy &Other) const { |
221 | ICVState.assertEqual(Other.ICVState); |
222 | ASSERT(ParallelTeamSize == Other.ParallelTeamSize, nullptr); |
223 | ASSERT(HasThreadState == Other.HasThreadState, nullptr); |
224 | } |
225 | |
226 | [[clang::loader_uninitialized]] Local<state::TeamStateTy> |
227 | ompx::state::TeamState; |
228 | [[clang::loader_uninitialized]] Local<state::ThreadStateTy **> |
229 | ompx::state::ThreadStates; |
230 | |
231 | namespace { |
232 | |
233 | int returnValIfLevelIsActive(int Level, int Val, int DefaultVal, |
234 | int OutOfBoundsVal = -1) { |
235 | if (Level == 0) |
236 | return DefaultVal; |
237 | int LevelVar = omp_get_level(); |
238 | if (OMP_UNLIKELY(Level < 0 || Level > LevelVar)) |
239 | return OutOfBoundsVal; |
240 | int ActiveLevel = icv::ActiveLevel; |
241 | if (OMP_UNLIKELY(Level != ActiveLevel)) |
242 | return DefaultVal; |
243 | return Val; |
244 | } |
245 | |
246 | } // namespace |
247 | |
248 | void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, |
249 | KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { |
250 | SharedMemorySmartStack.init(IsSPMD); |
251 | if (mapping::isInitialThreadInLevel0(IsSPMD)) { |
252 | TeamState.init(IsSPMD); |
253 | ThreadStates = nullptr; |
254 | KernelEnvironmentPtr = &KernelEnvironment; |
255 | KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment; |
256 | } |
257 | } |
258 | |
259 | KernelEnvironmentTy &state::getKernelEnvironment() { |
260 | return *KernelEnvironmentPtr; |
261 | } |
262 | |
263 | KernelLaunchEnvironmentTy &state::getKernelLaunchEnvironment() { |
264 | return *KernelLaunchEnvironmentPtr; |
265 | } |
266 | |
267 | void state::enterDataEnvironment(IdentTy *Ident) { |
268 | ASSERT(config::mayUseThreadStates(), |
269 | "Thread state modified while explicitly disabled!"); |
270 | if (!config::mayUseThreadStates()) |
271 | return; |
272 | |
273 | unsigned TId = mapping::getThreadIdInBlock(); |
274 | ThreadStateTy *NewThreadState = static_cast<ThreadStateTy *>( |
275 | memory::allocGlobal(sizeof(ThreadStateTy), "ThreadStates alloc")); |
276 | uintptr_t *ThreadStatesBitsPtr = reinterpret_cast<uintptr_t *>(&ThreadStates); |
277 | if (!atomic::load(ThreadStatesBitsPtr, atomic::seq_cst)) { |
278 | uint32_t Bytes = |
279 | sizeof(ThreadStates[0]) * mapping::getNumberOfThreadsInBlock(); |
280 | void *ThreadStatesPtr = |
281 | memory::allocGlobal(Bytes, "Thread state array allocation"); |
282 | __builtin_memset(ThreadStatesPtr, 0, Bytes); |
283 | if (!atomic::cas(ThreadStatesBitsPtr, uintptr_t(0), |
284 | reinterpret_cast<uintptr_t>(ThreadStatesPtr), |
285 | atomic::seq_cst, atomic::seq_cst)) |
286 | memory::freeGlobal(ThreadStatesPtr, |
287 | "Thread state array allocated multiple times"); |
288 | ASSERT(atomic::load(ThreadStatesBitsPtr, atomic::seq_cst), |
289 | "Expected valid thread states bit!"); |
290 | } |
291 | NewThreadState->init(ThreadStates[TId]); |
292 | TeamState.HasThreadState = true; |
293 | ThreadStates[TId] = NewThreadState; |
294 | } |
295 | |
296 | void state::exitDataEnvironment() { |
297 | ASSERT(config::mayUseThreadStates(), |
298 | "Thread state modified while explicitly disabled!"); |
299 | |
300 | unsigned TId = mapping::getThreadIdInBlock(); |
301 | resetStateForThread(TId); |
302 | } |
303 | |
304 | void state::resetStateForThread(uint32_t TId) { |
305 | if (!config::mayUseThreadStates()) |
306 | return; |
307 | if (OMP_LIKELY(!TeamState.HasThreadState || !ThreadStates[TId])) |
308 | return; |
309 | |
310 | ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState; |
311 | memory::freeGlobal(ThreadStates[TId], "ThreadStates dealloc"); |
312 | ThreadStates[TId] = PreviousThreadState; |
313 | } |
314 | |
315 | void state::runAndCheckState(void(Func(void))) { |
316 | TeamStateTy OldTeamState = TeamState; |
317 | OldTeamState.assertEqual(TeamState); |
318 | |
319 | Func(); |
320 | |
321 | OldTeamState.assertEqual(TeamState); |
322 | } |
323 | |
324 | void state::assumeInitialState(bool IsSPMD) { |
325 | TeamStateTy InitialTeamState; |
326 | InitialTeamState.init(IsSPMD); |
327 | InitialTeamState.assertEqual(TeamState); |
328 | ASSERT(mapping::isSPMDMode() == IsSPMD, nullptr); |
329 | } |
330 | |
331 | int state::getEffectivePTeamSize() { |
332 | int PTeamSize = state::ParallelTeamSize; |
333 | return PTeamSize ? PTeamSize : mapping::getMaxTeamThreads(); |
334 | } |
335 | |
336 | extern "C"{ |
337 | void omp_set_dynamic(int V) {} |
338 | |
339 | int omp_get_dynamic(void) { return 0; } |
340 | |
341 | void omp_set_num_threads(int V) { icv::NThreads = V; } |
342 | |
343 | int omp_get_max_threads(void) { |
344 | int NT = icv::NThreads; |
345 | return NT > 0 ? NT : mapping::getMaxTeamThreads(); |
346 | } |
347 | |
348 | int omp_get_level(void) { |
349 | int LevelVar = icv::Level; |
350 | ASSERT(LevelVar >= 0, nullptr); |
351 | return LevelVar; |
352 | } |
353 | |
354 | int omp_get_active_level(void) { return !!icv::ActiveLevel; } |
355 | |
356 | int omp_in_parallel(void) { return !!icv::ActiveLevel; } |
357 | |
358 | void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) { |
359 | *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched); |
360 | *ChunkSize = state::RunSchedChunk; |
361 | } |
362 | |
363 | void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) { |
364 | icv::RunSched = (int)ScheduleKind; |
365 | state::RunSchedChunk = ChunkSize; |
366 | } |
367 | |
368 | int omp_get_ancestor_thread_num(int Level) { |
369 | return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0); |
370 | } |
371 | |
372 | int omp_get_thread_num(void) { |
373 | return omp_get_ancestor_thread_num(Level: omp_get_level()); |
374 | } |
375 | |
376 | int omp_get_team_size(int Level) { |
377 | return returnValIfLevelIsActive(Level, state::getEffectivePTeamSize(), 1); |
378 | } |
379 | |
380 | int omp_get_num_threads(void) { |
381 | return omp_get_level() != 1 ? 1 : state::getEffectivePTeamSize(); |
382 | } |
383 | |
384 | int omp_get_thread_limit(void) { return mapping::getMaxTeamThreads(); } |
385 | |
386 | int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); } |
387 | |
388 | void omp_set_nested(int) {} |
389 | |
390 | int omp_get_nested(void) { return false; } |
391 | |
392 | void omp_set_max_active_levels(int Levels) { |
393 | icv::MaxActiveLevels = Levels > 0 ? 1 : 0; |
394 | } |
395 | |
396 | int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; } |
397 | |
398 | omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; } |
399 | |
400 | int omp_get_num_places(void) { return 0; } |
401 | |
402 | int omp_get_place_num_procs(int) { return omp_get_num_procs(); } |
403 | |
404 | void omp_get_place_proc_ids(int, int *) { |
405 | // TODO |
406 | } |
407 | |
408 | int omp_get_place_num(void) { return 0; } |
409 | |
410 | int omp_get_partition_num_places(void) { return 0; } |
411 | |
412 | void omp_get_partition_place_nums(int *) { |
413 | // TODO |
414 | } |
415 | |
416 | int omp_get_cancellation(void) { return 0; } |
417 | |
418 | void omp_set_default_device(int) {} |
419 | |
420 | int omp_get_default_device(void) { return -1; } |
421 | |
422 | int omp_get_num_devices(void) { return config::getNumDevices(); } |
423 | |
424 | int omp_get_device_num(void) { return config::getDeviceNum(); } |
425 | |
426 | int omp_get_num_teams(void) { return mapping::getNumberOfBlocksInKernel(); } |
427 | |
428 | int omp_get_team_num() { return mapping::getBlockIdInKernel(); } |
429 | |
430 | int omp_get_initial_device(void) { return -1; } |
431 | |
432 | int omp_is_initial_device(void) { return 0; } |
433 | } |
434 | |
435 | extern "C"{ |
436 | [[clang::noinline]] void *__kmpc_alloc_shared(uint64_t Bytes) { |
437 | return memory::allocShared(Bytes, "Frontend alloc shared"); |
438 | } |
439 | |
440 | [[clang::noinline]] void __kmpc_free_shared(void *Ptr, uint64_t Bytes) { |
441 | memory::freeShared(Ptr, Bytes, "Frontend free shared"); |
442 | } |
443 | |
444 | void *__kmpc_get_dynamic_shared() { return memory::getDynamicBuffer(); } |
445 | |
446 | void *llvm_omp_target_dynamic_shared_alloc() { |
447 | return __kmpc_get_dynamic_shared(); |
448 | } |
449 | |
450 | void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); } |
451 | |
452 | /// Allocate storage in shared memory to communicate arguments from the main |
453 | /// thread to the workers in generic mode. If we exceed |
454 | /// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication. |
455 | constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64; |
456 | |
457 | [[clang::loader_uninitialized]] static Local<void *> |
458 | SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM]; |
459 | [[clang::loader_uninitialized]] static Local<void **> |
460 | SharedMemVariableSharingSpacePtr; |
461 | |
462 | void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) { |
463 | if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) { |
464 | SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0]; |
465 | } else { |
466 | SharedMemVariableSharingSpacePtr = (void **)memory::allocGlobal( |
467 | nArgs * sizeof(void *), "new extended args"); |
468 | ASSERT(SharedMemVariableSharingSpacePtr != nullptr, |
469 | "Nullptr returned by malloc!"); |
470 | } |
471 | *GlobalArgs = SharedMemVariableSharingSpacePtr; |
472 | } |
473 | |
474 | void __kmpc_end_sharing_variables() { |
475 | if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0]) |
476 | memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args"); |
477 | } |
478 | |
479 | void __kmpc_get_shared_variables(void ***GlobalArgs) { |
480 | *GlobalArgs = SharedMemVariableSharingSpacePtr; |
481 | } |
482 | } |
483 |
Definitions
- KernelEnvironmentPtr
- KernelLaunchEnvironmentPtr
- SharedMemorySmartStackTy
- computeThreadStorageTotal
- getThreadDataTop
- SharedMemorySmartStack
- init
- push
- pop
- returnValIfLevelIsActive
- omp_set_dynamic
- omp_get_dynamic
- omp_set_num_threads
- omp_get_max_threads
- omp_get_level
- omp_get_active_level
- omp_in_parallel
- omp_get_schedule
- omp_set_schedule
- omp_get_ancestor_thread_num
- omp_get_thread_num
- omp_get_team_size
- omp_get_num_threads
- omp_get_thread_limit
- omp_get_num_procs
- omp_set_nested
- omp_get_nested
- omp_set_max_active_levels
- omp_get_max_active_levels
- omp_get_proc_bind
- omp_get_num_places
- omp_get_place_num_procs
- omp_get_place_proc_ids
- omp_get_place_num
- omp_get_partition_num_places
- omp_get_partition_place_nums
- omp_get_cancellation
- omp_set_default_device
- omp_get_default_device
- omp_get_num_devices
- omp_get_device_num
- omp_get_num_teams
- omp_get_team_num
- omp_get_initial_device
- omp_is_initial_device
- __kmpc_alloc_shared
- __kmpc_free_shared
- __kmpc_get_dynamic_shared
- llvm_omp_target_dynamic_shared_alloc
- llvm_omp_get_dynamic_shared
- NUM_SHARED_VARIABLES_IN_SHARED_MEM
- SharedMemVariableSharingSpace
- SharedMemVariableSharingSpacePtr
- __kmpc_begin_sharing_variables
- __kmpc_end_sharing_variables
Update your C++ knowledge – Modern C++11/14/17 Training
Find out more