1//===----RTLs/amdgpu/src/rtl.cpp - Target RTLs Implementation ----- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// RTL NextGen for AMDGPU machine
10//
11//===----------------------------------------------------------------------===//
12
13#include <atomic>
14#include <cassert>
15#include <cstddef>
16#include <deque>
17#include <mutex>
18#include <string>
19#include <system_error>
20#include <unistd.h>
21#include <unordered_map>
22
23#include "Shared/Debug.h"
24#include "Shared/Environment.h"
25#include "Shared/Utils.h"
26#include "Utils/ELF.h"
27
28#include "GlobalHandler.h"
29#include "OpenMP/OMPT/Callback.h"
30#include "PluginInterface.h"
31#include "UtilitiesRTL.h"
32#include "omptarget.h"
33
34#include "llvm/ADT/SmallString.h"
35#include "llvm/ADT/SmallVector.h"
36#include "llvm/ADT/StringRef.h"
37#include "llvm/BinaryFormat/ELF.h"
38#include "llvm/Frontend/OpenMP/OMPConstants.h"
39#include "llvm/Frontend/OpenMP/OMPGridValues.h"
40#include "llvm/Support/Error.h"
41#include "llvm/Support/FileOutputBuffer.h"
42#include "llvm/Support/FileSystem.h"
43#include "llvm/Support/MemoryBuffer.h"
44#include "llvm/Support/Program.h"
45#include "llvm/Support/raw_ostream.h"
46
47#if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || \
48 !defined(__ORDER_BIG_ENDIAN__)
49#error "Missing preprocessor definitions for endianness detection."
50#endif
51
52// The HSA headers require these definitions.
53#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
54#define LITTLEENDIAN_CPU
55#elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
56#define BIGENDIAN_CPU
57#endif
58
59#if defined(__has_include)
60#if __has_include("hsa/hsa.h")
61#include "hsa/hsa.h"
62#include "hsa/hsa_ext_amd.h"
63#elif __has_include("hsa.h")
64#include "hsa.h"
65#include "hsa_ext_amd.h"
66#endif
67#else
68#include "hsa/hsa.h"
69#include "hsa/hsa_ext_amd.h"
70#endif
71
72namespace llvm {
73namespace omp {
74namespace target {
75namespace plugin {
76
77/// Forward declarations for all specialized data structures.
78struct AMDGPUKernelTy;
79struct AMDGPUDeviceTy;
80struct AMDGPUPluginTy;
81struct AMDGPUStreamTy;
82struct AMDGPUEventTy;
83struct AMDGPUStreamManagerTy;
84struct AMDGPUEventManagerTy;
85struct AMDGPUDeviceImageTy;
86struct AMDGPUMemoryManagerTy;
87struct AMDGPUMemoryPoolTy;
88
89namespace utils {
90
91/// Iterate elements using an HSA iterate function. Do not use this function
92/// directly but the specialized ones below instead.
93template <typename ElemTy, typename IterFuncTy, typename CallbackTy>
94hsa_status_t iterate(IterFuncTy Func, CallbackTy Cb) {
95 auto L = [](ElemTy Elem, void *Data) -> hsa_status_t {
96 CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data);
97 return (*Unwrapped)(Elem);
98 };
99 return Func(L, static_cast<void *>(&Cb));
100}
101
102/// Iterate elements using an HSA iterate function passing a parameter. Do not
103/// use this function directly but the specialized ones below instead.
104template <typename ElemTy, typename IterFuncTy, typename IterFuncArgTy,
105 typename CallbackTy>
106hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) {
107 auto L = [](ElemTy Elem, void *Data) -> hsa_status_t {
108 CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data);
109 return (*Unwrapped)(Elem);
110 };
111 return Func(FuncArg, L, static_cast<void *>(&Cb));
112}
113
114/// Iterate elements using an HSA iterate function passing a parameter. Do not
115/// use this function directly but the specialized ones below instead.
116template <typename Elem1Ty, typename Elem2Ty, typename IterFuncTy,
117 typename IterFuncArgTy, typename CallbackTy>
118hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) {
119 auto L = [](Elem1Ty Elem1, Elem2Ty Elem2, void *Data) -> hsa_status_t {
120 CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data);
121 return (*Unwrapped)(Elem1, Elem2);
122 };
123 return Func(FuncArg, L, static_cast<void *>(&Cb));
124}
125
126/// Iterate agents.
127template <typename CallbackTy> Error iterateAgents(CallbackTy Callback) {
128 hsa_status_t Status = iterate<hsa_agent_t>(hsa_iterate_agents, Callback);
129 return Plugin::check(Status, "Error in hsa_iterate_agents: %s");
130}
131
132/// Iterate ISAs of an agent.
133template <typename CallbackTy>
134Error iterateAgentISAs(hsa_agent_t Agent, CallbackTy Cb) {
135 hsa_status_t Status = iterate<hsa_isa_t>(hsa_agent_iterate_isas, Agent, Cb);
136 return Plugin::check(Status, "Error in hsa_agent_iterate_isas: %s");
137}
138
139/// Iterate memory pools of an agent.
140template <typename CallbackTy>
141Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) {
142 hsa_status_t Status = iterate<hsa_amd_memory_pool_t>(
143 hsa_amd_agent_iterate_memory_pools, Agent, Cb);
144 return Plugin::check(Status,
145 "Error in hsa_amd_agent_iterate_memory_pools: %s");
146}
147
148/// Dispatches an asynchronous memory copy.
149/// Enables different SDMA engines for the dispatch in a round-robin fashion.
150Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent,
151 const void *Src, hsa_agent_t SrcAgent, size_t Size,
152 uint32_t NumDepSignals, const hsa_signal_t *DepSignals,
153 hsa_signal_t CompletionSignal) {
154 if (!UseMultipleSdmaEngines) {
155 hsa_status_t S =
156 hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, Size,
157 NumDepSignals, DepSignals, CompletionSignal);
158 return Plugin::check(S, "Error in hsa_amd_memory_async_copy: %s");
159 }
160
161// This solution is probably not the best
162#if !(HSA_AMD_INTERFACE_VERSION_MAJOR >= 1 && \
163 HSA_AMD_INTERFACE_VERSION_MINOR >= 2)
164 return Plugin::error("Async copy on selected SDMA requires ROCm 5.7");
165#else
166 static std::atomic<int> SdmaEngine{1};
167
168 // This atomics solution is probably not the best, but should be sufficient
169 // for now.
170 // In a worst case scenario, in which threads read the same value, they will
171 // dispatch to the same SDMA engine. This may result in sub-optimal
172 // performance. However, I think the possibility to be fairly low.
173 int LocalSdmaEngine = SdmaEngine.load(std::memory_order_acquire);
174 // This call is only avail in ROCm >= 5.7
175 hsa_status_t S = hsa_amd_memory_async_copy_on_engine(
176 Dst, DstAgent, Src, SrcAgent, Size, NumDepSignals, DepSignals,
177 CompletionSignal, (hsa_amd_sdma_engine_id_t)LocalSdmaEngine,
178 /*force_copy_on_sdma=*/true);
179 // Increment to use one of two SDMA engines: 0x1, 0x2
180 LocalSdmaEngine = (LocalSdmaEngine << 1) % 3;
181 SdmaEngine.store(LocalSdmaEngine, std::memory_order_relaxed);
182
183 return Plugin::check(S, "Error in hsa_amd_memory_async_copy_on_engine: %s");
184#endif
185}
186
187Expected<std::string> getTargetTripleAndFeatures(hsa_agent_t Agent) {
188 std::string Target;
189 auto Err = utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) {
190 uint32_t Length;
191 hsa_status_t Status;
192 Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME_LENGTH, &Length);
193 if (Status != HSA_STATUS_SUCCESS)
194 return Status;
195
196 llvm::SmallVector<char> ISAName(Length);
197 Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, ISAName.begin());
198 if (Status != HSA_STATUS_SUCCESS)
199 return Status;
200
201 llvm::StringRef TripleTarget(ISAName.begin(), Length);
202 if (TripleTarget.consume_front(Prefix: "amdgcn-amd-amdhsa"))
203 Target = TripleTarget.ltrim(Char: '-').rtrim(Char: '\0').str();
204 return HSA_STATUS_SUCCESS;
205 });
206 if (Err)
207 return Err;
208 return Target;
209}
210} // namespace utils
211
212/// Utility class representing generic resource references to AMDGPU resources.
213template <typename ResourceTy>
214struct AMDGPUResourceRef : public GenericDeviceResourceRef {
215 /// The underlying handle type for resources.
216 using HandleTy = ResourceTy *;
217
218 /// Create an empty reference to an invalid resource.
219 AMDGPUResourceRef() : Resource(nullptr) {}
220
221 /// Create a reference to an existing resource.
222 AMDGPUResourceRef(HandleTy Resource) : Resource(Resource) {}
223
224 virtual ~AMDGPUResourceRef() {}
225
226 /// Create a new resource and save the reference. The reference must be empty
227 /// before calling to this function.
228 Error create(GenericDeviceTy &Device) override;
229
230 /// Destroy the referenced resource and invalidate the reference. The
231 /// reference must be to a valid resource before calling to this function.
232 Error destroy(GenericDeviceTy &Device) override {
233 if (!Resource)
234 return Plugin::error("Destroying an invalid resource");
235
236 if (auto Err = Resource->deinit())
237 return Err;
238
239 delete Resource;
240
241 Resource = nullptr;
242 return Plugin::success();
243 }
244
245 /// Get the underlying resource handle.
246 operator HandleTy() const { return Resource; }
247
248private:
249 /// The handle to the actual resource.
250 HandleTy Resource;
251};
252
253/// Class holding an HSA memory pool.
254struct AMDGPUMemoryPoolTy {
255 /// Create a memory pool from an HSA memory pool.
256 AMDGPUMemoryPoolTy(hsa_amd_memory_pool_t MemoryPool)
257 : MemoryPool(MemoryPool), GlobalFlags(0) {}
258
259 /// Initialize the memory pool retrieving its properties.
260 Error init() {
261 if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_SEGMENT, Segment))
262 return Err;
263
264 if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, GlobalFlags))
265 return Err;
266
267 return Plugin::success();
268 }
269
270 /// Getter of the HSA memory pool.
271 hsa_amd_memory_pool_t get() const { return MemoryPool; }
272
273 /// Indicate the segment which belongs to.
274 bool isGlobal() const { return (Segment == HSA_AMD_SEGMENT_GLOBAL); }
275 bool isReadOnly() const { return (Segment == HSA_AMD_SEGMENT_READONLY); }
276 bool isPrivate() const { return (Segment == HSA_AMD_SEGMENT_PRIVATE); }
277 bool isGroup() const { return (Segment == HSA_AMD_SEGMENT_GROUP); }
278
279 /// Indicate if it is fine-grained memory. Valid only for global.
280 bool isFineGrained() const {
281 assert(isGlobal() && "Not global memory");
282 return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED);
283 }
284
285 /// Indicate if it is coarse-grained memory. Valid only for global.
286 bool isCoarseGrained() const {
287 assert(isGlobal() && "Not global memory");
288 return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED);
289 }
290
291 /// Indicate if it supports storing kernel arguments. Valid only for global.
292 bool supportsKernelArgs() const {
293 assert(isGlobal() && "Not global memory");
294 return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT);
295 }
296
297 /// Allocate memory on the memory pool.
298 Error allocate(size_t Size, void **PtrStorage) {
299 hsa_status_t Status =
300 hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, PtrStorage);
301 return Plugin::check(Status, "Error in hsa_amd_memory_pool_allocate: %s");
302 }
303
304 /// Return memory to the memory pool.
305 Error deallocate(void *Ptr) {
306 hsa_status_t Status = hsa_amd_memory_pool_free(Ptr);
307 return Plugin::check(Status, "Error in hsa_amd_memory_pool_free: %s");
308 }
309
310 /// Allow the device to access a specific allocation.
311 Error enableAccess(void *Ptr, int64_t Size,
312 const llvm::SmallVector<hsa_agent_t> &Agents) const {
313#ifdef OMPTARGET_DEBUG
314 for (hsa_agent_t Agent : Agents) {
315 hsa_amd_memory_pool_access_t Access;
316 if (auto Err =
317 getAttr(Agent, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, Access))
318 return Err;
319
320 // The agent is not allowed to access the memory pool in any case. Do not
321 // continue because otherwise it result in undefined behavior.
322 if (Access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED)
323 return Plugin::error("An agent is not allowed to access a memory pool");
324 }
325#endif
326
327 // We can access but it is disabled by default. Enable the access then.
328 hsa_status_t Status =
329 hsa_amd_agents_allow_access(Agents.size(), Agents.data(), nullptr, Ptr);
330 return Plugin::check(Status, "Error in hsa_amd_agents_allow_access: %s");
331 }
332
333 /// Get attribute from the memory pool.
334 template <typename Ty>
335 Error getAttr(hsa_amd_memory_pool_info_t Kind, Ty &Value) const {
336 hsa_status_t Status;
337 Status = hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value);
338 return Plugin::check(Status, "Error in hsa_amd_memory_pool_get_info: %s");
339 }
340
341 template <typename Ty>
342 hsa_status_t getAttrRaw(hsa_amd_memory_pool_info_t Kind, Ty &Value) const {
343 return hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value);
344 }
345
346 /// Get attribute from the memory pool relating to an agent.
347 template <typename Ty>
348 Error getAttr(hsa_agent_t Agent, hsa_amd_agent_memory_pool_info_t Kind,
349 Ty &Value) const {
350 hsa_status_t Status;
351 Status =
352 hsa_amd_agent_memory_pool_get_info(Agent, MemoryPool, Kind, &Value);
353 return Plugin::check(Status,
354 "Error in hsa_amd_agent_memory_pool_get_info: %s");
355 }
356
357private:
358 /// The HSA memory pool.
359 hsa_amd_memory_pool_t MemoryPool;
360
361 /// The segment where the memory pool belongs to.
362 hsa_amd_segment_t Segment;
363
364 /// The global flags of memory pool. Only valid if the memory pool belongs to
365 /// the global segment.
366 uint32_t GlobalFlags;
367};
368
369/// Class that implements a memory manager that gets memory from a specific
370/// memory pool.
371struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
372
373 /// Create an empty memory manager.
374 AMDGPUMemoryManagerTy(AMDGPUPluginTy &Plugin)
375 : Plugin(Plugin), MemoryPool(nullptr), MemoryManager(nullptr) {}
376
377 /// Initialize the memory manager from a memory pool.
378 Error init(AMDGPUMemoryPoolTy &MemoryPool) {
379 const uint32_t Threshold = 1 << 30;
380 this->MemoryManager = new MemoryManagerTy(*this, Threshold);
381 this->MemoryPool = &MemoryPool;
382 return Plugin::success();
383 }
384
385 /// Deinitialize the memory manager and free its allocations.
386 Error deinit() {
387 assert(MemoryManager && "Invalid memory manager");
388
389 // Delete and invalidate the memory manager. At this point, the memory
390 // manager will deallocate all its allocations.
391 delete MemoryManager;
392 MemoryManager = nullptr;
393
394 return Plugin::success();
395 }
396
397 /// Reuse or allocate memory through the memory manager.
398 Error allocate(size_t Size, void **PtrStorage) {
399 assert(MemoryManager && "Invalid memory manager");
400 assert(PtrStorage && "Invalid pointer storage");
401
402 *PtrStorage = MemoryManager->allocate(Size, nullptr);
403 if (*PtrStorage == nullptr)
404 return Plugin::error("Failure to allocate from AMDGPU memory manager");
405
406 return Plugin::success();
407 }
408
409 /// Release an allocation to be reused.
410 Error deallocate(void *Ptr) {
411 assert(Ptr && "Invalid pointer");
412
413 if (MemoryManager->free(Ptr))
414 return Plugin::error("Failure to deallocate from AMDGPU memory manager");
415
416 return Plugin::success();
417 }
418
419private:
420 /// Allocation callback that will be called once the memory manager does not
421 /// have more previously allocated buffers.
422 void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) override;
423
424 /// Deallocation callack that will be called by the memory manager.
425 int free(void *TgtPtr, TargetAllocTy Kind) override {
426 if (auto Err = MemoryPool->deallocate(Ptr: TgtPtr)) {
427 consumeError(Err: std::move(Err));
428 return OFFLOAD_FAIL;
429 }
430 return OFFLOAD_SUCCESS;
431 }
432
433 /// The underlying plugin that owns this memory manager.
434 AMDGPUPluginTy &Plugin;
435
436 /// The memory pool used to allocate memory.
437 AMDGPUMemoryPoolTy *MemoryPool;
438
439 /// Reference to the actual memory manager.
440 MemoryManagerTy *MemoryManager;
441};
442
443/// Class implementing the AMDGPU device images' properties.
444struct AMDGPUDeviceImageTy : public DeviceImageTy {
445 /// Create the AMDGPU image with the id and the target image pointer.
446 AMDGPUDeviceImageTy(int32_t ImageId, GenericDeviceTy &Device,
447 const __tgt_device_image *TgtImage)
448 : DeviceImageTy(ImageId, Device, TgtImage) {}
449
450 /// Prepare and load the executable corresponding to the image.
451 Error loadExecutable(const AMDGPUDeviceTy &Device);
452
453 /// Unload the executable.
454 Error unloadExecutable() {
455 hsa_status_t Status = hsa_executable_destroy(Executable);
456 if (auto Err = Plugin::check(Status, "Error in hsa_executable_destroy: %s"))
457 return Err;
458
459 Status = hsa_code_object_destroy(CodeObject);
460 return Plugin::check(Status, "Error in hsa_code_object_destroy: %s");
461 }
462
463 /// Get the executable.
464 hsa_executable_t getExecutable() const { return Executable; }
465
466 /// Get to Code Object Version of the ELF
467 uint16_t getELFABIVersion() const { return ELFABIVersion; }
468
469 /// Find an HSA device symbol by its name on the executable.
470 Expected<hsa_executable_symbol_t>
471 findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const;
472
473 /// Get additional info for kernel, e.g., register spill counts
474 std::optional<utils::KernelMetaDataTy>
475 getKernelInfo(StringRef Identifier) const {
476 auto It = KernelInfoMap.find(Identifier);
477
478 if (It == KernelInfoMap.end())
479 return {};
480
481 return It->second;
482 }
483
484private:
485 /// The exectuable loaded on the agent.
486 hsa_executable_t Executable;
487 hsa_code_object_t CodeObject;
488 StringMap<utils::KernelMetaDataTy> KernelInfoMap;
489 uint16_t ELFABIVersion;
490};
491
492/// Class implementing the AMDGPU kernel functionalities which derives from the
493/// generic kernel class.
494struct AMDGPUKernelTy : public GenericKernelTy {
495 /// Create an AMDGPU kernel with a name and an execution mode.
496 AMDGPUKernelTy(const char *Name) : GenericKernelTy(Name) {}
497
498 /// Initialize the AMDGPU kernel.
499 Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override {
500 AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(Image);
501
502 // Kernel symbols have a ".kd" suffix.
503 std::string KernelName(getName());
504 KernelName += ".kd";
505
506 // Find the symbol on the device executable.
507 auto SymbolOrErr = AMDImage.findDeviceSymbol(Device, KernelName);
508 if (!SymbolOrErr)
509 return SymbolOrErr.takeError();
510
511 hsa_executable_symbol_t Symbol = *SymbolOrErr;
512 hsa_symbol_kind_t SymbolType;
513 hsa_status_t Status;
514
515 // Retrieve different properties of the kernel symbol.
516 std::pair<hsa_executable_symbol_info_t, void *> RequiredInfos[] = {
517 {HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &SymbolType},
518 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &KernelObject},
519 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &ArgsSize},
520 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &GroupSize},
521 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &DynamicStack},
522 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &PrivateSize}};
523
524 for (auto &Info : RequiredInfos) {
525 Status = hsa_executable_symbol_get_info(Symbol, Info.first, Info.second);
526 if (auto Err = Plugin::check(
527 Status, "Error in hsa_executable_symbol_get_info: %s"))
528 return Err;
529 }
530
531 // Make sure it is a kernel symbol.
532 if (SymbolType != HSA_SYMBOL_KIND_KERNEL)
533 return Plugin::error("Symbol %s is not a kernel function");
534
535 // TODO: Read the kernel descriptor for the max threads per block. May be
536 // read from the image.
537
538 ImplicitArgsSize = utils::getImplicitArgsSize(AMDImage.getELFABIVersion());
539 DP("ELFABIVersion: %d\n", AMDImage.getELFABIVersion());
540
541 // Get additional kernel info read from image
542 KernelInfo = AMDImage.getKernelInfo(getName());
543 if (!KernelInfo.has_value())
544 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device.getDeviceId(),
545 "Could not read extra information for kernel %s.", getName());
546
547 return Plugin::success();
548 }
549
550 /// Launch the AMDGPU kernel function.
551 Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
552 uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args,
553 AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
554
555 /// Print more elaborate kernel launch info for AMDGPU
556 Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
557 KernelArgsTy &KernelArgs, uint32_t NumThreads,
558 uint64_t NumBlocks) const override;
559
560 /// Get group and private segment kernel size.
561 uint32_t getGroupSize() const { return GroupSize; }
562 uint32_t getPrivateSize() const { return PrivateSize; }
563
564 /// Get the HSA kernel object representing the kernel function.
565 uint64_t getKernelObject() const { return KernelObject; }
566
567 /// Get the size of implicitargs based on the code object version
568 /// @return 56 for cov4 and 256 for cov5
569 uint32_t getImplicitArgsSize() const { return ImplicitArgsSize; }
570
571 /// Indicates whether or not we need to set up our own private segment size.
572 bool usesDynamicStack() const { return DynamicStack; }
573
574private:
575 /// The kernel object to execute.
576 uint64_t KernelObject;
577
578 /// The args, group and private segments sizes required by a kernel instance.
579 uint32_t ArgsSize;
580 uint32_t GroupSize;
581 uint32_t PrivateSize;
582 bool DynamicStack;
583
584 /// The size of implicit kernel arguments.
585 uint32_t ImplicitArgsSize;
586
587 /// Additional Info for the AMD GPU Kernel
588 std::optional<utils::KernelMetaDataTy> KernelInfo;
589};
590
591/// Class representing an HSA signal. Signals are used to define dependencies
592/// between asynchronous operations: kernel launches and memory transfers.
593struct AMDGPUSignalTy {
594 /// Create an empty signal.
595 AMDGPUSignalTy() : HSASignal({0}), UseCount() {}
596 AMDGPUSignalTy(AMDGPUDeviceTy &Device) : HSASignal({0}), UseCount() {}
597
598 /// Initialize the signal with an initial value.
599 Error init(uint32_t InitialValue = 1) {
600 hsa_status_t Status =
601 hsa_amd_signal_create(InitialValue, 0, nullptr, 0, &HSASignal);
602 return Plugin::check(Status, "Error in hsa_signal_create: %s");
603 }
604
605 /// Deinitialize the signal.
606 Error deinit() {
607 hsa_status_t Status = hsa_signal_destroy(HSASignal);
608 return Plugin::check(Status, "Error in hsa_signal_destroy: %s");
609 }
610
611 /// Wait until the signal gets a zero value.
612 Error wait(const uint64_t ActiveTimeout = 0, RPCServerTy *RPCServer = nullptr,
613 GenericDeviceTy *Device = nullptr) const {
614 if (ActiveTimeout && !RPCServer) {
615 hsa_signal_value_t Got = 1;
616 Got = hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0,
617 ActiveTimeout, HSA_WAIT_STATE_ACTIVE);
618 if (Got == 0)
619 return Plugin::success();
620 }
621
622 // If there is an RPC device attached to this stream we run it as a server.
623 uint64_t Timeout = RPCServer ? 8192 : UINT64_MAX;
624 auto WaitState = RPCServer ? HSA_WAIT_STATE_ACTIVE : HSA_WAIT_STATE_BLOCKED;
625 while (hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0,
626 Timeout, WaitState) != 0) {
627 if (RPCServer && Device)
628 if (auto Err = RPCServer->runServer(*Device))
629 return Err;
630 }
631 return Plugin::success();
632 }
633
634 /// Load the value on the signal.
635 hsa_signal_value_t load() const {
636 return hsa_signal_load_scacquire(HSASignal);
637 }
638
639 /// Signal decrementing by one.
640 void signal() {
641 assert(load() > 0 && "Invalid signal value");
642 hsa_signal_subtract_screlease(HSASignal, 1);
643 }
644
645 /// Reset the signal value before reusing the signal. Do not call this
646 /// function if the signal is being currently used by any watcher, such as a
647 /// plugin thread or the HSA runtime.
648 void reset() { hsa_signal_store_screlease(HSASignal, 1); }
649
650 /// Increase the number of concurrent uses.
651 void increaseUseCount() { UseCount.increase(); }
652
653 /// Decrease the number of concurrent uses and return whether was the last.
654 bool decreaseUseCount() { return UseCount.decrease(); }
655
656 hsa_signal_t get() const { return HSASignal; }
657
658private:
659 /// The underlying HSA signal.
660 hsa_signal_t HSASignal;
661
662 /// Reference counter for tracking the concurrent use count. This is mainly
663 /// used for knowing how many streams are using the signal.
664 RefCountTy<> UseCount;
665};
666
667/// Classes for holding AMDGPU signals and managing signals.
668using AMDGPUSignalRef = AMDGPUResourceRef<AMDGPUSignalTy>;
669using AMDGPUSignalManagerTy = GenericDeviceResourceManagerTy<AMDGPUSignalRef>;
670
671/// Class holding an HSA queue to submit kernel and barrier packets.
672struct AMDGPUQueueTy {
673 /// Create an empty queue.
674 AMDGPUQueueTy() : Queue(nullptr), Mutex(), NumUsers(0) {}
675
676 /// Lazily initialize a new queue belonging to a specific agent.
677 Error init(hsa_agent_t Agent, int32_t QueueSize) {
678 if (Queue)
679 return Plugin::success();
680 hsa_status_t Status =
681 hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError,
682 nullptr, UINT32_MAX, UINT32_MAX, &Queue);
683 return Plugin::check(Status, "Error in hsa_queue_create: %s");
684 }
685
686 /// Deinitialize the queue and destroy its resources.
687 Error deinit() {
688 std::lock_guard<std::mutex> Lock(Mutex);
689 if (!Queue)
690 return Plugin::success();
691 hsa_status_t Status = hsa_queue_destroy(Queue);
692 return Plugin::check(Status, "Error in hsa_queue_destroy: %s");
693 }
694
695 /// Returns the number of streams, this queue is currently assigned to.
696 bool getUserCount() const { return NumUsers; }
697
698 /// Returns if the underlying HSA queue is initialized.
699 bool isInitialized() { return Queue != nullptr; }
700
701 /// Decrement user count of the queue object.
702 void removeUser() { --NumUsers; }
703
704 /// Increase user count of the queue object.
705 void addUser() { ++NumUsers; }
706
707 /// Push a kernel launch to the queue. The kernel launch requires an output
708 /// signal and can define an optional input signal (nullptr if none).
709 Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
710 uint32_t NumThreads, uint64_t NumBlocks,
711 uint32_t GroupSize, uint64_t StackSize,
712 AMDGPUSignalTy *OutputSignal,
713 AMDGPUSignalTy *InputSignal) {
714 assert(OutputSignal && "Invalid kernel output signal");
715
716 // Lock the queue during the packet publishing process. Notice this blocks
717 // the addition of other packets to the queue. The following piece of code
718 // should be lightweight; do not block the thread, allocate memory, etc.
719 std::lock_guard<std::mutex> Lock(Mutex);
720 assert(Queue && "Interacted with a non-initialized queue!");
721
722 // Add a barrier packet before the kernel packet in case there is a pending
723 // preceding operation. The barrier packet will delay the processing of
724 // subsequent queue's packets until the barrier input signal are satisfied.
725 // No need output signal needed because the dependency is already guaranteed
726 // by the queue barrier itself.
727 if (InputSignal && InputSignal->load())
728 if (auto Err = pushBarrierImpl(OutputSignal: nullptr, InputSignal1: InputSignal))
729 return Err;
730
731 // Now prepare the kernel packet.
732 uint64_t PacketId;
733 hsa_kernel_dispatch_packet_t *Packet = acquirePacket(PacketId);
734 assert(Packet && "Invalid packet");
735
736 // The first 32 bits of the packet are written after the other fields
737 uint16_t Setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
738 Packet->workgroup_size_x = NumThreads;
739 Packet->workgroup_size_y = 1;
740 Packet->workgroup_size_z = 1;
741 Packet->reserved0 = 0;
742 Packet->grid_size_x = NumBlocks * NumThreads;
743 Packet->grid_size_y = 1;
744 Packet->grid_size_z = 1;
745 Packet->private_segment_size =
746 Kernel.usesDynamicStack() ? StackSize : Kernel.getPrivateSize();
747 Packet->group_segment_size = GroupSize;
748 Packet->kernel_object = Kernel.getKernelObject();
749 Packet->kernarg_address = KernelArgs;
750 Packet->reserved2 = 0;
751 Packet->completion_signal = OutputSignal->get();
752
753 // Publish the packet. Do not modify the packet after this point.
754 publishKernelPacket(PacketId, Setup, Packet);
755
756 return Plugin::success();
757 }
758
759 /// Push a barrier packet that will wait up to two input signals. All signals
760 /// are optional (nullptr if none).
761 Error pushBarrier(AMDGPUSignalTy *OutputSignal,
762 const AMDGPUSignalTy *InputSignal1,
763 const AMDGPUSignalTy *InputSignal2) {
764 // Lock the queue during the packet publishing process.
765 std::lock_guard<std::mutex> Lock(Mutex);
766 assert(Queue && "Interacted with a non-initialized queue!");
767
768 // Push the barrier with the lock acquired.
769 return pushBarrierImpl(OutputSignal, InputSignal1, InputSignal2);
770 }
771
772private:
773 /// Push a barrier packet that will wait up to two input signals. Assumes the
774 /// the queue lock is acquired.
775 Error pushBarrierImpl(AMDGPUSignalTy *OutputSignal,
776 const AMDGPUSignalTy *InputSignal1,
777 const AMDGPUSignalTy *InputSignal2 = nullptr) {
778 // Add a queue barrier waiting on both the other stream's operation and the
779 // last operation on the current stream (if any).
780 uint64_t PacketId;
781 hsa_barrier_and_packet_t *Packet =
782 (hsa_barrier_and_packet_t *)acquirePacket(PacketId);
783 assert(Packet && "Invalid packet");
784
785 Packet->reserved0 = 0;
786 Packet->reserved1 = 0;
787 Packet->dep_signal[0] = {0};
788 Packet->dep_signal[1] = {0};
789 Packet->dep_signal[2] = {0};
790 Packet->dep_signal[3] = {0};
791 Packet->dep_signal[4] = {0};
792 Packet->reserved2 = 0;
793 Packet->completion_signal = {0};
794
795 // Set input and output dependencies if needed.
796 if (OutputSignal)
797 Packet->completion_signal = OutputSignal->get();
798 if (InputSignal1)
799 Packet->dep_signal[0] = InputSignal1->get();
800 if (InputSignal2)
801 Packet->dep_signal[1] = InputSignal2->get();
802
803 // Publish the packet. Do not modify the packet after this point.
804 publishBarrierPacket(PacketId, Packet);
805
806 return Plugin::success();
807 }
808
809 /// Acquire a packet from the queue. This call may block the thread if there
810 /// is no space in the underlying HSA queue. It may need to wait until the HSA
811 /// runtime processes some packets. Assumes the queue lock is acquired.
812 hsa_kernel_dispatch_packet_t *acquirePacket(uint64_t &PacketId) {
813 // Increase the queue index with relaxed memory order. Notice this will need
814 // another subsequent atomic operation with acquire order.
815 PacketId = hsa_queue_add_write_index_relaxed(Queue, 1);
816
817 // Wait for the package to be available. Notice the atomic operation uses
818 // the acquire memory order.
819 while (PacketId - hsa_queue_load_read_index_scacquire(Queue) >= Queue->size)
820 ;
821
822 // Return the packet reference.
823 const uint32_t Mask = Queue->size - 1; // The size is a power of 2.
824 return (hsa_kernel_dispatch_packet_t *)Queue->base_address +
825 (PacketId & Mask);
826 }
827
828 /// Publish the kernel packet so that the HSA runtime can start processing
829 /// the kernel launch. Do not modify the packet once this function is called.
830 /// Assumes the queue lock is acquired.
831 void publishKernelPacket(uint64_t PacketId, uint16_t Setup,
832 hsa_kernel_dispatch_packet_t *Packet) {
833 uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet);
834
835 uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
836 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
837 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
838
839 // Publish the packet. Do not modify the package after this point.
840 uint32_t HeaderWord = Header | (Setup << 16u);
841 __atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);
842
843 // Signal the doorbell about the published packet.
844 hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
845 }
846
847 /// Publish the barrier packet so that the HSA runtime can start processing
848 /// the barrier. Next packets in the queue will not be processed until all
849 /// barrier dependencies (signals) are satisfied. Assumes the queue is locked
850 void publishBarrierPacket(uint64_t PacketId,
851 hsa_barrier_and_packet_t *Packet) {
852 uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet);
853 uint16_t Setup = 0;
854 uint16_t Header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
855 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
856 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
857
858 // Publish the packet. Do not modify the package after this point.
859 uint32_t HeaderWord = Header | (Setup << 16u);
860 __atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);
861
862 // Signal the doorbell about the published packet.
863 hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
864 }
865
866 /// Callack that will be called when an error is detected on the HSA queue.
867 static void callbackError(hsa_status_t Status, hsa_queue_t *Source, void *) {
868 auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source);
869 FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data());
870 }
871
872 /// The HSA queue.
873 hsa_queue_t *Queue;
874
875 /// Mutex to protect the acquiring and publishing of packets. For the moment,
876 /// we need this mutex to prevent publishing packets that are not ready to be
877 /// published in a multi-thread scenario. Without a queue lock, a thread T1
878 /// could acquire packet P and thread T2 acquire packet P+1. Thread T2 could
879 /// publish its packet P+1 (signaling the queue's doorbell) before packet P
880 /// from T1 is ready to be processed. That scenario should be invalid. Thus,
881 /// we use the following mutex to make packet acquiring and publishing atomic.
882 /// TODO: There are other more advanced approaches to avoid this mutex using
883 /// atomic operations. We can further investigate it if this is a bottleneck.
884 std::mutex Mutex;
885
886 /// The number of streams, this queue is currently assigned to. A queue is
887 /// considered idle when this is zero, otherwise: busy.
888 uint32_t NumUsers;
889};
890
891/// Struct that implements a stream of asynchronous operations for AMDGPU
892/// devices. This class relies on signals to implement streams and define the
893/// dependencies between asynchronous operations.
894struct AMDGPUStreamTy {
895private:
896 /// Utility struct holding arguments for async H2H memory copies.
897 struct MemcpyArgsTy {
898 void *Dst;
899 const void *Src;
900 size_t Size;
901 };
902
903 /// Utility struct holding arguments for freeing buffers to memory managers.
904 struct ReleaseBufferArgsTy {
905 void *Buffer;
906 AMDGPUMemoryManagerTy *MemoryManager;
907 };
908
909 /// Utility struct holding arguments for releasing signals to signal managers.
910 struct ReleaseSignalArgsTy {
911 AMDGPUSignalTy *Signal;
912 AMDGPUSignalManagerTy *SignalManager;
913 };
914
915 /// The stream is composed of N stream's slots. The struct below represents
916 /// the fields of each slot. Each slot has a signal and an optional action
917 /// function. When appending an HSA asynchronous operation to the stream, one
918 /// slot is consumed and used to store the operation's information. The
919 /// operation's output signal is set to the consumed slot's signal. If there
920 /// is a previous asynchronous operation on the previous slot, the HSA async
921 /// operation's input signal is set to the signal of the previous slot. This
922 /// way, we obtain a chain of dependant async operations. The action is a
923 /// function that will be executed eventually after the operation is
924 /// completed, e.g., for releasing a buffer.
925 struct StreamSlotTy {
926 /// The output signal of the stream operation. May be used by the subsequent
927 /// operation as input signal.
928 AMDGPUSignalTy *Signal;
929
930 /// The action that must be performed after the operation's completion. Set
931 /// to nullptr when there is no action to perform.
932 Error (*ActionFunction)(void *);
933
934 /// Space for the action's arguments. A pointer to these arguments is passed
935 /// to the action function. Notice the space of arguments is limited.
936 union {
937 MemcpyArgsTy MemcpyArgs;
938 ReleaseBufferArgsTy ReleaseBufferArgs;
939 ReleaseSignalArgsTy ReleaseSignalArgs;
940 } ActionArgs;
941
942 /// Create an empty slot.
943 StreamSlotTy() : Signal(nullptr), ActionFunction(nullptr) {}
944
945 /// Schedule a host memory copy action on the slot.
946 Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size) {
947 ActionFunction = memcpyAction;
948 ActionArgs.MemcpyArgs = MemcpyArgsTy{Dst, Src, Size};
949 return Plugin::success();
950 }
951
952 /// Schedule a release buffer action on the slot.
953 Error schedReleaseBuffer(void *Buffer, AMDGPUMemoryManagerTy &Manager) {
954 ActionFunction = releaseBufferAction;
955 ActionArgs.ReleaseBufferArgs = ReleaseBufferArgsTy{Buffer, &Manager};
956 return Plugin::success();
957 }
958
959 /// Schedule a signal release action on the slot.
960 Error schedReleaseSignal(AMDGPUSignalTy *SignalToRelease,
961 AMDGPUSignalManagerTy *SignalManager) {
962 ActionFunction = releaseSignalAction;
963 ActionArgs.ReleaseSignalArgs =
964 ReleaseSignalArgsTy{SignalToRelease, SignalManager};
965 return Plugin::success();
966 }
967
968 // Perform the action if needed.
969 Error performAction() {
970 if (!ActionFunction)
971 return Plugin::success();
972
973 // Perform the action.
974 if (ActionFunction == memcpyAction) {
975 if (auto Err = memcpyAction(&ActionArgs))
976 return Err;
977 } else if (ActionFunction == releaseBufferAction) {
978 if (auto Err = releaseBufferAction(&ActionArgs))
979 return Err;
980 } else if (ActionFunction == releaseSignalAction) {
981 if (auto Err = releaseSignalAction(&ActionArgs))
982 return Err;
983 } else {
984 return Plugin::error("Unknown action function!");
985 }
986
987 // Invalidate the action.
988 ActionFunction = nullptr;
989
990 return Plugin::success();
991 }
992 };
993
994 /// The device agent where the stream was created.
995 hsa_agent_t Agent;
996
997 /// The queue that the stream uses to launch kernels.
998 AMDGPUQueueTy *Queue;
999
1000 /// The manager of signals to reuse signals.
1001 AMDGPUSignalManagerTy &SignalManager;
1002
1003 /// A reference to the associated device.
1004 GenericDeviceTy &Device;
1005
1006 /// Array of stream slots. Use std::deque because it can dynamically grow
1007 /// without invalidating the already inserted elements. For instance, the
1008 /// std::vector may invalidate the elements by reallocating the internal
1009 /// array if there is not enough space on new insertions.
1010 std::deque<StreamSlotTy> Slots;
1011
1012 /// The next available slot on the queue. This is reset to zero each time the
1013 /// stream is synchronized. It also indicates the current number of consumed
1014 /// slots at a given time.
1015 uint32_t NextSlot;
1016
1017 /// The synchronization id. This number is increased each time the stream is
1018 /// synchronized. It is useful to detect if an AMDGPUEventTy points to an
1019 /// operation that was already finalized in a previous stream sycnhronize.
1020 uint32_t SyncCycle;
1021
1022 /// A pointer associated with an RPC server running on the given device. If
1023 /// RPC is not being used this will be a null pointer. Otherwise, this
1024 /// indicates that an RPC server is expected to be run on this stream.
1025 RPCServerTy *RPCServer;
1026
1027 /// Mutex to protect stream's management.
1028 mutable std::mutex Mutex;
1029
1030 /// Timeout hint for HSA actively waiting for signal value to change
1031 const uint64_t StreamBusyWaitMicroseconds;
1032
1033 /// Indicate to spread data transfers across all avilable SDMAs
1034 bool UseMultipleSdmaEngines;
1035
1036 /// Return the current number of asychronous operations on the stream.
1037 uint32_t size() const { return NextSlot; }
1038
1039 /// Return the last valid slot on the stream.
1040 uint32_t last() const { return size() - 1; }
1041
1042 /// Consume one slot from the stream. Since the stream uses signals on demand
1043 /// and releases them once the slot is no longer used, the function requires
1044 /// an idle signal for the new consumed slot.
1045 std::pair<uint32_t, AMDGPUSignalTy *> consume(AMDGPUSignalTy *OutputSignal) {
1046 // Double the stream size if needed. Since we use std::deque, this operation
1047 // does not invalidate the already added slots.
1048 if (Slots.size() == NextSlot)
1049 Slots.resize(new_size: Slots.size() * 2);
1050
1051 // Update the next available slot and the stream size.
1052 uint32_t Curr = NextSlot++;
1053
1054 // Retrieve the input signal, if any, of the current operation.
1055 AMDGPUSignalTy *InputSignal = (Curr > 0) ? Slots[Curr - 1].Signal : nullptr;
1056
1057 // Set the output signal of the current slot.
1058 Slots[Curr].Signal = OutputSignal;
1059
1060 return std::make_pair(x&: Curr, y&: InputSignal);
1061 }
1062
1063 /// Complete all pending post actions and reset the stream after synchronizing
1064 /// or positively querying the stream.
1065 Error complete() {
1066 for (uint32_t Slot = 0; Slot < NextSlot; ++Slot) {
1067 // Take the post action of the operation if any.
1068 if (auto Err = Slots[Slot].performAction())
1069 return Err;
1070
1071 // Release the slot's signal if possible. Otherwise, another user will.
1072 if (Slots[Slot].Signal->decreaseUseCount())
1073 if (auto Err = SignalManager.returnResource(Slots[Slot].Signal))
1074 return Err;
1075
1076 Slots[Slot].Signal = nullptr;
1077 }
1078
1079 // Reset the stream slots to zero.
1080 NextSlot = 0;
1081
1082 // Increase the synchronization id since the stream completed a sync cycle.
1083 SyncCycle += 1;
1084
1085 return Plugin::success();
1086 }
1087
1088 /// Make the current stream wait on a specific operation of another stream.
1089 /// The idea is to make the current stream waiting on two signals: 1) the last
1090 /// signal of the current stream, and 2) the last signal of the other stream.
1091 /// Use a barrier packet with two input signals.
1092 Error waitOnStreamOperation(AMDGPUStreamTy &OtherStream, uint32_t Slot) {
1093 if (Queue == nullptr)
1094 return Plugin::error("Target queue was nullptr");
1095
1096 /// The signal that we must wait from the other stream.
1097 AMDGPUSignalTy *OtherSignal = OtherStream.Slots[Slot].Signal;
1098
1099 // Prevent the release of the other stream's signal.
1100 OtherSignal->increaseUseCount();
1101
1102 // Retrieve an available signal for the operation's output.
1103 AMDGPUSignalTy *OutputSignal = nullptr;
1104 if (auto Err = SignalManager.getResource(OutputSignal))
1105 return Err;
1106 OutputSignal->reset();
1107 OutputSignal->increaseUseCount();
1108
1109 // Consume stream slot and compute dependencies.
1110 auto [Curr, InputSignal] = consume(OutputSignal);
1111
1112 // Setup the post action to release the signal.
1113 if (auto Err = Slots[Curr].schedReleaseSignal(OtherSignal, &SignalManager))
1114 return Err;
1115
1116 // Push a barrier into the queue with both input signals.
1117 return Queue->pushBarrier(OutputSignal, InputSignal1: InputSignal, InputSignal2: OtherSignal);
1118 }
1119
1120 /// Callback for running a specific asynchronous operation. This callback is
1121 /// used for hsa_amd_signal_async_handler. The argument is the operation that
1122 /// should be executed. Notice we use the post action mechanism to codify the
1123 /// asynchronous operation.
1124 static bool asyncActionCallback(hsa_signal_value_t Value, void *Args) {
1125 StreamSlotTy *Slot = reinterpret_cast<StreamSlotTy *>(Args);
1126 assert(Slot && "Invalid slot");
1127 assert(Slot->Signal && "Invalid signal");
1128
1129 // This thread is outside the stream mutex. Make sure the thread sees the
1130 // changes on the slot.
1131 std::atomic_thread_fence(m: std::memory_order_acquire);
1132
1133 // Peform the operation.
1134 if (auto Err = Slot->performAction())
1135 FATAL_MESSAGE(1, "Error peforming post action: %s",
1136 toString(E: std::move(Err)).data());
1137
1138 // Signal the output signal to notify the asycnhronous operation finalized.
1139 Slot->Signal->signal();
1140
1141 // Unregister callback.
1142 return false;
1143 }
1144
1145 // Callback for host-to-host memory copies. This is an asynchronous action.
1146 static Error memcpyAction(void *Data) {
1147 MemcpyArgsTy *Args = reinterpret_cast<MemcpyArgsTy *>(Data);
1148 assert(Args && "Invalid arguments");
1149 assert(Args->Dst && "Invalid destination buffer");
1150 assert(Args->Src && "Invalid source buffer");
1151
1152 std::memcpy(dest: Args->Dst, src: Args->Src, n: Args->Size);
1153
1154 return Plugin::success();
1155 }
1156
1157 /// Releasing a memory buffer to a memory manager. This is a post completion
1158 /// action. There are two kinds of memory buffers:
1159 /// 1. For kernel arguments. This buffer can be freed after receiving the
1160 /// kernel completion signal.
1161 /// 2. For H2D tranfers that need pinned memory space for staging. This
1162 /// buffer can be freed after receiving the transfer completion signal.
1163 /// 3. For D2H tranfers that need pinned memory space for staging. This
1164 /// buffer cannot be freed after receiving the transfer completion signal
1165 /// because of the following asynchronous H2H callback.
1166 /// For this reason, This action can only be taken at
1167 /// AMDGPUStreamTy::complete()
1168 /// Because of the case 3, all releaseBufferActions are taken at
1169 /// AMDGPUStreamTy::complete() in the current implementation.
1170 static Error releaseBufferAction(void *Data) {
1171 ReleaseBufferArgsTy *Args = reinterpret_cast<ReleaseBufferArgsTy *>(Data);
1172 assert(Args && "Invalid arguments");
1173 assert(Args->MemoryManager && "Invalid memory manager");
1174 assert(Args->Buffer && "Invalid buffer");
1175
1176 // Release the allocation to the memory manager.
1177 return Args->MemoryManager->deallocate(Ptr: Args->Buffer);
1178 }
1179
1180 /// Releasing a signal object back to SignalManager. This is a post completion
1181 /// action. This action can only be taken at AMDGPUStreamTy::complete()
1182 static Error releaseSignalAction(void *Data) {
1183 ReleaseSignalArgsTy *Args = reinterpret_cast<ReleaseSignalArgsTy *>(Data);
1184 assert(Args && "Invalid arguments");
1185 assert(Args->Signal && "Invalid signal");
1186 assert(Args->SignalManager && "Invalid signal manager");
1187
1188 // Release the signal if needed.
1189 if (Args->Signal->decreaseUseCount())
1190 if (auto Err = Args->SignalManager->returnResource(Args->Signal))
1191 return Err;
1192
1193 return Plugin::success();
1194 }
1195
1196public:
1197 /// Create an empty stream associated with a specific device.
1198 AMDGPUStreamTy(AMDGPUDeviceTy &Device);
1199
1200 /// Intialize the stream's signals.
1201 Error init() { return Plugin::success(); }
1202
1203 /// Deinitialize the stream's signals.
1204 Error deinit() { return Plugin::success(); }
1205
1206 /// Attach an RPC server to this stream.
1207 void setRPCServer(RPCServerTy *Server) { RPCServer = Server; }
1208
1209 /// Push a asynchronous kernel to the stream. The kernel arguments must be
1210 /// placed in a special allocation for kernel args and must keep alive until
1211 /// the kernel finalizes. Once the kernel is finished, the stream will release
1212 /// the kernel args buffer to the specified memory manager.
1213 Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
1214 uint32_t NumThreads, uint64_t NumBlocks,
1215 uint32_t GroupSize, uint64_t StackSize,
1216 AMDGPUMemoryManagerTy &MemoryManager) {
1217 if (Queue == nullptr)
1218 return Plugin::error("Target queue was nullptr");
1219
1220 // Retrieve an available signal for the operation's output.
1221 AMDGPUSignalTy *OutputSignal = nullptr;
1222 if (auto Err = SignalManager.getResource(OutputSignal))
1223 return Err;
1224 OutputSignal->reset();
1225 OutputSignal->increaseUseCount();
1226
1227 std::lock_guard<std::mutex> StreamLock(Mutex);
1228
1229 // Consume stream slot and compute dependencies.
1230 auto [Curr, InputSignal] = consume(OutputSignal);
1231
1232 // Setup the post action to release the kernel args buffer.
1233 if (auto Err = Slots[Curr].schedReleaseBuffer(Buffer: KernelArgs, Manager&: MemoryManager))
1234 return Err;
1235
1236 // Push the kernel with the output signal and an input signal (optional)
1237 return Queue->pushKernelLaunch(Kernel, KernelArgs, NumThreads, NumBlocks,
1238 GroupSize, StackSize, OutputSignal,
1239 InputSignal);
1240 }
1241
1242 /// Push an asynchronous memory copy between pinned memory buffers.
1243 Error pushPinnedMemoryCopyAsync(void *Dst, const void *Src,
1244 uint64_t CopySize) {
1245 // Retrieve an available signal for the operation's output.
1246 AMDGPUSignalTy *OutputSignal = nullptr;
1247 if (auto Err = SignalManager.getResource(OutputSignal))
1248 return Err;
1249 OutputSignal->reset();
1250 OutputSignal->increaseUseCount();
1251
1252 std::lock_guard<std::mutex> Lock(Mutex);
1253
1254 // Consume stream slot and compute dependencies.
1255 auto [Curr, InputSignal] = consume(OutputSignal);
1256
1257 // Issue the async memory copy.
1258 if (InputSignal && InputSignal->load()) {
1259 hsa_signal_t InputSignalRaw = InputSignal->get();
1260 return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
1261 CopySize, 1, &InputSignalRaw,
1262 OutputSignal->get());
1263 }
1264
1265 return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
1266 CopySize, 0, nullptr, OutputSignal->get());
1267 }
1268
1269 /// Push an asynchronous memory copy device-to-host involving an unpinned
1270 /// memory buffer. The operation consists of a two-step copy from the
1271 /// device buffer to an intermediate pinned host buffer, and then, to a
1272 /// unpinned host buffer. Both operations are asynchronous and dependant.
1273 /// The intermediate pinned buffer will be released to the specified memory
1274 /// manager once the operation completes.
1275 Error pushMemoryCopyD2HAsync(void *Dst, const void *Src, void *Inter,
1276 uint64_t CopySize,
1277 AMDGPUMemoryManagerTy &MemoryManager) {
1278 // Retrieve available signals for the operation's outputs.
1279 AMDGPUSignalTy *OutputSignals[2] = {};
1280 if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals))
1281 return Err;
1282 for (auto *Signal : OutputSignals) {
1283 Signal->reset();
1284 Signal->increaseUseCount();
1285 }
1286
1287 std::lock_guard<std::mutex> Lock(Mutex);
1288
1289 // Consume stream slot and compute dependencies.
1290 auto [Curr, InputSignal] = consume(OutputSignal: OutputSignals[0]);
1291
1292 // Setup the post action for releasing the intermediate buffer.
1293 if (auto Err = Slots[Curr].schedReleaseBuffer(Buffer: Inter, Manager&: MemoryManager))
1294 return Err;
1295
1296 // Issue the first step: device to host transfer. Avoid defining the input
1297 // dependency if already satisfied.
1298 if (InputSignal && InputSignal->load()) {
1299 hsa_signal_t InputSignalRaw = InputSignal->get();
1300 if (auto Err = utils::asyncMemCopy(
1301 UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1,
1302 &InputSignalRaw, OutputSignals[0]->get()))
1303 return Err;
1304 } else {
1305 if (auto Err = utils::asyncMemCopy(UseMultipleSdmaEngines, Inter, Agent,
1306 Src, Agent, CopySize, 0, nullptr,
1307 OutputSignals[0]->get()))
1308 return Err;
1309 }
1310
1311 // Consume another stream slot and compute dependencies.
1312 std::tie(args&: Curr, args&: InputSignal) = consume(OutputSignal: OutputSignals[1]);
1313 assert(InputSignal && "Invalid input signal");
1314
1315 // The std::memcpy is done asynchronously using an async handler. We store
1316 // the function's information in the action but it's not actually an action.
1317 if (auto Err = Slots[Curr].schedHostMemoryCopy(Dst, Src: Inter, Size: CopySize))
1318 return Err;
1319
1320 // Make changes on this slot visible to the async handler's thread.
1321 std::atomic_thread_fence(m: std::memory_order_release);
1322
1323 // Issue the second step: host to host transfer.
1324 hsa_status_t Status = hsa_amd_signal_async_handler(
1325 InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback,
1326 (void *)&Slots[Curr]);
1327
1328 return Plugin::check(Status, "Error in hsa_amd_signal_async_handler: %s");
1329 }
1330
1331 /// Push an asynchronous memory copy host-to-device involving an unpinned
1332 /// memory buffer. The operation consists of a two-step copy from the
1333 /// unpinned host buffer to an intermediate pinned host buffer, and then, to
1334 /// the pinned host buffer. Both operations are asynchronous and dependant.
1335 /// The intermediate pinned buffer will be released to the specified memory
1336 /// manager once the operation completes.
1337 Error pushMemoryCopyH2DAsync(void *Dst, const void *Src, void *Inter,
1338 uint64_t CopySize,
1339 AMDGPUMemoryManagerTy &MemoryManager) {
1340 // Retrieve available signals for the operation's outputs.
1341 AMDGPUSignalTy *OutputSignals[2] = {};
1342 if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals))
1343 return Err;
1344 for (auto *Signal : OutputSignals) {
1345 Signal->reset();
1346 Signal->increaseUseCount();
1347 }
1348
1349 AMDGPUSignalTy *OutputSignal = OutputSignals[0];
1350
1351 std::lock_guard<std::mutex> Lock(Mutex);
1352
1353 // Consume stream slot and compute dependencies.
1354 auto [Curr, InputSignal] = consume(OutputSignal);
1355
1356 // Issue the first step: host to host transfer.
1357 if (InputSignal && InputSignal->load()) {
1358 // The std::memcpy is done asynchronously using an async handler. We store
1359 // the function's information in the action but it is not actually a
1360 // post action.
1361 if (auto Err = Slots[Curr].schedHostMemoryCopy(Dst: Inter, Src, Size: CopySize))
1362 return Err;
1363
1364 // Make changes on this slot visible to the async handler's thread.
1365 std::atomic_thread_fence(m: std::memory_order_release);
1366
1367 hsa_status_t Status = hsa_amd_signal_async_handler(
1368 InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback,
1369 (void *)&Slots[Curr]);
1370
1371 if (auto Err = Plugin::check(Status,
1372 "Error in hsa_amd_signal_async_handler: %s"))
1373 return Err;
1374
1375 // Let's use now the second output signal.
1376 OutputSignal = OutputSignals[1];
1377
1378 // Consume another stream slot and compute dependencies.
1379 std::tie(args&: Curr, args&: InputSignal) = consume(OutputSignal);
1380 } else {
1381 // All preceding operations completed, copy the memory synchronously.
1382 std::memcpy(dest: Inter, src: Src, n: CopySize);
1383
1384 // Return the second signal because it will not be used.
1385 OutputSignals[1]->decreaseUseCount();
1386 if (auto Err = SignalManager.returnResource(OutputSignals[1]))
1387 return Err;
1388 }
1389
1390 // Setup the post action to release the intermediate pinned buffer.
1391 if (auto Err = Slots[Curr].schedReleaseBuffer(Buffer: Inter, Manager&: MemoryManager))
1392 return Err;
1393
1394 // Issue the second step: host to device transfer. Avoid defining the input
1395 // dependency if already satisfied.
1396 if (InputSignal && InputSignal->load()) {
1397 hsa_signal_t InputSignalRaw = InputSignal->get();
1398 return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
1399 Agent, CopySize, 1, &InputSignalRaw,
1400 OutputSignal->get());
1401 }
1402 return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, Agent,
1403 CopySize, 0, nullptr, OutputSignal->get());
1404 }
1405
1406 // AMDGPUDeviceTy is incomplete here, passing the underlying agent instead
1407 Error pushMemoryCopyD2DAsync(void *Dst, hsa_agent_t DstAgent, const void *Src,
1408 hsa_agent_t SrcAgent, uint64_t CopySize) {
1409 AMDGPUSignalTy *OutputSignal;
1410 if (auto Err = SignalManager.getResources(/*Num=*/1, &OutputSignal))
1411 return Err;
1412 OutputSignal->reset();
1413 OutputSignal->increaseUseCount();
1414
1415 std::lock_guard<std::mutex> Lock(Mutex);
1416
1417 // Consume stream slot and compute dependencies.
1418 auto [Curr, InputSignal] = consume(OutputSignal);
1419
1420 // The agents need to have access to the corresponding memory
1421 // This is presently only true if the pointers were originally
1422 // allocated by this runtime or the caller made the appropriate
1423 // access calls.
1424
1425 if (InputSignal && InputSignal->load()) {
1426 hsa_signal_t InputSignalRaw = InputSignal->get();
1427 return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
1428 SrcAgent, CopySize, 1, &InputSignalRaw,
1429 OutputSignal->get());
1430 }
1431 return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
1432 SrcAgent, CopySize, 0, nullptr,
1433 OutputSignal->get());
1434 }
1435
1436 /// Synchronize with the stream. The current thread waits until all operations
1437 /// are finalized and it performs the pending post actions (i.e., releasing
1438 /// intermediate buffers).
1439 Error synchronize() {
1440 std::lock_guard<std::mutex> Lock(Mutex);
1441
1442 // No need to synchronize anything.
1443 if (size() == 0)
1444 return Plugin::success();
1445
1446 // Wait until all previous operations on the stream have completed.
1447 if (auto Err = Slots[last()].Signal->wait(StreamBusyWaitMicroseconds,
1448 RPCServer, &Device))
1449 return Err;
1450
1451 // Reset the stream and perform all pending post actions.
1452 return complete();
1453 }
1454
1455 /// Query the stream and complete pending post actions if operations finished.
1456 /// Return whether all the operations completed. This operation does not block
1457 /// the calling thread.
1458 Expected<bool> query() {
1459 std::lock_guard<std::mutex> Lock(Mutex);
1460
1461 // No need to query anything.
1462 if (size() == 0)
1463 return true;
1464
1465 // The last operation did not complete yet. Return directly.
1466 if (Slots[last()].Signal->load())
1467 return false;
1468
1469 // Reset the stream and perform all pending post actions.
1470 if (auto Err = complete())
1471 return std::move(Err);
1472
1473 return true;
1474 }
1475
1476 /// Record the state of the stream on an event.
1477 Error recordEvent(AMDGPUEventTy &Event) const;
1478
1479 /// Make the stream wait on an event.
1480 Error waitEvent(const AMDGPUEventTy &Event);
1481
1482 friend struct AMDGPUStreamManagerTy;
1483};
1484
1485/// Class representing an event on AMDGPU. The event basically stores some
1486/// information regarding the state of the recorded stream.
1487struct AMDGPUEventTy {
1488 /// Create an empty event.
1489 AMDGPUEventTy(AMDGPUDeviceTy &Device)
1490 : RecordedStream(nullptr), RecordedSlot(-1), RecordedSyncCycle(-1) {}
1491
1492 /// Initialize and deinitialize.
1493 Error init() { return Plugin::success(); }
1494 Error deinit() { return Plugin::success(); }
1495
1496 /// Record the state of a stream on the event.
1497 Error record(AMDGPUStreamTy &Stream) {
1498 std::lock_guard<std::mutex> Lock(Mutex);
1499
1500 // Ignore the last recorded stream.
1501 RecordedStream = &Stream;
1502
1503 return Stream.recordEvent(Event&: *this);
1504 }
1505
1506 /// Make a stream wait on the current event.
1507 Error wait(AMDGPUStreamTy &Stream) {
1508 std::lock_guard<std::mutex> Lock(Mutex);
1509
1510 if (!RecordedStream)
1511 return Plugin::error("Event does not have any recorded stream");
1512
1513 // Synchronizing the same stream. Do nothing.
1514 if (RecordedStream == &Stream)
1515 return Plugin::success();
1516
1517 // No need to wait anything, the recorded stream already finished the
1518 // corresponding operation.
1519 if (RecordedSlot < 0)
1520 return Plugin::success();
1521
1522 return Stream.waitEvent(Event: *this);
1523 }
1524
1525protected:
1526 /// The stream registered in this event.
1527 AMDGPUStreamTy *RecordedStream;
1528
1529 /// The recordered operation on the recorded stream.
1530 int64_t RecordedSlot;
1531
1532 /// The sync cycle when the stream was recorded. Used to detect stale events.
1533 int64_t RecordedSyncCycle;
1534
1535 /// Mutex to safely access event fields.
1536 mutable std::mutex Mutex;
1537
1538 friend struct AMDGPUStreamTy;
1539};
1540
1541Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) const {
1542 std::lock_guard<std::mutex> Lock(Mutex);
1543
1544 if (size() > 0) {
1545 // Record the synchronize identifier (to detect stale recordings) and
1546 // the last valid stream's operation.
1547 Event.RecordedSyncCycle = SyncCycle;
1548 Event.RecordedSlot = last();
1549
1550 assert(Event.RecordedSyncCycle >= 0 && "Invalid recorded sync cycle");
1551 assert(Event.RecordedSlot >= 0 && "Invalid recorded slot");
1552 } else {
1553 // The stream is empty, everything already completed, record nothing.
1554 Event.RecordedSyncCycle = -1;
1555 Event.RecordedSlot = -1;
1556 }
1557 return Plugin::success();
1558}
1559
1560Error AMDGPUStreamTy::waitEvent(const AMDGPUEventTy &Event) {
1561 // Retrieve the recorded stream on the event.
1562 AMDGPUStreamTy &RecordedStream = *Event.RecordedStream;
1563
1564 std::scoped_lock<std::mutex, std::mutex> Lock(Mutex, RecordedStream.Mutex);
1565
1566 // The recorded stream already completed the operation because the synchronize
1567 // identifier is already outdated.
1568 if (RecordedStream.SyncCycle != (uint32_t)Event.RecordedSyncCycle)
1569 return Plugin::success();
1570
1571 // Again, the recorded stream already completed the operation, the last
1572 // operation's output signal is satisfied.
1573 if (!RecordedStream.Slots[Event.RecordedSlot].Signal->load())
1574 return Plugin::success();
1575
1576 // Otherwise, make the current stream wait on the other stream's operation.
1577 return waitOnStreamOperation(OtherStream&: RecordedStream, Slot: Event.RecordedSlot);
1578}
1579
1580struct AMDGPUStreamManagerTy final
1581 : GenericDeviceResourceManagerTy<AMDGPUResourceRef<AMDGPUStreamTy>> {
1582 using ResourceRef = AMDGPUResourceRef<AMDGPUStreamTy>;
1583 using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>;
1584
1585 AMDGPUStreamManagerTy(GenericDeviceTy &Device, hsa_agent_t HSAAgent)
1586 : GenericDeviceResourceManagerTy(Device),
1587 OMPX_QueueTracking("LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING", true),
1588 NextQueue(0), Agent(HSAAgent) {}
1589
1590 Error init(uint32_t InitialSize, int NumHSAQueues, int HSAQueueSize) {
1591 Queues = std::vector<AMDGPUQueueTy>(NumHSAQueues);
1592 QueueSize = HSAQueueSize;
1593 MaxNumQueues = NumHSAQueues;
1594 // Initialize one queue eagerly
1595 if (auto Err = Queues.front().init(Agent, QueueSize))
1596 return Err;
1597
1598 return GenericDeviceResourceManagerTy::init(InitialSize);
1599 }
1600
1601 /// Deinitialize the resource pool and delete all resources. This function
1602 /// must be called before the destructor.
1603 Error deinit() override {
1604 // De-init all queues
1605 for (AMDGPUQueueTy &Queue : Queues) {
1606 if (auto Err = Queue.deinit())
1607 return Err;
1608 }
1609
1610 return GenericDeviceResourceManagerTy::deinit();
1611 }
1612
1613 /// Get a single stream from the pool or create new resources.
1614 virtual Error getResource(AMDGPUStreamTy *&StreamHandle) override {
1615 return getResourcesImpl(1, &StreamHandle, [this](AMDGPUStreamTy *&Handle) {
1616 return assignNextQueue(Stream: Handle);
1617 });
1618 }
1619
1620 /// Return stream to the pool.
1621 virtual Error returnResource(AMDGPUStreamTy *StreamHandle) override {
1622 return returnResourceImpl(StreamHandle, [](AMDGPUStreamTy *Handle) {
1623 Handle->Queue->removeUser();
1624 return Plugin::success();
1625 });
1626 }
1627
1628private:
1629 /// Search for and assign an prefereably idle queue to the given Stream. If
1630 /// there is no queue without current users, choose the queue with the lowest
1631 /// user count. If utilization is ignored: use round robin selection.
1632 inline Error assignNextQueue(AMDGPUStreamTy *Stream) {
1633 // Start from zero when tracking utilization, otherwise: round robin policy.
1634 uint32_t Index = OMPX_QueueTracking ? 0 : NextQueue++ % MaxNumQueues;
1635
1636 if (OMPX_QueueTracking) {
1637 // Find the least used queue.
1638 for (uint32_t I = 0; I < MaxNumQueues; ++I) {
1639 // Early exit when an initialized queue is idle.
1640 if (Queues[I].isInitialized() && Queues[I].getUserCount() == 0) {
1641 Index = I;
1642 break;
1643 }
1644
1645 // Update the least used queue.
1646 if (Queues[Index].getUserCount() > Queues[I].getUserCount())
1647 Index = I;
1648 }
1649 }
1650
1651 // Make sure the queue is initialized, then add user & assign.
1652 if (auto Err = Queues[Index].init(Agent, QueueSize))
1653 return Err;
1654 Queues[Index].addUser();
1655 Stream->Queue = &Queues[Index];
1656
1657 return Plugin::success();
1658 }
1659
1660 /// Envar for controlling the tracking of busy HSA queues.
1661 BoolEnvar OMPX_QueueTracking;
1662
1663 /// The next queue index to use for round robin selection.
1664 uint32_t NextQueue;
1665
1666 /// The queues which are assigned to requested streams.
1667 std::vector<AMDGPUQueueTy> Queues;
1668
1669 /// The corresponding device as HSA agent.
1670 hsa_agent_t Agent;
1671
1672 /// The maximum number of queues.
1673 int MaxNumQueues;
1674
1675 /// The size of created queues.
1676 int QueueSize;
1677};
1678
1679/// Abstract class that holds the common members of the actual kernel devices
1680/// and the host device. Both types should inherit from this class.
1681struct AMDGenericDeviceTy {
1682 AMDGenericDeviceTy() {}
1683
1684 virtual ~AMDGenericDeviceTy() {}
1685
1686 /// Create all memory pools which the device has access to and classify them.
1687 Error initMemoryPools() {
1688 // Retrieve all memory pools from the device agent(s).
1689 Error Err = retrieveAllMemoryPools();
1690 if (Err)
1691 return Err;
1692
1693 for (AMDGPUMemoryPoolTy *MemoryPool : AllMemoryPools) {
1694 // Initialize the memory pool and retrieve some basic info.
1695 Error Err = MemoryPool->init();
1696 if (Err)
1697 return Err;
1698
1699 if (!MemoryPool->isGlobal())
1700 continue;
1701
1702 // Classify the memory pools depending on their properties.
1703 if (MemoryPool->isFineGrained()) {
1704 FineGrainedMemoryPools.push_back(MemoryPool);
1705 if (MemoryPool->supportsKernelArgs())
1706 ArgsMemoryPools.push_back(MemoryPool);
1707 } else if (MemoryPool->isCoarseGrained()) {
1708 CoarseGrainedMemoryPools.push_back(MemoryPool);
1709 }
1710 }
1711 return Plugin::success();
1712 }
1713
1714 /// Destroy all memory pools.
1715 Error deinitMemoryPools() {
1716 for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools)
1717 delete Pool;
1718
1719 AllMemoryPools.clear();
1720 FineGrainedMemoryPools.clear();
1721 CoarseGrainedMemoryPools.clear();
1722 ArgsMemoryPools.clear();
1723
1724 return Plugin::success();
1725 }
1726
1727 /// Retrieve and construct all memory pools from the device agent(s).
1728 virtual Error retrieveAllMemoryPools() = 0;
1729
1730 /// Get the device agent.
1731 virtual hsa_agent_t getAgent() const = 0;
1732
1733protected:
1734 /// Array of all memory pools available to the host agents.
1735 llvm::SmallVector<AMDGPUMemoryPoolTy *> AllMemoryPools;
1736
1737 /// Array of fine-grained memory pools available to the host agents.
1738 llvm::SmallVector<AMDGPUMemoryPoolTy *> FineGrainedMemoryPools;
1739
1740 /// Array of coarse-grained memory pools available to the host agents.
1741 llvm::SmallVector<AMDGPUMemoryPoolTy *> CoarseGrainedMemoryPools;
1742
1743 /// Array of kernel args memory pools available to the host agents.
1744 llvm::SmallVector<AMDGPUMemoryPoolTy *> ArgsMemoryPools;
1745};
1746
1747/// Class representing the host device. This host device may have more than one
1748/// HSA host agent. We aggregate all its resources into the same instance.
1749struct AMDHostDeviceTy : public AMDGenericDeviceTy {
1750 /// Create a host device from an array of host agents.
1751 AMDHostDeviceTy(AMDGPUPluginTy &Plugin,
1752 const llvm::SmallVector<hsa_agent_t> &HostAgents)
1753 : AMDGenericDeviceTy(), Agents(HostAgents), ArgsMemoryManager(Plugin),
1754 PinnedMemoryManager(Plugin) {
1755 assert(HostAgents.size() && "No host agent found");
1756 }
1757
1758 /// Initialize the host device memory pools and the memory managers for
1759 /// kernel args and host pinned memory allocations.
1760 Error init() {
1761 if (auto Err = initMemoryPools())
1762 return Err;
1763
1764 if (auto Err = ArgsMemoryManager.init(getArgsMemoryPool()))
1765 return Err;
1766
1767 if (auto Err = PinnedMemoryManager.init(getFineGrainedMemoryPool()))
1768 return Err;
1769
1770 return Plugin::success();
1771 }
1772
1773 /// Deinitialize memory pools and managers.
1774 Error deinit() {
1775 if (auto Err = deinitMemoryPools())
1776 return Err;
1777
1778 if (auto Err = ArgsMemoryManager.deinit())
1779 return Err;
1780
1781 if (auto Err = PinnedMemoryManager.deinit())
1782 return Err;
1783
1784 return Plugin::success();
1785 }
1786
1787 /// Retrieve and construct all memory pools from the host agents.
1788 Error retrieveAllMemoryPools() override {
1789 // Iterate through the available pools across the host agents.
1790 for (hsa_agent_t Agent : Agents) {
1791 Error Err = utils::iterateAgentMemoryPools(
1792 Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) {
1793 AMDGPUMemoryPoolTy *MemoryPool =
1794 new AMDGPUMemoryPoolTy(HSAMemoryPool);
1795 AllMemoryPools.push_back(MemoryPool);
1796 return HSA_STATUS_SUCCESS;
1797 });
1798 if (Err)
1799 return Err;
1800 }
1801 return Plugin::success();
1802 }
1803
1804 /// Get one of the host agents. Return always the first agent.
1805 hsa_agent_t getAgent() const override { return Agents[0]; }
1806
1807 /// Get a memory pool for fine-grained allocations.
1808 AMDGPUMemoryPoolTy &getFineGrainedMemoryPool() {
1809 assert(!FineGrainedMemoryPools.empty() && "No fine-grained mempool");
1810 // Retrive any memory pool.
1811 return *FineGrainedMemoryPools[0];
1812 }
1813
1814 AMDGPUMemoryPoolTy &getCoarseGrainedMemoryPool() {
1815 assert(!CoarseGrainedMemoryPools.empty() && "No coarse-grained mempool");
1816 // Retrive any memory pool.
1817 return *CoarseGrainedMemoryPools[0];
1818 }
1819
1820 /// Get a memory pool for kernel args allocations.
1821 AMDGPUMemoryPoolTy &getArgsMemoryPool() {
1822 assert(!ArgsMemoryPools.empty() && "No kernelargs mempool");
1823 // Retrieve any memory pool.
1824 return *ArgsMemoryPools[0];
1825 }
1826
1827 /// Getters for kernel args and host pinned memory managers.
1828 AMDGPUMemoryManagerTy &getArgsMemoryManager() { return ArgsMemoryManager; }
1829 AMDGPUMemoryManagerTy &getPinnedMemoryManager() {
1830 return PinnedMemoryManager;
1831 }
1832
1833private:
1834 /// Array of agents on the host side.
1835 const llvm::SmallVector<hsa_agent_t> Agents;
1836
1837 // Memory manager for kernel arguments.
1838 AMDGPUMemoryManagerTy ArgsMemoryManager;
1839
1840 // Memory manager for pinned memory.
1841 AMDGPUMemoryManagerTy PinnedMemoryManager;
1842};
1843
1844/// Class implementing the AMDGPU device functionalities which derives from the
1845/// generic device class.
1846struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
1847 // Create an AMDGPU device with a device id and default AMDGPU grid values.
1848 AMDGPUDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices,
1849 AMDHostDeviceTy &HostDevice, hsa_agent_t Agent)
1850 : GenericDeviceTy(Plugin, DeviceId, NumDevices, {0}),
1851 AMDGenericDeviceTy(),
1852 OMPX_NumQueues("LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES", 4),
1853 OMPX_QueueSize("LIBOMPTARGET_AMDGPU_HSA_QUEUE_SIZE", 512),
1854 OMPX_DefaultTeamsPerCU("LIBOMPTARGET_AMDGPU_TEAMS_PER_CU", 4),
1855 OMPX_MaxAsyncCopyBytes("LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES",
1856 1 * 1024 * 1024), // 1MB
1857 OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS",
1858 64),
1859 OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", 2000000),
1860 OMPX_UseMultipleSdmaEngines(
1861 "LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES", false),
1862 OMPX_ApuMaps("OMPX_APU_MAPS", false), AMDGPUStreamManager(*this, Agent),
1863 AMDGPUEventManager(*this), AMDGPUSignalManager(*this), Agent(Agent),
1864 HostDevice(HostDevice) {}
1865
1866 ~AMDGPUDeviceTy() {}
1867
1868 /// Initialize the device, its resources and get its properties.
1869 Error initImpl(GenericPluginTy &Plugin) override {
1870 // First setup all the memory pools.
1871 if (auto Err = initMemoryPools())
1872 return Err;
1873
1874 char GPUName[64];
1875 if (auto Err = getDeviceAttr(HSA_AGENT_INFO_NAME, GPUName))
1876 return Err;
1877 ComputeUnitKind = GPUName;
1878
1879 // Get the wavefront size.
1880 uint32_t WavefrontSize = 0;
1881 if (auto Err = getDeviceAttr(HSA_AGENT_INFO_WAVEFRONT_SIZE, WavefrontSize))
1882 return Err;
1883 GridValues.GV_Warp_Size = WavefrontSize;
1884
1885 // Get the frequency of the steady clock. If the attribute is missing
1886 // assume running on an older libhsa and default to 0, omp_get_wtime
1887 // will be inaccurate but otherwise programs can still run.
1888 if (getDeviceAttrRaw(HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY,
1889 ClockFrequency) != HSA_STATUS_SUCCESS)
1890 ClockFrequency = 0;
1891
1892 // Load the grid values dependending on the wavefront.
1893 if (WavefrontSize == 32)
1894 GridValues = getAMDGPUGridValues<32>();
1895 else if (WavefrontSize == 64)
1896 GridValues = getAMDGPUGridValues<64>();
1897 else
1898 return Plugin::error("Unexpected AMDGPU wavefront %d", WavefrontSize);
1899
1900 // Get maximum number of workitems per workgroup.
1901 uint16_t WorkgroupMaxDim[3];
1902 if (auto Err =
1903 getDeviceAttr(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgroupMaxDim))
1904 return Err;
1905 GridValues.GV_Max_WG_Size = WorkgroupMaxDim[0];
1906
1907 // Get maximum number of workgroups.
1908 hsa_dim3_t GridMaxDim;
1909 if (auto Err = getDeviceAttr(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim))
1910 return Err;
1911
1912 GridValues.GV_Max_Teams = GridMaxDim.x / GridValues.GV_Max_WG_Size;
1913 if (GridValues.GV_Max_Teams == 0)
1914 return Plugin::error("Maximum number of teams cannot be zero");
1915
1916 // Compute the default number of teams.
1917 uint32_t ComputeUnits = 0;
1918 if (auto Err =
1919 getDeviceAttr(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, ComputeUnits))
1920 return Err;
1921 GridValues.GV_Default_Num_Teams = ComputeUnits * OMPX_DefaultTeamsPerCU;
1922
1923 uint32_t WavesPerCU = 0;
1924 if (auto Err =
1925 getDeviceAttr(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, WavesPerCU))
1926 return Err;
1927 HardwareParallelism = ComputeUnits * WavesPerCU;
1928
1929 // Get maximum size of any device queues and maximum number of queues.
1930 uint32_t MaxQueueSize;
1931 if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUE_MAX_SIZE, MaxQueueSize))
1932 return Err;
1933
1934 uint32_t MaxQueues;
1935 if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUES_MAX, MaxQueues))
1936 return Err;
1937
1938 // Compute the number of queues and their size.
1939 OMPX_NumQueues = std::max(1U, std::min(OMPX_NumQueues.get(), MaxQueues));
1940 OMPX_QueueSize = std::min(OMPX_QueueSize.get(), MaxQueueSize);
1941
1942 // Initialize stream pool.
1943 if (auto Err = AMDGPUStreamManager.init(OMPX_InitialNumStreams,
1944 OMPX_NumQueues, OMPX_QueueSize))
1945 return Err;
1946
1947 // Initialize event pool.
1948 if (auto Err = AMDGPUEventManager.init(OMPX_InitialNumEvents))
1949 return Err;
1950
1951 // Initialize signal pool.
1952 if (auto Err = AMDGPUSignalManager.init(OMPX_InitialNumSignals))
1953 return Err;
1954
1955 // Detect if XNACK is enabled
1956 auto TargeTripleAndFeaturesOrError =
1957 utils::getTargetTripleAndFeatures(Agent);
1958 if (!TargeTripleAndFeaturesOrError)
1959 return TargeTripleAndFeaturesOrError.takeError();
1960 if (static_cast<StringRef>(*TargeTripleAndFeaturesOrError)
1961 .contains(Other: "xnack+"))
1962 IsXnackEnabled = true;
1963
1964 // detect if device is an APU.
1965 if (auto Err = checkIfAPU())
1966 return Err;
1967
1968 return Plugin::success();
1969 }
1970
1971 /// Deinitialize the device and release its resources.
1972 Error deinitImpl() override {
1973 // Deinitialize the stream and event pools.
1974 if (auto Err = AMDGPUStreamManager.deinit())
1975 return Err;
1976
1977 if (auto Err = AMDGPUEventManager.deinit())
1978 return Err;
1979
1980 if (auto Err = AMDGPUSignalManager.deinit())
1981 return Err;
1982
1983 // Close modules if necessary.
1984 if (!LoadedImages.empty()) {
1985 // Each image has its own module.
1986 for (DeviceImageTy *Image : LoadedImages) {
1987 AMDGPUDeviceImageTy &AMDImage =
1988 static_cast<AMDGPUDeviceImageTy &>(*Image);
1989
1990 // Unload the executable of the image.
1991 if (auto Err = AMDImage.unloadExecutable())
1992 return Err;
1993 }
1994 }
1995
1996 // Invalidate agent reference.
1997 Agent = {0};
1998
1999 return Plugin::success();
2000 }
2001
2002 virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
2003 DeviceImageTy &Image) override {
2004 GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
2005 if (Handler.isSymbolInImage(*this, Image, "amdgcn.device.fini"))
2006 Image.setPendingGlobalDtors();
2007
2008 return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true);
2009 }
2010
2011 virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
2012 DeviceImageTy &Image) override {
2013 if (Image.hasPendingGlobalDtors())
2014 return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false);
2015 return Plugin::success();
2016 }
2017
2018 const uint64_t getStreamBusyWaitMicroseconds() const {
2019 return OMPX_StreamBusyWait;
2020 }
2021
2022 Expected<std::unique_ptr<MemoryBuffer>>
2023 doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const override {
2024
2025 // TODO: We should try to avoid materialization but there seems to be no
2026 // good linker interface w/o file i/o.
2027 SmallString<128> LinkerInputFilePath;
2028 std::error_code EC = sys::fs::createTemporaryFile("amdgpu-pre-link-jit",
2029 "o", LinkerInputFilePath);
2030 if (EC)
2031 return Plugin::error("Failed to create temporary file for linker");
2032
2033 // Write the file's contents to the output file.
2034 Expected<std::unique_ptr<FileOutputBuffer>> OutputOrErr =
2035 FileOutputBuffer::create(LinkerInputFilePath, MB->getBuffer().size());
2036 if (!OutputOrErr)
2037 return OutputOrErr.takeError();
2038 std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr);
2039 llvm::copy(Range: MB->getBuffer(), Out: Output->getBufferStart());
2040 if (Error E = Output->commit())
2041 return std::move(E);
2042
2043 SmallString<128> LinkerOutputFilePath;
2044 EC = sys::fs::createTemporaryFile("amdgpu-pre-link-jit", "so",
2045 LinkerOutputFilePath);
2046 if (EC)
2047 return Plugin::error("Failed to create temporary file for linker");
2048
2049 const auto &ErrorOrPath = sys::findProgramByName("lld");
2050 if (!ErrorOrPath)
2051 return createStringError(EC: inconvertibleErrorCode(),
2052 Msg: "Failed to find `lld` on the PATH.");
2053
2054 std::string LLDPath = ErrorOrPath.get();
2055 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
2056 "Using `%s` to link JITed amdgcn ouput.", LLDPath.c_str());
2057
2058 std::string MCPU = "-plugin-opt=mcpu=" + getComputeUnitKind();
2059 StringRef Args[] = {LLDPath,
2060 "-flavor",
2061 "gnu",
2062 "--no-undefined",
2063 "-shared",
2064 MCPU,
2065 "-o",
2066 LinkerOutputFilePath.data(),
2067 LinkerInputFilePath.data()};
2068
2069 std::string Error;
2070 int RC = sys::ExecuteAndWait(LLDPath, Args, std::nullopt, {}, 0, 0, &Error);
2071 if (RC)
2072 return Plugin::error("Linking optimized bitcode failed: %s",
2073 Error.c_str());
2074
2075 auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(LinkerOutputFilePath);
2076 if (!BufferOrErr)
2077 return Plugin::error("Failed to open temporary file for lld");
2078
2079 // Clean up the temporary files afterwards.
2080 if (sys::fs::remove(LinkerOutputFilePath))
2081 return Plugin::error("Failed to remove temporary output file for lld");
2082 if (sys::fs::remove(LinkerInputFilePath))
2083 return Plugin::error("Failed to remove temporary input file for lld");
2084
2085 return std::move(*BufferOrErr);
2086 }
2087
2088 /// See GenericDeviceTy::getComputeUnitKind().
2089 std::string getComputeUnitKind() const override { return ComputeUnitKind; }
2090
2091 /// Returns the clock frequency for the given AMDGPU device.
2092 uint64_t getClockFrequency() const override { return ClockFrequency; }
2093
2094 /// Allocate and construct an AMDGPU kernel.
2095 Expected<GenericKernelTy &> constructKernel(const char *Name) override {
2096 // Allocate and construct the AMDGPU kernel.
2097 AMDGPUKernelTy *AMDGPUKernel = Plugin.allocate<AMDGPUKernelTy>();
2098 if (!AMDGPUKernel)
2099 return Plugin::error("Failed to allocate memory for AMDGPU kernel");
2100
2101 new (AMDGPUKernel) AMDGPUKernelTy(Name);
2102
2103 return *AMDGPUKernel;
2104 }
2105
2106 /// Set the current context to this device's context. Do nothing since the
2107 /// AMDGPU devices do not have the concept of contexts.
2108 Error setContext() override { return Plugin::success(); }
2109
2110 /// AMDGPU returns the product of the number of compute units and the waves
2111 /// per compute unit.
2112 uint64_t getHardwareParallelism() const override {
2113 return HardwareParallelism;
2114 }
2115
2116 /// We want to set up the RPC server for host services to the GPU if it is
2117 /// availible.
2118 bool shouldSetupRPCServer() const override {
2119 return libomptargetSupportsRPC();
2120 }
2121
2122 /// The RPC interface should have enough space for all availible parallelism.
2123 uint64_t requestedRPCPortCount() const override {
2124 return getHardwareParallelism();
2125 }
2126
2127 /// Get the stream of the asynchronous info sructure or get a new one.
2128 Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper,
2129 AMDGPUStreamTy *&Stream) {
2130 // Get the stream (if any) from the async info.
2131 Stream = AsyncInfoWrapper.getQueueAs<AMDGPUStreamTy *>();
2132 if (!Stream) {
2133 // There was no stream; get an idle one.
2134 if (auto Err = AMDGPUStreamManager.getResource(Stream))
2135 return Err;
2136
2137 // Modify the async info's stream.
2138 AsyncInfoWrapper.setQueueAs<AMDGPUStreamTy *>(Stream);
2139 }
2140 return Plugin::success();
2141 }
2142
2143 /// Load the binary image into the device and allocate an image object.
2144 Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage,
2145 int32_t ImageId) override {
2146 // Allocate and initialize the image object.
2147 AMDGPUDeviceImageTy *AMDImage = Plugin.allocate<AMDGPUDeviceImageTy>();
2148 new (AMDImage) AMDGPUDeviceImageTy(ImageId, *this, TgtImage);
2149
2150 // Load the HSA executable.
2151 if (Error Err = AMDImage->loadExecutable(Device: *this))
2152 return std::move(Err);
2153
2154 return AMDImage;
2155 }
2156
2157 /// Allocate memory on the device or related to the device.
2158 void *allocate(size_t Size, void *, TargetAllocTy Kind) override;
2159
2160 /// Deallocate memory on the device or related to the device.
2161 int free(void *TgtPtr, TargetAllocTy Kind) override {
2162 if (TgtPtr == nullptr)
2163 return OFFLOAD_SUCCESS;
2164
2165 AMDGPUMemoryPoolTy *MemoryPool = nullptr;
2166 switch (Kind) {
2167 case TARGET_ALLOC_DEFAULT:
2168 case TARGET_ALLOC_DEVICE:
2169 case TARGET_ALLOC_DEVICE_NON_BLOCKING:
2170 MemoryPool = CoarseGrainedMemoryPools[0];
2171 break;
2172 case TARGET_ALLOC_HOST:
2173 MemoryPool = &HostDevice.getFineGrainedMemoryPool();
2174 break;
2175 case TARGET_ALLOC_SHARED:
2176 MemoryPool = &HostDevice.getFineGrainedMemoryPool();
2177 break;
2178 }
2179
2180 if (!MemoryPool) {
2181 REPORT("No memory pool for the specified allocation kind\n");
2182 return OFFLOAD_FAIL;
2183 }
2184
2185 if (Error Err = MemoryPool->deallocate(Ptr: TgtPtr)) {
2186 REPORT("%s\n", toString(E: std::move(Err)).data());
2187 return OFFLOAD_FAIL;
2188 }
2189
2190 return OFFLOAD_SUCCESS;
2191 }
2192
2193 /// Synchronize current thread with the pending operations on the async info.
2194 Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
2195 AMDGPUStreamTy *Stream =
2196 reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
2197 assert(Stream && "Invalid stream");
2198
2199 if (auto Err = Stream->synchronize())
2200 return Err;
2201
2202 // Once the stream is synchronized, return it to stream pool and reset
2203 // AsyncInfo. This is to make sure the synchronization only works for its
2204 // own tasks.
2205 AsyncInfo.Queue = nullptr;
2206 return AMDGPUStreamManager.returnResource(Stream);
2207 }
2208
2209 /// Query for the completion of the pending operations on the async info.
2210 Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override {
2211 AMDGPUStreamTy *Stream =
2212 reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
2213 assert(Stream && "Invalid stream");
2214
2215 auto CompletedOrErr = Stream->query();
2216 if (!CompletedOrErr)
2217 return CompletedOrErr.takeError();
2218
2219 // Return if it the stream did not complete yet.
2220 if (!(*CompletedOrErr))
2221 return Plugin::success();
2222
2223 // Once the stream is completed, return it to stream pool and reset
2224 // AsyncInfo. This is to make sure the synchronization only works for its
2225 // own tasks.
2226 AsyncInfo.Queue = nullptr;
2227 return AMDGPUStreamManager.returnResource(Stream);
2228 }
2229
2230 /// Pin the host buffer and return the device pointer that should be used for
2231 /// device transfers.
2232 Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override {
2233 void *PinnedPtr = nullptr;
2234
2235 hsa_status_t Status =
2236 hsa_amd_memory_lock(HstPtr, Size, nullptr, 0, &PinnedPtr);
2237 if (auto Err = Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n"))
2238 return std::move(Err);
2239
2240 return PinnedPtr;
2241 }
2242
2243 /// Unpin the host buffer.
2244 Error dataUnlockImpl(void *HstPtr) override {
2245 hsa_status_t Status = hsa_amd_memory_unlock(HstPtr);
2246 return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n");
2247 }
2248
2249 /// Check through the HSA runtime whether the \p HstPtr buffer is pinned.
2250 Expected<bool> isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr,
2251 void *&BaseDevAccessiblePtr,
2252 size_t &BaseSize) const override {
2253 hsa_amd_pointer_info_t Info;
2254 Info.size = sizeof(hsa_amd_pointer_info_t);
2255
2256 hsa_status_t Status = hsa_amd_pointer_info(
2257 HstPtr, &Info, /*Allocator=*/nullptr, /*num_agents_accessible=*/nullptr,
2258 /*accessible=*/nullptr);
2259 if (auto Err = Plugin::check(Status, "Error in hsa_amd_pointer_info: %s"))
2260 return std::move(Err);
2261
2262 // The buffer may be locked or allocated through HSA allocators. Assume that
2263 // the buffer is host pinned if the runtime reports a HSA type.
2264 if (Info.type != HSA_EXT_POINTER_TYPE_LOCKED &&
2265 Info.type != HSA_EXT_POINTER_TYPE_HSA)
2266 return false;
2267
2268 assert(Info.hostBaseAddress && "Invalid host pinned address");
2269 assert(Info.agentBaseAddress && "Invalid agent pinned address");
2270 assert(Info.sizeInBytes > 0 && "Invalid pinned allocation size");
2271
2272 // Save the allocation info in the output parameters.
2273 BaseHstPtr = Info.hostBaseAddress;
2274 BaseDevAccessiblePtr = Info.agentBaseAddress;
2275 BaseSize = Info.sizeInBytes;
2276
2277 return true;
2278 }
2279
2280 /// Submit data to the device (host to device transfer).
2281 Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
2282 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2283 AMDGPUStreamTy *Stream = nullptr;
2284 void *PinnedPtr = nullptr;
2285
2286 // Use one-step asynchronous operation when host memory is already pinned.
2287 if (void *PinnedPtr =
2288 PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) {
2289 if (auto Err = getStream(AsyncInfoWrapper, Stream))
2290 return Err;
2291 return Stream->pushPinnedMemoryCopyAsync(Dst: TgtPtr, Src: PinnedPtr, CopySize: Size);
2292 }
2293
2294 // For large transfers use synchronous behavior.
2295 if (Size >= OMPX_MaxAsyncCopyBytes) {
2296 if (AsyncInfoWrapper.hasQueue())
2297 if (auto Err = synchronize(AsyncInfoWrapper))
2298 return Err;
2299
2300 hsa_status_t Status;
2301 Status = hsa_amd_memory_lock(const_cast<void *>(HstPtr), Size, nullptr, 0,
2302 &PinnedPtr);
2303 if (auto Err =
2304 Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n"))
2305 return Err;
2306
2307 AMDGPUSignalTy Signal;
2308 if (auto Err = Signal.init())
2309 return Err;
2310
2311 if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr,
2312 Agent, PinnedPtr, Agent, Size, 0,
2313 nullptr, Signal.get()))
2314 return Err;
2315
2316 if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
2317 return Err;
2318
2319 if (auto Err = Signal.deinit())
2320 return Err;
2321
2322 Status = hsa_amd_memory_unlock(const_cast<void *>(HstPtr));
2323 return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n");
2324 }
2325
2326 // Otherwise, use two-step copy with an intermediate pinned host buffer.
2327 AMDGPUMemoryManagerTy &PinnedMemoryManager =
2328 HostDevice.getPinnedMemoryManager();
2329 if (auto Err = PinnedMemoryManager.allocate(Size, PtrStorage: &PinnedPtr))
2330 return Err;
2331
2332 if (auto Err = getStream(AsyncInfoWrapper, Stream))
2333 return Err;
2334
2335 return Stream->pushMemoryCopyH2DAsync(Dst: TgtPtr, Src: HstPtr, Inter: PinnedPtr, CopySize: Size,
2336 MemoryManager&: PinnedMemoryManager);
2337 }
2338
2339 /// Retrieve data from the device (device to host transfer).
2340 Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
2341 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2342 AMDGPUStreamTy *Stream = nullptr;
2343 void *PinnedPtr = nullptr;
2344
2345 // Use one-step asynchronous operation when host memory is already pinned.
2346 if (void *PinnedPtr =
2347 PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) {
2348 if (auto Err = getStream(AsyncInfoWrapper, Stream))
2349 return Err;
2350
2351 return Stream->pushPinnedMemoryCopyAsync(Dst: PinnedPtr, Src: TgtPtr, CopySize: Size);
2352 }
2353
2354 // For large transfers use synchronous behavior.
2355 if (Size >= OMPX_MaxAsyncCopyBytes) {
2356 if (AsyncInfoWrapper.hasQueue())
2357 if (auto Err = synchronize(AsyncInfoWrapper))
2358 return Err;
2359
2360 hsa_status_t Status;
2361 Status = hsa_amd_memory_lock(const_cast<void *>(HstPtr), Size, nullptr, 0,
2362 &PinnedPtr);
2363 if (auto Err =
2364 Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n"))
2365 return Err;
2366
2367 AMDGPUSignalTy Signal;
2368 if (auto Err = Signal.init())
2369 return Err;
2370
2371 if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), PinnedPtr,
2372 Agent, TgtPtr, Agent, Size, 0, nullptr,
2373 Signal.get()))
2374 return Err;
2375
2376 if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
2377 return Err;
2378
2379 if (auto Err = Signal.deinit())
2380 return Err;
2381
2382 Status = hsa_amd_memory_unlock(const_cast<void *>(HstPtr));
2383 return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n");
2384 }
2385
2386 // Otherwise, use two-step copy with an intermediate pinned host buffer.
2387 AMDGPUMemoryManagerTy &PinnedMemoryManager =
2388 HostDevice.getPinnedMemoryManager();
2389 if (auto Err = PinnedMemoryManager.allocate(Size, PtrStorage: &PinnedPtr))
2390 return Err;
2391
2392 if (auto Err = getStream(AsyncInfoWrapper, Stream))
2393 return Err;
2394
2395 return Stream->pushMemoryCopyD2HAsync(Dst: HstPtr, Src: TgtPtr, Inter: PinnedPtr, CopySize: Size,
2396 MemoryManager&: PinnedMemoryManager);
2397 }
2398
2399 /// Exchange data between two devices within the plugin.
2400 Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice,
2401 void *DstPtr, int64_t Size,
2402 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2403 AMDGPUDeviceTy &DstDevice = static_cast<AMDGPUDeviceTy &>(DstGenericDevice);
2404
2405 // For large transfers use synchronous behavior.
2406 if (Size >= OMPX_MaxAsyncCopyBytes) {
2407 if (AsyncInfoWrapper.hasQueue())
2408 if (auto Err = synchronize(AsyncInfoWrapper))
2409 return Err;
2410
2411 AMDGPUSignalTy Signal;
2412 if (auto Err = Signal.init())
2413 return Err;
2414
2415 if (auto Err = utils::asyncMemCopy(
2416 useMultipleSdmaEngines(), DstPtr, DstDevice.getAgent(), SrcPtr,
2417 getAgent(), (uint64_t)Size, 0, nullptr, Signal.get()))
2418 return Err;
2419
2420 if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
2421 return Err;
2422
2423 return Signal.deinit();
2424 }
2425
2426 AMDGPUStreamTy *Stream = nullptr;
2427 if (auto Err = getStream(AsyncInfoWrapper, Stream))
2428 return Err;
2429 if (Size <= 0)
2430 return Plugin::success();
2431
2432 return Stream->pushMemoryCopyD2DAsync(DstPtr, DstDevice.getAgent(), SrcPtr,
2433 getAgent(), (uint64_t)Size);
2434 }
2435
2436 /// Initialize the async info for interoperability purposes.
2437 Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2438 // TODO: Implement this function.
2439 return Plugin::success();
2440 }
2441
2442 /// Initialize the device info for interoperability purposes.
2443 Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override {
2444 DeviceInfo->Context = nullptr;
2445
2446 if (!DeviceInfo->Device)
2447 DeviceInfo->Device = reinterpret_cast<void *>(Agent.handle);
2448
2449 return Plugin::success();
2450 }
2451
2452 /// Create an event.
2453 Error createEventImpl(void **EventPtrStorage) override {
2454 AMDGPUEventTy **Event = reinterpret_cast<AMDGPUEventTy **>(EventPtrStorage);
2455 return AMDGPUEventManager.getResource(*Event);
2456 }
2457
2458 /// Destroy a previously created event.
2459 Error destroyEventImpl(void *EventPtr) override {
2460 AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
2461 return AMDGPUEventManager.returnResource(Event);
2462 }
2463
2464 /// Record the event.
2465 Error recordEventImpl(void *EventPtr,
2466 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2467 AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
2468 assert(Event && "Invalid event");
2469
2470 AMDGPUStreamTy *Stream = nullptr;
2471 if (auto Err = getStream(AsyncInfoWrapper, Stream))
2472 return Err;
2473
2474 return Event->record(Stream&: *Stream);
2475 }
2476
2477 /// Make the stream wait on the event.
2478 Error waitEventImpl(void *EventPtr,
2479 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2480 AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
2481
2482 AMDGPUStreamTy *Stream = nullptr;
2483 if (auto Err = getStream(AsyncInfoWrapper, Stream))
2484 return Err;
2485
2486 return Event->wait(Stream&: *Stream);
2487 }
2488
2489 /// Synchronize the current thread with the event.
2490 Error syncEventImpl(void *EventPtr) override {
2491 return Plugin::error("Synchronize event not implemented");
2492 }
2493
2494 /// Print information about the device.
2495 Error obtainInfoImpl(InfoQueueTy &Info) override {
2496 char TmpChar[1000];
2497 const char *TmpCharPtr = "Unknown";
2498 uint16_t Major, Minor;
2499 uint32_t TmpUInt, TmpUInt2;
2500 uint32_t CacheSize[4];
2501 size_t TmpSt;
2502 bool TmpBool;
2503 uint16_t WorkgrpMaxDim[3];
2504 hsa_dim3_t GridMaxDim;
2505 hsa_status_t Status, Status2;
2506
2507 Status = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major);
2508 Status2 = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor);
2509 if (Status == HSA_STATUS_SUCCESS && Status2 == HSA_STATUS_SUCCESS)
2510 Info.add("HSA Runtime Version",
2511 std::to_string(val: Major) + "." + std::to_string(val: Minor));
2512
2513 Info.add("HSA OpenMP Device Number", DeviceId);
2514
2515 Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar);
2516 if (Status == HSA_STATUS_SUCCESS)
2517 Info.add("Product Name", TmpChar);
2518
2519 Status = getDeviceAttrRaw(HSA_AGENT_INFO_NAME, TmpChar);
2520 if (Status == HSA_STATUS_SUCCESS)
2521 Info.add("Device Name", TmpChar);
2522
2523 Status = getDeviceAttrRaw(HSA_AGENT_INFO_VENDOR_NAME, TmpChar);
2524 if (Status == HSA_STATUS_SUCCESS)
2525 Info.add("Vendor Name", TmpChar);
2526
2527 hsa_device_type_t DevType;
2528 Status = getDeviceAttrRaw(HSA_AGENT_INFO_DEVICE, DevType);
2529 if (Status == HSA_STATUS_SUCCESS) {
2530 switch (DevType) {
2531 case HSA_DEVICE_TYPE_CPU:
2532 TmpCharPtr = "CPU";
2533 break;
2534 case HSA_DEVICE_TYPE_GPU:
2535 TmpCharPtr = "GPU";
2536 break;
2537 case HSA_DEVICE_TYPE_DSP:
2538 TmpCharPtr = "DSP";
2539 break;
2540 }
2541 Info.add("Device Type", TmpCharPtr);
2542 }
2543
2544 Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUES_MAX, TmpUInt);
2545 if (Status == HSA_STATUS_SUCCESS)
2546 Info.add("Max Queues", TmpUInt);
2547
2548 Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MIN_SIZE, TmpUInt);
2549 if (Status == HSA_STATUS_SUCCESS)
2550 Info.add("Queue Min Size", TmpUInt);
2551
2552 Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MAX_SIZE, TmpUInt);
2553 if (Status == HSA_STATUS_SUCCESS)
2554 Info.add("Queue Max Size", TmpUInt);
2555
2556 // FIXME: This is deprecated according to HSA documentation. But using
2557 // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during
2558 // runtime.
2559 Status = getDeviceAttrRaw(HSA_AGENT_INFO_CACHE_SIZE, CacheSize);
2560 if (Status == HSA_STATUS_SUCCESS) {
2561 Info.add("Cache");
2562
2563 for (int I = 0; I < 4; I++)
2564 if (CacheSize[I])
2565 Info.add<InfoLevel2>("L" + std::to_string(I), CacheSize[I]);
2566 }
2567
2568 Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_CACHELINE_SIZE, TmpUInt);
2569 if (Status == HSA_STATUS_SUCCESS)
2570 Info.add("Cacheline Size", TmpUInt);
2571
2572 Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt);
2573 if (Status == HSA_STATUS_SUCCESS)
2574 Info.add("Max Clock Freq", TmpUInt, "MHz");
2575
2576 Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, TmpUInt);
2577 if (Status == HSA_STATUS_SUCCESS)
2578 Info.add("Compute Units", TmpUInt);
2579
2580 Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, TmpUInt);
2581 if (Status == HSA_STATUS_SUCCESS)
2582 Info.add("SIMD per CU", TmpUInt);
2583
2584 Status = getDeviceAttrRaw(HSA_AGENT_INFO_FAST_F16_OPERATION, TmpBool);
2585 if (Status == HSA_STATUS_SUCCESS)
2586 Info.add("Fast F16 Operation", TmpBool);
2587
2588 Status = getDeviceAttrRaw(HSA_AGENT_INFO_WAVEFRONT_SIZE, TmpUInt2);
2589 if (Status == HSA_STATUS_SUCCESS)
2590 Info.add("Wavefront Size", TmpUInt2);
2591
2592 Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, TmpUInt);
2593 if (Status == HSA_STATUS_SUCCESS)
2594 Info.add("Workgroup Max Size", TmpUInt);
2595
2596 Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgrpMaxDim);
2597 if (Status == HSA_STATUS_SUCCESS) {
2598 Info.add("Workgroup Max Size per Dimension");
2599 Info.add<InfoLevel2>("x", WorkgrpMaxDim[0]);
2600 Info.add<InfoLevel2>("y", WorkgrpMaxDim[1]);
2601 Info.add<InfoLevel2>("z", WorkgrpMaxDim[2]);
2602 }
2603
2604 Status = getDeviceAttrRaw(
2605 (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, TmpUInt);
2606 if (Status == HSA_STATUS_SUCCESS) {
2607 Info.add("Max Waves Per CU", TmpUInt);
2608 Info.add("Max Work-item Per CU", TmpUInt * TmpUInt2);
2609 }
2610
2611 Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_SIZE, TmpUInt);
2612 if (Status == HSA_STATUS_SUCCESS)
2613 Info.add("Grid Max Size", TmpUInt);
2614
2615 Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim);
2616 if (Status == HSA_STATUS_SUCCESS) {
2617 Info.add("Grid Max Size per Dimension");
2618 Info.add<InfoLevel2>("x", GridMaxDim.x);
2619 Info.add<InfoLevel2>("y", GridMaxDim.y);
2620 Info.add<InfoLevel2>("z", GridMaxDim.z);
2621 }
2622
2623 Status = getDeviceAttrRaw(HSA_AGENT_INFO_FBARRIER_MAX_SIZE, TmpUInt);
2624 if (Status == HSA_STATUS_SUCCESS)
2625 Info.add("Max fbarriers/Workgrp", TmpUInt);
2626
2627 Info.add("Memory Pools");
2628 for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
2629 std::string TmpStr, TmpStr2;
2630
2631 if (Pool->isGlobal())
2632 TmpStr = "Global";
2633 else if (Pool->isReadOnly())
2634 TmpStr = "ReadOnly";
2635 else if (Pool->isPrivate())
2636 TmpStr = "Private";
2637 else if (Pool->isGroup())
2638 TmpStr = "Group";
2639 else
2640 TmpStr = "Unknown";
2641
2642 Info.add<InfoLevel2>(std::string("Pool ") + TmpStr);
2643
2644 if (Pool->isGlobal()) {
2645 if (Pool->isFineGrained())
2646 TmpStr2 += "Fine Grained ";
2647 if (Pool->isCoarseGrained())
2648 TmpStr2 += "Coarse Grained ";
2649 if (Pool->supportsKernelArgs())
2650 TmpStr2 += "Kernarg ";
2651
2652 Info.add<InfoLevel3>("Flags", TmpStr2);
2653 }
2654
2655 Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, TmpSt);
2656 if (Status == HSA_STATUS_SUCCESS)
2657 Info.add<InfoLevel3>("Size", TmpSt, "bytes");
2658
2659 Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
2660 TmpBool);
2661 if (Status == HSA_STATUS_SUCCESS)
2662 Info.add<InfoLevel3>("Allocatable", TmpBool);
2663
2664 Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE,
2665 TmpSt);
2666 if (Status == HSA_STATUS_SUCCESS)
2667 Info.add<InfoLevel3>("Runtime Alloc Granule", TmpSt, "bytes");
2668
2669 Status = Pool->getAttrRaw(
2670 HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, TmpSt);
2671 if (Status == HSA_STATUS_SUCCESS)
2672 Info.add<InfoLevel3>("Runtime Alloc Alignment", TmpSt, "bytes");
2673
2674 Status =
2675 Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, TmpBool);
2676 if (Status == HSA_STATUS_SUCCESS)
2677 Info.add<InfoLevel3>("Accessable by all", TmpBool);
2678 }
2679
2680 Info.add("ISAs");
2681 auto Err = utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) {
2682 Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, TmpChar);
2683 if (Status == HSA_STATUS_SUCCESS)
2684 Info.add<InfoLevel2>("Name", TmpChar);
2685
2686 return Status;
2687 });
2688
2689 // Silently consume the error.
2690 if (Err)
2691 consumeError(std::move(Err));
2692
2693 return Plugin::success();
2694 }
2695
2696 /// Returns true if auto zero-copy the best configuration for the current
2697 /// arch.
2698 /// On AMDGPUs, automatic zero-copy is turned on
2699 /// when running on an APU with XNACK (unified memory) support
2700 /// enabled. On discrete GPUs, automatic zero-copy is triggered
2701 /// if the user sets the environment variable OMPX_APU_MAPS=1
2702 /// and if XNACK is enabled. The rationale is that zero-copy
2703 /// is the best configuration (performance, memory footprint) on APUs,
2704 /// while it is often not the best on discrete GPUs.
2705 /// XNACK can be enabled with a kernel boot parameter or with
2706 /// the HSA_XNACK environment variable.
2707 bool useAutoZeroCopyImpl() override {
2708 return ((IsAPU || OMPX_ApuMaps) && IsXnackEnabled);
2709 }
2710
2711 /// Getters and setters for stack and heap sizes.
2712 Error getDeviceStackSize(uint64_t &Value) override {
2713 Value = StackSize;
2714 return Plugin::success();
2715 }
2716 Error setDeviceStackSize(uint64_t Value) override {
2717 StackSize = Value;
2718 return Plugin::success();
2719 }
2720 Error getDeviceHeapSize(uint64_t &Value) override {
2721 Value = DeviceMemoryPoolSize;
2722 return Plugin::success();
2723 }
2724 Error setDeviceHeapSize(uint64_t Value) override {
2725 for (DeviceImageTy *Image : LoadedImages)
2726 if (auto Err = setupDeviceMemoryPool(Plugin, *Image, Value))
2727 return Err;
2728 DeviceMemoryPoolSize = Value;
2729 return Plugin::success();
2730 }
2731 Error getDeviceMemorySize(uint64_t &Value) override {
2732 for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
2733 if (Pool->isGlobal()) {
2734 hsa_status_t Status =
2735 Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, Value);
2736 return Plugin::check(Status, "Error in getting device memory size: %s");
2737 }
2738 }
2739 return Plugin::error("getDeviceMemorySize:: no global pool");
2740 }
2741
2742 /// AMDGPU-specific function to get device attributes.
2743 template <typename Ty> Error getDeviceAttr(uint32_t Kind, Ty &Value) {
2744 hsa_status_t Status =
2745 hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value);
2746 return Plugin::check(Status, "Error in hsa_agent_get_info: %s");
2747 }
2748
2749 template <typename Ty>
2750 hsa_status_t getDeviceAttrRaw(uint32_t Kind, Ty &Value) {
2751 return hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value);
2752 }
2753
2754 /// Get the device agent.
2755 hsa_agent_t getAgent() const override { return Agent; }
2756
2757 /// Get the signal manager.
2758 AMDGPUSignalManagerTy &getSignalManager() { return AMDGPUSignalManager; }
2759
2760 /// Retrieve and construct all memory pools of the device agent.
2761 Error retrieveAllMemoryPools() override {
2762 // Iterate through the available pools of the device agent.
2763 return utils::iterateAgentMemoryPools(
2764 Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) {
2765 AMDGPUMemoryPoolTy *MemoryPool =
2766 Plugin.allocate<AMDGPUMemoryPoolTy>();
2767 new (MemoryPool) AMDGPUMemoryPoolTy(HSAMemoryPool);
2768 AllMemoryPools.push_back(MemoryPool);
2769 return HSA_STATUS_SUCCESS;
2770 });
2771 }
2772
2773 bool useMultipleSdmaEngines() const { return OMPX_UseMultipleSdmaEngines; }
2774
2775private:
2776 using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>;
2777 using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>;
2778
2779 /// Common method to invoke a single threaded constructor or destructor
2780 /// kernel by name.
2781 Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image,
2782 bool IsCtor) {
2783 const char *KernelName =
2784 IsCtor ? "amdgcn.device.init" : "amdgcn.device.fini";
2785 // Perform a quick check for the named kernel in the image. The kernel
2786 // should be created by the 'amdgpu-lower-ctor-dtor' pass.
2787 GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
2788 if (IsCtor && !Handler.isSymbolInImage(*this, Image, KernelName))
2789 return Plugin::success();
2790
2791 // Allocate and construct the AMDGPU kernel.
2792 AMDGPUKernelTy AMDGPUKernel(KernelName);
2793 if (auto Err = AMDGPUKernel.init(*this, Image))
2794 return Err;
2795
2796 AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
2797
2798 KernelArgsTy KernelArgs = {};
2799 if (auto Err = AMDGPUKernel.launchImpl(*this, /*NumThread=*/1u,
2800 /*NumBlocks=*/1ul, KernelArgs,
2801 /*Args=*/nullptr, AsyncInfoWrapper))
2802 return Err;
2803
2804 Error Err = Plugin::success();
2805 AsyncInfoWrapper.finalize(Err);
2806
2807 return Err;
2808 }
2809
2810 /// Detect if current architecture is an APU.
2811 Error checkIfAPU() {
2812 // TODO: replace with ROCr API once it becomes available.
2813 llvm::StringRef StrGfxName(ComputeUnitKind);
2814 IsAPU = llvm::StringSwitch<bool>(StrGfxName)
2815 .Case(S: "gfx940", Value: true)
2816 .Default(Value: false);
2817 if (IsAPU)
2818 return Plugin::success();
2819
2820 bool MayBeAPU = llvm::StringSwitch<bool>(StrGfxName)
2821 .Case(S: "gfx942", Value: true)
2822 .Default(Value: false);
2823 if (!MayBeAPU)
2824 return Plugin::success();
2825
2826 // can be MI300A or MI300X
2827 uint32_t ChipID = 0;
2828 if (auto Err = getDeviceAttr(HSA_AMD_AGENT_INFO_CHIP_ID, ChipID))
2829 return Err;
2830
2831 if (!(ChipID & 0x1)) {
2832 IsAPU = true;
2833 return Plugin::success();
2834 }
2835 return Plugin::success();
2836 }
2837
2838 /// Envar for controlling the number of HSA queues per device. High number of
2839 /// queues may degrade performance.
2840 UInt32Envar OMPX_NumQueues;
2841
2842 /// Envar for controlling the size of each HSA queue. The size is the number
2843 /// of HSA packets a queue is expected to hold. It is also the number of HSA
2844 /// packets that can be pushed into each queue without waiting the driver to
2845 /// process them.
2846 UInt32Envar OMPX_QueueSize;
2847
2848 /// Envar for controlling the default number of teams relative to the number
2849 /// of compute units (CUs) the device has:
2850 /// #default_teams = OMPX_DefaultTeamsPerCU * #CUs.
2851 UInt32Envar OMPX_DefaultTeamsPerCU;
2852
2853 /// Envar specifying the maximum size in bytes where the memory copies are
2854 /// asynchronous operations. Up to this transfer size, the memory copies are
2855 /// asychronous operations pushed to the corresponding stream. For larger
2856 /// transfers, they are synchronous transfers.
2857 UInt32Envar OMPX_MaxAsyncCopyBytes;
2858
2859 /// Envar controlling the initial number of HSA signals per device. There is
2860 /// one manager of signals per device managing several pre-allocated signals.
2861 /// These signals are mainly used by AMDGPU streams. If needed, more signals
2862 /// will be created.
2863 UInt32Envar OMPX_InitialNumSignals;
2864
2865 /// Environment variables to set the time to wait in active state before
2866 /// switching to blocked state. The default 2000000 busywaits for 2 seconds
2867 /// before going into a blocking HSA wait state. The unit for these variables
2868 /// are microseconds.
2869 UInt32Envar OMPX_StreamBusyWait;
2870
2871 /// Use ROCm 5.7 interface for multiple SDMA engines
2872 BoolEnvar OMPX_UseMultipleSdmaEngines;
2873
2874 /// Value of OMPX_APU_MAPS env var used to force
2875 /// automatic zero-copy behavior on non-APU GPUs.
2876 BoolEnvar OMPX_ApuMaps;
2877
2878 /// Stream manager for AMDGPU streams.
2879 AMDGPUStreamManagerTy AMDGPUStreamManager;
2880
2881 /// Event manager for AMDGPU events.
2882 AMDGPUEventManagerTy AMDGPUEventManager;
2883
2884 /// Signal manager for AMDGPU signals.
2885 AMDGPUSignalManagerTy AMDGPUSignalManager;
2886
2887 /// The agent handler corresponding to the device.
2888 hsa_agent_t Agent;
2889
2890 /// The GPU architecture.
2891 std::string ComputeUnitKind;
2892
2893 /// The frequency of the steady clock inside the device.
2894 uint64_t ClockFrequency;
2895
2896 /// The total number of concurrent work items that can be running on the GPU.
2897 uint64_t HardwareParallelism;
2898
2899 /// Reference to the host device.
2900 AMDHostDeviceTy &HostDevice;
2901
2902 /// The current size of the global device memory pool (managed by us).
2903 uint64_t DeviceMemoryPoolSize = 1L << 29L /*512MB=*/;
2904
2905 /// The current size of the stack that will be used in cases where it could
2906 /// not be statically determined.
2907 uint64_t StackSize = 16 * 1024 /* 16 KB */;
2908
2909 /// Is the plugin associated with an APU?
2910 bool IsAPU = false;
2911
2912 /// True is the system is configured with XNACK-Enabled.
2913 /// False otherwise.
2914 bool IsXnackEnabled = false;
2915};
2916
2917Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
2918 hsa_status_t Status;
2919 Status = hsa_code_object_deserialize(getStart(), getSize(), "", &CodeObject);
2920 if (auto Err =
2921 Plugin::check(Status, "Error in hsa_code_object_deserialize: %s"))
2922 return Err;
2923
2924 Status = hsa_executable_create_alt(
2925 HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "", &Executable);
2926 if (auto Err =
2927 Plugin::check(Status, "Error in hsa_executable_create_alt: %s"))
2928 return Err;
2929
2930 Status = hsa_executable_load_code_object(Executable, Device.getAgent(),
2931 CodeObject, "");
2932 if (auto Err =
2933 Plugin::check(Status, "Error in hsa_executable_load_code_object: %s"))
2934 return Err;
2935
2936 Status = hsa_executable_freeze(Executable, "");
2937 if (auto Err = Plugin::check(Status, "Error in hsa_executable_freeze: %s"))
2938 return Err;
2939
2940 uint32_t Result;
2941 Status = hsa_executable_validate(Executable, &Result);
2942 if (auto Err = Plugin::check(Status, "Error in hsa_executable_validate: %s"))
2943 return Err;
2944
2945 if (Result)
2946 return Plugin::error("Loaded HSA executable does not validate");
2947
2948 if (auto Err = utils::readAMDGPUMetaDataFromImage(
2949 getMemoryBuffer(), KernelInfoMap, ELFABIVersion))
2950 return Err;
2951
2952 return Plugin::success();
2953}
2954
2955Expected<hsa_executable_symbol_t>
2956AMDGPUDeviceImageTy::findDeviceSymbol(GenericDeviceTy &Device,
2957 StringRef SymbolName) const {
2958
2959 AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(Device);
2960 hsa_agent_t Agent = AMDGPUDevice.getAgent();
2961
2962 hsa_executable_symbol_t Symbol;
2963 hsa_status_t Status = hsa_executable_get_symbol_by_name(
2964 Executable, SymbolName.data(), &Agent, &Symbol);
2965 if (auto Err = Plugin::check(
2966 Status, "Error in hsa_executable_get_symbol_by_name(%s): %s",
2967 SymbolName.data()))
2968 return std::move(Err);
2969
2970 return Symbol;
2971}
2972
2973template <typename ResourceTy>
2974Error AMDGPUResourceRef<ResourceTy>::create(GenericDeviceTy &Device) {
2975 if (Resource)
2976 return Plugin::error("Creating an existing resource");
2977
2978 AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(Device);
2979
2980 Resource = new ResourceTy(AMDGPUDevice);
2981
2982 return Resource->init();
2983}
2984
2985AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
2986 : Agent(Device.getAgent()), Queue(nullptr),
2987 SignalManager(Device.getSignalManager()), Device(Device),
2988 // Initialize the std::deque with some empty positions.
2989 Slots(32), NextSlot(0), SyncCycle(0), RPCServer(nullptr),
2990 StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
2991 UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {}
2992
2993/// Class implementing the AMDGPU-specific functionalities of the global
2994/// handler.
2995struct AMDGPUGlobalHandlerTy final : public GenericGlobalHandlerTy {
2996 /// Get the metadata of a global from the device. The name and size of the
2997 /// global is read from DeviceGlobal and the address of the global is written
2998 /// to DeviceGlobal.
2999 Error getGlobalMetadataFromDevice(GenericDeviceTy &Device,
3000 DeviceImageTy &Image,
3001 GlobalTy &DeviceGlobal) override {
3002 AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(Image);
3003
3004 // Find the symbol on the device executable.
3005 auto SymbolOrErr =
3006 AMDImage.findDeviceSymbol(Device, DeviceGlobal.getName());
3007 if (!SymbolOrErr)
3008 return SymbolOrErr.takeError();
3009
3010 hsa_executable_symbol_t Symbol = *SymbolOrErr;
3011 hsa_symbol_kind_t SymbolType;
3012 hsa_status_t Status;
3013 uint64_t SymbolAddr;
3014 uint32_t SymbolSize;
3015
3016 // Retrieve the type, address and size of the symbol.
3017 std::pair<hsa_executable_symbol_info_t, void *> RequiredInfos[] = {
3018 {HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &SymbolType},
3019 {HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &SymbolAddr},
3020 {HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &SymbolSize}};
3021
3022 for (auto &Info : RequiredInfos) {
3023 Status = hsa_executable_symbol_get_info(Symbol, Info.first, Info.second);
3024 if (auto Err = Plugin::check(
3025 Status, "Error in hsa_executable_symbol_get_info: %s"))
3026 return Err;
3027 }
3028
3029 // Check the size of the symbol.
3030 if (SymbolSize != DeviceGlobal.getSize())
3031 return Plugin::error(
3032 "Failed to load global '%s' due to size mismatch (%zu != %zu)",
3033 DeviceGlobal.getName().data(), SymbolSize,
3034 (size_t)DeviceGlobal.getSize());
3035
3036 // Store the symbol address on the device global metadata.
3037 DeviceGlobal.setPtr(reinterpret_cast<void *>(SymbolAddr));
3038
3039 return Plugin::success();
3040 }
3041};
3042
3043/// Class implementing the AMDGPU-specific functionalities of the plugin.
3044struct AMDGPUPluginTy final : public GenericPluginTy {
3045 /// Create an AMDGPU plugin and initialize the AMDGPU driver.
3046 AMDGPUPluginTy()
3047 : GenericPluginTy(getTripleArch()), Initialized(false),
3048 HostDevice(nullptr) {}
3049
3050 /// This class should not be copied.
3051 AMDGPUPluginTy(const AMDGPUPluginTy &) = delete;
3052 AMDGPUPluginTy(AMDGPUPluginTy &&) = delete;
3053
3054 /// Initialize the plugin and return the number of devices.
3055 Expected<int32_t> initImpl() override {
3056 hsa_status_t Status = hsa_init();
3057 if (Status != HSA_STATUS_SUCCESS) {
3058 // Cannot call hsa_success_string.
3059 DP("Failed to initialize AMDGPU's HSA library\n");
3060 return 0;
3061 }
3062
3063 // The initialization of HSA was successful. It should be safe to call
3064 // HSA functions from now on, e.g., hsa_shut_down.
3065 Initialized = true;
3066
3067#ifdef OMPT_SUPPORT
3068 ompt::connectLibrary();
3069#endif
3070
3071 // Register event handler to detect memory errors on the devices.
3072 Status = hsa_amd_register_system_event_handler(eventHandler, nullptr);
3073 if (auto Err = Plugin::check(
3074 Status, "Error in hsa_amd_register_system_event_handler: %s"))
3075 return std::move(Err);
3076
3077 // List of host (CPU) agents.
3078 llvm::SmallVector<hsa_agent_t> HostAgents;
3079
3080 // Count the number of available agents.
3081 auto Err = utils::iterateAgents(Callback: [&](hsa_agent_t Agent) {
3082 // Get the device type of the agent.
3083 hsa_device_type_t DeviceType;
3084 hsa_status_t Status =
3085 hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType);
3086 if (Status != HSA_STATUS_SUCCESS)
3087 return Status;
3088
3089 // Classify the agents into kernel (GPU) and host (CPU) kernels.
3090 if (DeviceType == HSA_DEVICE_TYPE_GPU) {
3091 // Ensure that the GPU agent supports kernel dispatch packets.
3092 hsa_agent_feature_t Features;
3093 Status = hsa_agent_get_info(Agent, HSA_AGENT_INFO_FEATURE, &Features);
3094 if (Features & HSA_AGENT_FEATURE_KERNEL_DISPATCH)
3095 KernelAgents.push_back(Agent);
3096 } else if (DeviceType == HSA_DEVICE_TYPE_CPU) {
3097 HostAgents.push_back(Agent);
3098 }
3099 return HSA_STATUS_SUCCESS;
3100 });
3101
3102 if (Err)
3103 return std::move(Err);
3104
3105 int32_t NumDevices = KernelAgents.size();
3106 if (NumDevices == 0) {
3107 // Do not initialize if there are no devices.
3108 DP("There are no devices supporting AMDGPU.\n");
3109 return 0;
3110 }
3111
3112 // There are kernel agents but there is no host agent. That should be
3113 // treated as an error.
3114 if (HostAgents.empty())
3115 return Plugin::error("No AMDGPU host agents");
3116
3117 // Initialize the host device using host agents.
3118 HostDevice = allocate<AMDHostDeviceTy>();
3119 new (HostDevice) AMDHostDeviceTy(*this, HostAgents);
3120
3121 // Setup the memory pools of available for the host.
3122 if (auto Err = HostDevice->init())
3123 return std::move(Err);
3124
3125 return NumDevices;
3126 }
3127
3128 /// Deinitialize the plugin.
3129 Error deinitImpl() override {
3130 // The HSA runtime was not initialized, so nothing from the plugin was
3131 // actually initialized.
3132 if (!Initialized)
3133 return Plugin::success();
3134
3135 if (HostDevice)
3136 if (auto Err = HostDevice->deinit())
3137 return Err;
3138
3139 // Finalize the HSA runtime.
3140 hsa_status_t Status = hsa_shut_down();
3141 return Plugin::check(Status, "Error in hsa_shut_down: %s");
3142 }
3143
3144 /// Creates an AMDGPU device.
3145 GenericDeviceTy *createDevice(GenericPluginTy &Plugin, int32_t DeviceId,
3146 int32_t NumDevices) override {
3147 return new AMDGPUDeviceTy(Plugin, DeviceId, NumDevices, getHostDevice(),
3148 getKernelAgent(DeviceId));
3149 }
3150
3151 /// Creates an AMDGPU global handler.
3152 GenericGlobalHandlerTy *createGlobalHandler() override {
3153 return new AMDGPUGlobalHandlerTy();
3154 }
3155
3156 Triple::ArchType getTripleArch() const override { return Triple::amdgcn; }
3157
3158 /// Get the ELF code for recognizing the compatible image binary.
3159 uint16_t getMagicElfBits() const override { return ELF::EM_AMDGPU; }
3160
3161 /// Check whether the image is compatible with an AMDGPU device.
3162 Expected<bool> isELFCompatible(StringRef Image) const override {
3163 // Get the associated architecture and flags from the ELF.
3164 auto ElfOrErr = ELF64LEObjectFile::create(
3165 MemoryBufferRef(Image, /*Identifier=*/""), /*InitContent=*/false);
3166 if (!ElfOrErr)
3167 return ElfOrErr.takeError();
3168 std::optional<StringRef> Processor = ElfOrErr->tryGetCPUName();
3169
3170 for (hsa_agent_t Agent : KernelAgents) {
3171 auto TargeTripleAndFeaturesOrError =
3172 utils::getTargetTripleAndFeatures(Agent);
3173 if (!TargeTripleAndFeaturesOrError)
3174 return TargeTripleAndFeaturesOrError.takeError();
3175 if (!utils::isImageCompatibleWithEnv(Processor ? *Processor : "",
3176 ElfOrErr->getPlatformFlags(),
3177 *TargeTripleAndFeaturesOrError))
3178 return false;
3179 }
3180 return true;
3181 }
3182
3183 bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) override {
3184 return true;
3185 }
3186
3187 /// Get the host device instance.
3188 AMDHostDeviceTy &getHostDevice() {
3189 assert(HostDevice && "Host device not initialized");
3190 return *HostDevice;
3191 }
3192
3193 /// Get the kernel agent with the corresponding agent id.
3194 hsa_agent_t getKernelAgent(int32_t AgentId) const {
3195 assert((uint32_t)AgentId < KernelAgents.size() && "Invalid agent id");
3196 return KernelAgents[AgentId];
3197 }
3198
3199 /// Get the list of the available kernel agents.
3200 const llvm::SmallVector<hsa_agent_t> &getKernelAgents() const {
3201 return KernelAgents;
3202 }
3203
3204private:
3205 /// Event handler that will be called by ROCr if an event is detected.
3206 static hsa_status_t eventHandler(const hsa_amd_event_t *Event, void *) {
3207 if (Event->event_type != HSA_AMD_GPU_MEMORY_FAULT_EVENT)
3208 return HSA_STATUS_SUCCESS;
3209
3210 SmallVector<std::string> Reasons;
3211 uint32_t ReasonsMask = Event->memory_fault.fault_reason_mask;
3212 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_PAGE_NOT_PRESENT)
3213 Reasons.emplace_back(Args: "Page not present or supervisor privilege");
3214 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_READ_ONLY)
3215 Reasons.emplace_back(Args: "Write access to a read-only page");
3216 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_NX)
3217 Reasons.emplace_back(Args: "Execute access to a page marked NX");
3218 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HOST_ONLY)
3219 Reasons.emplace_back(Args: "GPU attempted access to a host only page");
3220 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_DRAMECC)
3221 Reasons.emplace_back(Args: "DRAM ECC failure");
3222 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_IMPRECISE)
3223 Reasons.emplace_back(Args: "Can't determine the exact fault address");
3224 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_SRAMECC)
3225 Reasons.emplace_back(Args: "SRAM ECC failure (ie registers, no fault address)");
3226 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HANG)
3227 Reasons.emplace_back(Args: "GPU reset following unspecified hang");
3228
3229 // If we do not know the reason, say so, otherwise remove the trailing comma
3230 // and space.
3231 if (Reasons.empty())
3232 Reasons.emplace_back(Args: "Unknown (" + std::to_string(val: ReasonsMask) + ")");
3233
3234 uint32_t Node = -1;
3235 hsa_agent_get_info(Event->memory_fault.agent, HSA_AGENT_INFO_NODE, &Node);
3236
3237 // Abort the execution since we do not recover from this error.
3238 FATAL_MESSAGE(1,
3239 "Memory access fault by GPU %" PRIu32 " (agent 0x%" PRIx64
3240 ") at virtual address %p. Reasons: %s",
3241 Node, Event->memory_fault.agent.handle,
3242 (void *)Event->memory_fault.virtual_address,
3243 llvm::join(Reasons, ", ").c_str());
3244
3245 return HSA_STATUS_ERROR;
3246 }
3247
3248 /// Indicate whether the HSA runtime was correctly initialized. Even if there
3249 /// is no available devices this boolean will be true. It indicates whether
3250 /// we can safely call HSA functions (e.g., hsa_shut_down).
3251 bool Initialized;
3252
3253 /// Arrays of the available GPU and CPU agents. These arrays of handles should
3254 /// not be here but in the AMDGPUDeviceTy structures directly. However, the
3255 /// HSA standard does not provide API functions to retirve agents directly,
3256 /// only iterating functions. We cache the agents here for convenience.
3257 llvm::SmallVector<hsa_agent_t> KernelAgents;
3258
3259 /// The device representing all HSA host agents.
3260 AMDHostDeviceTy *HostDevice;
3261};
3262
3263Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
3264 uint32_t NumThreads, uint64_t NumBlocks,
3265 KernelArgsTy &KernelArgs, void *Args,
3266 AsyncInfoWrapperTy &AsyncInfoWrapper) const {
3267 const uint32_t KernelArgsSize = KernelArgs.NumArgs * sizeof(void *);
3268
3269 if (ArgsSize < KernelArgsSize)
3270 return Plugin::error("Mismatch of kernel arguments size");
3271
3272 // The args size reported by HSA may or may not contain the implicit args.
3273 // For now, assume that HSA does not consider the implicit arguments when
3274 // reporting the arguments of a kernel. In the worst case, we can waste
3275 // 56 bytes per allocation.
3276 uint32_t AllArgsSize = KernelArgsSize + ImplicitArgsSize;
3277
3278 AMDGPUPluginTy &AMDGPUPlugin =
3279 static_cast<AMDGPUPluginTy &>(GenericDevice.Plugin);
3280 AMDHostDeviceTy &HostDevice = AMDGPUPlugin.getHostDevice();
3281 AMDGPUMemoryManagerTy &ArgsMemoryManager = HostDevice.getArgsMemoryManager();
3282
3283 void *AllArgs = nullptr;
3284 if (auto Err = ArgsMemoryManager.allocate(AllArgsSize, &AllArgs))
3285 return Err;
3286
3287 // Account for user requested dynamic shared memory.
3288 uint32_t GroupSize = getGroupSize();
3289 if (uint32_t MaxDynCGroupMem = std::max(
3290 KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize())) {
3291 GroupSize += MaxDynCGroupMem;
3292 }
3293
3294 uint64_t StackSize;
3295 if (auto Err = GenericDevice.getDeviceStackSize(StackSize))
3296 return Err;
3297
3298 // Initialize implicit arguments.
3299 utils::AMDGPUImplicitArgsTy *ImplArgs =
3300 reinterpret_cast<utils::AMDGPUImplicitArgsTy *>(
3301 advanceVoidPtr(AllArgs, KernelArgsSize));
3302
3303 // Initialize the implicit arguments to zero.
3304 std::memset(s: ImplArgs, c: 0, n: ImplicitArgsSize);
3305
3306 // Copy the explicit arguments.
3307 // TODO: We should expose the args memory manager alloc to the common part as
3308 // alternative to copying them twice.
3309 if (KernelArgs.NumArgs)
3310 std::memcpy(dest: AllArgs, src: *static_cast<void **>(Args),
3311 n: sizeof(void *) * KernelArgs.NumArgs);
3312
3313 AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(GenericDevice);
3314
3315 AMDGPUStreamTy *Stream = nullptr;
3316 if (auto Err = AMDGPUDevice.getStream(AsyncInfoWrapper, Stream))
3317 return Err;
3318
3319 // If this kernel requires an RPC server we attach its pointer to the stream.
3320 if (GenericDevice.getRPCServer())
3321 Stream->setRPCServer(GenericDevice.getRPCServer());
3322
3323 // Only COV5 implicitargs needs to be set. COV4 implicitargs are not used.
3324 if (getImplicitArgsSize() == sizeof(utils::AMDGPUImplicitArgsTy)) {
3325 ImplArgs->BlockCountX = NumBlocks;
3326 ImplArgs->BlockCountY = 1;
3327 ImplArgs->BlockCountZ = 1;
3328 ImplArgs->GroupSizeX = NumThreads;
3329 ImplArgs->GroupSizeY = 1;
3330 ImplArgs->GroupSizeZ = 1;
3331 ImplArgs->GridDims = 1;
3332 ImplArgs->DynamicLdsSize = KernelArgs.DynCGroupMem;
3333 }
3334
3335 // Push the kernel launch into the stream.
3336 return Stream->pushKernelLaunch(Kernel: *this, KernelArgs: AllArgs, NumThreads, NumBlocks,
3337 GroupSize, StackSize, MemoryManager&: ArgsMemoryManager);
3338}
3339
3340Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
3341 KernelArgsTy &KernelArgs,
3342 uint32_t NumThreads,
3343 uint64_t NumBlocks) const {
3344 // Only do all this when the output is requested
3345 if (!(getInfoLevel() & OMP_INFOTYPE_PLUGIN_KERNEL))
3346 return Plugin::success();
3347
3348 // We don't have data to print additional info, but no hard error
3349 if (!KernelInfo.has_value())
3350 return Plugin::success();
3351
3352 // General Info
3353 auto NumGroups = NumBlocks;
3354 auto ThreadsPerGroup = NumThreads;
3355
3356 // Kernel Arguments Info
3357 auto ArgNum = KernelArgs.NumArgs;
3358 auto LoopTripCount = KernelArgs.Tripcount;
3359
3360 // Details for AMDGPU kernels (read from image)
3361 // https://www.llvm.org/docs/AMDGPUUsage.html#code-object-v4-metadata
3362 auto GroupSegmentSize = (*KernelInfo).GroupSegmentList;
3363 auto SGPRCount = (*KernelInfo).SGPRCount;
3364 auto VGPRCount = (*KernelInfo).VGPRCount;
3365 auto SGPRSpillCount = (*KernelInfo).SGPRSpillCount;
3366 auto VGPRSpillCount = (*KernelInfo).VGPRSpillCount;
3367 auto MaxFlatWorkgroupSize = (*KernelInfo).MaxFlatWorkgroupSize;
3368
3369 // Prints additional launch info that contains the following.
3370 // Num Args: The number of kernel arguments
3371 // Teams x Thrds: The number of teams and the number of threads actually
3372 // running.
3373 // MaxFlatWorkgroupSize: Maximum flat work-group size supported by the
3374 // kernel in work-items
3375 // LDS Usage: Amount of bytes used in LDS storage
3376 // S/VGPR Count: the number of S/V GPRs occupied by the kernel
3377 // S/VGPR Spill Count: how many S/VGPRs are spilled by the kernel
3378 // Tripcount: loop tripcount for the kernel
3379 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(),
3380 "#Args: %d Teams x Thrds: %4lux%4u (MaxFlatWorkGroupSize: %u) LDS "
3381 "Usage: %uB #SGPRs/VGPRs: %u/%u #SGPR/VGPR Spills: %u/%u Tripcount: "
3382 "%lu\n",
3383 ArgNum, NumGroups, ThreadsPerGroup, MaxFlatWorkgroupSize,
3384 GroupSegmentSize, SGPRCount, VGPRCount, SGPRSpillCount, VGPRSpillCount,
3385 LoopTripCount);
3386
3387 return Plugin::success();
3388}
3389
3390GenericPluginTy *PluginTy::createPlugin() { return new AMDGPUPluginTy(); }
3391
3392template <typename... ArgsTy>
3393static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
3394 hsa_status_t ResultCode = static_cast<hsa_status_t>(Code);
3395 if (ResultCode == HSA_STATUS_SUCCESS || ResultCode == HSA_STATUS_INFO_BREAK)
3396 return Error::success();
3397
3398 const char *Desc = "Unknown error";
3399 hsa_status_t Ret = hsa_status_string(ResultCode, &Desc);
3400 if (Ret != HSA_STATUS_SUCCESS)
3401 REPORT("Unrecognized " GETNAME(TARGET_NAME) " error code %d\n", Code);
3402
3403 return createStringError<ArgsTy..., const char *>(inconvertibleErrorCode(),
3404 ErrFmt, Args..., Desc);
3405}
3406
3407void *AMDGPUMemoryManagerTy::allocate(size_t Size, void *HstPtr,
3408 TargetAllocTy Kind) {
3409 // Allocate memory from the pool.
3410 void *Ptr = nullptr;
3411 if (auto Err = MemoryPool->allocate(Size, PtrStorage: &Ptr)) {
3412 consumeError(Err: std::move(Err));
3413 return nullptr;
3414 }
3415 assert(Ptr && "Invalid pointer");
3416
3417 auto &KernelAgents = Plugin.getKernelAgents();
3418
3419 // Allow all kernel agents to access the allocation.
3420 if (auto Err = MemoryPool->enableAccess(Ptr, Size, KernelAgents)) {
3421 REPORT("%s\n", toString(std::move(Err)).data());
3422 return nullptr;
3423 }
3424 return Ptr;
3425}
3426
3427void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
3428 if (Size == 0)
3429 return nullptr;
3430
3431 // Find the correct memory pool.
3432 AMDGPUMemoryPoolTy *MemoryPool = nullptr;
3433 switch (Kind) {
3434 case TARGET_ALLOC_DEFAULT:
3435 case TARGET_ALLOC_DEVICE:
3436 case TARGET_ALLOC_DEVICE_NON_BLOCKING:
3437 MemoryPool = CoarseGrainedMemoryPools[0];
3438 break;
3439 case TARGET_ALLOC_HOST:
3440 MemoryPool = &HostDevice.getFineGrainedMemoryPool();
3441 break;
3442 case TARGET_ALLOC_SHARED:
3443 MemoryPool = &HostDevice.getFineGrainedMemoryPool();
3444 break;
3445 }
3446
3447 if (!MemoryPool) {
3448 REPORT("No memory pool for the specified allocation kind\n");
3449 return nullptr;
3450 }
3451
3452 // Allocate from the corresponding memory pool.
3453 void *Alloc = nullptr;
3454 if (Error Err = MemoryPool->allocate(Size, PtrStorage: &Alloc)) {
3455 REPORT("%s\n", toString(E: std::move(Err)).data());
3456 return nullptr;
3457 }
3458
3459 if (Alloc) {
3460 auto &KernelAgents =
3461 static_cast<AMDGPUPluginTy &>(Plugin).getKernelAgents();
3462 // Inherently necessary for host or shared allocations
3463 // Also enabled for device memory to allow device to device memcpy
3464
3465 // Enable all kernel agents to access the buffer.
3466 if (auto Err = MemoryPool->enableAccess(Alloc, Size, KernelAgents)) {
3467 REPORT("%s\n", toString(std::move(Err)).data());
3468 return nullptr;
3469 }
3470 }
3471
3472 return Alloc;
3473}
3474
3475} // namespace plugin
3476} // namespace target
3477} // namespace omp
3478} // namespace llvm
3479

source code of offload/plugins-nextgen/amdgpu/src/rtl.cpp