1//===----RTLs/amdgpu/src/rtl.cpp - Target RTLs Implementation ----- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// RTL NextGen for AMDGPU machine
10//
11//===----------------------------------------------------------------------===//
12
13#include <atomic>
14#include <cassert>
15#include <cstddef>
16#include <cstdint>
17#include <deque>
18#include <functional>
19#include <mutex>
20#include <string>
21#include <system_error>
22#include <unistd.h>
23#include <unordered_map>
24
25#include "ErrorReporting.h"
26#include "Shared/APITypes.h"
27#include "Shared/Debug.h"
28#include "Shared/Environment.h"
29#include "Shared/RefCnt.h"
30#include "Shared/Utils.h"
31#include "Utils/ELF.h"
32
33#include "GlobalHandler.h"
34#include "OpenMP/OMPT/Callback.h"
35#include "PluginInterface.h"
36#include "UtilitiesRTL.h"
37#include "omptarget.h"
38
39#include "llvm/ADT/SmallString.h"
40#include "llvm/ADT/SmallVector.h"
41#include "llvm/ADT/StringRef.h"
42#include "llvm/BinaryFormat/ELF.h"
43#include "llvm/Frontend/OpenMP/OMPConstants.h"
44#include "llvm/Frontend/OpenMP/OMPGridValues.h"
45#include "llvm/Support/Error.h"
46#include "llvm/Support/FileOutputBuffer.h"
47#include "llvm/Support/FileSystem.h"
48#include "llvm/Support/MemoryBuffer.h"
49#include "llvm/Support/Program.h"
50#include "llvm/Support/Signals.h"
51#include "llvm/Support/raw_ostream.h"
52
53#if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || \
54 !defined(__ORDER_BIG_ENDIAN__)
55#error "Missing preprocessor definitions for endianness detection."
56#endif
57
58// The HSA headers require these definitions.
59#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
60#define LITTLEENDIAN_CPU
61#elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
62#define BIGENDIAN_CPU
63#endif
64
65#if defined(__has_include)
66#if __has_include("hsa.h")
67#include "hsa.h"
68#include "hsa_ext_amd.h"
69#elif __has_include("hsa/hsa.h")
70#include "hsa/hsa.h"
71#include "hsa/hsa_ext_amd.h"
72#endif
73#else
74#include "hsa/hsa.h"
75#include "hsa/hsa_ext_amd.h"
76#endif
77
78using namespace error;
79
80namespace llvm {
81namespace omp {
82namespace target {
83namespace plugin {
84
85/// Forward declarations for all specialized data structures.
86struct AMDGPUKernelTy;
87struct AMDGPUDeviceTy;
88struct AMDGPUPluginTy;
89struct AMDGPUStreamTy;
90struct AMDGPUEventTy;
91struct AMDGPUStreamManagerTy;
92struct AMDGPUEventManagerTy;
93struct AMDGPUDeviceImageTy;
94struct AMDGPUMemoryManagerTy;
95struct AMDGPUMemoryPoolTy;
96
97namespace hsa_utils {
98
99/// Iterate elements using an HSA iterate function. Do not use this function
100/// directly but the specialized ones below instead.
101template <typename ElemTy, typename IterFuncTy, typename CallbackTy>
102hsa_status_t iterate(IterFuncTy Func, CallbackTy Cb) {
103 auto L = [](ElemTy Elem, void *Data) -> hsa_status_t {
104 CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data);
105 return (*Unwrapped)(Elem);
106 };
107 return Func(L, static_cast<void *>(&Cb));
108}
109
110/// Iterate elements using an HSA iterate function passing a parameter. Do not
111/// use this function directly but the specialized ones below instead.
112template <typename ElemTy, typename IterFuncTy, typename IterFuncArgTy,
113 typename CallbackTy>
114hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) {
115 auto L = [](ElemTy Elem, void *Data) -> hsa_status_t {
116 CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data);
117 return (*Unwrapped)(Elem);
118 };
119 return Func(FuncArg, L, static_cast<void *>(&Cb));
120}
121
122/// Iterate elements using an HSA iterate function passing a parameter. Do not
123/// use this function directly but the specialized ones below instead.
124template <typename Elem1Ty, typename Elem2Ty, typename IterFuncTy,
125 typename IterFuncArgTy, typename CallbackTy>
126hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) {
127 auto L = [](Elem1Ty Elem1, Elem2Ty Elem2, void *Data) -> hsa_status_t {
128 CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data);
129 return (*Unwrapped)(Elem1, Elem2);
130 };
131 return Func(FuncArg, L, static_cast<void *>(&Cb));
132}
133
134/// Iterate agents.
135template <typename CallbackTy> Error iterateAgents(CallbackTy Callback) {
136 hsa_status_t Status = iterate<hsa_agent_t>(hsa_iterate_agents, Callback);
137 return Plugin::check(Status, "error in hsa_iterate_agents: %s");
138}
139
140/// Iterate ISAs of an agent.
141template <typename CallbackTy>
142Error iterateAgentISAs(hsa_agent_t Agent, CallbackTy Cb) {
143 hsa_status_t Status = iterate<hsa_isa_t>(hsa_agent_iterate_isas, Agent, Cb);
144 return Plugin::check(Status, "error in hsa_agent_iterate_isas: %s");
145}
146
147/// Iterate memory pools of an agent.
148template <typename CallbackTy>
149Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) {
150 hsa_status_t Status = iterate<hsa_amd_memory_pool_t>(
151 hsa_amd_agent_iterate_memory_pools, Agent, Cb);
152 return Plugin::check(Status,
153 "error in hsa_amd_agent_iterate_memory_pools: %s");
154}
155
156/// Dispatches an asynchronous memory copy.
157/// Enables different SDMA engines for the dispatch in a round-robin fashion.
158Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent,
159 const void *Src, hsa_agent_t SrcAgent, size_t Size,
160 uint32_t NumDepSignals, const hsa_signal_t *DepSignals,
161 hsa_signal_t CompletionSignal) {
162 if (!UseMultipleSdmaEngines) {
163 hsa_status_t S =
164 hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, Size,
165 NumDepSignals, DepSignals, CompletionSignal);
166 return Plugin::check(S, "error in hsa_amd_memory_async_copy: %s");
167 }
168
169// This solution is probably not the best
170#if !(HSA_AMD_INTERFACE_VERSION_MAJOR >= 1 && \
171 HSA_AMD_INTERFACE_VERSION_MINOR >= 2)
172 return Plugin::error(ErrorCode::UNSUPPORTED,
173 "async copy on selected SDMA requires ROCm 5.7");
174#else
175 static std::atomic<int> SdmaEngine{1};
176
177 // This atomics solution is probably not the best, but should be sufficient
178 // for now.
179 // In a worst case scenario, in which threads read the same value, they will
180 // dispatch to the same SDMA engine. This may result in sub-optimal
181 // performance. However, I think the possibility to be fairly low.
182 int LocalSdmaEngine = SdmaEngine.load(std::memory_order_acquire);
183 // This call is only avail in ROCm >= 5.7
184 hsa_status_t S = hsa_amd_memory_async_copy_on_engine(
185 Dst, DstAgent, Src, SrcAgent, Size, NumDepSignals, DepSignals,
186 CompletionSignal, (hsa_amd_sdma_engine_id_t)LocalSdmaEngine,
187 /*force_copy_on_sdma=*/true);
188 // Increment to use one of two SDMA engines: 0x1, 0x2
189 LocalSdmaEngine = (LocalSdmaEngine << 1) % 3;
190 SdmaEngine.store(LocalSdmaEngine, std::memory_order_relaxed);
191
192 return Plugin::check(S, "error in hsa_amd_memory_async_copy_on_engine: %s");
193#endif
194}
195
196Error getTargetTripleAndFeatures(hsa_agent_t Agent,
197 SmallVector<SmallString<32>> &Targets) {
198 auto Err = hsa_utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) {
199 uint32_t Length;
200 hsa_status_t Status;
201 Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME_LENGTH, &Length);
202 if (Status != HSA_STATUS_SUCCESS)
203 return Status;
204
205 llvm::SmallVector<char> ISAName(Length);
206 Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, ISAName.begin());
207 if (Status != HSA_STATUS_SUCCESS)
208 return Status;
209
210 llvm::StringRef TripleTarget(ISAName.begin(), Length);
211 if (TripleTarget.consume_front(Prefix: "amdgcn-amd-amdhsa")) {
212 auto Target = TripleTarget.ltrim(Char: '-').rtrim(Char: '\0');
213 Targets.push_back(Elt: Target);
214 }
215 return HSA_STATUS_SUCCESS;
216 });
217 return Err;
218}
219} // namespace hsa_utils
220
221/// Utility class representing generic resource references to AMDGPU resources.
222template <typename ResourceTy>
223struct AMDGPUResourceRef : public GenericDeviceResourceRef {
224 /// The underlying handle type for resources.
225 using HandleTy = ResourceTy *;
226
227 /// Create an empty reference to an invalid resource.
228 AMDGPUResourceRef() : Resource(nullptr) {}
229
230 /// Create a reference to an existing resource.
231 AMDGPUResourceRef(HandleTy Resource) : Resource(Resource) {}
232
233 virtual ~AMDGPUResourceRef() {}
234
235 /// Create a new resource and save the reference. The reference must be empty
236 /// before calling to this function.
237 Error create(GenericDeviceTy &Device) override;
238
239 /// Destroy the referenced resource and invalidate the reference. The
240 /// reference must be to a valid resource before calling to this function.
241 Error destroy(GenericDeviceTy &Device) override {
242 if (!Resource)
243 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
244 "destroying an invalid resource");
245
246 if (auto Err = Resource->deinit())
247 return Err;
248
249 delete Resource;
250
251 Resource = nullptr;
252 return Plugin::success();
253 }
254
255 /// Get the underlying resource handle.
256 operator HandleTy() const { return Resource; }
257
258private:
259 /// The handle to the actual resource.
260 HandleTy Resource;
261};
262
263/// Class holding an HSA memory pool.
264struct AMDGPUMemoryPoolTy {
265 /// Create a memory pool from an HSA memory pool.
266 AMDGPUMemoryPoolTy(hsa_amd_memory_pool_t MemoryPool)
267 : MemoryPool(MemoryPool), GlobalFlags(0) {}
268
269 /// Initialize the memory pool retrieving its properties.
270 Error init() {
271 if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_SEGMENT, Segment))
272 return Err;
273
274 if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, GlobalFlags))
275 return Err;
276
277 return Plugin::success();
278 }
279
280 /// Getter of the HSA memory pool.
281 hsa_amd_memory_pool_t get() const { return MemoryPool; }
282
283 /// Indicate the segment which belongs to.
284 bool isGlobal() const { return (Segment == HSA_AMD_SEGMENT_GLOBAL); }
285 bool isReadOnly() const { return (Segment == HSA_AMD_SEGMENT_READONLY); }
286 bool isPrivate() const { return (Segment == HSA_AMD_SEGMENT_PRIVATE); }
287 bool isGroup() const { return (Segment == HSA_AMD_SEGMENT_GROUP); }
288
289 /// Indicate if it is fine-grained memory. Valid only for global.
290 bool isFineGrained() const {
291 assert(isGlobal() && "Not global memory");
292 return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED);
293 }
294
295 /// Indicate if it is coarse-grained memory. Valid only for global.
296 bool isCoarseGrained() const {
297 assert(isGlobal() && "Not global memory");
298 return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED);
299 }
300
301 /// Indicate if it supports storing kernel arguments. Valid only for global.
302 bool supportsKernelArgs() const {
303 assert(isGlobal() && "Not global memory");
304 return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT);
305 }
306
307 /// Allocate memory on the memory pool.
308 Error allocate(size_t Size, void **PtrStorage) {
309 hsa_status_t Status =
310 hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, PtrStorage);
311 return Plugin::check(Status, "error in hsa_amd_memory_pool_allocate: %s");
312 }
313
314 /// Return memory to the memory pool.
315 Error deallocate(void *Ptr) {
316 hsa_status_t Status = hsa_amd_memory_pool_free(Ptr);
317 return Plugin::check(Status, "error in hsa_amd_memory_pool_free: %s");
318 }
319
320 /// Returns if the \p Agent can access the memory pool.
321 bool canAccess(hsa_agent_t Agent) {
322 hsa_amd_memory_pool_access_t Access;
323 if (hsa_amd_agent_memory_pool_get_info(
324 Agent, MemoryPool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &Access))
325 return false;
326 return Access != HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED;
327 }
328
329 /// Allow the device to access a specific allocation.
330 Error enableAccess(void *Ptr, int64_t Size,
331 const llvm::SmallVector<hsa_agent_t> &Agents) const {
332#ifdef OMPTARGET_DEBUG
333 for (hsa_agent_t Agent : Agents) {
334 hsa_amd_memory_pool_access_t Access;
335 if (auto Err =
336 getAttr(Agent, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, Access))
337 return Err;
338
339 // The agent is not allowed to access the memory pool in any case. Do not
340 // continue because otherwise it result in undefined behavior.
341 if (Access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED)
342 return Plugin::error(ErrorCode::INVALID_VALUE,
343 "an agent is not allowed to access a memory pool");
344 }
345#endif
346
347 // We can access but it is disabled by default. Enable the access then.
348 hsa_status_t Status =
349 hsa_amd_agents_allow_access(Agents.size(), Agents.data(), nullptr, Ptr);
350 return Plugin::check(Status, "error in hsa_amd_agents_allow_access: %s");
351 }
352
353 /// Get attribute from the memory pool.
354 template <typename Ty>
355 Error getAttr(hsa_amd_memory_pool_info_t Kind, Ty &Value) const {
356 hsa_status_t Status;
357 Status = hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value);
358 return Plugin::check(Status, "error in hsa_amd_memory_pool_get_info: %s");
359 }
360
361 template <typename Ty>
362 hsa_status_t getAttrRaw(hsa_amd_memory_pool_info_t Kind, Ty &Value) const {
363 return hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value);
364 }
365
366 /// Get attribute from the memory pool relating to an agent.
367 template <typename Ty>
368 Error getAttr(hsa_agent_t Agent, hsa_amd_agent_memory_pool_info_t Kind,
369 Ty &Value) const {
370 hsa_status_t Status;
371 Status =
372 hsa_amd_agent_memory_pool_get_info(Agent, MemoryPool, Kind, &Value);
373 return Plugin::check(Status,
374 "error in hsa_amd_agent_memory_pool_get_info: %s");
375 }
376
377private:
378 /// The HSA memory pool.
379 hsa_amd_memory_pool_t MemoryPool;
380
381 /// The segment where the memory pool belongs to.
382 hsa_amd_segment_t Segment;
383
384 /// The global flags of memory pool. Only valid if the memory pool belongs to
385 /// the global segment.
386 uint32_t GlobalFlags;
387};
388
389/// Class that implements a memory manager that gets memory from a specific
390/// memory pool.
391struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
392
393 /// Create an empty memory manager.
394 AMDGPUMemoryManagerTy(AMDGPUPluginTy &Plugin)
395 : Plugin(Plugin), MemoryPool(nullptr), MemoryManager(nullptr) {}
396
397 /// Initialize the memory manager from a memory pool.
398 Error init(AMDGPUMemoryPoolTy &MemoryPool) {
399 const uint32_t Threshold = 1 << 30;
400 this->MemoryManager = new MemoryManagerTy(*this, Threshold);
401 this->MemoryPool = &MemoryPool;
402 return Plugin::success();
403 }
404
405 /// Deinitialize the memory manager and free its allocations.
406 Error deinit() {
407 assert(MemoryManager && "Invalid memory manager");
408
409 // Delete and invalidate the memory manager. At this point, the memory
410 // manager will deallocate all its allocations.
411 delete MemoryManager;
412 MemoryManager = nullptr;
413
414 return Plugin::success();
415 }
416
417 /// Reuse or allocate memory through the memory manager.
418 Error allocate(size_t Size, void **PtrStorage) {
419 assert(MemoryManager && "Invalid memory manager");
420 assert(PtrStorage && "Invalid pointer storage");
421
422 *PtrStorage = MemoryManager->allocate(Size, nullptr);
423 if (Size && *PtrStorage == nullptr)
424 return Plugin::error(ErrorCode::OUT_OF_RESOURCES,
425 "failure to allocate from AMDGPU memory manager");
426
427 return Plugin::success();
428 }
429
430 /// Release an allocation to be reused.
431 Error deallocate(void *Ptr) {
432 if (MemoryManager->free(Ptr))
433 return Plugin::error(ErrorCode::UNKNOWN,
434 "failure to deallocate from AMDGPU memory manager");
435
436 return Plugin::success();
437 }
438
439private:
440 /// Allocation callback that will be called once the memory manager does not
441 /// have more previously allocated buffers.
442 void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) override;
443
444 /// Deallocation callback that will be called by the memory manager.
445 int free(void *TgtPtr, TargetAllocTy Kind) override {
446 if (auto Err = MemoryPool->deallocate(Ptr: TgtPtr)) {
447 consumeError(Err: std::move(Err));
448 return OFFLOAD_FAIL;
449 }
450 return OFFLOAD_SUCCESS;
451 }
452
453 /// The underlying plugin that owns this memory manager.
454 AMDGPUPluginTy &Plugin;
455
456 /// The memory pool used to allocate memory.
457 AMDGPUMemoryPoolTy *MemoryPool;
458
459 /// Reference to the actual memory manager.
460 MemoryManagerTy *MemoryManager;
461};
462
463/// Class implementing the AMDGPU device images' properties.
464struct AMDGPUDeviceImageTy : public DeviceImageTy {
465 /// Create the AMDGPU image with the id and the target image pointer.
466 AMDGPUDeviceImageTy(int32_t ImageId, GenericDeviceTy &Device,
467 const __tgt_device_image *TgtImage)
468 : DeviceImageTy(ImageId, Device, TgtImage) {}
469
470 /// Prepare and load the executable corresponding to the image.
471 Error loadExecutable(const AMDGPUDeviceTy &Device);
472
473 /// Unload the executable.
474 Error unloadExecutable() {
475 hsa_status_t Status = hsa_executable_destroy(Executable);
476 return Plugin::check(Status, "error in hsa_executable_destroy: %s");
477 }
478
479 /// Get the executable.
480 hsa_executable_t getExecutable() const { return Executable; }
481
482 /// Get to Code Object Version of the ELF
483 uint16_t getELFABIVersion() const { return ELFABIVersion; }
484
485 /// Find an HSA device symbol by its name on the executable.
486 Expected<hsa_executable_symbol_t>
487 findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const;
488
489 /// Get additional info for kernel, e.g., register spill counts
490 std::optional<offloading::amdgpu::AMDGPUKernelMetaData>
491 getKernelInfo(StringRef Identifier) const {
492 auto It = KernelInfoMap.find(Identifier);
493
494 if (It == KernelInfoMap.end())
495 return {};
496
497 return It->second;
498 }
499
500private:
501 /// The executable loaded on the agent.
502 hsa_executable_t Executable;
503 StringMap<offloading::amdgpu::AMDGPUKernelMetaData> KernelInfoMap;
504 uint16_t ELFABIVersion;
505};
506
507/// Class implementing the AMDGPU kernel functionalities which derives from the
508/// generic kernel class.
509struct AMDGPUKernelTy : public GenericKernelTy {
510 /// Create an AMDGPU kernel with a name and an execution mode.
511 AMDGPUKernelTy(const char *Name) : GenericKernelTy(Name) {}
512
513 /// Initialize the AMDGPU kernel.
514 Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override {
515 AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(Image);
516
517 // Kernel symbols have a ".kd" suffix.
518 std::string KernelName(getName());
519 KernelName += ".kd";
520
521 // Find the symbol on the device executable.
522 auto SymbolOrErr = AMDImage.findDeviceSymbol(Device, KernelName);
523 if (!SymbolOrErr)
524 return SymbolOrErr.takeError();
525
526 hsa_executable_symbol_t Symbol = *SymbolOrErr;
527 hsa_symbol_kind_t SymbolType;
528 hsa_status_t Status;
529
530 // Retrieve different properties of the kernel symbol.
531 std::pair<hsa_executable_symbol_info_t, void *> RequiredInfos[] = {
532 {HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &SymbolType},
533 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &KernelObject},
534 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &ArgsSize},
535 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &GroupSize},
536 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &DynamicStack},
537 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &PrivateSize}};
538
539 for (auto &Info : RequiredInfos) {
540 Status = hsa_executable_symbol_get_info(Symbol, Info.first, Info.second);
541 if (auto Err = Plugin::check(
542 Status, "error in hsa_executable_symbol_get_info: %s"))
543 return Err;
544 }
545
546 // Make sure it is a kernel symbol.
547 if (SymbolType != HSA_SYMBOL_KIND_KERNEL)
548 return Plugin::error(ErrorCode::INVALID_BINARY,
549 "symbol %s is not a kernel function");
550
551 // TODO: Read the kernel descriptor for the max threads per block. May be
552 // read from the image.
553
554 ImplicitArgsSize =
555 hsa_utils::getImplicitArgsSize(AMDImage.getELFABIVersion());
556 DP("ELFABIVersion: %d\n", AMDImage.getELFABIVersion());
557
558 // Get additional kernel info read from image
559 KernelInfo = AMDImage.getKernelInfo(getName());
560 if (!KernelInfo.has_value())
561 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device.getDeviceId(),
562 "Could not read extra information for kernel %s.", getName());
563
564 return Plugin::success();
565 }
566
567 /// Launch the AMDGPU kernel function.
568 Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
569 uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
570 KernelLaunchParamsTy LaunchParams,
571 AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
572
573 /// Print more elaborate kernel launch info for AMDGPU
574 Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
575 KernelArgsTy &KernelArgs, uint32_t NumThreads[3],
576 uint32_t NumBlocks[3]) const override;
577
578 /// Get group and private segment kernel size.
579 uint32_t getGroupSize() const { return GroupSize; }
580 uint32_t getPrivateSize() const { return PrivateSize; }
581
582 /// Get the HSA kernel object representing the kernel function.
583 uint64_t getKernelObject() const { return KernelObject; }
584
585 /// Get the size of implicitargs based on the code object version.
586 uint32_t getImplicitArgsSize() const { return ImplicitArgsSize; }
587
588 /// Indicates whether or not we need to set up our own private segment size.
589 bool usesDynamicStack() const { return DynamicStack; }
590
591private:
592 /// The kernel object to execute.
593 uint64_t KernelObject;
594
595 /// The args, group and private segments sizes required by a kernel instance.
596 uint32_t ArgsSize;
597 uint32_t GroupSize;
598 uint32_t PrivateSize;
599 bool DynamicStack;
600
601 /// The size of implicit kernel arguments.
602 uint32_t ImplicitArgsSize;
603
604 /// Additional Info for the AMD GPU Kernel
605 std::optional<offloading::amdgpu::AMDGPUKernelMetaData> KernelInfo;
606};
607
608/// Class representing an HSA signal. Signals are used to define dependencies
609/// between asynchronous operations: kernel launches and memory transfers.
610struct AMDGPUSignalTy {
611 /// Create an empty signal.
612 AMDGPUSignalTy() : HSASignal({0}), UseCount() {}
613 AMDGPUSignalTy(AMDGPUDeviceTy &Device) : HSASignal({0}), UseCount() {}
614
615 /// Initialize the signal with an initial value.
616 Error init(uint32_t InitialValue = 1) {
617 hsa_status_t Status =
618 hsa_amd_signal_create(InitialValue, 0, nullptr, 0, &HSASignal);
619 return Plugin::check(Status, "error in hsa_signal_create: %s");
620 }
621
622 /// Deinitialize the signal.
623 Error deinit() {
624 hsa_status_t Status = hsa_signal_destroy(HSASignal);
625 return Plugin::check(Status, "error in hsa_signal_destroy: %s");
626 }
627
628 /// Wait until the signal gets a zero value.
629 Error wait(const uint64_t ActiveTimeout = 0,
630 GenericDeviceTy *Device = nullptr) const {
631 if (ActiveTimeout) {
632 hsa_signal_value_t Got = 1;
633 Got = hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0,
634 ActiveTimeout, HSA_WAIT_STATE_ACTIVE);
635 if (Got == 0)
636 return Plugin::success();
637 }
638
639 // If there is an RPC device attached to this stream we run it as a server.
640 uint64_t Timeout = UINT64_MAX;
641 auto WaitState = HSA_WAIT_STATE_BLOCKED;
642 while (hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0,
643 Timeout, WaitState) != 0)
644 ;
645 return Plugin::success();
646 }
647
648 /// Load the value on the signal.
649 hsa_signal_value_t load() const {
650 return hsa_signal_load_scacquire(HSASignal);
651 }
652
653 /// Signal decrementing by one.
654 void signal() {
655 assert(load() > 0 && "Invalid signal value");
656 hsa_signal_subtract_screlease(HSASignal, 1);
657 }
658
659 /// Reset the signal value before reusing the signal. Do not call this
660 /// function if the signal is being currently used by any watcher, such as a
661 /// plugin thread or the HSA runtime.
662 void reset() { hsa_signal_store_screlease(HSASignal, 1); }
663
664 /// Increase the number of concurrent uses.
665 void increaseUseCount() { UseCount.increase(); }
666
667 /// Decrease the number of concurrent uses and return whether was the last.
668 bool decreaseUseCount() { return UseCount.decrease(); }
669
670 hsa_signal_t get() const { return HSASignal; }
671
672private:
673 /// The underlying HSA signal.
674 hsa_signal_t HSASignal;
675
676 /// Reference counter for tracking the concurrent use count. This is mainly
677 /// used for knowing how many streams are using the signal.
678 RefCountTy<> UseCount;
679};
680
681/// Classes for holding AMDGPU signals and managing signals.
682using AMDGPUSignalRef = AMDGPUResourceRef<AMDGPUSignalTy>;
683using AMDGPUSignalManagerTy = GenericDeviceResourceManagerTy<AMDGPUSignalRef>;
684
685/// Class holding an HSA queue to submit kernel and barrier packets.
686struct AMDGPUQueueTy {
687 /// Create an empty queue.
688 AMDGPUQueueTy() : Queue(nullptr), Mutex(), NumUsers(0) {}
689
690 /// Lazily initialize a new queue belonging to a specific agent.
691 Error init(GenericDeviceTy &Device, hsa_agent_t Agent, int32_t QueueSize) {
692 if (Queue)
693 return Plugin::success();
694 hsa_status_t Status =
695 hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError,
696 &Device, UINT32_MAX, UINT32_MAX, &Queue);
697 return Plugin::check(Status, "error in hsa_queue_create: %s");
698 }
699
700 /// Deinitialize the queue and destroy its resources.
701 Error deinit() {
702 std::lock_guard<std::mutex> Lock(Mutex);
703 if (!Queue)
704 return Plugin::success();
705 hsa_status_t Status = hsa_queue_destroy(Queue);
706 return Plugin::check(Status, "error in hsa_queue_destroy: %s");
707 }
708
709 /// Returns the number of streams, this queue is currently assigned to.
710 bool getUserCount() const { return NumUsers; }
711
712 /// Returns if the underlying HSA queue is initialized.
713 bool isInitialized() { return Queue != nullptr; }
714
715 /// Decrement user count of the queue object.
716 void removeUser() { --NumUsers; }
717
718 /// Increase user count of the queue object.
719 void addUser() { ++NumUsers; }
720
721 /// Push a kernel launch to the queue. The kernel launch requires an output
722 /// signal and can define an optional input signal (nullptr if none).
723 Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
724 uint32_t NumThreads[3], uint32_t NumBlocks[3],
725 uint32_t GroupSize, uint64_t StackSize,
726 AMDGPUSignalTy *OutputSignal,
727 AMDGPUSignalTy *InputSignal) {
728 assert(OutputSignal && "Invalid kernel output signal");
729
730 // Lock the queue during the packet publishing process. Notice this blocks
731 // the addition of other packets to the queue. The following piece of code
732 // should be lightweight; do not block the thread, allocate memory, etc.
733 std::lock_guard<std::mutex> Lock(Mutex);
734 assert(Queue && "Interacted with a non-initialized queue!");
735
736 // Add a barrier packet before the kernel packet in case there is a pending
737 // preceding operation. The barrier packet will delay the processing of
738 // subsequent queue's packets until the barrier input signal are satisfied.
739 // No need output signal needed because the dependency is already guaranteed
740 // by the queue barrier itself.
741 if (InputSignal && InputSignal->load())
742 if (auto Err = pushBarrierImpl(OutputSignal: nullptr, InputSignal1: InputSignal))
743 return Err;
744
745 // Now prepare the kernel packet.
746 uint64_t PacketId;
747 hsa_kernel_dispatch_packet_t *Packet = acquirePacket(PacketId);
748 assert(Packet && "Invalid packet");
749
750 // The first 32 bits of the packet are written after the other fields
751 uint16_t Dims = NumBlocks[2] * NumThreads[2] > 1
752 ? 3
753 : 1 + (NumBlocks[1] * NumThreads[1] != 1);
754 uint16_t Setup = UINT16_C(Dims)
755 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
756 Packet->workgroup_size_x = NumThreads[0];
757 Packet->workgroup_size_y = NumThreads[1];
758 Packet->workgroup_size_z = NumThreads[2];
759 Packet->reserved0 = 0;
760 Packet->grid_size_x = NumBlocks[0] * NumThreads[0];
761 Packet->grid_size_y = NumBlocks[1] * NumThreads[1];
762 Packet->grid_size_z = NumBlocks[2] * NumThreads[2];
763 Packet->private_segment_size =
764 Kernel.usesDynamicStack() ? StackSize : Kernel.getPrivateSize();
765 Packet->group_segment_size = GroupSize;
766 Packet->kernel_object = Kernel.getKernelObject();
767 Packet->kernarg_address = KernelArgs;
768 Packet->reserved2 = 0;
769 Packet->completion_signal = OutputSignal->get();
770
771 // Publish the packet. Do not modify the packet after this point.
772 publishKernelPacket(PacketId, Setup, Packet);
773
774 return Plugin::success();
775 }
776
777 /// Push a barrier packet that will wait up to two input signals. All signals
778 /// are optional (nullptr if none).
779 Error pushBarrier(AMDGPUSignalTy *OutputSignal,
780 const AMDGPUSignalTy *InputSignal1,
781 const AMDGPUSignalTy *InputSignal2) {
782 // Lock the queue during the packet publishing process.
783 std::lock_guard<std::mutex> Lock(Mutex);
784 assert(Queue && "Interacted with a non-initialized queue!");
785
786 // Push the barrier with the lock acquired.
787 return pushBarrierImpl(OutputSignal, InputSignal1, InputSignal2);
788 }
789
790private:
791 /// Push a barrier packet that will wait up to two input signals. Assumes the
792 /// the queue lock is acquired.
793 Error pushBarrierImpl(AMDGPUSignalTy *OutputSignal,
794 const AMDGPUSignalTy *InputSignal1,
795 const AMDGPUSignalTy *InputSignal2 = nullptr) {
796 // Add a queue barrier waiting on both the other stream's operation and the
797 // last operation on the current stream (if any).
798 uint64_t PacketId;
799 hsa_barrier_and_packet_t *Packet =
800 (hsa_barrier_and_packet_t *)acquirePacket(PacketId);
801 assert(Packet && "Invalid packet");
802
803 Packet->reserved0 = 0;
804 Packet->reserved1 = 0;
805 Packet->dep_signal[0] = {0};
806 Packet->dep_signal[1] = {0};
807 Packet->dep_signal[2] = {0};
808 Packet->dep_signal[3] = {0};
809 Packet->dep_signal[4] = {0};
810 Packet->reserved2 = 0;
811 Packet->completion_signal = {0};
812
813 // Set input and output dependencies if needed.
814 if (OutputSignal)
815 Packet->completion_signal = OutputSignal->get();
816 if (InputSignal1)
817 Packet->dep_signal[0] = InputSignal1->get();
818 if (InputSignal2)
819 Packet->dep_signal[1] = InputSignal2->get();
820
821 // Publish the packet. Do not modify the packet after this point.
822 publishBarrierPacket(PacketId, Packet);
823
824 return Plugin::success();
825 }
826
827 /// Acquire a packet from the queue. This call may block the thread if there
828 /// is no space in the underlying HSA queue. It may need to wait until the HSA
829 /// runtime processes some packets. Assumes the queue lock is acquired.
830 hsa_kernel_dispatch_packet_t *acquirePacket(uint64_t &PacketId) {
831 // Increase the queue index with relaxed memory order. Notice this will need
832 // another subsequent atomic operation with acquire order.
833 PacketId = hsa_queue_add_write_index_relaxed(Queue, 1);
834
835 // Wait for the package to be available. Notice the atomic operation uses
836 // the acquire memory order.
837 while (PacketId - hsa_queue_load_read_index_scacquire(Queue) >= Queue->size)
838 ;
839
840 // Return the packet reference.
841 const uint32_t Mask = Queue->size - 1; // The size is a power of 2.
842 return (hsa_kernel_dispatch_packet_t *)Queue->base_address +
843 (PacketId & Mask);
844 }
845
846 /// Publish the kernel packet so that the HSA runtime can start processing
847 /// the kernel launch. Do not modify the packet once this function is called.
848 /// Assumes the queue lock is acquired.
849 void publishKernelPacket(uint64_t PacketId, uint16_t Setup,
850 hsa_kernel_dispatch_packet_t *Packet) {
851 uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet);
852
853 uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
854 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
855 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
856
857 // Publish the packet. Do not modify the package after this point.
858 uint32_t HeaderWord = Header | (Setup << 16u);
859 __atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);
860
861 // Signal the doorbell about the published packet.
862 hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
863 }
864
865 /// Publish the barrier packet so that the HSA runtime can start processing
866 /// the barrier. Next packets in the queue will not be processed until all
867 /// barrier dependencies (signals) are satisfied. Assumes the queue is locked
868 void publishBarrierPacket(uint64_t PacketId,
869 hsa_barrier_and_packet_t *Packet) {
870 uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet);
871 uint16_t Setup = 0;
872 uint16_t Header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
873 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
874 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
875
876 // Publish the packet. Do not modify the package after this point.
877 uint32_t HeaderWord = Header | (Setup << 16u);
878 __atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);
879
880 // Signal the doorbell about the published packet.
881 hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
882 }
883
884 /// Callback that will be called when an error is detected on the HSA queue.
885 static void callbackError(hsa_status_t Status, hsa_queue_t *Source,
886 void *Data);
887
888 /// The HSA queue.
889 hsa_queue_t *Queue;
890
891 /// Mutex to protect the acquiring and publishing of packets. For the moment,
892 /// we need this mutex to prevent publishing packets that are not ready to be
893 /// published in a multi-thread scenario. Without a queue lock, a thread T1
894 /// could acquire packet P and thread T2 acquire packet P+1. Thread T2 could
895 /// publish its packet P+1 (signaling the queue's doorbell) before packet P
896 /// from T1 is ready to be processed. That scenario should be invalid. Thus,
897 /// we use the following mutex to make packet acquiring and publishing atomic.
898 /// TODO: There are other more advanced approaches to avoid this mutex using
899 /// atomic operations. We can further investigate it if this is a bottleneck.
900 std::mutex Mutex;
901
902 /// The number of streams, this queue is currently assigned to. A queue is
903 /// considered idle when this is zero, otherwise: busy.
904 uint32_t NumUsers;
905};
906
907/// Struct that implements a stream of asynchronous operations for AMDGPU
908/// devices. This class relies on signals to implement streams and define the
909/// dependencies between asynchronous operations.
910struct AMDGPUStreamTy {
911private:
912 /// Utility struct holding arguments for async H2H memory copies.
913 struct MemcpyArgsTy {
914 void *Dst;
915 const void *Src;
916 size_t Size;
917 };
918
919 /// Utility struct holding arguments for freeing buffers to memory managers.
920 struct ReleaseBufferArgsTy {
921 void *Buffer;
922 AMDGPUMemoryManagerTy *MemoryManager;
923 };
924
925 /// Utility struct holding arguments for releasing signals to signal managers.
926 struct ReleaseSignalArgsTy {
927 AMDGPUSignalTy *Signal;
928 AMDGPUSignalManagerTy *SignalManager;
929 };
930
931 using AMDGPUStreamCallbackTy = Error(void *Data);
932
933 /// The stream is composed of N stream's slots. The struct below represents
934 /// the fields of each slot. Each slot has a signal and an optional action
935 /// function. When appending an HSA asynchronous operation to the stream, one
936 /// slot is consumed and used to store the operation's information. The
937 /// operation's output signal is set to the consumed slot's signal. If there
938 /// is a previous asynchronous operation on the previous slot, the HSA async
939 /// operation's input signal is set to the signal of the previous slot. This
940 /// way, we obtain a chain of dependent async operations. The action is a
941 /// function that will be executed eventually after the operation is
942 /// completed, e.g., for releasing a buffer.
943 struct StreamSlotTy {
944 /// The output signal of the stream operation. May be used by the subsequent
945 /// operation as input signal.
946 AMDGPUSignalTy *Signal;
947
948 /// The actions that must be performed after the operation's completion. Set
949 /// to nullptr when there is no action to perform.
950 llvm::SmallVector<AMDGPUStreamCallbackTy *> Callbacks;
951
952 /// Space for the action's arguments. A pointer to these arguments is passed
953 /// to the action function. Notice the space of arguments is limited.
954 union ActionArgsTy {
955 MemcpyArgsTy MemcpyArgs;
956 ReleaseBufferArgsTy ReleaseBufferArgs;
957 ReleaseSignalArgsTy ReleaseSignalArgs;
958 void *CallbackArgs;
959 };
960
961 llvm::SmallVector<ActionArgsTy> ActionArgs;
962
963 /// Create an empty slot.
964 StreamSlotTy() : Signal(nullptr), Callbacks({}), ActionArgs({}) {}
965
966 /// Schedule a host memory copy action on the slot.
967 Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size) {
968 Callbacks.emplace_back(Args&: memcpyAction);
969 ActionArgs.emplace_back().MemcpyArgs = MemcpyArgsTy{.Dst: Dst, .Src: Src, .Size: Size};
970 return Plugin::success();
971 }
972
973 /// Schedule a release buffer action on the slot.
974 Error schedReleaseBuffer(void *Buffer, AMDGPUMemoryManagerTy &Manager) {
975 Callbacks.emplace_back(Args&: releaseBufferAction);
976 ActionArgs.emplace_back().ReleaseBufferArgs =
977 ReleaseBufferArgsTy{.Buffer: Buffer, .MemoryManager: &Manager};
978 return Plugin::success();
979 }
980
981 /// Schedule a signal release action on the slot.
982 Error schedReleaseSignal(AMDGPUSignalTy *SignalToRelease,
983 AMDGPUSignalManagerTy *SignalManager) {
984 Callbacks.emplace_back(Args&: releaseSignalAction);
985 ActionArgs.emplace_back().ReleaseSignalArgs =
986 ReleaseSignalArgsTy{SignalToRelease, SignalManager};
987 return Plugin::success();
988 }
989
990 /// Register a callback to be called on compleition
991 Error schedCallback(AMDGPUStreamCallbackTy *Func, void *Data) {
992 Callbacks.emplace_back(Args&: Func);
993 ActionArgs.emplace_back().CallbackArgs = Data;
994
995 return Plugin::success();
996 }
997
998 // Perform the action if needed.
999 Error performAction() {
1000 if (Callbacks.empty())
1001 return Plugin::success();
1002
1003 assert(Callbacks.size() == ActionArgs.size() && "Size mismatch");
1004 for (auto [Callback, ActionArg] : llvm::zip(t&: Callbacks, u&: ActionArgs)) {
1005 // Perform the action.
1006 if (Callback == memcpyAction) {
1007 if (auto Err = memcpyAction(Data: &ActionArg))
1008 return Err;
1009 } else if (Callback == releaseBufferAction) {
1010 if (auto Err = releaseBufferAction(Data: &ActionArg))
1011 return Err;
1012 } else if (Callback == releaseSignalAction) {
1013 if (auto Err = releaseSignalAction(Data: &ActionArg))
1014 return Err;
1015 } else if (Callback) {
1016 if (auto Err = Callback(ActionArg.CallbackArgs))
1017 return Err;
1018 }
1019 }
1020
1021 // Invalidate the action.
1022 Callbacks.clear();
1023 ActionArgs.clear();
1024
1025 return Plugin::success();
1026 }
1027 };
1028
1029 /// The device agent where the stream was created.
1030 hsa_agent_t Agent;
1031
1032 /// The queue that the stream uses to launch kernels.
1033 AMDGPUQueueTy *Queue;
1034
1035 /// The manager of signals to reuse signals.
1036 AMDGPUSignalManagerTy &SignalManager;
1037
1038 /// A reference to the associated device.
1039 GenericDeviceTy &Device;
1040
1041 /// Array of stream slots. Use std::deque because it can dynamically grow
1042 /// without invalidating the already inserted elements. For instance, the
1043 /// std::vector may invalidate the elements by reallocating the internal
1044 /// array if there is not enough space on new insertions.
1045 std::deque<StreamSlotTy> Slots;
1046
1047 /// The next available slot on the queue. This is reset to zero each time the
1048 /// stream is synchronized. It also indicates the current number of consumed
1049 /// slots at a given time.
1050 uint32_t NextSlot;
1051
1052 /// The synchronization id. This number is increased each time the stream is
1053 /// synchronized. It is useful to detect if an AMDGPUEventTy points to an
1054 /// operation that was already finalized in a previous stream sycnhronize.
1055 uint32_t SyncCycle;
1056
1057 /// Mutex to protect stream's management.
1058 mutable std::mutex Mutex;
1059
1060 /// Timeout hint for HSA actively waiting for signal value to change
1061 const uint64_t StreamBusyWaitMicroseconds;
1062
1063 /// Indicate to spread data transfers across all available SDMAs
1064 bool UseMultipleSdmaEngines;
1065
1066 /// Return the current number of asynchronous operations on the stream.
1067 uint32_t size() const { return NextSlot; }
1068
1069 /// Return the last valid slot on the stream.
1070 uint32_t last() const { return size() - 1; }
1071
1072 /// Consume one slot from the stream. Since the stream uses signals on demand
1073 /// and releases them once the slot is no longer used, the function requires
1074 /// an idle signal for the new consumed slot.
1075 std::pair<uint32_t, AMDGPUSignalTy *> consume(AMDGPUSignalTy *OutputSignal) {
1076 // Double the stream size if needed. Since we use std::deque, this operation
1077 // does not invalidate the already added slots.
1078 if (Slots.size() == NextSlot)
1079 Slots.resize(new_size: Slots.size() * 2);
1080
1081 // Update the next available slot and the stream size.
1082 uint32_t Curr = NextSlot++;
1083
1084 // Retrieve the input signal, if any, of the current operation.
1085 AMDGPUSignalTy *InputSignal = (Curr > 0) ? Slots[Curr - 1].Signal : nullptr;
1086
1087 // Set the output signal of the current slot.
1088 Slots[Curr].Signal = OutputSignal;
1089
1090 return std::make_pair(x&: Curr, y&: InputSignal);
1091 }
1092
1093 /// Complete all pending post actions and reset the stream after synchronizing
1094 /// or positively querying the stream.
1095 Error complete() {
1096 for (uint32_t Slot = 0; Slot < NextSlot; ++Slot) {
1097 // Take the post action of the operation if any.
1098 if (auto Err = Slots[Slot].performAction())
1099 return Err;
1100
1101 // Release the slot's signal if possible. Otherwise, another user will.
1102 if (Slots[Slot].Signal->decreaseUseCount())
1103 if (auto Err = SignalManager.returnResource(Slots[Slot].Signal))
1104 return Err;
1105
1106 Slots[Slot].Signal = nullptr;
1107 }
1108
1109 // Reset the stream slots to zero.
1110 NextSlot = 0;
1111
1112 // Increase the synchronization id since the stream completed a sync cycle.
1113 SyncCycle += 1;
1114
1115 return Plugin::success();
1116 }
1117
1118 /// Make the current stream wait on a specific operation of another stream.
1119 /// The idea is to make the current stream waiting on two signals: 1) the last
1120 /// signal of the current stream, and 2) the last signal of the other stream.
1121 /// Use a barrier packet with two input signals.
1122 Error waitOnStreamOperation(AMDGPUStreamTy &OtherStream, uint32_t Slot) {
1123 if (Queue == nullptr)
1124 return Plugin::error(ErrorCode::INVALID_NULL_POINTER,
1125 "target queue was nullptr");
1126
1127 /// The signal that we must wait from the other stream.
1128 AMDGPUSignalTy *OtherSignal = OtherStream.Slots[Slot].Signal;
1129
1130 // Prevent the release of the other stream's signal.
1131 OtherSignal->increaseUseCount();
1132
1133 // Retrieve an available signal for the operation's output.
1134 AMDGPUSignalTy *OutputSignal = nullptr;
1135 if (auto Err = SignalManager.getResource(OutputSignal))
1136 return Err;
1137 OutputSignal->reset();
1138 OutputSignal->increaseUseCount();
1139
1140 // Consume stream slot and compute dependencies.
1141 auto [Curr, InputSignal] = consume(OutputSignal);
1142
1143 // Setup the post action to release the signal.
1144 if (auto Err = Slots[Curr].schedReleaseSignal(OtherSignal, &SignalManager))
1145 return Err;
1146
1147 // Push a barrier into the queue with both input signals.
1148 return Queue->pushBarrier(OutputSignal, InputSignal1: InputSignal, InputSignal2: OtherSignal);
1149 }
1150
1151 /// Callback for running a specific asynchronous operation. This callback is
1152 /// used for hsa_amd_signal_async_handler. The argument is the operation that
1153 /// should be executed. Notice we use the post action mechanism to codify the
1154 /// asynchronous operation.
1155 static bool asyncActionCallback(hsa_signal_value_t Value, void *Args) {
1156 StreamSlotTy *Slot = reinterpret_cast<StreamSlotTy *>(Args);
1157 assert(Slot && "Invalid slot");
1158 assert(Slot->Signal && "Invalid signal");
1159
1160 // This thread is outside the stream mutex. Make sure the thread sees the
1161 // changes on the slot.
1162 std::atomic_thread_fence(m: std::memory_order_acquire);
1163
1164 // Perform the operation.
1165 if (auto Err = Slot->performAction())
1166 FATAL_MESSAGE(1, "Error performing post action: %s",
1167 toString(E: std::move(Err)).data());
1168
1169 // Signal the output signal to notify the asynchronous operation finalized.
1170 Slot->Signal->signal();
1171
1172 // Unregister callback.
1173 return false;
1174 }
1175
1176 // Callback for host-to-host memory copies. This is an asynchronous action.
1177 static Error memcpyAction(void *Data) {
1178 MemcpyArgsTy *Args = reinterpret_cast<MemcpyArgsTy *>(Data);
1179 assert(Args && "Invalid arguments");
1180 assert(Args->Dst && "Invalid destination buffer");
1181 assert(Args->Src && "Invalid source buffer");
1182
1183 std::memcpy(dest: Args->Dst, src: Args->Src, n: Args->Size);
1184
1185 return Plugin::success();
1186 }
1187
1188 /// Releasing a memory buffer to a memory manager. This is a post completion
1189 /// action. There are two kinds of memory buffers:
1190 /// 1. For kernel arguments. This buffer can be freed after receiving the
1191 /// kernel completion signal.
1192 /// 2. For H2D transfers that need pinned memory space for staging. This
1193 /// buffer can be freed after receiving the transfer completion signal.
1194 /// 3. For D2H transfers that need pinned memory space for staging. This
1195 /// buffer cannot be freed after receiving the transfer completion signal
1196 /// because of the following asynchronous H2H callback.
1197 /// For this reason, This action can only be taken at
1198 /// AMDGPUStreamTy::complete()
1199 /// Because of the case 3, all releaseBufferActions are taken at
1200 /// AMDGPUStreamTy::complete() in the current implementation.
1201 static Error releaseBufferAction(void *Data) {
1202 ReleaseBufferArgsTy *Args = reinterpret_cast<ReleaseBufferArgsTy *>(Data);
1203 assert(Args && "Invalid arguments");
1204 assert(Args->MemoryManager && "Invalid memory manager");
1205
1206 // Release the allocation to the memory manager.
1207 return Args->MemoryManager->deallocate(Ptr: Args->Buffer);
1208 }
1209
1210 /// Releasing a signal object back to SignalManager. This is a post completion
1211 /// action. This action can only be taken at AMDGPUStreamTy::complete()
1212 static Error releaseSignalAction(void *Data) {
1213 ReleaseSignalArgsTy *Args = reinterpret_cast<ReleaseSignalArgsTy *>(Data);
1214 assert(Args && "Invalid arguments");
1215 assert(Args->Signal && "Invalid signal");
1216 assert(Args->SignalManager && "Invalid signal manager");
1217
1218 // Release the signal if needed.
1219 if (Args->Signal->decreaseUseCount())
1220 if (auto Err = Args->SignalManager->returnResource(Args->Signal))
1221 return Err;
1222
1223 return Plugin::success();
1224 }
1225
1226public:
1227 /// Create an empty stream associated with a specific device.
1228 AMDGPUStreamTy(AMDGPUDeviceTy &Device);
1229
1230 /// Initialize the stream's signals.
1231 Error init() { return Plugin::success(); }
1232
1233 /// Deinitialize the stream's signals.
1234 Error deinit() { return Plugin::success(); }
1235
1236 /// Push a asynchronous kernel to the stream. The kernel arguments must be
1237 /// placed in a special allocation for kernel args and must keep alive until
1238 /// the kernel finalizes. Once the kernel is finished, the stream will release
1239 /// the kernel args buffer to the specified memory manager.
1240 Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
1241 uint32_t NumThreads[3], uint32_t NumBlocks[3],
1242 uint32_t GroupSize, uint64_t StackSize,
1243 AMDGPUMemoryManagerTy &MemoryManager) {
1244 if (Queue == nullptr)
1245 return Plugin::error(ErrorCode::INVALID_NULL_POINTER,
1246 "target queue was nullptr");
1247
1248 // Retrieve an available signal for the operation's output.
1249 AMDGPUSignalTy *OutputSignal = nullptr;
1250 if (auto Err = SignalManager.getResource(OutputSignal))
1251 return Err;
1252 OutputSignal->reset();
1253 OutputSignal->increaseUseCount();
1254
1255 std::lock_guard<std::mutex> StreamLock(Mutex);
1256
1257 // Consume stream slot and compute dependencies.
1258 auto [Curr, InputSignal] = consume(OutputSignal);
1259
1260 // Setup the post action to release the kernel args buffer.
1261 if (auto Err = Slots[Curr].schedReleaseBuffer(Buffer: KernelArgs, Manager&: MemoryManager))
1262 return Err;
1263
1264 // If we are running an RPC server we want to wake up the server thread
1265 // whenever there is a kernel running and let it sleep otherwise.
1266 if (Device.getRPCServer())
1267 Device.Plugin.getRPCServer().Thread->notify();
1268
1269 // Push the kernel with the output signal and an input signal (optional)
1270 if (auto Err = Queue->pushKernelLaunch(Kernel, KernelArgs, NumThreads,
1271 NumBlocks, GroupSize, StackSize,
1272 OutputSignal, InputSignal))
1273 return Err;
1274
1275 // Register a callback to indicate when the kernel is complete.
1276 if (Device.getRPCServer()) {
1277 if (auto Err = Slots[Curr].schedCallback(
1278 [](void *Data) -> llvm::Error {
1279 GenericPluginTy &Plugin =
1280 *reinterpret_cast<GenericPluginTy *>(Data);
1281 Plugin.getRPCServer().Thread->finish();
1282 return Error::success();
1283 },
1284 &Device.Plugin))
1285 return Err;
1286 }
1287 return Plugin::success();
1288 }
1289
1290 /// Push an asynchronous memory copy between pinned memory buffers.
1291 Error pushPinnedMemoryCopyAsync(void *Dst, const void *Src,
1292 uint64_t CopySize) {
1293 // Retrieve an available signal for the operation's output.
1294 AMDGPUSignalTy *OutputSignal = nullptr;
1295 if (auto Err = SignalManager.getResource(OutputSignal))
1296 return Err;
1297 OutputSignal->reset();
1298 OutputSignal->increaseUseCount();
1299
1300 std::lock_guard<std::mutex> Lock(Mutex);
1301
1302 // Consume stream slot and compute dependencies.
1303 auto [Curr, InputSignal] = consume(OutputSignal);
1304
1305 // Issue the async memory copy.
1306 if (InputSignal && InputSignal->load()) {
1307 hsa_signal_t InputSignalRaw = InputSignal->get();
1308 return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src,
1309 Agent, CopySize, 1, &InputSignalRaw,
1310 OutputSignal->get());
1311 }
1312
1313 return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src,
1314 Agent, CopySize, 0, nullptr,
1315 OutputSignal->get());
1316 }
1317
1318 /// Push an asynchronous memory copy device-to-host involving an unpinned
1319 /// memory buffer. The operation consists of a two-step copy from the
1320 /// device buffer to an intermediate pinned host buffer, and then, to a
1321 /// unpinned host buffer. Both operations are asynchronous and dependent.
1322 /// The intermediate pinned buffer will be released to the specified memory
1323 /// manager once the operation completes.
1324 Error pushMemoryCopyD2HAsync(void *Dst, const void *Src, void *Inter,
1325 uint64_t CopySize,
1326 AMDGPUMemoryManagerTy &MemoryManager) {
1327 // Retrieve available signals for the operation's outputs.
1328 AMDGPUSignalTy *OutputSignals[2] = {};
1329 if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals))
1330 return Err;
1331 for (auto *Signal : OutputSignals) {
1332 Signal->reset();
1333 Signal->increaseUseCount();
1334 }
1335
1336 std::lock_guard<std::mutex> Lock(Mutex);
1337
1338 // Consume stream slot and compute dependencies.
1339 auto [Curr, InputSignal] = consume(OutputSignal: OutputSignals[0]);
1340
1341 // Setup the post action for releasing the intermediate buffer.
1342 if (auto Err = Slots[Curr].schedReleaseBuffer(Buffer: Inter, Manager&: MemoryManager))
1343 return Err;
1344
1345 // Issue the first step: device to host transfer. Avoid defining the input
1346 // dependency if already satisfied.
1347 if (InputSignal && InputSignal->load()) {
1348 hsa_signal_t InputSignalRaw = InputSignal->get();
1349 if (auto Err = hsa_utils::asyncMemCopy(
1350 UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1,
1351 &InputSignalRaw, OutputSignals[0]->get()))
1352 return Err;
1353 } else {
1354 if (auto Err = hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Inter,
1355 Agent, Src, Agent, CopySize, 0,
1356 nullptr, OutputSignals[0]->get()))
1357 return Err;
1358 }
1359
1360 // Consume another stream slot and compute dependencies.
1361 std::tie(args&: Curr, args&: InputSignal) = consume(OutputSignal: OutputSignals[1]);
1362 assert(InputSignal && "Invalid input signal");
1363
1364 // The std::memcpy is done asynchronously using an async handler. We store
1365 // the function's information in the action but it's not actually an action.
1366 if (auto Err = Slots[Curr].schedHostMemoryCopy(Dst, Src: Inter, Size: CopySize))
1367 return Err;
1368
1369 // Make changes on this slot visible to the async handler's thread.
1370 std::atomic_thread_fence(m: std::memory_order_release);
1371
1372 // Issue the second step: host to host transfer.
1373 hsa_status_t Status = hsa_amd_signal_async_handler(
1374 InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback,
1375 (void *)&Slots[Curr]);
1376
1377 return Plugin::check(Status, "error in hsa_amd_signal_async_handler: %s");
1378 }
1379
1380 /// Push an asynchronous memory copy host-to-device involving an unpinned
1381 /// memory buffer. The operation consists of a two-step copy from the
1382 /// unpinned host buffer to an intermediate pinned host buffer, and then, to
1383 /// the pinned host buffer. Both operations are asynchronous and dependent.
1384 /// The intermediate pinned buffer will be released to the specified memory
1385 /// manager once the operation completes.
1386 Error pushMemoryCopyH2DAsync(void *Dst, const void *Src, void *Inter,
1387 uint64_t CopySize,
1388 AMDGPUMemoryManagerTy &MemoryManager) {
1389 // Retrieve available signals for the operation's outputs.
1390 AMDGPUSignalTy *OutputSignals[2] = {};
1391 if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals))
1392 return Err;
1393 for (auto *Signal : OutputSignals) {
1394 Signal->reset();
1395 Signal->increaseUseCount();
1396 }
1397
1398 AMDGPUSignalTy *OutputSignal = OutputSignals[0];
1399
1400 std::lock_guard<std::mutex> Lock(Mutex);
1401
1402 // Consume stream slot and compute dependencies.
1403 auto [Curr, InputSignal] = consume(OutputSignal);
1404
1405 // Issue the first step: host to host transfer.
1406 if (InputSignal && InputSignal->load()) {
1407 // The std::memcpy is done asynchronously using an async handler. We store
1408 // the function's information in the action but it is not actually a
1409 // post action.
1410 if (auto Err = Slots[Curr].schedHostMemoryCopy(Dst: Inter, Src, Size: CopySize))
1411 return Err;
1412
1413 // Make changes on this slot visible to the async handler's thread.
1414 std::atomic_thread_fence(m: std::memory_order_release);
1415
1416 hsa_status_t Status = hsa_amd_signal_async_handler(
1417 InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback,
1418 (void *)&Slots[Curr]);
1419
1420 if (auto Err = Plugin::check(Status,
1421 "error in hsa_amd_signal_async_handler: %s"))
1422 return Err;
1423
1424 // Let's use now the second output signal.
1425 OutputSignal = OutputSignals[1];
1426
1427 // Consume another stream slot and compute dependencies.
1428 std::tie(args&: Curr, args&: InputSignal) = consume(OutputSignal);
1429 } else {
1430 // All preceding operations completed, copy the memory synchronously.
1431 std::memcpy(dest: Inter, src: Src, n: CopySize);
1432
1433 // Return the second signal because it will not be used.
1434 OutputSignals[1]->decreaseUseCount();
1435 if (auto Err = SignalManager.returnResource(OutputSignals[1]))
1436 return Err;
1437 }
1438
1439 // Setup the post action to release the intermediate pinned buffer.
1440 if (auto Err = Slots[Curr].schedReleaseBuffer(Buffer: Inter, Manager&: MemoryManager))
1441 return Err;
1442
1443 // Issue the second step: host to device transfer. Avoid defining the input
1444 // dependency if already satisfied.
1445 if (InputSignal && InputSignal->load()) {
1446 hsa_signal_t InputSignalRaw = InputSignal->get();
1447 return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
1448 Agent, CopySize, 1, &InputSignalRaw,
1449 OutputSignal->get());
1450 }
1451 return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
1452 Agent, CopySize, 0, nullptr,
1453 OutputSignal->get());
1454 }
1455
1456 // AMDGPUDeviceTy is incomplete here, passing the underlying agent instead
1457 Error pushMemoryCopyD2DAsync(void *Dst, hsa_agent_t DstAgent, const void *Src,
1458 hsa_agent_t SrcAgent, uint64_t CopySize) {
1459 AMDGPUSignalTy *OutputSignal;
1460 if (auto Err = SignalManager.getResources(/*Num=*/1, &OutputSignal))
1461 return Err;
1462 OutputSignal->reset();
1463 OutputSignal->increaseUseCount();
1464
1465 std::lock_guard<std::mutex> Lock(Mutex);
1466
1467 // Consume stream slot and compute dependencies.
1468 auto [Curr, InputSignal] = consume(OutputSignal);
1469
1470 // The agents need to have access to the corresponding memory
1471 // This is presently only true if the pointers were originally
1472 // allocated by this runtime or the caller made the appropriate
1473 // access calls.
1474
1475 if (InputSignal && InputSignal->load()) {
1476 hsa_signal_t InputSignalRaw = InputSignal->get();
1477 return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
1478 SrcAgent, CopySize, 1, &InputSignalRaw,
1479 OutputSignal->get());
1480 }
1481 return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
1482 SrcAgent, CopySize, 0, nullptr,
1483 OutputSignal->get());
1484 }
1485
1486 /// Synchronize with the stream. The current thread waits until all operations
1487 /// are finalized and it performs the pending post actions (i.e., releasing
1488 /// intermediate buffers).
1489 Error synchronize() {
1490 std::lock_guard<std::mutex> Lock(Mutex);
1491
1492 // No need to synchronize anything.
1493 if (size() == 0)
1494 return Plugin::success();
1495
1496 // Wait until all previous operations on the stream have completed.
1497 if (auto Err =
1498 Slots[last()].Signal->wait(StreamBusyWaitMicroseconds, &Device))
1499 return Err;
1500
1501 // Reset the stream and perform all pending post actions.
1502 return complete();
1503 }
1504
1505 /// Query the stream and complete pending post actions if operations finished.
1506 /// Return whether all the operations completed. This operation does not block
1507 /// the calling thread.
1508 Expected<bool> query() {
1509 std::lock_guard<std::mutex> Lock(Mutex);
1510
1511 // No need to query anything.
1512 if (size() == 0)
1513 return true;
1514
1515 // The last operation did not complete yet. Return directly.
1516 if (Slots[last()].Signal->load())
1517 return false;
1518
1519 // Reset the stream and perform all pending post actions.
1520 if (auto Err = complete())
1521 return std::move(Err);
1522
1523 return true;
1524 }
1525
1526 const AMDGPUQueueTy *getQueue() const { return Queue; }
1527
1528 /// Record the state of the stream on an event.
1529 Error recordEvent(AMDGPUEventTy &Event) const;
1530
1531 /// Make the stream wait on an event.
1532 Error waitEvent(const AMDGPUEventTy &Event);
1533
1534 friend struct AMDGPUStreamManagerTy;
1535};
1536
1537/// Class representing an event on AMDGPU. The event basically stores some
1538/// information regarding the state of the recorded stream.
1539struct AMDGPUEventTy {
1540 /// Create an empty event.
1541 AMDGPUEventTy(AMDGPUDeviceTy &Device)
1542 : RecordedStream(nullptr), RecordedSlot(-1), RecordedSyncCycle(-1) {}
1543
1544 /// Initialize and deinitialize.
1545 Error init() { return Plugin::success(); }
1546 Error deinit() { return Plugin::success(); }
1547
1548 /// Record the state of a stream on the event.
1549 Error record(AMDGPUStreamTy &Stream) {
1550 std::lock_guard<std::mutex> Lock(Mutex);
1551
1552 // Ignore the last recorded stream.
1553 RecordedStream = &Stream;
1554
1555 return Stream.recordEvent(Event&: *this);
1556 }
1557
1558 /// Make a stream wait on the current event.
1559 Error wait(AMDGPUStreamTy &Stream) {
1560 std::lock_guard<std::mutex> Lock(Mutex);
1561
1562 if (!RecordedStream)
1563 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
1564 "event does not have any recorded stream");
1565
1566 // Synchronizing the same stream. Do nothing.
1567 if (RecordedStream == &Stream)
1568 return Plugin::success();
1569
1570 // No need to wait anything, the recorded stream already finished the
1571 // corresponding operation.
1572 if (RecordedSlot < 0)
1573 return Plugin::success();
1574
1575 return Stream.waitEvent(Event: *this);
1576 }
1577
1578protected:
1579 /// The stream registered in this event.
1580 AMDGPUStreamTy *RecordedStream;
1581
1582 /// The recordered operation on the recorded stream.
1583 int64_t RecordedSlot;
1584
1585 /// The sync cycle when the stream was recorded. Used to detect stale events.
1586 int64_t RecordedSyncCycle;
1587
1588 /// Mutex to safely access event fields.
1589 mutable std::mutex Mutex;
1590
1591 friend struct AMDGPUStreamTy;
1592};
1593
1594Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) const {
1595 std::lock_guard<std::mutex> Lock(Mutex);
1596
1597 if (size() > 0) {
1598 // Record the synchronize identifier (to detect stale recordings) and
1599 // the last valid stream's operation.
1600 Event.RecordedSyncCycle = SyncCycle;
1601 Event.RecordedSlot = last();
1602
1603 assert(Event.RecordedSyncCycle >= 0 && "Invalid recorded sync cycle");
1604 assert(Event.RecordedSlot >= 0 && "Invalid recorded slot");
1605 } else {
1606 // The stream is empty, everything already completed, record nothing.
1607 Event.RecordedSyncCycle = -1;
1608 Event.RecordedSlot = -1;
1609 }
1610 return Plugin::success();
1611}
1612
1613Error AMDGPUStreamTy::waitEvent(const AMDGPUEventTy &Event) {
1614 // Retrieve the recorded stream on the event.
1615 AMDGPUStreamTy &RecordedStream = *Event.RecordedStream;
1616
1617 std::scoped_lock<std::mutex, std::mutex> Lock(Mutex, RecordedStream.Mutex);
1618
1619 // The recorded stream already completed the operation because the synchronize
1620 // identifier is already outdated.
1621 if (RecordedStream.SyncCycle != (uint32_t)Event.RecordedSyncCycle)
1622 return Plugin::success();
1623
1624 // Again, the recorded stream already completed the operation, the last
1625 // operation's output signal is satisfied.
1626 if (!RecordedStream.Slots[Event.RecordedSlot].Signal->load())
1627 return Plugin::success();
1628
1629 // Otherwise, make the current stream wait on the other stream's operation.
1630 return waitOnStreamOperation(OtherStream&: RecordedStream, Slot: Event.RecordedSlot);
1631}
1632
1633struct AMDGPUStreamManagerTy final
1634 : GenericDeviceResourceManagerTy<AMDGPUResourceRef<AMDGPUStreamTy>> {
1635 using ResourceRef = AMDGPUResourceRef<AMDGPUStreamTy>;
1636 using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>;
1637
1638 AMDGPUStreamManagerTy(GenericDeviceTy &Device, hsa_agent_t HSAAgent)
1639 : GenericDeviceResourceManagerTy(Device), Device(Device),
1640 OMPX_QueueTracking("LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING", true),
1641 NextQueue(0), Agent(HSAAgent) {}
1642
1643 Error init(uint32_t InitialSize, int NumHSAQueues, int HSAQueueSize) {
1644 Queues = std::vector<AMDGPUQueueTy>(NumHSAQueues);
1645 QueueSize = HSAQueueSize;
1646 MaxNumQueues = NumHSAQueues;
1647 // Initialize one queue eagerly
1648 if (auto Err = Queues.front().init(Device, Agent, QueueSize))
1649 return Err;
1650
1651 return GenericDeviceResourceManagerTy::init(InitialSize);
1652 }
1653
1654 /// Deinitialize the resource pool and delete all resources. This function
1655 /// must be called before the destructor.
1656 Error deinit() override {
1657 // De-init all queues
1658 for (AMDGPUQueueTy &Queue : Queues) {
1659 if (auto Err = Queue.deinit())
1660 return Err;
1661 }
1662
1663 return GenericDeviceResourceManagerTy::deinit();
1664 }
1665
1666 /// Get a single stream from the pool or create new resources.
1667 virtual Error getResource(AMDGPUStreamTy *&StreamHandle) override {
1668 return getResourcesImpl(1, &StreamHandle, [this](AMDGPUStreamTy *&Handle) {
1669 return assignNextQueue(Stream: Handle);
1670 });
1671 }
1672
1673 /// Return stream to the pool.
1674 virtual Error returnResource(AMDGPUStreamTy *StreamHandle) override {
1675 return returnResourceImpl(StreamHandle, [](AMDGPUStreamTy *Handle) {
1676 Handle->Queue->removeUser();
1677 return Plugin::success();
1678 });
1679 }
1680
1681private:
1682 /// Search for and assign an preferably idle queue to the given Stream. If
1683 /// there is no queue without current users, choose the queue with the lowest
1684 /// user count. If utilization is ignored: use round robin selection.
1685 inline Error assignNextQueue(AMDGPUStreamTy *Stream) {
1686 // Start from zero when tracking utilization, otherwise: round robin policy.
1687 uint32_t Index = OMPX_QueueTracking ? 0 : NextQueue++ % MaxNumQueues;
1688
1689 if (OMPX_QueueTracking) {
1690 // Find the least used queue.
1691 for (uint32_t I = 0; I < MaxNumQueues; ++I) {
1692 // Early exit when an initialized queue is idle.
1693 if (Queues[I].isInitialized() && Queues[I].getUserCount() == 0) {
1694 Index = I;
1695 break;
1696 }
1697
1698 // Update the least used queue.
1699 if (Queues[Index].getUserCount() > Queues[I].getUserCount())
1700 Index = I;
1701 }
1702 }
1703
1704 // Make sure the queue is initialized, then add user & assign.
1705 if (auto Err = Queues[Index].init(Device, Agent, QueueSize))
1706 return Err;
1707 Queues[Index].addUser();
1708 Stream->Queue = &Queues[Index];
1709
1710 return Plugin::success();
1711 }
1712
1713 /// The device associated with this stream.
1714 GenericDeviceTy &Device;
1715
1716 /// Envar for controlling the tracking of busy HSA queues.
1717 BoolEnvar OMPX_QueueTracking;
1718
1719 /// The next queue index to use for round robin selection.
1720 uint32_t NextQueue;
1721
1722 /// The queues which are assigned to requested streams.
1723 std::vector<AMDGPUQueueTy> Queues;
1724
1725 /// The corresponding device as HSA agent.
1726 hsa_agent_t Agent;
1727
1728 /// The maximum number of queues.
1729 uint32_t MaxNumQueues;
1730
1731 /// The size of created queues.
1732 uint32_t QueueSize;
1733};
1734
1735/// Abstract class that holds the common members of the actual kernel devices
1736/// and the host device. Both types should inherit from this class.
1737struct AMDGenericDeviceTy {
1738 AMDGenericDeviceTy() {}
1739
1740 virtual ~AMDGenericDeviceTy() {}
1741
1742 /// Create all memory pools which the device has access to and classify them.
1743 Error initMemoryPools() {
1744 // Retrieve all memory pools from the device agent(s).
1745 Error Err = retrieveAllMemoryPools();
1746 if (Err)
1747 return Err;
1748
1749 for (AMDGPUMemoryPoolTy *MemoryPool : AllMemoryPools) {
1750 // Initialize the memory pool and retrieve some basic info.
1751 Error Err = MemoryPool->init();
1752 if (Err)
1753 return Err;
1754
1755 if (!MemoryPool->isGlobal())
1756 continue;
1757
1758 // Classify the memory pools depending on their properties.
1759 if (MemoryPool->isFineGrained()) {
1760 FineGrainedMemoryPools.push_back(Elt: MemoryPool);
1761 if (MemoryPool->supportsKernelArgs())
1762 ArgsMemoryPools.push_back(Elt: MemoryPool);
1763 } else if (MemoryPool->isCoarseGrained()) {
1764 CoarseGrainedMemoryPools.push_back(Elt: MemoryPool);
1765 }
1766 }
1767 return Plugin::success();
1768 }
1769
1770 /// Destroy all memory pools.
1771 Error deinitMemoryPools() {
1772 for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools)
1773 delete Pool;
1774
1775 AllMemoryPools.clear();
1776 FineGrainedMemoryPools.clear();
1777 CoarseGrainedMemoryPools.clear();
1778 ArgsMemoryPools.clear();
1779
1780 return Plugin::success();
1781 }
1782
1783 /// Retrieve and construct all memory pools from the device agent(s).
1784 virtual Error retrieveAllMemoryPools() = 0;
1785
1786 /// Get the device agent.
1787 virtual hsa_agent_t getAgent() const = 0;
1788
1789protected:
1790 /// Array of all memory pools available to the host agents.
1791 llvm::SmallVector<AMDGPUMemoryPoolTy *> AllMemoryPools;
1792
1793 /// Array of fine-grained memory pools available to the host agents.
1794 llvm::SmallVector<AMDGPUMemoryPoolTy *> FineGrainedMemoryPools;
1795
1796 /// Array of coarse-grained memory pools available to the host agents.
1797 llvm::SmallVector<AMDGPUMemoryPoolTy *> CoarseGrainedMemoryPools;
1798
1799 /// Array of kernel args memory pools available to the host agents.
1800 llvm::SmallVector<AMDGPUMemoryPoolTy *> ArgsMemoryPools;
1801};
1802
1803/// Class representing the host device. This host device may have more than one
1804/// HSA host agent. We aggregate all its resources into the same instance.
1805struct AMDHostDeviceTy : public AMDGenericDeviceTy {
1806 /// Create a host device from an array of host agents.
1807 AMDHostDeviceTy(AMDGPUPluginTy &Plugin,
1808 const llvm::SmallVector<hsa_agent_t> &HostAgents)
1809 : AMDGenericDeviceTy(), Agents(HostAgents), ArgsMemoryManager(Plugin),
1810 PinnedMemoryManager(Plugin) {
1811 assert(HostAgents.size() && "No host agent found");
1812 }
1813
1814 /// Initialize the host device memory pools and the memory managers for
1815 /// kernel args and host pinned memory allocations.
1816 Error init() {
1817 if (auto Err = initMemoryPools())
1818 return Err;
1819
1820 if (auto Err = ArgsMemoryManager.init(getArgsMemoryPool()))
1821 return Err;
1822
1823 if (auto Err = PinnedMemoryManager.init(getFineGrainedMemoryPool()))
1824 return Err;
1825
1826 return Plugin::success();
1827 }
1828
1829 /// Deinitialize memory pools and managers.
1830 Error deinit() {
1831 if (auto Err = deinitMemoryPools())
1832 return Err;
1833
1834 if (auto Err = ArgsMemoryManager.deinit())
1835 return Err;
1836
1837 if (auto Err = PinnedMemoryManager.deinit())
1838 return Err;
1839
1840 return Plugin::success();
1841 }
1842
1843 /// Retrieve and construct all memory pools from the host agents.
1844 Error retrieveAllMemoryPools() override {
1845 // Iterate through the available pools across the host agents.
1846 for (hsa_agent_t Agent : Agents) {
1847 Error Err = hsa_utils::iterateAgentMemoryPools(
1848 Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) {
1849 AMDGPUMemoryPoolTy *MemoryPool =
1850 new AMDGPUMemoryPoolTy(HSAMemoryPool);
1851 AllMemoryPools.push_back(MemoryPool);
1852 return HSA_STATUS_SUCCESS;
1853 });
1854 if (Err)
1855 return Err;
1856 }
1857 return Plugin::success();
1858 }
1859
1860 /// Get one of the host agents. Return always the first agent.
1861 hsa_agent_t getAgent() const override { return Agents[0]; }
1862
1863 /// Get a memory pool for fine-grained allocations.
1864 AMDGPUMemoryPoolTy &getFineGrainedMemoryPool() {
1865 assert(!FineGrainedMemoryPools.empty() && "No fine-grained mempool");
1866 // Retrieve any memory pool.
1867 return *FineGrainedMemoryPools[0];
1868 }
1869
1870 AMDGPUMemoryPoolTy &getCoarseGrainedMemoryPool() {
1871 assert(!CoarseGrainedMemoryPools.empty() && "No coarse-grained mempool");
1872 // Retrieve any memory pool.
1873 return *CoarseGrainedMemoryPools[0];
1874 }
1875
1876 /// Get a memory pool for kernel args allocations.
1877 AMDGPUMemoryPoolTy &getArgsMemoryPool() {
1878 assert(!ArgsMemoryPools.empty() && "No kernelargs mempool");
1879 // Retrieve any memory pool.
1880 return *ArgsMemoryPools[0];
1881 }
1882
1883 /// Getters for kernel args and host pinned memory managers.
1884 AMDGPUMemoryManagerTy &getArgsMemoryManager() { return ArgsMemoryManager; }
1885 AMDGPUMemoryManagerTy &getPinnedMemoryManager() {
1886 return PinnedMemoryManager;
1887 }
1888
1889private:
1890 /// Array of agents on the host side.
1891 const llvm::SmallVector<hsa_agent_t> Agents;
1892
1893 // Memory manager for kernel arguments.
1894 AMDGPUMemoryManagerTy ArgsMemoryManager;
1895
1896 // Memory manager for pinned memory.
1897 AMDGPUMemoryManagerTy PinnedMemoryManager;
1898};
1899
1900/// Class implementing the AMDGPU device functionalities which derives from the
1901/// generic device class.
1902struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
1903 // Create an AMDGPU device with a device id and default AMDGPU grid values.
1904 AMDGPUDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices,
1905 AMDHostDeviceTy &HostDevice, hsa_agent_t Agent)
1906 : GenericDeviceTy(Plugin, DeviceId, NumDevices, {}), AMDGenericDeviceTy(),
1907 OMPX_NumQueues("LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES", 4),
1908 OMPX_QueueSize("LIBOMPTARGET_AMDGPU_HSA_QUEUE_SIZE", 512),
1909 OMPX_DefaultTeamsPerCU("LIBOMPTARGET_AMDGPU_TEAMS_PER_CU", 4),
1910 OMPX_MaxAsyncCopyBytes("LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES",
1911 1 * 1024 * 1024), // 1MB
1912 OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS",
1913 64),
1914 OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", 2000000),
1915 OMPX_UseMultipleSdmaEngines(
1916 "LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES", false),
1917 OMPX_ApuMaps("OMPX_APU_MAPS", false), AMDGPUStreamManager(*this, Agent),
1918 AMDGPUEventManager(*this), AMDGPUSignalManager(*this), Agent(Agent),
1919 HostDevice(HostDevice) {}
1920
1921 ~AMDGPUDeviceTy() {}
1922
1923 /// Initialize the device, its resources and get its properties.
1924 Error initImpl(GenericPluginTy &Plugin) override {
1925 // First setup all the memory pools.
1926 if (auto Err = initMemoryPools())
1927 return Err;
1928
1929 char GPUName[64];
1930 if (auto Err = getDeviceAttr(HSA_AGENT_INFO_NAME, GPUName))
1931 return Err;
1932 ComputeUnitKind = GPUName;
1933
1934 // Get the wavefront size.
1935 uint32_t WavefrontSize = 0;
1936 if (auto Err = getDeviceAttr(HSA_AGENT_INFO_WAVEFRONT_SIZE, WavefrontSize))
1937 return Err;
1938 GridValues.GV_Warp_Size = WavefrontSize;
1939
1940 // Get the frequency of the steady clock. If the attribute is missing
1941 // assume running on an older libhsa and default to 0, omp_get_wtime
1942 // will be inaccurate but otherwise programs can still run.
1943 if (getDeviceAttrRaw(HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY,
1944 ClockFrequency) != HSA_STATUS_SUCCESS)
1945 ClockFrequency = 0;
1946
1947 // Load the grid values depending on the wavefront.
1948 if (WavefrontSize == 32)
1949 GridValues = getAMDGPUGridValues<32>();
1950 else if (WavefrontSize == 64)
1951 GridValues = getAMDGPUGridValues<64>();
1952 else
1953 return Plugin::error(ErrorCode::UNSUPPORTED,
1954 "unexpected AMDGPU wavefront %d", WavefrontSize);
1955
1956 // Get maximum number of workitems per workgroup.
1957 uint16_t WorkgroupMaxDim[3];
1958 if (auto Err =
1959 getDeviceAttr(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgroupMaxDim))
1960 return Err;
1961 GridValues.GV_Max_WG_Size = WorkgroupMaxDim[0];
1962
1963 // Get maximum number of workgroups.
1964 hsa_dim3_t GridMaxDim;
1965 if (auto Err = getDeviceAttr(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim))
1966 return Err;
1967
1968 GridValues.GV_Max_Teams = GridMaxDim.x / GridValues.GV_Max_WG_Size;
1969 if (GridValues.GV_Max_Teams == 0)
1970 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
1971 "maximum number of teams cannot be zero");
1972
1973 // Compute the default number of teams.
1974 uint32_t ComputeUnits = 0;
1975 if (auto Err =
1976 getDeviceAttr(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, ComputeUnits))
1977 return Err;
1978 GridValues.GV_Default_Num_Teams = ComputeUnits * OMPX_DefaultTeamsPerCU;
1979
1980 uint32_t WavesPerCU = 0;
1981 if (auto Err =
1982 getDeviceAttr(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, WavesPerCU))
1983 return Err;
1984 HardwareParallelism = ComputeUnits * WavesPerCU;
1985
1986 // Get maximum size of any device queues and maximum number of queues.
1987 uint32_t MaxQueueSize;
1988 if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUE_MAX_SIZE, MaxQueueSize))
1989 return Err;
1990
1991 uint32_t MaxQueues;
1992 if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUES_MAX, MaxQueues))
1993 return Err;
1994
1995 // Compute the number of queues and their size.
1996 OMPX_NumQueues = std::max(1U, std::min(OMPX_NumQueues.get(), MaxQueues));
1997 OMPX_QueueSize = std::min(OMPX_QueueSize.get(), MaxQueueSize);
1998
1999 // Initialize stream pool.
2000 if (auto Err = AMDGPUStreamManager.init(OMPX_InitialNumStreams,
2001 OMPX_NumQueues, OMPX_QueueSize))
2002 return Err;
2003
2004 // Initialize event pool.
2005 if (auto Err = AMDGPUEventManager.init(OMPX_InitialNumEvents))
2006 return Err;
2007
2008 // Initialize signal pool.
2009 if (auto Err = AMDGPUSignalManager.init(OMPX_InitialNumSignals))
2010 return Err;
2011
2012 // Detect if XNACK is enabled
2013 SmallVector<SmallString<32>> Targets;
2014 if (auto Err = hsa_utils::getTargetTripleAndFeatures(Agent, Targets))
2015 return Err;
2016 if (!Targets.empty() && Targets[0].str().contains(Other: "xnack+"))
2017 IsXnackEnabled = true;
2018
2019 // detect if device is an APU.
2020 if (auto Err = checkIfAPU())
2021 return Err;
2022
2023 return Plugin::success();
2024 }
2025
2026 Error unloadBinaryImpl(DeviceImageTy *Image) override {
2027 AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(*Image);
2028
2029 // Unload the executable of the image.
2030 return AMDImage.unloadExecutable();
2031 }
2032
2033 /// Deinitialize the device and release its resources.
2034 Error deinitImpl() override {
2035 // Deinitialize the stream and event pools.
2036 if (auto Err = AMDGPUStreamManager.deinit())
2037 return Err;
2038
2039 if (auto Err = AMDGPUEventManager.deinit())
2040 return Err;
2041
2042 if (auto Err = AMDGPUSignalManager.deinit())
2043 return Err;
2044
2045 // Invalidate agent reference.
2046 Agent = {0};
2047
2048 return Plugin::success();
2049 }
2050
2051 virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
2052 DeviceImageTy &Image) override {
2053 GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
2054 if (Handler.isSymbolInImage(*this, Image, "amdgcn.device.fini"))
2055 Image.setPendingGlobalDtors();
2056
2057 return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true);
2058 }
2059
2060 virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
2061 DeviceImageTy &Image) override {
2062 if (Image.hasPendingGlobalDtors())
2063 return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false);
2064 return Plugin::success();
2065 }
2066
2067 uint64_t getStreamBusyWaitMicroseconds() const { return OMPX_StreamBusyWait; }
2068
2069 Expected<std::unique_ptr<MemoryBuffer>>
2070 doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const override {
2071
2072 // TODO: We should try to avoid materialization but there seems to be no
2073 // good linker interface w/o file i/o.
2074 SmallString<128> LinkerInputFilePath;
2075 std::error_code EC = sys::fs::createTemporaryFile(Prefix: "amdgpu-pre-link-jit",
2076 Suffix: "o", ResultPath&: LinkerInputFilePath);
2077 if (EC)
2078 return Plugin::error(ErrorCode::HOST_IO,
2079 "failed to create temporary file for linker");
2080
2081 // Write the file's contents to the output file.
2082 Expected<std::unique_ptr<FileOutputBuffer>> OutputOrErr =
2083 FileOutputBuffer::create(FilePath: LinkerInputFilePath, Size: MB->getBuffer().size());
2084 if (!OutputOrErr)
2085 return OutputOrErr.takeError();
2086 std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr);
2087 llvm::copy(Range: MB->getBuffer(), Out: Output->getBufferStart());
2088 if (Error E = Output->commit())
2089 return std::move(E);
2090
2091 SmallString<128> LinkerOutputFilePath;
2092 EC = sys::fs::createTemporaryFile(Prefix: "amdgpu-pre-link-jit", Suffix: "so",
2093 ResultPath&: LinkerOutputFilePath);
2094 if (EC)
2095 return Plugin::error(ErrorCode::HOST_IO,
2096 "failed to create temporary file for linker");
2097
2098 const auto &ErrorOrPath = sys::findProgramByName(Name: "lld");
2099 if (!ErrorOrPath)
2100 return createStringError(ErrorCode::HOST_TOOL_NOT_FOUND,
2101 "failed to find `lld` on the PATH.");
2102
2103 std::string LLDPath = ErrorOrPath.get();
2104 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
2105 "Using `%s` to link JITed amdgcn output.", LLDPath.c_str());
2106
2107 std::string MCPU = "-plugin-opt=mcpu=" + getComputeUnitKind();
2108 StringRef Args[] = {LLDPath,
2109 "-flavor",
2110 "gnu",
2111 "--no-undefined",
2112 "-shared",
2113 MCPU,
2114 "-o",
2115 LinkerOutputFilePath.data(),
2116 LinkerInputFilePath.data()};
2117
2118 std::string Error;
2119 int RC = sys::ExecuteAndWait(Program: LLDPath, Args, Env: std::nullopt, Redirects: {}, SecondsToWait: 0, MemoryLimit: 0, ErrMsg: &Error);
2120 if (RC)
2121 return Plugin::error(ErrorCode::LINK_FAILURE,
2122 "linking optimized bitcode failed: %s",
2123 Error.c_str());
2124
2125 auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(Filename: LinkerOutputFilePath);
2126 if (!BufferOrErr)
2127 return Plugin::error(ErrorCode::HOST_IO,
2128 "failed to open temporary file for lld");
2129
2130 // Clean up the temporary files afterwards.
2131 if (sys::fs::remove(LinkerOutputFilePath))
2132 return Plugin::error(ErrorCode::HOST_IO,
2133 "failed to remove temporary output file for lld");
2134 if (sys::fs::remove(LinkerInputFilePath))
2135 return Plugin::error(ErrorCode::HOST_IO,
2136 "failed to remove temporary input file for lld");
2137
2138 return std::move(*BufferOrErr);
2139 }
2140
2141 /// See GenericDeviceTy::getComputeUnitKind().
2142 std::string getComputeUnitKind() const override { return ComputeUnitKind; }
2143
2144 /// Returns the clock frequency for the given AMDGPU device.
2145 uint64_t getClockFrequency() const override { return ClockFrequency; }
2146
2147 /// Allocate and construct an AMDGPU kernel.
2148 Expected<GenericKernelTy &> constructKernel(const char *Name) override {
2149 // Allocate and construct the AMDGPU kernel.
2150 AMDGPUKernelTy *AMDGPUKernel = Plugin.allocate<AMDGPUKernelTy>();
2151 if (!AMDGPUKernel)
2152 return Plugin::error(ErrorCode::OUT_OF_RESOURCES,
2153 "failed to allocate memory for AMDGPU kernel");
2154
2155 new (AMDGPUKernel) AMDGPUKernelTy(Name);
2156
2157 return *AMDGPUKernel;
2158 }
2159
2160 /// Set the current context to this device's context. Do nothing since the
2161 /// AMDGPU devices do not have the concept of contexts.
2162 Error setContext() override { return Plugin::success(); }
2163
2164 /// AMDGPU returns the product of the number of compute units and the waves
2165 /// per compute unit.
2166 uint64_t getHardwareParallelism() const override {
2167 return HardwareParallelism;
2168 }
2169
2170 /// We want to set up the RPC server for host services to the GPU if it is
2171 /// available.
2172 bool shouldSetupRPCServer() const override { return true; }
2173
2174 /// The RPC interface should have enough space for all available parallelism.
2175 uint64_t requestedRPCPortCount() const override {
2176 return getHardwareParallelism();
2177 }
2178
2179 /// Get the stream of the asynchronous info structure or get a new one.
2180 Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper,
2181 AMDGPUStreamTy *&Stream) {
2182 // Get the stream (if any) from the async info.
2183 Stream = AsyncInfoWrapper.getQueueAs<AMDGPUStreamTy *>();
2184 if (!Stream) {
2185 // There was no stream; get an idle one.
2186 if (auto Err = AMDGPUStreamManager.getResource(Stream))
2187 return Err;
2188
2189 // Modify the async info's stream.
2190 AsyncInfoWrapper.setQueueAs<AMDGPUStreamTy *>(Stream);
2191 }
2192 return Plugin::success();
2193 }
2194
2195 /// Load the binary image into the device and allocate an image object.
2196 Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage,
2197 int32_t ImageId) override {
2198 // Allocate and initialize the image object.
2199 AMDGPUDeviceImageTy *AMDImage = Plugin.allocate<AMDGPUDeviceImageTy>();
2200 new (AMDImage) AMDGPUDeviceImageTy(ImageId, *this, TgtImage);
2201
2202 // Load the HSA executable.
2203 if (Error Err = AMDImage->loadExecutable(Device: *this))
2204 return std::move(Err);
2205
2206 return AMDImage;
2207 }
2208
2209 /// Allocate memory on the device or related to the device.
2210 void *allocate(size_t Size, void *, TargetAllocTy Kind) override;
2211
2212 /// Deallocate memory on the device or related to the device.
2213 int free(void *TgtPtr, TargetAllocTy Kind) override {
2214 if (TgtPtr == nullptr)
2215 return OFFLOAD_SUCCESS;
2216
2217 AMDGPUMemoryPoolTy *MemoryPool = nullptr;
2218 switch (Kind) {
2219 case TARGET_ALLOC_DEFAULT:
2220 case TARGET_ALLOC_DEVICE:
2221 case TARGET_ALLOC_DEVICE_NON_BLOCKING:
2222 MemoryPool = CoarseGrainedMemoryPools[0];
2223 break;
2224 case TARGET_ALLOC_HOST:
2225 MemoryPool = &HostDevice.getFineGrainedMemoryPool();
2226 break;
2227 case TARGET_ALLOC_SHARED:
2228 MemoryPool = &HostDevice.getFineGrainedMemoryPool();
2229 break;
2230 }
2231
2232 if (!MemoryPool) {
2233 REPORT("No memory pool for the specified allocation kind\n");
2234 return OFFLOAD_FAIL;
2235 }
2236
2237 if (Error Err = MemoryPool->deallocate(Ptr: TgtPtr)) {
2238 REPORT("%s\n", toString(E: std::move(Err)).data());
2239 return OFFLOAD_FAIL;
2240 }
2241
2242 return OFFLOAD_SUCCESS;
2243 }
2244
2245 /// Synchronize current thread with the pending operations on the async info.
2246 Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
2247 AMDGPUStreamTy *Stream =
2248 reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
2249 assert(Stream && "Invalid stream");
2250
2251 if (auto Err = Stream->synchronize())
2252 return Err;
2253
2254 // Once the stream is synchronized, return it to stream pool and reset
2255 // AsyncInfo. This is to make sure the synchronization only works for its
2256 // own tasks.
2257 AsyncInfo.Queue = nullptr;
2258 return AMDGPUStreamManager.returnResource(Stream);
2259 }
2260
2261 /// Query for the completion of the pending operations on the async info.
2262 Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override {
2263 AMDGPUStreamTy *Stream =
2264 reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
2265 assert(Stream && "Invalid stream");
2266
2267 auto CompletedOrErr = Stream->query();
2268 if (!CompletedOrErr)
2269 return CompletedOrErr.takeError();
2270
2271 // Return if it the stream did not complete yet.
2272 if (!(*CompletedOrErr))
2273 return Plugin::success();
2274
2275 // Once the stream is completed, return it to stream pool and reset
2276 // AsyncInfo. This is to make sure the synchronization only works for its
2277 // own tasks.
2278 AsyncInfo.Queue = nullptr;
2279 return AMDGPUStreamManager.returnResource(Stream);
2280 }
2281
2282 /// Pin the host buffer and return the device pointer that should be used for
2283 /// device transfers.
2284 Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override {
2285 void *PinnedPtr = nullptr;
2286
2287 hsa_status_t Status =
2288 hsa_amd_memory_lock(HstPtr, Size, nullptr, 0, &PinnedPtr);
2289 if (auto Err = Plugin::check(Status, "error in hsa_amd_memory_lock: %s\n"))
2290 return std::move(Err);
2291
2292 return PinnedPtr;
2293 }
2294
2295 /// Unpin the host buffer.
2296 Error dataUnlockImpl(void *HstPtr) override {
2297 hsa_status_t Status = hsa_amd_memory_unlock(HstPtr);
2298 return Plugin::check(Status, "error in hsa_amd_memory_unlock: %s\n");
2299 }
2300
2301 /// Check through the HSA runtime whether the \p HstPtr buffer is pinned.
2302 Expected<bool> isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr,
2303 void *&BaseDevAccessiblePtr,
2304 size_t &BaseSize) const override {
2305 hsa_amd_pointer_info_t Info;
2306 Info.size = sizeof(hsa_amd_pointer_info_t);
2307
2308 hsa_status_t Status = hsa_amd_pointer_info(
2309 HstPtr, &Info, /*Allocator=*/nullptr, /*num_agents_accessible=*/nullptr,
2310 /*accessible=*/nullptr);
2311 if (auto Err = Plugin::check(Status, "error in hsa_amd_pointer_info: %s"))
2312 return std::move(Err);
2313
2314 // The buffer may be locked or allocated through HSA allocators. Assume that
2315 // the buffer is host pinned if the runtime reports a HSA type.
2316 if (Info.type != HSA_EXT_POINTER_TYPE_LOCKED &&
2317 Info.type != HSA_EXT_POINTER_TYPE_HSA)
2318 return false;
2319
2320 assert(Info.hostBaseAddress && "Invalid host pinned address");
2321 assert(Info.agentBaseAddress && "Invalid agent pinned address");
2322 assert(Info.sizeInBytes > 0 && "Invalid pinned allocation size");
2323
2324 // Save the allocation info in the output parameters.
2325 BaseHstPtr = Info.hostBaseAddress;
2326 BaseDevAccessiblePtr = Info.agentBaseAddress;
2327 BaseSize = Info.sizeInBytes;
2328
2329 return true;
2330 }
2331
2332 /// Submit data to the device (host to device transfer).
2333 Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
2334 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2335 AMDGPUStreamTy *Stream = nullptr;
2336 void *PinnedPtr = nullptr;
2337
2338 // Use one-step asynchronous operation when host memory is already pinned.
2339 if (void *PinnedPtr =
2340 PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) {
2341 if (auto Err = getStream(AsyncInfoWrapper, Stream))
2342 return Err;
2343 return Stream->pushPinnedMemoryCopyAsync(Dst: TgtPtr, Src: PinnedPtr, CopySize: Size);
2344 }
2345
2346 // For large transfers use synchronous behavior.
2347 if (Size >= OMPX_MaxAsyncCopyBytes) {
2348 if (AsyncInfoWrapper.hasQueue())
2349 if (auto Err = synchronize(AsyncInfoWrapper))
2350 return Err;
2351
2352 hsa_status_t Status;
2353 Status = hsa_amd_memory_lock(const_cast<void *>(HstPtr), Size, nullptr, 0,
2354 &PinnedPtr);
2355 if (auto Err =
2356 Plugin::check(Status, "error in hsa_amd_memory_lock: %s\n"))
2357 return Err;
2358
2359 AMDGPUSignalTy Signal;
2360 if (auto Err = Signal.init())
2361 return Err;
2362
2363 if (auto Err = hsa_utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr,
2364 Agent, PinnedPtr, Agent, Size, 0,
2365 nullptr, Signal.get()))
2366 return Err;
2367
2368 if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
2369 return Err;
2370
2371 if (auto Err = Signal.deinit())
2372 return Err;
2373
2374 Status = hsa_amd_memory_unlock(const_cast<void *>(HstPtr));
2375 return Plugin::check(Status, "error in hsa_amd_memory_unlock: %s\n");
2376 }
2377
2378 // Otherwise, use two-step copy with an intermediate pinned host buffer.
2379 AMDGPUMemoryManagerTy &PinnedMemoryManager =
2380 HostDevice.getPinnedMemoryManager();
2381 if (auto Err = PinnedMemoryManager.allocate(Size, PtrStorage: &PinnedPtr))
2382 return Err;
2383
2384 if (auto Err = getStream(AsyncInfoWrapper, Stream))
2385 return Err;
2386
2387 return Stream->pushMemoryCopyH2DAsync(Dst: TgtPtr, Src: HstPtr, Inter: PinnedPtr, CopySize: Size,
2388 MemoryManager&: PinnedMemoryManager);
2389 }
2390
2391 /// Retrieve data from the device (device to host transfer).
2392 Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
2393 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2394 AMDGPUStreamTy *Stream = nullptr;
2395 void *PinnedPtr = nullptr;
2396
2397 // Use one-step asynchronous operation when host memory is already pinned.
2398 if (void *PinnedPtr =
2399 PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) {
2400 if (auto Err = getStream(AsyncInfoWrapper, Stream))
2401 return Err;
2402
2403 return Stream->pushPinnedMemoryCopyAsync(Dst: PinnedPtr, Src: TgtPtr, CopySize: Size);
2404 }
2405
2406 // For large transfers use synchronous behavior.
2407 if (Size >= OMPX_MaxAsyncCopyBytes) {
2408 if (AsyncInfoWrapper.hasQueue())
2409 if (auto Err = synchronize(AsyncInfoWrapper))
2410 return Err;
2411
2412 hsa_status_t Status;
2413 Status = hsa_amd_memory_lock(const_cast<void *>(HstPtr), Size, nullptr, 0,
2414 &PinnedPtr);
2415 if (auto Err =
2416 Plugin::check(Status, "error in hsa_amd_memory_lock: %s\n"))
2417 return Err;
2418
2419 AMDGPUSignalTy Signal;
2420 if (auto Err = Signal.init())
2421 return Err;
2422
2423 if (auto Err = hsa_utils::asyncMemCopy(useMultipleSdmaEngines(),
2424 PinnedPtr, Agent, TgtPtr, Agent,
2425 Size, 0, nullptr, Signal.get()))
2426 return Err;
2427
2428 if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
2429 return Err;
2430
2431 if (auto Err = Signal.deinit())
2432 return Err;
2433
2434 Status = hsa_amd_memory_unlock(const_cast<void *>(HstPtr));
2435 return Plugin::check(Status, "error in hsa_amd_memory_unlock: %s\n");
2436 }
2437
2438 // Otherwise, use two-step copy with an intermediate pinned host buffer.
2439 AMDGPUMemoryManagerTy &PinnedMemoryManager =
2440 HostDevice.getPinnedMemoryManager();
2441 if (auto Err = PinnedMemoryManager.allocate(Size, PtrStorage: &PinnedPtr))
2442 return Err;
2443
2444 if (auto Err = getStream(AsyncInfoWrapper, Stream))
2445 return Err;
2446
2447 return Stream->pushMemoryCopyD2HAsync(Dst: HstPtr, Src: TgtPtr, Inter: PinnedPtr, CopySize: Size,
2448 MemoryManager&: PinnedMemoryManager);
2449 }
2450
2451 /// Exchange data between two devices within the plugin.
2452 Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice,
2453 void *DstPtr, int64_t Size,
2454 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2455 AMDGPUDeviceTy &DstDevice = static_cast<AMDGPUDeviceTy &>(DstGenericDevice);
2456
2457 // For large transfers use synchronous behavior.
2458 if (Size >= OMPX_MaxAsyncCopyBytes) {
2459 if (AsyncInfoWrapper.hasQueue())
2460 if (auto Err = synchronize(AsyncInfoWrapper))
2461 return Err;
2462
2463 AMDGPUSignalTy Signal;
2464 if (auto Err = Signal.init())
2465 return Err;
2466
2467 if (auto Err = hsa_utils::asyncMemCopy(
2468 useMultipleSdmaEngines(), DstPtr, DstDevice.getAgent(), SrcPtr,
2469 getAgent(), (uint64_t)Size, 0, nullptr, Signal.get()))
2470 return Err;
2471
2472 if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
2473 return Err;
2474
2475 return Signal.deinit();
2476 }
2477
2478 AMDGPUStreamTy *Stream = nullptr;
2479 if (auto Err = getStream(AsyncInfoWrapper, Stream))
2480 return Err;
2481 if (Size <= 0)
2482 return Plugin::success();
2483
2484 return Stream->pushMemoryCopyD2DAsync(DstPtr, DstDevice.getAgent(), SrcPtr,
2485 getAgent(), (uint64_t)Size);
2486 }
2487
2488 /// Initialize the async info for interoperability purposes.
2489 Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2490 // TODO: Implement this function.
2491 return Plugin::success();
2492 }
2493
2494 /// Initialize the device info for interoperability purposes.
2495 Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override {
2496 DeviceInfo->Context = nullptr;
2497
2498 if (!DeviceInfo->Device)
2499 DeviceInfo->Device = reinterpret_cast<void *>(Agent.handle);
2500
2501 return Plugin::success();
2502 }
2503
2504 /// Create an event.
2505 Error createEventImpl(void **EventPtrStorage) override {
2506 AMDGPUEventTy **Event = reinterpret_cast<AMDGPUEventTy **>(EventPtrStorage);
2507 return AMDGPUEventManager.getResource(*Event);
2508 }
2509
2510 /// Destroy a previously created event.
2511 Error destroyEventImpl(void *EventPtr) override {
2512 AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
2513 return AMDGPUEventManager.returnResource(Event);
2514 }
2515
2516 /// Record the event.
2517 Error recordEventImpl(void *EventPtr,
2518 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2519 AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
2520 assert(Event && "Invalid event");
2521
2522 AMDGPUStreamTy *Stream = nullptr;
2523 if (auto Err = getStream(AsyncInfoWrapper, Stream))
2524 return Err;
2525
2526 return Event->record(Stream&: *Stream);
2527 }
2528
2529 /// Make the stream wait on the event.
2530 Error waitEventImpl(void *EventPtr,
2531 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2532 AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
2533
2534 AMDGPUStreamTy *Stream = nullptr;
2535 if (auto Err = getStream(AsyncInfoWrapper, Stream))
2536 return Err;
2537
2538 return Event->wait(Stream&: *Stream);
2539 }
2540
2541 /// Synchronize the current thread with the event.
2542 Error syncEventImpl(void *EventPtr) override {
2543 return Plugin::error(ErrorCode::UNIMPLEMENTED,
2544 "synchronize event not implemented");
2545 }
2546
2547 /// Print information about the device.
2548 Expected<InfoTreeNode> obtainInfoImpl() override {
2549 char TmpChar[1000];
2550 const char *TmpCharPtr = "Unknown";
2551 uint16_t Major, Minor;
2552 uint32_t TmpUInt, TmpUInt2;
2553 uint32_t CacheSize[4];
2554 size_t TmpSt;
2555 bool TmpBool;
2556 uint16_t WorkgrpMaxDim[3];
2557 hsa_dim3_t GridMaxDim;
2558 hsa_status_t Status, Status2;
2559 InfoTreeNode Info;
2560
2561 Status = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major);
2562 Status2 = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor);
2563 if (Status == HSA_STATUS_SUCCESS && Status2 == HSA_STATUS_SUCCESS)
2564 Info.add("HSA Runtime Version",
2565 std::to_string(val: Major) + "." + std::to_string(val: Minor));
2566
2567 Info.add("HSA OpenMP Device Number", DeviceId);
2568
2569 Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar);
2570 if (Status == HSA_STATUS_SUCCESS)
2571 Info.add("Product Name", TmpChar);
2572
2573 Status = getDeviceAttrRaw(HSA_AGENT_INFO_NAME, TmpChar);
2574 if (Status == HSA_STATUS_SUCCESS)
2575 Info.add("Device Name", TmpChar);
2576
2577 Status = getDeviceAttrRaw(HSA_AGENT_INFO_VENDOR_NAME, TmpChar);
2578 if (Status == HSA_STATUS_SUCCESS)
2579 Info.add("Vendor Name", TmpChar);
2580
2581 hsa_device_type_t DevType;
2582 Status = getDeviceAttrRaw(HSA_AGENT_INFO_DEVICE, DevType);
2583 if (Status == HSA_STATUS_SUCCESS) {
2584 switch (DevType) {
2585 case HSA_DEVICE_TYPE_CPU:
2586 TmpCharPtr = "CPU";
2587 break;
2588 case HSA_DEVICE_TYPE_GPU:
2589 TmpCharPtr = "GPU";
2590 break;
2591 case HSA_DEVICE_TYPE_DSP:
2592 TmpCharPtr = "DSP";
2593 break;
2594 default:
2595 TmpCharPtr = "Unknown";
2596 break;
2597 }
2598 Info.add("Device Type", TmpCharPtr);
2599 }
2600
2601 Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUES_MAX, TmpUInt);
2602 if (Status == HSA_STATUS_SUCCESS)
2603 Info.add("Max Queues", TmpUInt);
2604
2605 Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MIN_SIZE, TmpUInt);
2606 if (Status == HSA_STATUS_SUCCESS)
2607 Info.add("Queue Min Size", TmpUInt);
2608
2609 Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MAX_SIZE, TmpUInt);
2610 if (Status == HSA_STATUS_SUCCESS)
2611 Info.add("Queue Max Size", TmpUInt);
2612
2613 // FIXME: This is deprecated according to HSA documentation. But using
2614 // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during
2615 // runtime.
2616 Status = getDeviceAttrRaw(HSA_AGENT_INFO_CACHE_SIZE, CacheSize);
2617 if (Status == HSA_STATUS_SUCCESS) {
2618 auto &Cache = *Info.add("Cache");
2619
2620 for (int I = 0; I < 4; I++)
2621 if (CacheSize[I])
2622 Cache.add("L" + std::to_string(val: I), CacheSize[I]);
2623 }
2624
2625 Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_CACHELINE_SIZE, TmpUInt);
2626 if (Status == HSA_STATUS_SUCCESS)
2627 Info.add("Cacheline Size", TmpUInt);
2628
2629 Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt);
2630 if (Status == HSA_STATUS_SUCCESS)
2631 Info.add("Max Clock Freq", TmpUInt, "MHz");
2632
2633 Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, TmpUInt);
2634 if (Status == HSA_STATUS_SUCCESS)
2635 Info.add("Compute Units", TmpUInt);
2636
2637 Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, TmpUInt);
2638 if (Status == HSA_STATUS_SUCCESS)
2639 Info.add("SIMD per CU", TmpUInt);
2640
2641 Status = getDeviceAttrRaw(HSA_AGENT_INFO_FAST_F16_OPERATION, TmpBool);
2642 if (Status == HSA_STATUS_SUCCESS)
2643 Info.add("Fast F16 Operation", TmpBool);
2644
2645 Status = getDeviceAttrRaw(HSA_AGENT_INFO_WAVEFRONT_SIZE, TmpUInt2);
2646 if (Status == HSA_STATUS_SUCCESS)
2647 Info.add("Wavefront Size", TmpUInt2);
2648
2649 Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, TmpUInt);
2650 if (Status == HSA_STATUS_SUCCESS)
2651 Info.add("Workgroup Max Size", TmpUInt);
2652
2653 Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgrpMaxDim);
2654 if (Status == HSA_STATUS_SUCCESS) {
2655 auto &MaxSize = *Info.add("Workgroup Max Size per Dimension");
2656 MaxSize.add("x", WorkgrpMaxDim[0]);
2657 MaxSize.add("y", WorkgrpMaxDim[1]);
2658 MaxSize.add("z", WorkgrpMaxDim[2]);
2659 }
2660
2661 Status = getDeviceAttrRaw(
2662 (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, TmpUInt);
2663 if (Status == HSA_STATUS_SUCCESS) {
2664 Info.add("Max Waves Per CU", TmpUInt);
2665 Info.add("Max Work-item Per CU", TmpUInt * TmpUInt2);
2666 }
2667
2668 Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_SIZE, TmpUInt);
2669 if (Status == HSA_STATUS_SUCCESS)
2670 Info.add("Grid Max Size", TmpUInt);
2671
2672 Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim);
2673 if (Status == HSA_STATUS_SUCCESS) {
2674 auto &MaxDim = *Info.add("Grid Max Size per Dimension");
2675 MaxDim.add("x", GridMaxDim.x);
2676 MaxDim.add("y", GridMaxDim.y);
2677 MaxDim.add("z", GridMaxDim.z);
2678 }
2679
2680 Status = getDeviceAttrRaw(HSA_AGENT_INFO_FBARRIER_MAX_SIZE, TmpUInt);
2681 if (Status == HSA_STATUS_SUCCESS)
2682 Info.add("Max fbarriers/Workgrp", TmpUInt);
2683
2684 auto &RootPool = *Info.add("Memory Pools");
2685 for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
2686 std::string TmpStr, TmpStr2;
2687
2688 if (Pool->isGlobal())
2689 TmpStr = "Global";
2690 else if (Pool->isReadOnly())
2691 TmpStr = "ReadOnly";
2692 else if (Pool->isPrivate())
2693 TmpStr = "Private";
2694 else if (Pool->isGroup())
2695 TmpStr = "Group";
2696 else
2697 TmpStr = "Unknown";
2698
2699 auto &PoolNode = *RootPool.add(std::string("Pool ") + TmpStr);
2700
2701 if (Pool->isGlobal()) {
2702 if (Pool->isFineGrained())
2703 TmpStr2 += "Fine Grained ";
2704 if (Pool->isCoarseGrained())
2705 TmpStr2 += "Coarse Grained ";
2706 if (Pool->supportsKernelArgs())
2707 TmpStr2 += "Kernarg ";
2708
2709 PoolNode.add("Flags", TmpStr2);
2710 }
2711
2712 Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, TmpSt);
2713 if (Status == HSA_STATUS_SUCCESS)
2714 PoolNode.add("Size", TmpSt, "bytes");
2715
2716 Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
2717 TmpBool);
2718 if (Status == HSA_STATUS_SUCCESS)
2719 PoolNode.add("Allocatable", TmpBool);
2720
2721 Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE,
2722 TmpSt);
2723 if (Status == HSA_STATUS_SUCCESS)
2724 PoolNode.add("Runtime Alloc Granule", TmpSt, "bytes");
2725
2726 Status = Pool->getAttrRaw(
2727 HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, TmpSt);
2728 if (Status == HSA_STATUS_SUCCESS)
2729 PoolNode.add("Runtime Alloc Alignment", TmpSt, "bytes");
2730
2731 Status =
2732 Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, TmpBool);
2733 if (Status == HSA_STATUS_SUCCESS)
2734 PoolNode.add("Accessible by all", TmpBool);
2735 }
2736
2737 auto &ISAs = *Info.add("ISAs");
2738 auto Err = hsa_utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) {
2739 Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, TmpChar);
2740 if (Status == HSA_STATUS_SUCCESS)
2741 ISAs.add("Name", TmpChar);
2742
2743 return Status;
2744 });
2745
2746 // Silently consume the error.
2747 if (Err)
2748 consumeError(std::move(Err));
2749
2750 return Info;
2751 }
2752
2753 /// Returns true if auto zero-copy the best configuration for the current
2754 /// arch.
2755 /// On AMDGPUs, automatic zero-copy is turned on
2756 /// when running on an APU with XNACK (unified memory) support
2757 /// enabled. On discrete GPUs, automatic zero-copy is triggered
2758 /// if the user sets the environment variable OMPX_APU_MAPS=1
2759 /// and if XNACK is enabled. The rationale is that zero-copy
2760 /// is the best configuration (performance, memory footprint) on APUs,
2761 /// while it is often not the best on discrete GPUs.
2762 /// XNACK can be enabled with a kernel boot parameter or with
2763 /// the HSA_XNACK environment variable.
2764 bool useAutoZeroCopyImpl() override {
2765 return ((IsAPU || OMPX_ApuMaps) && IsXnackEnabled);
2766 }
2767
2768 /// Getters and setters for stack and heap sizes.
2769 Error getDeviceStackSize(uint64_t &Value) override {
2770 Value = StackSize;
2771 return Plugin::success();
2772 }
2773 Error setDeviceStackSize(uint64_t Value) override {
2774 StackSize = Value;
2775 return Plugin::success();
2776 }
2777 Error getDeviceHeapSize(uint64_t &Value) override {
2778 Value = DeviceMemoryPoolSize;
2779 return Plugin::success();
2780 }
2781 Error setDeviceHeapSize(uint64_t Value) override {
2782 for (DeviceImageTy *Image : LoadedImages)
2783 if (auto Err = setupDeviceMemoryPool(Plugin, *Image, Value))
2784 return Err;
2785 DeviceMemoryPoolSize = Value;
2786 return Plugin::success();
2787 }
2788 Error getDeviceMemorySize(uint64_t &Value) override {
2789 for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
2790 if (Pool->isGlobal()) {
2791 hsa_status_t Status =
2792 Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, Value);
2793 return Plugin::check(Status, "error in getting device memory size: %s");
2794 }
2795 }
2796 return Plugin::error(ErrorCode::UNSUPPORTED,
2797 "getDeviceMemorySize:: no global pool");
2798 }
2799
2800 /// AMDGPU-specific function to get device attributes.
2801 template <typename Ty> Error getDeviceAttr(uint32_t Kind, Ty &Value) {
2802 hsa_status_t Status =
2803 hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value);
2804 return Plugin::check(Status, "Error in hsa_agent_get_info: %s");
2805 }
2806
2807 template <typename Ty>
2808 hsa_status_t getDeviceAttrRaw(uint32_t Kind, Ty &Value) {
2809 return hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value);
2810 }
2811
2812 /// Get the device agent.
2813 hsa_agent_t getAgent() const override { return Agent; }
2814
2815 /// Get the signal manager.
2816 AMDGPUSignalManagerTy &getSignalManager() { return AMDGPUSignalManager; }
2817
2818 /// Retrieve and construct all memory pools of the device agent.
2819 Error retrieveAllMemoryPools() override {
2820 // Iterate through the available pools of the device agent.
2821 return hsa_utils::iterateAgentMemoryPools(
2822 Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) {
2823 AMDGPUMemoryPoolTy *MemoryPool =
2824 Plugin.allocate<AMDGPUMemoryPoolTy>();
2825 new (MemoryPool) AMDGPUMemoryPoolTy(HSAMemoryPool);
2826 AllMemoryPools.push_back(MemoryPool);
2827 return HSA_STATUS_SUCCESS;
2828 });
2829 }
2830
2831 bool useMultipleSdmaEngines() const { return OMPX_UseMultipleSdmaEngines; }
2832
2833private:
2834 using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>;
2835 using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>;
2836
2837 /// Common method to invoke a single threaded constructor or destructor
2838 /// kernel by name.
2839 Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image,
2840 bool IsCtor) {
2841 const char *KernelName =
2842 IsCtor ? "amdgcn.device.init" : "amdgcn.device.fini";
2843 // Perform a quick check for the named kernel in the image. The kernel
2844 // should be created by the 'amdgpu-lower-ctor-dtor' pass.
2845 GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
2846 if (IsCtor && !Handler.isSymbolInImage(*this, Image, KernelName))
2847 return Plugin::success();
2848
2849 // Allocate and construct the AMDGPU kernel.
2850 AMDGPUKernelTy AMDGPUKernel(KernelName);
2851 if (auto Err = AMDGPUKernel.init(*this, Image))
2852 return Err;
2853
2854 AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
2855
2856 KernelArgsTy KernelArgs = {};
2857 uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u};
2858 if (auto Err = AMDGPUKernel.launchImpl(
2859 *this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs,
2860 KernelLaunchParamsTy{}, AsyncInfoWrapper))
2861 return Err;
2862
2863 Error Err = Plugin::success();
2864 AsyncInfoWrapper.finalize(Err);
2865
2866 return Err;
2867 }
2868
2869 /// Detect if current architecture is an APU.
2870 Error checkIfAPU() {
2871 // TODO: replace with ROCr API once it becomes available.
2872 llvm::StringRef StrGfxName(ComputeUnitKind);
2873 bool MayBeAPU = llvm::StringSwitch<bool>(StrGfxName)
2874 .Case(S: "gfx942", Value: true)
2875 .Default(Value: false);
2876 if (!MayBeAPU)
2877 return Plugin::success();
2878
2879 // can be MI300A or MI300X
2880 uint32_t ChipID = 0;
2881 if (auto Err = getDeviceAttr(HSA_AMD_AGENT_INFO_CHIP_ID, ChipID))
2882 return Err;
2883
2884 if (!(ChipID & 0x1)) {
2885 IsAPU = true;
2886 return Plugin::success();
2887 }
2888 return Plugin::success();
2889 }
2890
2891 /// Envar for controlling the number of HSA queues per device. High number of
2892 /// queues may degrade performance.
2893 UInt32Envar OMPX_NumQueues;
2894
2895 /// Envar for controlling the size of each HSA queue. The size is the number
2896 /// of HSA packets a queue is expected to hold. It is also the number of HSA
2897 /// packets that can be pushed into each queue without waiting the driver to
2898 /// process them.
2899 UInt32Envar OMPX_QueueSize;
2900
2901 /// Envar for controlling the default number of teams relative to the number
2902 /// of compute units (CUs) the device has:
2903 /// #default_teams = OMPX_DefaultTeamsPerCU * #CUs.
2904 UInt32Envar OMPX_DefaultTeamsPerCU;
2905
2906 /// Envar specifying the maximum size in bytes where the memory copies are
2907 /// asynchronous operations. Up to this transfer size, the memory copies are
2908 /// asynchronous operations pushed to the corresponding stream. For larger
2909 /// transfers, they are synchronous transfers.
2910 UInt32Envar OMPX_MaxAsyncCopyBytes;
2911
2912 /// Envar controlling the initial number of HSA signals per device. There is
2913 /// one manager of signals per device managing several pre-allocated signals.
2914 /// These signals are mainly used by AMDGPU streams. If needed, more signals
2915 /// will be created.
2916 UInt32Envar OMPX_InitialNumSignals;
2917
2918 /// Environment variables to set the time to wait in active state before
2919 /// switching to blocked state. The default 2000000 busywaits for 2 seconds
2920 /// before going into a blocking HSA wait state. The unit for these variables
2921 /// are microseconds.
2922 UInt32Envar OMPX_StreamBusyWait;
2923
2924 /// Use ROCm 5.7 interface for multiple SDMA engines
2925 BoolEnvar OMPX_UseMultipleSdmaEngines;
2926
2927 /// Value of OMPX_APU_MAPS env var used to force
2928 /// automatic zero-copy behavior on non-APU GPUs.
2929 BoolEnvar OMPX_ApuMaps;
2930
2931 /// Stream manager for AMDGPU streams.
2932 AMDGPUStreamManagerTy AMDGPUStreamManager;
2933
2934 /// Event manager for AMDGPU events.
2935 AMDGPUEventManagerTy AMDGPUEventManager;
2936
2937 /// Signal manager for AMDGPU signals.
2938 AMDGPUSignalManagerTy AMDGPUSignalManager;
2939
2940 /// The agent handler corresponding to the device.
2941 hsa_agent_t Agent;
2942
2943 /// The GPU architecture.
2944 std::string ComputeUnitKind;
2945
2946 /// The frequency of the steady clock inside the device.
2947 uint64_t ClockFrequency;
2948
2949 /// The total number of concurrent work items that can be running on the GPU.
2950 uint64_t HardwareParallelism;
2951
2952 /// Reference to the host device.
2953 AMDHostDeviceTy &HostDevice;
2954
2955 /// The current size of the global device memory pool (managed by us).
2956 uint64_t DeviceMemoryPoolSize = 1L << 29L /*512MB=*/;
2957
2958 /// The current size of the stack that will be used in cases where it could
2959 /// not be statically determined.
2960 uint64_t StackSize = 16 * 1024 /* 16 KB */;
2961
2962 /// Is the plugin associated with an APU?
2963 bool IsAPU = false;
2964
2965 /// True is the system is configured with XNACK-Enabled.
2966 /// False otherwise.
2967 bool IsXnackEnabled = false;
2968};
2969
2970Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
2971 hsa_code_object_reader_t Reader;
2972 hsa_status_t Status =
2973 hsa_code_object_reader_create_from_memory(getStart(), getSize(), &Reader);
2974 if (auto Err = Plugin::check(
2975 Status, "error in hsa_code_object_reader_create_from_memory: %s"))
2976 return Err;
2977
2978 Status = hsa_executable_create_alt(
2979 HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "", &Executable);
2980 if (auto Err =
2981 Plugin::check(Status, "error in hsa_executable_create_alt: %s"))
2982 return Err;
2983
2984 hsa_loaded_code_object_t Object;
2985 Status = hsa_executable_load_agent_code_object(Executable, Device.getAgent(),
2986 Reader, "", &Object);
2987 if (auto Err = Plugin::check(
2988 Status, "error in hsa_executable_load_agent_code_object: %s"))
2989 return Err;
2990
2991 Status = hsa_executable_freeze(Executable, "");
2992 if (auto Err = Plugin::check(Status, "error in hsa_executable_freeze: %s"))
2993 return Err;
2994
2995 uint32_t Result;
2996 Status = hsa_executable_validate(Executable, &Result);
2997 if (auto Err = Plugin::check(Status, "error in hsa_executable_validate: %s"))
2998 return Err;
2999
3000 if (Result)
3001 return Plugin::error(ErrorCode::INVALID_BINARY,
3002 "loaded HSA executable does not validate");
3003
3004 Status = hsa_code_object_reader_destroy(Reader);
3005 if (auto Err =
3006 Plugin::check(Status, "error in hsa_code_object_reader_destroy: %s"))
3007 return Err;
3008
3009 if (auto Err = hsa_utils::readAMDGPUMetaDataFromImage(
3010 getMemoryBuffer(), KernelInfoMap, ELFABIVersion))
3011 return Err;
3012
3013 return Plugin::success();
3014}
3015
3016Expected<hsa_executable_symbol_t>
3017AMDGPUDeviceImageTy::findDeviceSymbol(GenericDeviceTy &Device,
3018 StringRef SymbolName) const {
3019
3020 AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(Device);
3021 hsa_agent_t Agent = AMDGPUDevice.getAgent();
3022
3023 hsa_executable_symbol_t Symbol;
3024 hsa_status_t Status = hsa_executable_get_symbol_by_name(
3025 Executable, SymbolName.data(), &Agent, &Symbol);
3026 if (auto Err = Plugin::check(
3027 Status, "error in hsa_executable_get_symbol_by_name(%s): %s",
3028 SymbolName.data()))
3029 return std::move(Err);
3030
3031 return Symbol;
3032}
3033
3034template <typename ResourceTy>
3035Error AMDGPUResourceRef<ResourceTy>::create(GenericDeviceTy &Device) {
3036 if (Resource)
3037 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
3038 "creating an existing resource");
3039
3040 AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(Device);
3041
3042 Resource = new ResourceTy(AMDGPUDevice);
3043
3044 return Resource->init();
3045}
3046
3047AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
3048 : Agent(Device.getAgent()), Queue(nullptr),
3049 SignalManager(Device.getSignalManager()), Device(Device),
3050 // Initialize the std::deque with some empty positions.
3051 Slots(32), NextSlot(0), SyncCycle(0),
3052 StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
3053 UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {}
3054
3055/// Class implementing the AMDGPU-specific functionalities of the global
3056/// handler.
3057struct AMDGPUGlobalHandlerTy final : public GenericGlobalHandlerTy {
3058 /// Get the metadata of a global from the device. The name and size of the
3059 /// global is read from DeviceGlobal and the address of the global is written
3060 /// to DeviceGlobal.
3061 Error getGlobalMetadataFromDevice(GenericDeviceTy &Device,
3062 DeviceImageTy &Image,
3063 GlobalTy &DeviceGlobal) override {
3064 AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(Image);
3065
3066 // Find the symbol on the device executable.
3067 auto SymbolOrErr =
3068 AMDImage.findDeviceSymbol(Device, DeviceGlobal.getName());
3069 if (!SymbolOrErr)
3070 return SymbolOrErr.takeError();
3071
3072 hsa_executable_symbol_t Symbol = *SymbolOrErr;
3073 hsa_symbol_kind_t SymbolType;
3074 hsa_status_t Status;
3075 uint64_t SymbolAddr;
3076 uint32_t SymbolSize;
3077
3078 // Retrieve the type, address and size of the symbol.
3079 std::pair<hsa_executable_symbol_info_t, void *> RequiredInfos[] = {
3080 {HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &SymbolType},
3081 {HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &SymbolAddr},
3082 {HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &SymbolSize}};
3083
3084 for (auto &Info : RequiredInfos) {
3085 Status = hsa_executable_symbol_get_info(Symbol, Info.first, Info.second);
3086 if (auto Err = Plugin::check(
3087 Status, "error in hsa_executable_symbol_get_info: %s"))
3088 return Err;
3089 }
3090
3091 // Check the size of the symbol.
3092 if (DeviceGlobal.getSize() && SymbolSize != DeviceGlobal.getSize())
3093 return Plugin::error(
3094 ErrorCode::INVALID_BINARY,
3095 "failed to load global '%s' due to size mismatch (%zu != %zu)",
3096 DeviceGlobal.getName().data(), SymbolSize,
3097 (size_t)DeviceGlobal.getSize());
3098
3099 // Store the symbol address and size on the device global metadata.
3100 DeviceGlobal.setPtr(reinterpret_cast<void *>(SymbolAddr));
3101 DeviceGlobal.setSize(SymbolSize);
3102
3103 return Plugin::success();
3104 }
3105};
3106
3107/// Class implementing the AMDGPU-specific functionalities of the plugin.
3108struct AMDGPUPluginTy final : public GenericPluginTy {
3109 /// Create an AMDGPU plugin and initialize the AMDGPU driver.
3110 AMDGPUPluginTy()
3111 : GenericPluginTy(getTripleArch()), Initialized(false),
3112 HostDevice(nullptr) {}
3113
3114 /// This class should not be copied.
3115 AMDGPUPluginTy(const AMDGPUPluginTy &) = delete;
3116 AMDGPUPluginTy(AMDGPUPluginTy &&) = delete;
3117
3118 /// Initialize the plugin and return the number of devices.
3119 Expected<int32_t> initImpl() override {
3120 hsa_status_t Status = hsa_init();
3121 if (Status != HSA_STATUS_SUCCESS) {
3122 // Cannot call hsa_success_string.
3123 DP("Failed to initialize AMDGPU's HSA library\n");
3124 return 0;
3125 }
3126
3127 // The initialization of HSA was successful. It should be safe to call
3128 // HSA functions from now on, e.g., hsa_shut_down.
3129 Initialized = true;
3130
3131 // Register event handler to detect memory errors on the devices.
3132 Status = hsa_amd_register_system_event_handler(eventHandler, this);
3133 if (auto Err = Plugin::check(
3134 Status, "error in hsa_amd_register_system_event_handler: %s"))
3135 return std::move(Err);
3136
3137 // List of host (CPU) agents.
3138 llvm::SmallVector<hsa_agent_t> HostAgents;
3139
3140 // Count the number of available agents.
3141 auto Err = hsa_utils::iterateAgents(Callback: [&](hsa_agent_t Agent) {
3142 // Get the device type of the agent.
3143 hsa_device_type_t DeviceType;
3144 hsa_status_t Status =
3145 hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType);
3146 if (Status != HSA_STATUS_SUCCESS)
3147 return Status;
3148
3149 // Classify the agents into kernel (GPU) and host (CPU) kernels.
3150 if (DeviceType == HSA_DEVICE_TYPE_GPU) {
3151 // Ensure that the GPU agent supports kernel dispatch packets.
3152 hsa_agent_feature_t Features;
3153 Status = hsa_agent_get_info(Agent, HSA_AGENT_INFO_FEATURE, &Features);
3154 if (Features & HSA_AGENT_FEATURE_KERNEL_DISPATCH)
3155 KernelAgents.push_back(Agent);
3156 } else if (DeviceType == HSA_DEVICE_TYPE_CPU) {
3157 HostAgents.push_back(Agent);
3158 }
3159 return HSA_STATUS_SUCCESS;
3160 });
3161
3162 if (Err)
3163 return std::move(Err);
3164
3165 int32_t NumDevices = KernelAgents.size();
3166 if (NumDevices == 0) {
3167 // Do not initialize if there are no devices.
3168 DP("There are no devices supporting AMDGPU.\n");
3169 return 0;
3170 }
3171
3172 // There are kernel agents but there is no host agent. That should be
3173 // treated as an error.
3174 if (HostAgents.empty())
3175 return Plugin::error(ErrorCode::BACKEND_FAILURE, "no AMDGPU host agents");
3176
3177 // Initialize the host device using host agents.
3178 HostDevice = allocate<AMDHostDeviceTy>();
3179 new (HostDevice) AMDHostDeviceTy(*this, HostAgents);
3180
3181 // Setup the memory pools of available for the host.
3182 if (auto Err = HostDevice->init())
3183 return std::move(Err);
3184
3185 return NumDevices;
3186 }
3187
3188 /// Deinitialize the plugin.
3189 Error deinitImpl() override {
3190 // The HSA runtime was not initialized, so nothing from the plugin was
3191 // actually initialized.
3192 if (!Initialized)
3193 return Plugin::success();
3194
3195 if (HostDevice)
3196 if (auto Err = HostDevice->deinit())
3197 return Err;
3198
3199 // Finalize the HSA runtime.
3200 hsa_status_t Status = hsa_shut_down();
3201 return Plugin::check(Status, "error in hsa_shut_down: %s");
3202 }
3203
3204 /// Creates an AMDGPU device.
3205 GenericDeviceTy *createDevice(GenericPluginTy &Plugin, int32_t DeviceId,
3206 int32_t NumDevices) override {
3207 return new AMDGPUDeviceTy(Plugin, DeviceId, NumDevices, getHostDevice(),
3208 getKernelAgent(DeviceId));
3209 }
3210
3211 /// Creates an AMDGPU global handler.
3212 GenericGlobalHandlerTy *createGlobalHandler() override {
3213 return new AMDGPUGlobalHandlerTy();
3214 }
3215
3216 Triple::ArchType getTripleArch() const override { return Triple::amdgcn; }
3217
3218 const char *getName() const override { return GETNAME(TARGET_NAME); }
3219
3220 /// Get the ELF code for recognizing the compatible image binary.
3221 uint16_t getMagicElfBits() const override { return ELF::EM_AMDGPU; }
3222
3223 /// Check whether the image is compatible with an AMDGPU device.
3224 Expected<bool> isELFCompatible(uint32_t DeviceId,
3225 StringRef Image) const override {
3226 // Get the associated architecture and flags from the ELF.
3227 auto ElfOrErr = ELF64LEObjectFile::create(
3228 MemoryBufferRef(Image, /*Identifier=*/""), /*InitContent=*/false);
3229 if (!ElfOrErr)
3230 return ElfOrErr.takeError();
3231 std::optional<StringRef> Processor = ElfOrErr->tryGetCPUName();
3232 if (!Processor)
3233 return false;
3234
3235 SmallVector<SmallString<32>> Targets;
3236 if (auto Err = hsa_utils::getTargetTripleAndFeatures(
3237 getKernelAgent(DeviceId), Targets))
3238 return Err;
3239 for (auto &Target : Targets)
3240 if (offloading::amdgpu::isImageCompatibleWithEnv(
3241 Processor ? *Processor : "", ElfOrErr->getPlatformFlags(),
3242 Target.str()))
3243 return true;
3244 return false;
3245 }
3246
3247 bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) override {
3248 return true;
3249 }
3250
3251 /// Get the host device instance.
3252 AMDHostDeviceTy &getHostDevice() {
3253 assert(HostDevice && "Host device not initialized");
3254 return *HostDevice;
3255 }
3256
3257 /// Get the kernel agent with the corresponding agent id.
3258 hsa_agent_t getKernelAgent(int32_t AgentId) const {
3259 assert((uint32_t)AgentId < KernelAgents.size() && "Invalid agent id");
3260 return KernelAgents[AgentId];
3261 }
3262
3263 /// Get the list of the available kernel agents.
3264 const llvm::SmallVector<hsa_agent_t> &getKernelAgents() const {
3265 return KernelAgents;
3266 }
3267
3268private:
3269 /// Event handler that will be called by ROCr if an event is detected.
3270 static hsa_status_t eventHandler(const hsa_amd_event_t *Event,
3271 void *PluginPtr) {
3272 if (Event->event_type != HSA_AMD_GPU_MEMORY_FAULT_EVENT)
3273 return HSA_STATUS_SUCCESS;
3274
3275 SmallVector<std::string> Reasons;
3276 uint32_t ReasonsMask = Event->memory_fault.fault_reason_mask;
3277 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_PAGE_NOT_PRESENT)
3278 Reasons.emplace_back(Args: "Page not present or supervisor privilege");
3279 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_READ_ONLY)
3280 Reasons.emplace_back(Args: "Write access to a read-only page");
3281 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_NX)
3282 Reasons.emplace_back(Args: "Execute access to a page marked NX");
3283 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HOST_ONLY)
3284 Reasons.emplace_back(Args: "GPU attempted access to a host only page");
3285 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_DRAMECC)
3286 Reasons.emplace_back(Args: "DRAM ECC failure");
3287 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_IMPRECISE)
3288 Reasons.emplace_back(Args: "Can't determine the exact fault address");
3289 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_SRAMECC)
3290 Reasons.emplace_back(Args: "SRAM ECC failure (ie registers, no fault address)");
3291 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HANG)
3292 Reasons.emplace_back(Args: "GPU reset following unspecified hang");
3293
3294 // If we do not know the reason, say so, otherwise remove the trailing comma
3295 // and space.
3296 if (Reasons.empty())
3297 Reasons.emplace_back(Args: "Unknown (" + std::to_string(val: ReasonsMask) + ")");
3298
3299 uint32_t Node = -1;
3300 hsa_agent_get_info(Event->memory_fault.agent, HSA_AGENT_INFO_NODE, &Node);
3301
3302 AMDGPUPluginTy &Plugin = *reinterpret_cast<AMDGPUPluginTy *>(PluginPtr);
3303 for (uint32_t I = 0, E = Plugin.getNumDevices();
3304 Node != uint32_t(-1) && I < E; ++I) {
3305 AMDGPUDeviceTy &AMDGPUDevice =
3306 reinterpret_cast<AMDGPUDeviceTy &>(Plugin.getDevice(I));
3307 auto KernelTraceInfoRecord =
3308 AMDGPUDevice.KernelLaunchTraces.getExclusiveAccessor();
3309
3310 uint32_t DeviceNode = -1;
3311 if (auto Err =
3312 AMDGPUDevice.getDeviceAttr(HSA_AGENT_INFO_NODE, DeviceNode)) {
3313 consumeError(std::move(Err));
3314 continue;
3315 }
3316 if (DeviceNode != Node)
3317 continue;
3318 void *DevicePtr = (void *)Event->memory_fault.virtual_address;
3319 std::string S;
3320 llvm::raw_string_ostream OS(S);
3321 OS << llvm::format("memory access fault by GPU %" PRIu32
3322 " (agent 0x%" PRIx64
3323 ") at virtual address %p. Reasons: %s",
3324 Node, Event->memory_fault.agent.handle,
3325 (void *)Event->memory_fault.virtual_address,
3326 llvm::join(Reasons, ", ").c_str());
3327 ErrorReporter::reportKernelTraces(AMDGPUDevice, *KernelTraceInfoRecord);
3328 ErrorReporter::reportMemoryAccessError(AMDGPUDevice, DevicePtr, S,
3329 /*Abort*/ true);
3330 }
3331
3332 // Abort the execution since we do not recover from this error.
3333 FATAL_MESSAGE(1,
3334 "memory access fault by GPU %" PRIu32 " (agent 0x%" PRIx64
3335 ") at virtual address %p. Reasons: %s",
3336 Node, Event->memory_fault.agent.handle,
3337 (void *)Event->memory_fault.virtual_address,
3338 llvm::join(Reasons, ", ").c_str());
3339
3340 return HSA_STATUS_ERROR;
3341 }
3342
3343 /// Indicate whether the HSA runtime was correctly initialized. Even if there
3344 /// is no available devices this boolean will be true. It indicates whether
3345 /// we can safely call HSA functions (e.g., hsa_shut_down).
3346 bool Initialized;
3347
3348 /// Arrays of the available GPU and CPU agents. These arrays of handles should
3349 /// not be here but in the AMDGPUDeviceTy structures directly. However, the
3350 /// HSA standard does not provide API functions to retirve agents directly,
3351 /// only iterating functions. We cache the agents here for convenience.
3352 llvm::SmallVector<hsa_agent_t> KernelAgents;
3353
3354 /// The device representing all HSA host agents.
3355 AMDHostDeviceTy *HostDevice;
3356};
3357
3358Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
3359 uint32_t NumThreads[3], uint32_t NumBlocks[3],
3360 KernelArgsTy &KernelArgs,
3361 KernelLaunchParamsTy LaunchParams,
3362 AsyncInfoWrapperTy &AsyncInfoWrapper) const {
3363 if (ArgsSize != LaunchParams.Size &&
3364 ArgsSize > LaunchParams.Size + getImplicitArgsSize())
3365 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
3366 "invalid kernel arguments size");
3367
3368 AMDGPUPluginTy &AMDGPUPlugin =
3369 static_cast<AMDGPUPluginTy &>(GenericDevice.Plugin);
3370 AMDHostDeviceTy &HostDevice = AMDGPUPlugin.getHostDevice();
3371 AMDGPUMemoryManagerTy &ArgsMemoryManager = HostDevice.getArgsMemoryManager();
3372
3373 void *AllArgs = nullptr;
3374 if (auto Err = ArgsMemoryManager.allocate(ArgsSize, &AllArgs))
3375 return Err;
3376
3377 // Account for user requested dynamic shared memory.
3378 uint32_t GroupSize = getGroupSize();
3379 if (uint32_t MaxDynCGroupMem = std::max(
3380 KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize())) {
3381 GroupSize += MaxDynCGroupMem;
3382 }
3383
3384 uint64_t StackSize;
3385 if (auto Err = GenericDevice.getDeviceStackSize(StackSize))
3386 return Err;
3387
3388 // Copy the explicit arguments.
3389 // TODO: We should expose the args memory manager alloc to the common part as
3390 // alternative to copying them twice.
3391 if (LaunchParams.Size)
3392 std::memcpy(dest: AllArgs, src: LaunchParams.Data, n: LaunchParams.Size);
3393
3394 AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(GenericDevice);
3395
3396 AMDGPUStreamTy *Stream = nullptr;
3397 if (auto Err = AMDGPUDevice.getStream(AsyncInfoWrapper, Stream))
3398 return Err;
3399
3400 uint64_t ImplArgsOffset = utils::roundUp(
3401 LaunchParams.Size, alignof(hsa_utils::AMDGPUImplicitArgsTy));
3402 if (ArgsSize > ImplArgsOffset) {
3403 hsa_utils::AMDGPUImplicitArgsTy *ImplArgs =
3404 reinterpret_cast<hsa_utils::AMDGPUImplicitArgsTy *>(
3405 utils::advancePtr(AllArgs, ImplArgsOffset));
3406
3407 // Set the COV5+ implicit arguments to the appropriate values if present.
3408 uint64_t ImplArgsSize = ArgsSize - ImplArgsOffset;
3409 std::memset(s: ImplArgs, c: 0, n: ImplArgsSize);
3410
3411 using ImplArgsTy = hsa_utils::AMDGPUImplicitArgsTy;
3412 hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::BlockCountX, ImplArgsSize,
3413 NumBlocks[0]);
3414 hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::BlockCountY, ImplArgsSize,
3415 NumBlocks[1]);
3416 hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::BlockCountZ, ImplArgsSize,
3417 NumBlocks[2]);
3418
3419 hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GroupSizeX, ImplArgsSize,
3420 NumThreads[0]);
3421 hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GroupSizeY, ImplArgsSize,
3422 NumThreads[1]);
3423 hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GroupSizeZ, ImplArgsSize,
3424 NumThreads[2]);
3425
3426 hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GridDims, ImplArgsSize,
3427 NumBlocks[2] * NumThreads[2] > 1
3428 ? 3
3429 : 1 + (NumBlocks[1] * NumThreads[1] != 1));
3430
3431 hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::DynamicLdsSize, ImplArgsSize,
3432 KernelArgs.DynCGroupMem);
3433 }
3434
3435 // Push the kernel launch into the stream.
3436 return Stream->pushKernelLaunch(Kernel: *this, KernelArgs: AllArgs, NumThreads, NumBlocks,
3437 GroupSize, StackSize, MemoryManager&: ArgsMemoryManager);
3438}
3439
3440Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
3441 KernelArgsTy &KernelArgs,
3442 uint32_t NumThreads[3],
3443 uint32_t NumBlocks[3]) const {
3444 // Only do all this when the output is requested
3445 if (!(getInfoLevel() & OMP_INFOTYPE_PLUGIN_KERNEL))
3446 return Plugin::success();
3447
3448 // We don't have data to print additional info, but no hard error
3449 if (!KernelInfo.has_value())
3450 return Plugin::success();
3451
3452 // General Info
3453 auto NumGroups = NumBlocks;
3454 auto ThreadsPerGroup = NumThreads;
3455
3456 // Kernel Arguments Info
3457 auto ArgNum = KernelArgs.NumArgs;
3458 auto LoopTripCount = KernelArgs.Tripcount;
3459
3460 // Details for AMDGPU kernels (read from image)
3461 // https://www.llvm.org/docs/AMDGPUUsage.html#code-object-v4-metadata
3462 auto GroupSegmentSize = (*KernelInfo).GroupSegmentList;
3463 auto SGPRCount = (*KernelInfo).SGPRCount;
3464 auto VGPRCount = (*KernelInfo).VGPRCount;
3465 auto SGPRSpillCount = (*KernelInfo).SGPRSpillCount;
3466 auto VGPRSpillCount = (*KernelInfo).VGPRSpillCount;
3467 auto MaxFlatWorkgroupSize = (*KernelInfo).MaxFlatWorkgroupSize;
3468
3469 // Prints additional launch info that contains the following.
3470 // Num Args: The number of kernel arguments
3471 // Teams x Thrds: The number of teams and the number of threads actually
3472 // running.
3473 // MaxFlatWorkgroupSize: Maximum flat work-group size supported by the
3474 // kernel in work-items
3475 // LDS Usage: Amount of bytes used in LDS storage
3476 // S/VGPR Count: the number of S/V GPRs occupied by the kernel
3477 // S/VGPR Spill Count: how many S/VGPRs are spilled by the kernel
3478 // Tripcount: loop tripcount for the kernel
3479 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(),
3480 "#Args: %d Teams x Thrds: %4ux%4u (MaxFlatWorkGroupSize: %u) LDS "
3481 "Usage: %uB #SGPRs/VGPRs: %u/%u #SGPR/VGPR Spills: %u/%u Tripcount: "
3482 "%lu\n",
3483 ArgNum, NumGroups[0] * NumGroups[1] * NumGroups[2],
3484 ThreadsPerGroup[0] * ThreadsPerGroup[1] * ThreadsPerGroup[2],
3485 MaxFlatWorkgroupSize, GroupSegmentSize, SGPRCount, VGPRCount,
3486 SGPRSpillCount, VGPRSpillCount, LoopTripCount);
3487
3488 return Plugin::success();
3489}
3490
3491template <typename... ArgsTy>
3492static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
3493 hsa_status_t ResultCode = static_cast<hsa_status_t>(Code);
3494 if (ResultCode == HSA_STATUS_SUCCESS || ResultCode == HSA_STATUS_INFO_BREAK)
3495 return Plugin::success();
3496
3497 const char *Desc = "unknown error";
3498 hsa_status_t Ret = hsa_status_string(ResultCode, &Desc);
3499 if (Ret != HSA_STATUS_SUCCESS)
3500 REPORT("Unrecognized " GETNAME(TARGET_NAME) " error code %d\n", Code);
3501
3502 // TODO: Add more entries to this switch
3503 ErrorCode OffloadErrCode;
3504 switch (ResultCode) {
3505 case HSA_STATUS_ERROR_INVALID_SYMBOL_NAME:
3506 OffloadErrCode = ErrorCode::NOT_FOUND;
3507 break;
3508 case HSA_STATUS_ERROR_INVALID_CODE_OBJECT:
3509 OffloadErrCode = ErrorCode::INVALID_BINARY;
3510 break;
3511 default:
3512 OffloadErrCode = ErrorCode::UNKNOWN;
3513 }
3514
3515 return Plugin::error(OffloadErrCode, ErrFmt, Args..., Desc);
3516}
3517
3518void *AMDGPUMemoryManagerTy::allocate(size_t Size, void *HstPtr,
3519 TargetAllocTy Kind) {
3520 // Allocate memory from the pool.
3521 void *Ptr = nullptr;
3522 if (auto Err = MemoryPool->allocate(Size, PtrStorage: &Ptr)) {
3523 consumeError(Err: std::move(Err));
3524 return nullptr;
3525 }
3526 assert(Ptr && "Invalid pointer");
3527
3528 // Get a list of agents that can access this memory pool.
3529 llvm::SmallVector<hsa_agent_t> Agents;
3530 llvm::copy_if(
3531 Plugin.getKernelAgents(), std::back_inserter(Agents),
3532 [&](hsa_agent_t Agent) { return MemoryPool->canAccess(Agent); });
3533
3534 // Allow all valid kernel agents to access the allocation.
3535 if (auto Err = MemoryPool->enableAccess(Ptr, Size, Agents)) {
3536 REPORT("%s\n", toString(std::move(Err)).data());
3537 return nullptr;
3538 }
3539 return Ptr;
3540}
3541
3542void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
3543 if (Size == 0)
3544 return nullptr;
3545
3546 // Find the correct memory pool.
3547 AMDGPUMemoryPoolTy *MemoryPool = nullptr;
3548 switch (Kind) {
3549 case TARGET_ALLOC_DEFAULT:
3550 case TARGET_ALLOC_DEVICE:
3551 case TARGET_ALLOC_DEVICE_NON_BLOCKING:
3552 MemoryPool = CoarseGrainedMemoryPools[0];
3553 break;
3554 case TARGET_ALLOC_HOST:
3555 MemoryPool = &HostDevice.getFineGrainedMemoryPool();
3556 break;
3557 case TARGET_ALLOC_SHARED:
3558 MemoryPool = &HostDevice.getFineGrainedMemoryPool();
3559 break;
3560 }
3561
3562 if (!MemoryPool) {
3563 REPORT("No memory pool for the specified allocation kind\n");
3564 return nullptr;
3565 }
3566
3567 // Allocate from the corresponding memory pool.
3568 void *Alloc = nullptr;
3569 if (Error Err = MemoryPool->allocate(Size, PtrStorage: &Alloc)) {
3570 REPORT("%s\n", toString(E: std::move(Err)).data());
3571 return nullptr;
3572 }
3573
3574 if (Alloc) {
3575 // Get a list of agents that can access this memory pool. Inherently
3576 // necessary for host or shared allocations Also enabled for device memory
3577 // to allow device to device memcpy
3578 llvm::SmallVector<hsa_agent_t> Agents;
3579 llvm::copy_if(static_cast<AMDGPUPluginTy &>(Plugin).getKernelAgents(),
3580 std::back_inserter(Agents), [&](hsa_agent_t Agent) {
3581 return MemoryPool->canAccess(Agent);
3582 });
3583
3584 // Enable all valid kernel agents to access the buffer.
3585 if (auto Err = MemoryPool->enableAccess(Alloc, Size, Agents)) {
3586 REPORT("%s\n", toString(std::move(Err)).data());
3587 return nullptr;
3588 }
3589 }
3590
3591 return Alloc;
3592}
3593
3594void AMDGPUQueueTy::callbackError(hsa_status_t Status, hsa_queue_t *Source,
3595 void *Data) {
3596 auto &AMDGPUDevice = *reinterpret_cast<AMDGPUDeviceTy *>(Data);
3597
3598 if (Status == HSA_STATUS_ERROR_EXCEPTION) {
3599 auto KernelTraceInfoRecord =
3600 AMDGPUDevice.KernelLaunchTraces.getExclusiveAccessor();
3601 std::function<bool(__tgt_async_info &)> AsyncInfoWrapperMatcher =
3602 [=](__tgt_async_info &AsyncInfo) {
3603 auto *Stream = reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
3604 if (!Stream || !Stream->getQueue())
3605 return false;
3606 return Stream->getQueue()->Queue == Source;
3607 };
3608 ErrorReporter::reportTrapInKernel(AMDGPUDevice, *KernelTraceInfoRecord,
3609 AsyncInfoWrapperMatcher);
3610 }
3611
3612 auto Err = Plugin::check(Status, "received error in queue %p: %s", Source);
3613 FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data());
3614}
3615
3616} // namespace plugin
3617} // namespace target
3618} // namespace omp
3619} // namespace llvm
3620
3621extern "C" {
3622llvm::omp::target::plugin::GenericPluginTy *createPlugin_amdgpu() {
3623 return new llvm::omp::target::plugin::AMDGPUPluginTy();
3624}
3625}
3626

source code of offload/plugins-nextgen/amdgpu/src/rtl.cpp