1//===----RTLs/amdgpu/src/rtl.cpp - Target RTLs Implementation ----- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// RTL NextGen for AMDGPU machine
10//
11//===----------------------------------------------------------------------===//
12
13#include <atomic>
14#include <cassert>
15#include <cstddef>
16#include <cstdint>
17#include <deque>
18#include <functional>
19#include <mutex>
20#include <string>
21#include <system_error>
22#include <unistd.h>
23#include <unordered_map>
24
25#include "ErrorReporting.h"
26#include "Shared/APITypes.h"
27#include "Shared/Debug.h"
28#include "Shared/Environment.h"
29#include "Shared/RefCnt.h"
30#include "Shared/Utils.h"
31#include "Utils/ELF.h"
32
33#include "GlobalHandler.h"
34#include "OpenMP/OMPT/Callback.h"
35#include "PluginInterface.h"
36#include "UtilitiesRTL.h"
37#include "omptarget.h"
38
39#include "llvm/ADT/SmallString.h"
40#include "llvm/ADT/SmallVector.h"
41#include "llvm/ADT/StringRef.h"
42#include "llvm/BinaryFormat/ELF.h"
43#include "llvm/Frontend/OpenMP/OMPConstants.h"
44#include "llvm/Frontend/OpenMP/OMPGridValues.h"
45#include "llvm/Support/Error.h"
46#include "llvm/Support/FileOutputBuffer.h"
47#include "llvm/Support/FileSystem.h"
48#include "llvm/Support/MemoryBuffer.h"
49#include "llvm/Support/Program.h"
50#include "llvm/Support/Signals.h"
51#include "llvm/Support/raw_ostream.h"
52
53#if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || \
54 !defined(__ORDER_BIG_ENDIAN__)
55#error "Missing preprocessor definitions for endianness detection."
56#endif
57
58// The HSA headers require these definitions.
59#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
60#define LITTLEENDIAN_CPU
61#elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
62#define BIGENDIAN_CPU
63#endif
64
65#if defined(__has_include)
66#if __has_include("hsa.h")
67#include "hsa.h"
68#include "hsa_ext_amd.h"
69#elif __has_include("hsa/hsa.h")
70#include "hsa/hsa.h"
71#include "hsa/hsa_ext_amd.h"
72#endif
73#else
74#include "hsa/hsa.h"
75#include "hsa/hsa_ext_amd.h"
76#endif
77
78using namespace error;
79
80namespace llvm {
81namespace omp {
82namespace target {
83namespace plugin {
84
85/// Forward declarations for all specialized data structures.
86struct AMDGPUKernelTy;
87struct AMDGPUDeviceTy;
88struct AMDGPUPluginTy;
89struct AMDGPUStreamTy;
90struct AMDGPUEventTy;
91struct AMDGPUStreamManagerTy;
92struct AMDGPUEventManagerTy;
93struct AMDGPUDeviceImageTy;
94struct AMDGPUMemoryManagerTy;
95struct AMDGPUMemoryPoolTy;
96
97namespace hsa_utils {
98
99/// Iterate elements using an HSA iterate function. Do not use this function
100/// directly but the specialized ones below instead.
101template <typename ElemTy, typename IterFuncTy, typename CallbackTy>
102hsa_status_t iterate(IterFuncTy Func, CallbackTy Cb) {
103 auto L = [](ElemTy Elem, void *Data) -> hsa_status_t {
104 CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data);
105 return (*Unwrapped)(Elem);
106 };
107 return Func(L, static_cast<void *>(&Cb));
108}
109
110/// Iterate elements using an HSA iterate function passing a parameter. Do not
111/// use this function directly but the specialized ones below instead.
112template <typename ElemTy, typename IterFuncTy, typename IterFuncArgTy,
113 typename CallbackTy>
114hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) {
115 auto L = [](ElemTy Elem, void *Data) -> hsa_status_t {
116 CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data);
117 return (*Unwrapped)(Elem);
118 };
119 return Func(FuncArg, L, static_cast<void *>(&Cb));
120}
121
122/// Iterate elements using an HSA iterate function passing a parameter. Do not
123/// use this function directly but the specialized ones below instead.
124template <typename Elem1Ty, typename Elem2Ty, typename IterFuncTy,
125 typename IterFuncArgTy, typename CallbackTy>
126hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) {
127 auto L = [](Elem1Ty Elem1, Elem2Ty Elem2, void *Data) -> hsa_status_t {
128 CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data);
129 return (*Unwrapped)(Elem1, Elem2);
130 };
131 return Func(FuncArg, L, static_cast<void *>(&Cb));
132}
133
134/// Iterate agents.
135template <typename CallbackTy> Error iterateAgents(CallbackTy Callback) {
136 hsa_status_t Status = iterate<hsa_agent_t>(hsa_iterate_agents, Callback);
137 return Plugin::check(Status, "error in hsa_iterate_agents: %s");
138}
139
140/// Iterate ISAs of an agent.
141template <typename CallbackTy>
142Error iterateAgentISAs(hsa_agent_t Agent, CallbackTy Cb) {
143 hsa_status_t Status = iterate<hsa_isa_t>(hsa_agent_iterate_isas, Agent, Cb);
144 return Plugin::check(Status, "error in hsa_agent_iterate_isas: %s");
145}
146
147/// Iterate memory pools of an agent.
148template <typename CallbackTy>
149Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) {
150 hsa_status_t Status = iterate<hsa_amd_memory_pool_t>(
151 hsa_amd_agent_iterate_memory_pools, Agent, Cb);
152 return Plugin::check(Status,
153 "error in hsa_amd_agent_iterate_memory_pools: %s");
154}
155
156/// Dispatches an asynchronous memory copy.
157/// Enables different SDMA engines for the dispatch in a round-robin fashion.
158Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent,
159 const void *Src, hsa_agent_t SrcAgent, size_t Size,
160 uint32_t NumDepSignals, const hsa_signal_t *DepSignals,
161 hsa_signal_t CompletionSignal) {
162 if (!UseMultipleSdmaEngines) {
163 hsa_status_t S =
164 hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, Size,
165 NumDepSignals, DepSignals, CompletionSignal);
166 return Plugin::check(S, "error in hsa_amd_memory_async_copy: %s");
167 }
168
169// This solution is probably not the best
170#if !(HSA_AMD_INTERFACE_VERSION_MAJOR >= 1 && \
171 HSA_AMD_INTERFACE_VERSION_MINOR >= 2)
172 return Plugin::error(ErrorCode::UNSUPPORTED,
173 "async copy on selected SDMA requires ROCm 5.7");
174#else
175 static std::atomic<int> SdmaEngine{1};
176
177 // This atomics solution is probably not the best, but should be sufficient
178 // for now.
179 // In a worst case scenario, in which threads read the same value, they will
180 // dispatch to the same SDMA engine. This may result in sub-optimal
181 // performance. However, I think the possibility to be fairly low.
182 int LocalSdmaEngine = SdmaEngine.load(std::memory_order_acquire);
183 // This call is only avail in ROCm >= 5.7
184 hsa_status_t S = hsa_amd_memory_async_copy_on_engine(
185 Dst, DstAgent, Src, SrcAgent, Size, NumDepSignals, DepSignals,
186 CompletionSignal, (hsa_amd_sdma_engine_id_t)LocalSdmaEngine,
187 /*force_copy_on_sdma=*/true);
188 // Increment to use one of two SDMA engines: 0x1, 0x2
189 LocalSdmaEngine = (LocalSdmaEngine << 1) % 3;
190 SdmaEngine.store(LocalSdmaEngine, std::memory_order_relaxed);
191
192 return Plugin::check(S, "error in hsa_amd_memory_async_copy_on_engine: %s");
193#endif
194}
195
196Error getTargetTripleAndFeatures(hsa_agent_t Agent,
197 SmallVector<SmallString<32>> &Targets) {
198 auto Err = hsa_utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) {
199 uint32_t Length;
200 hsa_status_t Status;
201 Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME_LENGTH, &Length);
202 if (Status != HSA_STATUS_SUCCESS)
203 return Status;
204
205 llvm::SmallVector<char> ISAName(Length);
206 Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, ISAName.begin());
207 if (Status != HSA_STATUS_SUCCESS)
208 return Status;
209
210 llvm::StringRef TripleTarget(ISAName.begin(), Length);
211 if (TripleTarget.consume_front(Prefix: "amdgcn-amd-amdhsa")) {
212 auto Target = TripleTarget.ltrim(Char: '-').rtrim(Char: '\0');
213 Targets.push_back(Target);
214 }
215 return HSA_STATUS_SUCCESS;
216 });
217 return Err;
218}
219} // namespace hsa_utils
220
221/// Utility class representing generic resource references to AMDGPU resources.
222template <typename ResourceTy>
223struct AMDGPUResourceRef : public GenericDeviceResourceRef {
224 /// The underlying handle type for resources.
225 using HandleTy = ResourceTy *;
226
227 /// Create an empty reference to an invalid resource.
228 AMDGPUResourceRef() : Resource(nullptr) {}
229
230 /// Create a reference to an existing resource.
231 AMDGPUResourceRef(HandleTy Resource) : Resource(Resource) {}
232
233 virtual ~AMDGPUResourceRef() {}
234
235 /// Create a new resource and save the reference. The reference must be empty
236 /// before calling to this function.
237 Error create(GenericDeviceTy &Device) override;
238
239 /// Destroy the referenced resource and invalidate the reference. The
240 /// reference must be to a valid resource before calling to this function.
241 Error destroy(GenericDeviceTy &Device) override {
242 if (!Resource)
243 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
244 "destroying an invalid resource");
245
246 if (auto Err = Resource->deinit())
247 return Err;
248
249 delete Resource;
250
251 Resource = nullptr;
252 return Plugin::success();
253 }
254
255 /// Get the underlying resource handle.
256 operator HandleTy() const { return Resource; }
257
258private:
259 /// The handle to the actual resource.
260 HandleTy Resource;
261};
262
263/// Class holding an HSA memory pool.
264struct AMDGPUMemoryPoolTy {
265 /// Create a memory pool from an HSA memory pool.
266 AMDGPUMemoryPoolTy(hsa_amd_memory_pool_t MemoryPool)
267 : MemoryPool(MemoryPool), GlobalFlags(0) {}
268
269 /// Initialize the memory pool retrieving its properties.
270 Error init() {
271 if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_SEGMENT, Segment))
272 return Err;
273
274 if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, GlobalFlags))
275 return Err;
276
277 return Plugin::success();
278 }
279
280 /// Getter of the HSA memory pool.
281 hsa_amd_memory_pool_t get() const { return MemoryPool; }
282
283 /// Indicate the segment which belongs to.
284 bool isGlobal() const { return (Segment == HSA_AMD_SEGMENT_GLOBAL); }
285 bool isReadOnly() const { return (Segment == HSA_AMD_SEGMENT_READONLY); }
286 bool isPrivate() const { return (Segment == HSA_AMD_SEGMENT_PRIVATE); }
287 bool isGroup() const { return (Segment == HSA_AMD_SEGMENT_GROUP); }
288
289 /// Indicate if it is fine-grained memory. Valid only for global.
290 bool isFineGrained() const {
291 assert(isGlobal() && "Not global memory");
292 return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED);
293 }
294
295 /// Indicate if it is coarse-grained memory. Valid only for global.
296 bool isCoarseGrained() const {
297 assert(isGlobal() && "Not global memory");
298 return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED);
299 }
300
301 /// Indicate if it supports storing kernel arguments. Valid only for global.
302 bool supportsKernelArgs() const {
303 assert(isGlobal() && "Not global memory");
304 return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT);
305 }
306
307 /// Allocate memory on the memory pool.
308 Error allocate(size_t Size, void **PtrStorage) {
309 hsa_status_t Status =
310 hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, PtrStorage);
311 return Plugin::check(Status, "error in hsa_amd_memory_pool_allocate: %s");
312 }
313
314 /// Return memory to the memory pool.
315 Error deallocate(void *Ptr) {
316 hsa_status_t Status = hsa_amd_memory_pool_free(Ptr);
317 return Plugin::check(Status, "error in hsa_amd_memory_pool_free: %s");
318 }
319
320 /// Returns if the \p Agent can access the memory pool.
321 bool canAccess(hsa_agent_t Agent) {
322 hsa_amd_memory_pool_access_t Access;
323 if (hsa_amd_agent_memory_pool_get_info(
324 Agent, MemoryPool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &Access))
325 return false;
326 return Access != HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED;
327 }
328
329 /// Allow the device to access a specific allocation.
330 Error enableAccess(void *Ptr, int64_t Size,
331 const llvm::SmallVector<hsa_agent_t> &Agents) const {
332#ifdef OMPTARGET_DEBUG
333 for (hsa_agent_t Agent : Agents) {
334 hsa_amd_memory_pool_access_t Access;
335 if (auto Err =
336 getAttr(Agent, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, Access))
337 return Err;
338
339 // The agent is not allowed to access the memory pool in any case. Do not
340 // continue because otherwise it result in undefined behavior.
341 if (Access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED)
342 return Plugin::error(ErrorCode::INVALID_VALUE,
343 "an agent is not allowed to access a memory pool");
344 }
345#endif
346
347 // We can access but it is disabled by default. Enable the access then.
348 hsa_status_t Status =
349 hsa_amd_agents_allow_access(Agents.size(), Agents.data(), nullptr, Ptr);
350 return Plugin::check(Status, "error in hsa_amd_agents_allow_access: %s");
351 }
352
353 /// Get attribute from the memory pool.
354 template <typename Ty>
355 Error getAttr(hsa_amd_memory_pool_info_t Kind, Ty &Value) const {
356 hsa_status_t Status;
357 Status = hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value);
358 return Plugin::check(Status, "error in hsa_amd_memory_pool_get_info: %s");
359 }
360
361 template <typename Ty>
362 hsa_status_t getAttrRaw(hsa_amd_memory_pool_info_t Kind, Ty &Value) const {
363 return hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value);
364 }
365
366 /// Get attribute from the memory pool relating to an agent.
367 template <typename Ty>
368 Error getAttr(hsa_agent_t Agent, hsa_amd_agent_memory_pool_info_t Kind,
369 Ty &Value) const {
370 hsa_status_t Status;
371 Status =
372 hsa_amd_agent_memory_pool_get_info(Agent, MemoryPool, Kind, &Value);
373 return Plugin::check(Status,
374 "error in hsa_amd_agent_memory_pool_get_info: %s");
375 }
376
377private:
378 /// The HSA memory pool.
379 hsa_amd_memory_pool_t MemoryPool;
380
381 /// The segment where the memory pool belongs to.
382 hsa_amd_segment_t Segment;
383
384 /// The global flags of memory pool. Only valid if the memory pool belongs to
385 /// the global segment.
386 uint32_t GlobalFlags;
387};
388
389/// Class that implements a memory manager that gets memory from a specific
390/// memory pool.
391struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
392
393 /// Create an empty memory manager.
394 AMDGPUMemoryManagerTy(AMDGPUPluginTy &Plugin)
395 : Plugin(Plugin), MemoryPool(nullptr), MemoryManager(nullptr) {}
396
397 /// Initialize the memory manager from a memory pool.
398 Error init(AMDGPUMemoryPoolTy &MemoryPool) {
399 const uint32_t Threshold = 1 << 30;
400 this->MemoryManager = new MemoryManagerTy(*this, Threshold);
401 this->MemoryPool = &MemoryPool;
402 return Plugin::success();
403 }
404
405 /// Deinitialize the memory manager and free its allocations.
406 Error deinit() {
407 assert(MemoryManager && "Invalid memory manager");
408
409 // Delete and invalidate the memory manager. At this point, the memory
410 // manager will deallocate all its allocations.
411 delete MemoryManager;
412 MemoryManager = nullptr;
413
414 return Plugin::success();
415 }
416
417 /// Reuse or allocate memory through the memory manager.
418 Error allocate(size_t Size, void **PtrStorage) {
419 assert(MemoryManager && "Invalid memory manager");
420 assert(PtrStorage && "Invalid pointer storage");
421
422 *PtrStorage = MemoryManager->allocate(Size, nullptr);
423 if (Size && *PtrStorage == nullptr)
424 return Plugin::error(ErrorCode::OUT_OF_RESOURCES,
425 "failure to allocate from AMDGPU memory manager");
426
427 return Plugin::success();
428 }
429
430 /// Release an allocation to be reused.
431 Error deallocate(void *Ptr) {
432 if (MemoryManager->free(Ptr))
433 return Plugin::error(ErrorCode::UNKNOWN,
434 "failure to deallocate from AMDGPU memory manager");
435
436 return Plugin::success();
437 }
438
439private:
440 /// Allocation callback that will be called once the memory manager does not
441 /// have more previously allocated buffers.
442 void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) override;
443
444 /// Deallocation callback that will be called by the memory manager.
445 int free(void *TgtPtr, TargetAllocTy Kind) override {
446 if (auto Err = MemoryPool->deallocate(Ptr: TgtPtr)) {
447 consumeError(Err: std::move(Err));
448 return OFFLOAD_FAIL;
449 }
450 return OFFLOAD_SUCCESS;
451 }
452
453 /// The underlying plugin that owns this memory manager.
454 AMDGPUPluginTy &Plugin;
455
456 /// The memory pool used to allocate memory.
457 AMDGPUMemoryPoolTy *MemoryPool;
458
459 /// Reference to the actual memory manager.
460 MemoryManagerTy *MemoryManager;
461};
462
463/// Class implementing the AMDGPU device images' properties.
464struct AMDGPUDeviceImageTy : public DeviceImageTy {
465 /// Create the AMDGPU image with the id and the target image pointer.
466 AMDGPUDeviceImageTy(int32_t ImageId, GenericDeviceTy &Device,
467 const __tgt_device_image *TgtImage)
468 : DeviceImageTy(ImageId, Device, TgtImage) {}
469
470 /// Prepare and load the executable corresponding to the image.
471 Error loadExecutable(const AMDGPUDeviceTy &Device);
472
473 /// Unload the executable.
474 Error unloadExecutable() {
475 hsa_status_t Status = hsa_executable_destroy(Executable);
476 return Plugin::check(Status, "error in hsa_executable_destroy: %s");
477 }
478
479 /// Get the executable.
480 hsa_executable_t getExecutable() const { return Executable; }
481
482 /// Get to Code Object Version of the ELF
483 uint16_t getELFABIVersion() const { return ELFABIVersion; }
484
485 /// Find an HSA device symbol by its name on the executable.
486 Expected<hsa_executable_symbol_t>
487 findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const;
488
489 /// Get additional info for kernel, e.g., register spill counts
490 std::optional<offloading::amdgpu::AMDGPUKernelMetaData>
491 getKernelInfo(StringRef Identifier) const {
492 auto It = KernelInfoMap.find(Identifier);
493
494 if (It == KernelInfoMap.end())
495 return {};
496
497 return It->second;
498 }
499
500private:
501 /// The executable loaded on the agent.
502 hsa_executable_t Executable;
503 StringMap<offloading::amdgpu::AMDGPUKernelMetaData> KernelInfoMap;
504 uint16_t ELFABIVersion;
505};
506
507/// Class implementing the AMDGPU kernel functionalities which derives from the
508/// generic kernel class.
509struct AMDGPUKernelTy : public GenericKernelTy {
510 /// Create an AMDGPU kernel with a name and an execution mode.
511 AMDGPUKernelTy(const char *Name) : GenericKernelTy(Name) {}
512
513 /// Initialize the AMDGPU kernel.
514 Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override {
515 AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(Image);
516
517 // Kernel symbols have a ".kd" suffix.
518 std::string KernelName(getName());
519 KernelName += ".kd";
520
521 // Find the symbol on the device executable.
522 auto SymbolOrErr = AMDImage.findDeviceSymbol(Device, KernelName);
523 if (!SymbolOrErr)
524 return SymbolOrErr.takeError();
525
526 hsa_executable_symbol_t Symbol = *SymbolOrErr;
527 hsa_symbol_kind_t SymbolType;
528 hsa_status_t Status;
529
530 // Retrieve different properties of the kernel symbol.
531 std::pair<hsa_executable_symbol_info_t, void *> RequiredInfos[] = {
532 {HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &SymbolType},
533 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &KernelObject},
534 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &ArgsSize},
535 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &GroupSize},
536 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &DynamicStack},
537 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &PrivateSize}};
538
539 for (auto &Info : RequiredInfos) {
540 Status = hsa_executable_symbol_get_info(Symbol, Info.first, Info.second);
541 if (auto Err = Plugin::check(
542 Status, "error in hsa_executable_symbol_get_info: %s"))
543 return Err;
544 }
545
546 // Make sure it is a kernel symbol.
547 if (SymbolType != HSA_SYMBOL_KIND_KERNEL)
548 return Plugin::error(ErrorCode::INVALID_BINARY,
549 "symbol %s is not a kernel function");
550
551 // TODO: Read the kernel descriptor for the max threads per block. May be
552 // read from the image.
553
554 ImplicitArgsSize =
555 hsa_utils::getImplicitArgsSize(AMDImage.getELFABIVersion());
556 DP("ELFABIVersion: %d\n", AMDImage.getELFABIVersion());
557
558 // Get additional kernel info read from image
559 KernelInfo = AMDImage.getKernelInfo(getName());
560 if (!KernelInfo.has_value())
561 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device.getDeviceId(),
562 "Could not read extra information for kernel %s.", getName());
563
564 return Plugin::success();
565 }
566
567 /// Launch the AMDGPU kernel function.
568 Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
569 uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
570 KernelLaunchParamsTy LaunchParams,
571 AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
572
573 /// Print more elaborate kernel launch info for AMDGPU
574 Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
575 KernelArgsTy &KernelArgs, uint32_t NumThreads[3],
576 uint32_t NumBlocks[3]) const override;
577
578 /// Get group and private segment kernel size.
579 uint32_t getGroupSize() const { return GroupSize; }
580 uint32_t getPrivateSize() const { return PrivateSize; }
581
582 /// Get the HSA kernel object representing the kernel function.
583 uint64_t getKernelObject() const { return KernelObject; }
584
585 /// Get the size of implicitargs based on the code object version.
586 uint32_t getImplicitArgsSize() const { return ImplicitArgsSize; }
587
588 /// Indicates whether or not we need to set up our own private segment size.
589 bool usesDynamicStack() const { return DynamicStack; }
590
591private:
592 /// The kernel object to execute.
593 uint64_t KernelObject;
594
595 /// The args, group and private segments sizes required by a kernel instance.
596 uint32_t ArgsSize;
597 uint32_t GroupSize;
598 uint32_t PrivateSize;
599 bool DynamicStack;
600
601 /// The size of implicit kernel arguments.
602 uint32_t ImplicitArgsSize;
603
604 /// Additional Info for the AMD GPU Kernel
605 std::optional<offloading::amdgpu::AMDGPUKernelMetaData> KernelInfo;
606};
607
608/// Class representing an HSA signal. Signals are used to define dependencies
609/// between asynchronous operations: kernel launches and memory transfers.
610struct AMDGPUSignalTy {
611 /// Create an empty signal.
612 AMDGPUSignalTy() : HSASignal({0}), UseCount() {}
613 AMDGPUSignalTy(AMDGPUDeviceTy &Device) : HSASignal({0}), UseCount() {}
614
615 /// Initialize the signal with an initial value.
616 Error init(uint32_t InitialValue = 1) {
617 hsa_status_t Status =
618 hsa_amd_signal_create(InitialValue, 0, nullptr, 0, &HSASignal);
619 return Plugin::check(Status, "error in hsa_signal_create: %s");
620 }
621
622 /// Deinitialize the signal.
623 Error deinit() {
624 hsa_status_t Status = hsa_signal_destroy(HSASignal);
625 return Plugin::check(Status, "error in hsa_signal_destroy: %s");
626 }
627
628 /// Wait until the signal gets a zero value.
629 Error wait(const uint64_t ActiveTimeout = 0,
630 GenericDeviceTy *Device = nullptr) const {
631 if (ActiveTimeout) {
632 hsa_signal_value_t Got = 1;
633 Got = hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0,
634 ActiveTimeout, HSA_WAIT_STATE_ACTIVE);
635 if (Got == 0)
636 return Plugin::success();
637 }
638
639 // If there is an RPC device attached to this stream we run it as a server.
640 uint64_t Timeout = UINT64_MAX;
641 auto WaitState = HSA_WAIT_STATE_BLOCKED;
642 while (hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0,
643 Timeout, WaitState) != 0)
644 ;
645 return Plugin::success();
646 }
647
648 /// Load the value on the signal.
649 hsa_signal_value_t load() const {
650 return hsa_signal_load_scacquire(HSASignal);
651 }
652
653 /// Signal decrementing by one.
654 void signal() {
655 assert(load() > 0 && "Invalid signal value");
656 hsa_signal_subtract_screlease(HSASignal, 1);
657 }
658
659 /// Reset the signal value before reusing the signal. Do not call this
660 /// function if the signal is being currently used by any watcher, such as a
661 /// plugin thread or the HSA runtime.
662 void reset() { hsa_signal_store_screlease(HSASignal, 1); }
663
664 /// Increase the number of concurrent uses.
665 void increaseUseCount() { UseCount.increase(); }
666
667 /// Decrease the number of concurrent uses and return whether was the last.
668 bool decreaseUseCount() { return UseCount.decrease(); }
669
670 hsa_signal_t get() const { return HSASignal; }
671
672private:
673 /// The underlying HSA signal.
674 hsa_signal_t HSASignal;
675
676 /// Reference counter for tracking the concurrent use count. This is mainly
677 /// used for knowing how many streams are using the signal.
678 RefCountTy<> UseCount;
679};
680
681/// Classes for holding AMDGPU signals and managing signals.
682using AMDGPUSignalRef = AMDGPUResourceRef<AMDGPUSignalTy>;
683using AMDGPUSignalManagerTy = GenericDeviceResourceManagerTy<AMDGPUSignalRef>;
684
685/// Class holding an HSA queue to submit kernel and barrier packets.
686struct AMDGPUQueueTy {
687 /// Create an empty queue.
688 AMDGPUQueueTy() : Queue(nullptr), Mutex(), NumUsers(0) {}
689
690 /// Lazily initialize a new queue belonging to a specific agent.
691 Error init(GenericDeviceTy &Device, hsa_agent_t Agent, int32_t QueueSize) {
692 if (Queue)
693 return Plugin::success();
694 hsa_status_t Status =
695 hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError,
696 &Device, UINT32_MAX, UINT32_MAX, &Queue);
697 return Plugin::check(Status, "error in hsa_queue_create: %s");
698 }
699
700 /// Deinitialize the queue and destroy its resources.
701 Error deinit() {
702 std::lock_guard<std::mutex> Lock(Mutex);
703 if (!Queue)
704 return Plugin::success();
705 hsa_status_t Status = hsa_queue_destroy(Queue);
706 return Plugin::check(Status, "error in hsa_queue_destroy: %s");
707 }
708
709 /// Returns the number of streams, this queue is currently assigned to.
710 bool getUserCount() const { return NumUsers; }
711
712 /// Returns if the underlying HSA queue is initialized.
713 bool isInitialized() { return Queue != nullptr; }
714
715 /// Decrement user count of the queue object.
716 void removeUser() { --NumUsers; }
717
718 /// Increase user count of the queue object.
719 void addUser() { ++NumUsers; }
720
721 /// Push a kernel launch to the queue. The kernel launch requires an output
722 /// signal and can define an optional input signal (nullptr if none).
723 Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
724 uint32_t NumThreads[3], uint32_t NumBlocks[3],
725 uint32_t GroupSize, uint64_t StackSize,
726 AMDGPUSignalTy *OutputSignal,
727 AMDGPUSignalTy *InputSignal) {
728 assert(OutputSignal && "Invalid kernel output signal");
729
730 // Lock the queue during the packet publishing process. Notice this blocks
731 // the addition of other packets to the queue. The following piece of code
732 // should be lightweight; do not block the thread, allocate memory, etc.
733 std::lock_guard<std::mutex> Lock(Mutex);
734 assert(Queue && "Interacted with a non-initialized queue!");
735
736 // Add a barrier packet before the kernel packet in case there is a pending
737 // preceding operation. The barrier packet will delay the processing of
738 // subsequent queue's packets until the barrier input signal are satisfied.
739 // No need output signal needed because the dependency is already guaranteed
740 // by the queue barrier itself.
741 if (InputSignal && InputSignal->load())
742 if (auto Err = pushBarrierImpl(OutputSignal: nullptr, InputSignal1: InputSignal))
743 return Err;
744
745 // Now prepare the kernel packet.
746 uint64_t PacketId;
747 hsa_kernel_dispatch_packet_t *Packet = acquirePacket(PacketId);
748 assert(Packet && "Invalid packet");
749
750 // The first 32 bits of the packet are written after the other fields
751 uint16_t Dims = NumBlocks[2] * NumThreads[2] > 1
752 ? 3
753 : 1 + (NumBlocks[1] * NumThreads[1] != 1);
754 uint16_t Setup = UINT16_C(Dims)
755 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
756 Packet->workgroup_size_x = NumThreads[0];
757 Packet->workgroup_size_y = NumThreads[1];
758 Packet->workgroup_size_z = NumThreads[2];
759 Packet->reserved0 = 0;
760 Packet->grid_size_x = NumBlocks[0] * NumThreads[0];
761 Packet->grid_size_y = NumBlocks[1] * NumThreads[1];
762 Packet->grid_size_z = NumBlocks[2] * NumThreads[2];
763 Packet->private_segment_size =
764 Kernel.usesDynamicStack() ? StackSize : Kernel.getPrivateSize();
765 Packet->group_segment_size = GroupSize;
766 Packet->kernel_object = Kernel.getKernelObject();
767 Packet->kernarg_address = KernelArgs;
768 Packet->reserved2 = 0;
769 Packet->completion_signal = OutputSignal->get();
770
771 // Publish the packet. Do not modify the packet after this point.
772 publishKernelPacket(PacketId, Setup, Packet);
773
774 return Plugin::success();
775 }
776
777 /// Push a barrier packet that will wait up to two input signals. All signals
778 /// are optional (nullptr if none).
779 Error pushBarrier(AMDGPUSignalTy *OutputSignal,
780 const AMDGPUSignalTy *InputSignal1,
781 const AMDGPUSignalTy *InputSignal2) {
782 // Lock the queue during the packet publishing process.
783 std::lock_guard<std::mutex> Lock(Mutex);
784 assert(Queue && "Interacted with a non-initialized queue!");
785
786 // Push the barrier with the lock acquired.
787 return pushBarrierImpl(OutputSignal, InputSignal1, InputSignal2);
788 }
789
790private:
791 /// Push a barrier packet that will wait up to two input signals. Assumes the
792 /// the queue lock is acquired.
793 Error pushBarrierImpl(AMDGPUSignalTy *OutputSignal,
794 const AMDGPUSignalTy *InputSignal1,
795 const AMDGPUSignalTy *InputSignal2 = nullptr) {
796 // Add a queue barrier waiting on both the other stream's operation and the
797 // last operation on the current stream (if any).
798 uint64_t PacketId;
799 hsa_barrier_and_packet_t *Packet =
800 (hsa_barrier_and_packet_t *)acquirePacket(PacketId);
801 assert(Packet && "Invalid packet");
802
803 Packet->reserved0 = 0;
804 Packet->reserved1 = 0;
805 Packet->dep_signal[0] = {0};
806 Packet->dep_signal[1] = {0};
807 Packet->dep_signal[2] = {0};
808 Packet->dep_signal[3] = {0};
809 Packet->dep_signal[4] = {0};
810 Packet->reserved2 = 0;
811 Packet->completion_signal = {0};
812
813 // Set input and output dependencies if needed.
814 if (OutputSignal)
815 Packet->completion_signal = OutputSignal->get();
816 if (InputSignal1)
817 Packet->dep_signal[0] = InputSignal1->get();
818 if (InputSignal2)
819 Packet->dep_signal[1] = InputSignal2->get();
820
821 // Publish the packet. Do not modify the packet after this point.
822 publishBarrierPacket(PacketId, Packet);
823
824 return Plugin::success();
825 }
826
827 /// Acquire a packet from the queue. This call may block the thread if there
828 /// is no space in the underlying HSA queue. It may need to wait until the HSA
829 /// runtime processes some packets. Assumes the queue lock is acquired.
830 hsa_kernel_dispatch_packet_t *acquirePacket(uint64_t &PacketId) {
831 // Increase the queue index with relaxed memory order. Notice this will need
832 // another subsequent atomic operation with acquire order.
833 PacketId = hsa_queue_add_write_index_relaxed(Queue, 1);
834
835 // Wait for the package to be available. Notice the atomic operation uses
836 // the acquire memory order.
837 while (PacketId - hsa_queue_load_read_index_scacquire(Queue) >= Queue->size)
838 ;
839
840 // Return the packet reference.
841 const uint32_t Mask = Queue->size - 1; // The size is a power of 2.
842 return (hsa_kernel_dispatch_packet_t *)Queue->base_address +
843 (PacketId & Mask);
844 }
845
846 /// Publish the kernel packet so that the HSA runtime can start processing
847 /// the kernel launch. Do not modify the packet once this function is called.
848 /// Assumes the queue lock is acquired.
849 void publishKernelPacket(uint64_t PacketId, uint16_t Setup,
850 hsa_kernel_dispatch_packet_t *Packet) {
851 uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet);
852
853 uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
854 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
855 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
856
857 // Publish the packet. Do not modify the package after this point.
858 uint32_t HeaderWord = Header | (Setup << 16u);
859 __atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);
860
861 // Signal the doorbell about the published packet.
862 hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
863 }
864
865 /// Publish the barrier packet so that the HSA runtime can start processing
866 /// the barrier. Next packets in the queue will not be processed until all
867 /// barrier dependencies (signals) are satisfied. Assumes the queue is locked
868 void publishBarrierPacket(uint64_t PacketId,
869 hsa_barrier_and_packet_t *Packet) {
870 uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet);
871 uint16_t Setup = 0;
872 uint16_t Header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
873 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
874 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
875
876 // Publish the packet. Do not modify the package after this point.
877 uint32_t HeaderWord = Header | (Setup << 16u);
878 __atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);
879
880 // Signal the doorbell about the published packet.
881 hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
882 }
883
884 /// Callback that will be called when an error is detected on the HSA queue.
885 static void callbackError(hsa_status_t Status, hsa_queue_t *Source,
886 void *Data);
887
888 /// The HSA queue.
889 hsa_queue_t *Queue;
890
891 /// Mutex to protect the acquiring and publishing of packets. For the moment,
892 /// we need this mutex to prevent publishing packets that are not ready to be
893 /// published in a multi-thread scenario. Without a queue lock, a thread T1
894 /// could acquire packet P and thread T2 acquire packet P+1. Thread T2 could
895 /// publish its packet P+1 (signaling the queue's doorbell) before packet P
896 /// from T1 is ready to be processed. That scenario should be invalid. Thus,
897 /// we use the following mutex to make packet acquiring and publishing atomic.
898 /// TODO: There are other more advanced approaches to avoid this mutex using
899 /// atomic operations. We can further investigate it if this is a bottleneck.
900 std::mutex Mutex;
901
902 /// The number of streams, this queue is currently assigned to. A queue is
903 /// considered idle when this is zero, otherwise: busy.
904 uint32_t NumUsers;
905};
906
907/// Struct that implements a stream of asynchronous operations for AMDGPU
908/// devices. This class relies on signals to implement streams and define the
909/// dependencies between asynchronous operations.
910struct AMDGPUStreamTy {
911private:
912 /// Utility struct holding arguments for async H2H memory copies.
913 struct MemcpyArgsTy {
914 void *Dst;
915 const void *Src;
916 size_t Size;
917 };
918
919 /// Utility struct holding arguments for freeing buffers to memory managers.
920 struct ReleaseBufferArgsTy {
921 void *Buffer;
922 AMDGPUMemoryManagerTy *MemoryManager;
923 };
924
925 /// Utility struct holding arguments for releasing signals to signal managers.
926 struct ReleaseSignalArgsTy {
927 AMDGPUSignalTy *Signal;
928 AMDGPUSignalManagerTy *SignalManager;
929 };
930
931 using AMDGPUStreamCallbackTy = Error(void *Data);
932
933 /// The stream is composed of N stream's slots. The struct below represents
934 /// the fields of each slot. Each slot has a signal and an optional action
935 /// function. When appending an HSA asynchronous operation to the stream, one
936 /// slot is consumed and used to store the operation's information. The
937 /// operation's output signal is set to the consumed slot's signal. If there
938 /// is a previous asynchronous operation on the previous slot, the HSA async
939 /// operation's input signal is set to the signal of the previous slot. This
940 /// way, we obtain a chain of dependent async operations. The action is a
941 /// function that will be executed eventually after the operation is
942 /// completed, e.g., for releasing a buffer.
943 struct StreamSlotTy {
944 /// The output signal of the stream operation. May be used by the subsequent
945 /// operation as input signal.
946 AMDGPUSignalTy *Signal;
947
948 /// The actions that must be performed after the operation's completion. Set
949 /// to nullptr when there is no action to perform.
950 llvm::SmallVector<AMDGPUStreamCallbackTy *> Callbacks;
951
952 /// Space for the action's arguments. A pointer to these arguments is passed
953 /// to the action function. Notice the space of arguments is limited.
954 union ActionArgsTy {
955 MemcpyArgsTy MemcpyArgs;
956 ReleaseBufferArgsTy ReleaseBufferArgs;
957 ReleaseSignalArgsTy ReleaseSignalArgs;
958 void *CallbackArgs;
959 };
960
961 llvm::SmallVector<ActionArgsTy> ActionArgs;
962
963 /// Create an empty slot.
964 StreamSlotTy() : Signal(nullptr), Callbacks({}), ActionArgs({}) {}
965
966 /// Schedule a host memory copy action on the slot.
967 Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size) {
968 Callbacks.emplace_back(memcpyAction);
969 ActionArgs.emplace_back().MemcpyArgs = MemcpyArgsTy{Dst, Src, Size};
970 return Plugin::success();
971 }
972
973 /// Schedule a release buffer action on the slot.
974 Error schedReleaseBuffer(void *Buffer, AMDGPUMemoryManagerTy &Manager) {
975 Callbacks.emplace_back(releaseBufferAction);
976 ActionArgs.emplace_back().ReleaseBufferArgs =
977 ReleaseBufferArgsTy{Buffer, &Manager};
978 return Plugin::success();
979 }
980
981 /// Schedule a signal release action on the slot.
982 Error schedReleaseSignal(AMDGPUSignalTy *SignalToRelease,
983 AMDGPUSignalManagerTy *SignalManager) {
984 Callbacks.emplace_back(releaseSignalAction);
985 ActionArgs.emplace_back().ReleaseSignalArgs =
986 ReleaseSignalArgsTy{SignalToRelease, SignalManager};
987 return Plugin::success();
988 }
989
990 /// Register a callback to be called on compleition
991 Error schedCallback(AMDGPUStreamCallbackTy *Func, void *Data) {
992 Callbacks.emplace_back(Func);
993 ActionArgs.emplace_back().CallbackArgs = Data;
994
995 return Plugin::success();
996 }
997
998 // Perform the action if needed.
999 Error performAction() {
1000 if (Callbacks.empty())
1001 return Plugin::success();
1002
1003 assert(Callbacks.size() == ActionArgs.size() && "Size mismatch");
1004 for (auto [Callback, ActionArg] : llvm::zip(Callbacks, ActionArgs)) {
1005 // Perform the action.
1006 if (Callback == memcpyAction) {
1007 if (auto Err = memcpyAction(&ActionArg))
1008 return Err;
1009 } else if (Callback == releaseBufferAction) {
1010 if (auto Err = releaseBufferAction(&ActionArg))
1011 return Err;
1012 } else if (Callback == releaseSignalAction) {
1013 if (auto Err = releaseSignalAction(&ActionArg))
1014 return Err;
1015 } else if (Callback) {
1016 if (auto Err = Callback(ActionArg.CallbackArgs))
1017 return Err;
1018 }
1019 }
1020
1021 // Invalidate the action.
1022 Callbacks.clear();
1023 ActionArgs.clear();
1024
1025 return Plugin::success();
1026 }
1027 };
1028
1029 /// The device agent where the stream was created.
1030 hsa_agent_t Agent;
1031
1032 /// The queue that the stream uses to launch kernels.
1033 AMDGPUQueueTy *Queue;
1034
1035 /// The manager of signals to reuse signals.
1036 AMDGPUSignalManagerTy &SignalManager;
1037
1038 /// A reference to the associated device.
1039 GenericDeviceTy &Device;
1040
1041 /// Array of stream slots. Use std::deque because it can dynamically grow
1042 /// without invalidating the already inserted elements. For instance, the
1043 /// std::vector may invalidate the elements by reallocating the internal
1044 /// array if there is not enough space on new insertions.
1045 std::deque<StreamSlotTy> Slots;
1046
1047 /// The next available slot on the queue. This is reset to zero each time the
1048 /// stream is synchronized. It also indicates the current number of consumed
1049 /// slots at a given time.
1050 uint32_t NextSlot;
1051
1052 /// The synchronization id. This number is increased each time the stream is
1053 /// synchronized. It is useful to detect if an AMDGPUEventTy points to an
1054 /// operation that was already finalized in a previous stream sycnhronize.
1055 uint32_t SyncCycle;
1056
1057 /// Mutex to protect stream's management.
1058 mutable std::mutex Mutex;
1059
1060 /// Timeout hint for HSA actively waiting for signal value to change
1061 const uint64_t StreamBusyWaitMicroseconds;
1062
1063 /// Indicate to spread data transfers across all available SDMAs
1064 bool UseMultipleSdmaEngines;
1065
1066 /// Return the current number of asynchronous operations on the stream.
1067 uint32_t size() const { return NextSlot; }
1068
1069 /// Return the last valid slot on the stream.
1070 uint32_t last() const { return size() - 1; }
1071
1072 /// Consume one slot from the stream. Since the stream uses signals on demand
1073 /// and releases them once the slot is no longer used, the function requires
1074 /// an idle signal for the new consumed slot.
1075 std::pair<uint32_t, AMDGPUSignalTy *> consume(AMDGPUSignalTy *OutputSignal) {
1076 // Double the stream size if needed. Since we use std::deque, this operation
1077 // does not invalidate the already added slots.
1078 if (Slots.size() == NextSlot)
1079 Slots.resize(new_size: Slots.size() * 2);
1080
1081 // Update the next available slot and the stream size.
1082 uint32_t Curr = NextSlot++;
1083
1084 // Retrieve the input signal, if any, of the current operation.
1085 AMDGPUSignalTy *InputSignal = (Curr > 0) ? Slots[Curr - 1].Signal : nullptr;
1086
1087 // Set the output signal of the current slot.
1088 Slots[Curr].Signal = OutputSignal;
1089
1090 return std::make_pair(x&: Curr, y&: InputSignal);
1091 }
1092
1093 /// Complete all pending post actions and reset the stream after synchronizing
1094 /// or positively querying the stream.
1095 Error complete() {
1096 for (uint32_t Slot = 0; Slot < NextSlot; ++Slot) {
1097 // Take the post action of the operation if any.
1098 if (auto Err = Slots[Slot].performAction())
1099 return Err;
1100
1101 // Release the slot's signal if possible. Otherwise, another user will.
1102 if (Slots[Slot].Signal->decreaseUseCount())
1103 if (auto Err = SignalManager.returnResource(Slots[Slot].Signal))
1104 return Err;
1105
1106 Slots[Slot].Signal = nullptr;
1107 }
1108
1109 // Reset the stream slots to zero.
1110 NextSlot = 0;
1111
1112 // Increase the synchronization id since the stream completed a sync cycle.
1113 SyncCycle += 1;
1114
1115 return Plugin::success();
1116 }
1117
1118 /// Make the current stream wait on a specific operation of another stream.
1119 /// The idea is to make the current stream waiting on two signals: 1) the last
1120 /// signal of the current stream, and 2) the last signal of the other stream.
1121 /// Use a barrier packet with two input signals.
1122 Error waitOnStreamOperation(AMDGPUStreamTy &OtherStream, uint32_t Slot) {
1123 if (Queue == nullptr)
1124 return Plugin::error(ErrorCode::INVALID_NULL_POINTER,
1125 "target queue was nullptr");
1126
1127 /// The signal that we must wait from the other stream.
1128 AMDGPUSignalTy *OtherSignal = OtherStream.Slots[Slot].Signal;
1129
1130 // Prevent the release of the other stream's signal.
1131 OtherSignal->increaseUseCount();
1132
1133 // Retrieve an available signal for the operation's output.
1134 AMDGPUSignalTy *OutputSignal = nullptr;
1135 if (auto Err = SignalManager.getResource(OutputSignal))
1136 return Err;
1137 OutputSignal->reset();
1138 OutputSignal->increaseUseCount();
1139
1140 // Consume stream slot and compute dependencies.
1141 auto [Curr, InputSignal] = consume(OutputSignal);
1142
1143 // Setup the post action to release the signal.
1144 if (auto Err = Slots[Curr].schedReleaseSignal(OtherSignal, &SignalManager))
1145 return Err;
1146
1147 // Push a barrier into the queue with both input signals.
1148 return Queue->pushBarrier(OutputSignal, InputSignal1: InputSignal, InputSignal2: OtherSignal);
1149 }
1150
1151 /// Callback for running a specific asynchronous operation. This callback is
1152 /// used for hsa_amd_signal_async_handler. The argument is the operation that
1153 /// should be executed. Notice we use the post action mechanism to codify the
1154 /// asynchronous operation.
1155 static bool asyncActionCallback(hsa_signal_value_t Value, void *Args) {
1156 StreamSlotTy *Slot = reinterpret_cast<StreamSlotTy *>(Args);
1157 assert(Slot && "Invalid slot");
1158 assert(Slot->Signal && "Invalid signal");
1159
1160 // This thread is outside the stream mutex. Make sure the thread sees the
1161 // changes on the slot.
1162 std::atomic_thread_fence(m: std::memory_order_acquire);
1163
1164 // Perform the operation.
1165 if (auto Err = Slot->performAction())
1166 FATAL_MESSAGE(1, "Error performing post action: %s",
1167 toString(E: std::move(Err)).data());
1168
1169 // Signal the output signal to notify the asynchronous operation finalized.
1170 Slot->Signal->signal();
1171
1172 // Unregister callback.
1173 return false;
1174 }
1175
1176 // Callback for host-to-host memory copies. This is an asynchronous action.
1177 static Error memcpyAction(void *Data) {
1178 MemcpyArgsTy *Args = reinterpret_cast<MemcpyArgsTy *>(Data);
1179 assert(Args && "Invalid arguments");
1180 assert(Args->Dst && "Invalid destination buffer");
1181 assert(Args->Src && "Invalid source buffer");
1182
1183 std::memcpy(dest: Args->Dst, src: Args->Src, n: Args->Size);
1184
1185 return Plugin::success();
1186 }
1187
1188 /// Releasing a memory buffer to a memory manager. This is a post completion
1189 /// action. There are two kinds of memory buffers:
1190 /// 1. For kernel arguments. This buffer can be freed after receiving the
1191 /// kernel completion signal.
1192 /// 2. For H2D transfers that need pinned memory space for staging. This
1193 /// buffer can be freed after receiving the transfer completion signal.
1194 /// 3. For D2H transfers that need pinned memory space for staging. This
1195 /// buffer cannot be freed after receiving the transfer completion signal
1196 /// because of the following asynchronous H2H callback.
1197 /// For this reason, This action can only be taken at
1198 /// AMDGPUStreamTy::complete()
1199 /// Because of the case 3, all releaseBufferActions are taken at
1200 /// AMDGPUStreamTy::complete() in the current implementation.
1201 static Error releaseBufferAction(void *Data) {
1202 ReleaseBufferArgsTy *Args = reinterpret_cast<ReleaseBufferArgsTy *>(Data);
1203 assert(Args && "Invalid arguments");
1204 assert(Args->MemoryManager && "Invalid memory manager");
1205
1206 // Release the allocation to the memory manager.
1207 return Args->MemoryManager->deallocate(Ptr: Args->Buffer);
1208 }
1209
1210 /// Releasing a signal object back to SignalManager. This is a post completion
1211 /// action. This action can only be taken at AMDGPUStreamTy::complete()
1212 static Error releaseSignalAction(void *Data) {
1213 ReleaseSignalArgsTy *Args = reinterpret_cast<ReleaseSignalArgsTy *>(Data);
1214 assert(Args && "Invalid arguments");
1215 assert(Args->Signal && "Invalid signal");
1216 assert(Args->SignalManager && "Invalid signal manager");
1217
1218 // Release the signal if needed.
1219 if (Args->Signal->decreaseUseCount())
1220 if (auto Err = Args->SignalManager->returnResource(Args->Signal))
1221 return Err;
1222
1223 return Plugin::success();
1224 }
1225
1226public:
1227 /// Create an empty stream associated with a specific device.
1228 AMDGPUStreamTy(AMDGPUDeviceTy &Device);
1229
1230 /// Initialize the stream's signals.
1231 Error init() { return Plugin::success(); }
1232
1233 /// Deinitialize the stream's signals.
1234 Error deinit() { return Plugin::success(); }
1235
1236 /// Push a asynchronous kernel to the stream. The kernel arguments must be
1237 /// placed in a special allocation for kernel args and must keep alive until
1238 /// the kernel finalizes. Once the kernel is finished, the stream will release
1239 /// the kernel args buffer to the specified memory manager.
1240 Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
1241 uint32_t NumThreads[3], uint32_t NumBlocks[3],
1242 uint32_t GroupSize, uint64_t StackSize,
1243 AMDGPUMemoryManagerTy &MemoryManager) {
1244 if (Queue == nullptr)
1245 return Plugin::error(ErrorCode::INVALID_NULL_POINTER,
1246 "target queue was nullptr");
1247
1248 // Retrieve an available signal for the operation's output.
1249 AMDGPUSignalTy *OutputSignal = nullptr;
1250 if (auto Err = SignalManager.getResource(OutputSignal))
1251 return Err;
1252 OutputSignal->reset();
1253 OutputSignal->increaseUseCount();
1254
1255 std::lock_guard<std::mutex> StreamLock(Mutex);
1256
1257 // Consume stream slot and compute dependencies.
1258 auto [Curr, InputSignal] = consume(OutputSignal);
1259
1260 // Setup the post action to release the kernel args buffer.
1261 if (auto Err = Slots[Curr].schedReleaseBuffer(Buffer: KernelArgs, Manager&: MemoryManager))
1262 return Err;
1263
1264 // If we are running an RPC server we want to wake up the server thread
1265 // whenever there is a kernel running and let it sleep otherwise.
1266 if (Device.getRPCServer())
1267 Device.Plugin.getRPCServer().Thread->notify();
1268
1269 // Push the kernel with the output signal and an input signal (optional)
1270 if (auto Err = Queue->pushKernelLaunch(Kernel, KernelArgs, NumThreads,
1271 NumBlocks, GroupSize, StackSize,
1272 OutputSignal, InputSignal))
1273 return Err;
1274
1275 // Register a callback to indicate when the kernel is complete.
1276 if (Device.getRPCServer()) {
1277 if (auto Err = Slots[Curr].schedCallback(
1278 [](void *Data) -> llvm::Error {
1279 GenericPluginTy &Plugin =
1280 *reinterpret_cast<GenericPluginTy *>(Data);
1281 Plugin.getRPCServer().Thread->finish();
1282 return Error::success();
1283 },
1284 &Device.Plugin))
1285 return Err;
1286 }
1287 return Plugin::success();
1288 }
1289
1290 /// Push an asynchronous memory copy between pinned memory buffers.
1291 Error pushPinnedMemoryCopyAsync(void *Dst, const void *Src,
1292 uint64_t CopySize) {
1293 // Retrieve an available signal for the operation's output.
1294 AMDGPUSignalTy *OutputSignal = nullptr;
1295 if (auto Err = SignalManager.getResource(OutputSignal))
1296 return Err;
1297 OutputSignal->reset();
1298 OutputSignal->increaseUseCount();
1299
1300 std::lock_guard<std::mutex> Lock(Mutex);
1301
1302 // Consume stream slot and compute dependencies.
1303 auto [Curr, InputSignal] = consume(OutputSignal);
1304
1305 // Issue the async memory copy.
1306 if (InputSignal && InputSignal->load()) {
1307 hsa_signal_t InputSignalRaw = InputSignal->get();
1308 return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src,
1309 Agent, CopySize, 1, &InputSignalRaw,
1310 OutputSignal->get());
1311 }
1312
1313 return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src,
1314 Agent, CopySize, 0, nullptr,
1315 OutputSignal->get());
1316 }
1317
1318 /// Push an asynchronous memory copy device-to-host involving an unpinned
1319 /// memory buffer. The operation consists of a two-step copy from the
1320 /// device buffer to an intermediate pinned host buffer, and then, to a
1321 /// unpinned host buffer. Both operations are asynchronous and dependent.
1322 /// The intermediate pinned buffer will be released to the specified memory
1323 /// manager once the operation completes.
1324 Error pushMemoryCopyD2HAsync(void *Dst, const void *Src, void *Inter,
1325 uint64_t CopySize,
1326 AMDGPUMemoryManagerTy &MemoryManager) {
1327 // Retrieve available signals for the operation's outputs.
1328 AMDGPUSignalTy *OutputSignals[2] = {};
1329 if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals))
1330 return Err;
1331 for (auto *Signal : OutputSignals) {
1332 Signal->reset();
1333 Signal->increaseUseCount();
1334 }
1335
1336 std::lock_guard<std::mutex> Lock(Mutex);
1337
1338 // Consume stream slot and compute dependencies.
1339 auto [Curr, InputSignal] = consume(OutputSignal: OutputSignals[0]);
1340
1341 // Setup the post action for releasing the intermediate buffer.
1342 if (auto Err = Slots[Curr].schedReleaseBuffer(Buffer: Inter, Manager&: MemoryManager))
1343 return Err;
1344
1345 // Issue the first step: device to host transfer. Avoid defining the input
1346 // dependency if already satisfied.
1347 if (InputSignal && InputSignal->load()) {
1348 hsa_signal_t InputSignalRaw = InputSignal->get();
1349 if (auto Err = hsa_utils::asyncMemCopy(
1350 UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1,
1351 &InputSignalRaw, OutputSignals[0]->get()))
1352 return Err;
1353 } else {
1354 if (auto Err = hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Inter,
1355 Agent, Src, Agent, CopySize, 0,
1356 nullptr, OutputSignals[0]->get()))
1357 return Err;
1358 }
1359
1360 // Consume another stream slot and compute dependencies.
1361 std::tie(args&: Curr, args&: InputSignal) = consume(OutputSignal: OutputSignals[1]);
1362 assert(InputSignal && "Invalid input signal");
1363
1364 // The std::memcpy is done asynchronously using an async handler. We store
1365 // the function's information in the action but it's not actually an action.
1366 if (auto Err = Slots[Curr].schedHostMemoryCopy(Dst, Src: Inter, Size: CopySize))
1367 return Err;
1368
1369 // Make changes on this slot visible to the async handler's thread.
1370 std::atomic_thread_fence(m: std::memory_order_release);
1371
1372 // Issue the second step: host to host transfer.
1373 hsa_status_t Status = hsa_amd_signal_async_handler(
1374 InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback,
1375 (void *)&Slots[Curr]);
1376
1377 return Plugin::check(Status, "error in hsa_amd_signal_async_handler: %s");
1378 }
1379
1380 /// Push an asynchronous memory copy host-to-device involving an unpinned
1381 /// memory buffer. The operation consists of a two-step copy from the
1382 /// unpinned host buffer to an intermediate pinned host buffer, and then, to
1383 /// the pinned host buffer. Both operations are asynchronous and dependent.
1384 /// The intermediate pinned buffer will be released to the specified memory
1385 /// manager once the operation completes.
1386 Error pushMemoryCopyH2DAsync(void *Dst, const void *Src, void *Inter,
1387 uint64_t CopySize,
1388 AMDGPUMemoryManagerTy &MemoryManager) {
1389 // Retrieve available signals for the operation's outputs.
1390 AMDGPUSignalTy *OutputSignals[2] = {};
1391 if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals))
1392 return Err;
1393 for (auto *Signal : OutputSignals) {
1394 Signal->reset();
1395 Signal->increaseUseCount();
1396 }
1397
1398 AMDGPUSignalTy *OutputSignal = OutputSignals[0];
1399
1400 std::lock_guard<std::mutex> Lock(Mutex);
1401
1402 // Consume stream slot and compute dependencies.
1403 auto [Curr, InputSignal] = consume(OutputSignal);
1404
1405 // Issue the first step: host to host transfer.
1406 if (InputSignal && InputSignal->load()) {
1407 // The std::memcpy is done asynchronously using an async handler. We store
1408 // the function's information in the action but it is not actually a
1409 // post action.
1410 if (auto Err = Slots[Curr].schedHostMemoryCopy(Dst: Inter, Src, Size: CopySize))
1411 return Err;
1412
1413 // Make changes on this slot visible to the async handler's thread.
1414 std::atomic_thread_fence(m: std::memory_order_release);
1415
1416 hsa_status_t Status = hsa_amd_signal_async_handler(
1417 InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback,
1418 (void *)&Slots[Curr]);
1419
1420 if (auto Err = Plugin::check(Status,
1421 "error in hsa_amd_signal_async_handler: %s"))
1422 return Err;
1423
1424 // Let's use now the second output signal.
1425 OutputSignal = OutputSignals[1];
1426
1427 // Consume another stream slot and compute dependencies.
1428 std::tie(args&: Curr, args&: InputSignal) = consume(OutputSignal);
1429 } else {
1430 // All preceding operations completed, copy the memory synchronously.
1431 std::memcpy(dest: Inter, src: Src, n: CopySize);
1432
1433 // Return the second signal because it will not be used.
1434 OutputSignals[1]->decreaseUseCount();
1435 if (auto Err = SignalManager.returnResource(OutputSignals[1]))
1436 return Err;
1437 }
1438
1439 // Setup the post action to release the intermediate pinned buffer.
1440 if (auto Err = Slots[Curr].schedReleaseBuffer(Buffer: Inter, Manager&: MemoryManager))
1441 return Err;
1442
1443 // Issue the second step: host to device transfer. Avoid defining the input
1444 // dependency if already satisfied.
1445 if (InputSignal && InputSignal->load()) {
1446 hsa_signal_t InputSignalRaw = InputSignal->get();
1447 return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
1448 Agent, CopySize, 1, &InputSignalRaw,
1449 OutputSignal->get());
1450 }
1451 return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
1452 Agent, CopySize, 0, nullptr,
1453 OutputSignal->get());
1454 }
1455
1456 // AMDGPUDeviceTy is incomplete here, passing the underlying agent instead
1457 Error pushMemoryCopyD2DAsync(void *Dst, hsa_agent_t DstAgent, const void *Src,
1458 hsa_agent_t SrcAgent, uint64_t CopySize) {
1459 AMDGPUSignalTy *OutputSignal;
1460 if (auto Err = SignalManager.getResources(/*Num=*/1, &OutputSignal))
1461 return Err;
1462 OutputSignal->reset();
1463 OutputSignal->increaseUseCount();
1464
1465 std::lock_guard<std::mutex> Lock(Mutex);
1466
1467 // Consume stream slot and compute dependencies.
1468 auto [Curr, InputSignal] = consume(OutputSignal);
1469
1470 // The agents need to have access to the corresponding memory
1471 // This is presently only true if the pointers were originally
1472 // allocated by this runtime or the caller made the appropriate
1473 // access calls.
1474
1475 if (InputSignal && InputSignal->load()) {
1476 hsa_signal_t InputSignalRaw = InputSignal->get();
1477 return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
1478 SrcAgent, CopySize, 1, &InputSignalRaw,
1479 OutputSignal->get());
1480 }
1481 return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
1482 SrcAgent, CopySize, 0, nullptr,
1483 OutputSignal->get());
1484 }
1485
1486 /// Synchronize with the stream. The current thread waits until all operations
1487 /// are finalized and it performs the pending post actions (i.e., releasing
1488 /// intermediate buffers).
1489 Error synchronize() {
1490 std::lock_guard<std::mutex> Lock(Mutex);
1491
1492 // No need to synchronize anything.
1493 if (size() == 0)
1494 return Plugin::success();
1495
1496 // Wait until all previous operations on the stream have completed.
1497 if (auto Err =
1498 Slots[last()].Signal->wait(StreamBusyWaitMicroseconds, &Device))
1499 return Err;
1500
1501 // Reset the stream and perform all pending post actions.
1502 return complete();
1503 }
1504
1505 /// Query the stream and complete pending post actions if operations finished.
1506 /// Return whether all the operations completed. This operation does not block
1507 /// the calling thread.
1508 Expected<bool> query() {
1509 std::lock_guard<std::mutex> Lock(Mutex);
1510
1511 // No need to query anything.
1512 if (size() == 0)
1513 return true;
1514
1515 // The last operation did not complete yet. Return directly.
1516 if (Slots[last()].Signal->load())
1517 return false;
1518
1519 // Reset the stream and perform all pending post actions.
1520 if (auto Err = complete())
1521 return std::move(Err);
1522
1523 return true;
1524 }
1525
1526 const AMDGPUQueueTy *getQueue() const { return Queue; }
1527
1528 /// Record the state of the stream on an event.
1529 Error recordEvent(AMDGPUEventTy &Event) const;
1530
1531 /// Make the stream wait on an event.
1532 Error waitEvent(const AMDGPUEventTy &Event);
1533
1534 friend struct AMDGPUStreamManagerTy;
1535};
1536
1537/// Class representing an event on AMDGPU. The event basically stores some
1538/// information regarding the state of the recorded stream.
1539struct AMDGPUEventTy {
1540 /// Create an empty event.
1541 AMDGPUEventTy(AMDGPUDeviceTy &Device)
1542 : RecordedStream(nullptr), RecordedSlot(-1), RecordedSyncCycle(-1) {}
1543
1544 /// Initialize and deinitialize.
1545 Error init() { return Plugin::success(); }
1546 Error deinit() { return Plugin::success(); }
1547
1548 /// Record the state of a stream on the event.
1549 Error record(AMDGPUStreamTy &Stream) {
1550 std::lock_guard<std::mutex> Lock(Mutex);
1551
1552 // Ignore the last recorded stream.
1553 RecordedStream = &Stream;
1554
1555 return Stream.recordEvent(Event&: *this);
1556 }
1557
1558 /// Make a stream wait on the current event.
1559 Error wait(AMDGPUStreamTy &Stream) {
1560 std::lock_guard<std::mutex> Lock(Mutex);
1561
1562 if (!RecordedStream)
1563 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
1564 "event does not have any recorded stream");
1565
1566 // Synchronizing the same stream. Do nothing.
1567 if (RecordedStream == &Stream)
1568 return Plugin::success();
1569
1570 // No need to wait anything, the recorded stream already finished the
1571 // corresponding operation.
1572 if (RecordedSlot < 0)
1573 return Plugin::success();
1574
1575 return Stream.waitEvent(Event: *this);
1576 }
1577
1578protected:
1579 /// The stream registered in this event.
1580 AMDGPUStreamTy *RecordedStream;
1581
1582 /// The recordered operation on the recorded stream.
1583 int64_t RecordedSlot;
1584
1585 /// The sync cycle when the stream was recorded. Used to detect stale events.
1586 int64_t RecordedSyncCycle;
1587
1588 /// Mutex to safely access event fields.
1589 mutable std::mutex Mutex;
1590
1591 friend struct AMDGPUStreamTy;
1592};
1593
1594Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) const {
1595 std::lock_guard<std::mutex> Lock(Mutex);
1596
1597 if (size() > 0) {
1598 // Record the synchronize identifier (to detect stale recordings) and
1599 // the last valid stream's operation.
1600 Event.RecordedSyncCycle = SyncCycle;
1601 Event.RecordedSlot = last();
1602
1603 assert(Event.RecordedSyncCycle >= 0 && "Invalid recorded sync cycle");
1604 assert(Event.RecordedSlot >= 0 && "Invalid recorded slot");
1605 } else {
1606 // The stream is empty, everything already completed, record nothing.
1607 Event.RecordedSyncCycle = -1;
1608 Event.RecordedSlot = -1;
1609 }
1610 return Plugin::success();
1611}
1612
1613Error AMDGPUStreamTy::waitEvent(const AMDGPUEventTy &Event) {
1614 // Retrieve the recorded stream on the event.
1615 AMDGPUStreamTy &RecordedStream = *Event.RecordedStream;
1616
1617 std::scoped_lock<std::mutex, std::mutex> Lock(Mutex, RecordedStream.Mutex);
1618
1619 // The recorded stream already completed the operation because the synchronize
1620 // identifier is already outdated.
1621 if (RecordedStream.SyncCycle != (uint32_t)Event.RecordedSyncCycle)
1622 return Plugin::success();
1623
1624 // Again, the recorded stream already completed the operation, the last
1625 // operation's output signal is satisfied.
1626 if (!RecordedStream.Slots[Event.RecordedSlot].Signal->load())
1627 return Plugin::success();
1628
1629 // Otherwise, make the current stream wait on the other stream's operation.
1630 return waitOnStreamOperation(OtherStream&: RecordedStream, Slot: Event.RecordedSlot);
1631}
1632
1633struct AMDGPUStreamManagerTy final
1634 : GenericDeviceResourceManagerTy<AMDGPUResourceRef<AMDGPUStreamTy>> {
1635 using ResourceRef = AMDGPUResourceRef<AMDGPUStreamTy>;
1636 using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>;
1637
1638 AMDGPUStreamManagerTy(GenericDeviceTy &Device, hsa_agent_t HSAAgent)
1639 : GenericDeviceResourceManagerTy(Device), Device(Device),
1640 OMPX_QueueTracking("LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING", true),
1641 NextQueue(0), Agent(HSAAgent) {}
1642
1643 Error init(uint32_t InitialSize, int NumHSAQueues, int HSAQueueSize) {
1644 Queues = std::vector<AMDGPUQueueTy>(NumHSAQueues);
1645 QueueSize = HSAQueueSize;
1646 MaxNumQueues = NumHSAQueues;
1647 // Initialize one queue eagerly
1648 if (auto Err = Queues.front().init(Device, Agent, QueueSize))
1649 return Err;
1650
1651 return GenericDeviceResourceManagerTy::init(InitialSize);
1652 }
1653
1654 /// Deinitialize the resource pool and delete all resources. This function
1655 /// must be called before the destructor.
1656 Error deinit() override {
1657 // De-init all queues
1658 for (AMDGPUQueueTy &Queue : Queues) {
1659 if (auto Err = Queue.deinit())
1660 return Err;
1661 }
1662
1663 return GenericDeviceResourceManagerTy::deinit();
1664 }
1665
1666 /// Get a single stream from the pool or create new resources.
1667 virtual Error getResource(AMDGPUStreamTy *&StreamHandle) override {
1668 return getResourcesImpl(1, &StreamHandle, [this](AMDGPUStreamTy *&Handle) {
1669 return assignNextQueue(Stream: Handle);
1670 });
1671 }
1672
1673 /// Return stream to the pool.
1674 virtual Error returnResource(AMDGPUStreamTy *StreamHandle) override {
1675 return returnResourceImpl(StreamHandle, [](AMDGPUStreamTy *Handle) {
1676 Handle->Queue->removeUser();
1677 return Plugin::success();
1678 });
1679 }
1680
1681private:
1682 /// Search for and assign an preferably idle queue to the given Stream. If
1683 /// there is no queue without current users, choose the queue with the lowest
1684 /// user count. If utilization is ignored: use round robin selection.
1685 inline Error assignNextQueue(AMDGPUStreamTy *Stream) {
1686 // Start from zero when tracking utilization, otherwise: round robin policy.
1687 uint32_t Index = OMPX_QueueTracking ? 0 : NextQueue++ % MaxNumQueues;
1688
1689 if (OMPX_QueueTracking) {
1690 // Find the least used queue.
1691 for (uint32_t I = 0; I < MaxNumQueues; ++I) {
1692 // Early exit when an initialized queue is idle.
1693 if (Queues[I].isInitialized() && Queues[I].getUserCount() == 0) {
1694 Index = I;
1695 break;
1696 }
1697
1698 // Update the least used queue.
1699 if (Queues[Index].getUserCount() > Queues[I].getUserCount())
1700 Index = I;
1701 }
1702 }
1703
1704 // Make sure the queue is initialized, then add user & assign.
1705 if (auto Err = Queues[Index].init(Device, Agent, QueueSize))
1706 return Err;
1707 Queues[Index].addUser();
1708 Stream->Queue = &Queues[Index];
1709
1710 return Plugin::success();
1711 }
1712
1713 /// The device associated with this stream.
1714 GenericDeviceTy &Device;
1715
1716 /// Envar for controlling the tracking of busy HSA queues.
1717 BoolEnvar OMPX_QueueTracking;
1718
1719 /// The next queue index to use for round robin selection.
1720 uint32_t NextQueue;
1721
1722 /// The queues which are assigned to requested streams.
1723 std::vector<AMDGPUQueueTy> Queues;
1724
1725 /// The corresponding device as HSA agent.
1726 hsa_agent_t Agent;
1727
1728 /// The maximum number of queues.
1729 uint32_t MaxNumQueues;
1730
1731 /// The size of created queues.
1732 uint32_t QueueSize;
1733};
1734
1735/// Abstract class that holds the common members of the actual kernel devices
1736/// and the host device. Both types should inherit from this class.
1737struct AMDGenericDeviceTy {
1738 AMDGenericDeviceTy() {}
1739
1740 virtual ~AMDGenericDeviceTy() {}
1741
1742 /// Create all memory pools which the device has access to and classify them.
1743 Error initMemoryPools() {
1744 // Retrieve all memory pools from the device agent(s).
1745 Error Err = retrieveAllMemoryPools();
1746 if (Err)
1747 return Err;
1748
1749 for (AMDGPUMemoryPoolTy *MemoryPool : AllMemoryPools) {
1750 // Initialize the memory pool and retrieve some basic info.
1751 Error Err = MemoryPool->init();
1752 if (Err)
1753 return Err;
1754
1755 if (!MemoryPool->isGlobal())
1756 continue;
1757
1758 // Classify the memory pools depending on their properties.
1759 if (MemoryPool->isFineGrained()) {
1760 FineGrainedMemoryPools.push_back(MemoryPool);
1761 if (MemoryPool->supportsKernelArgs())
1762 ArgsMemoryPools.push_back(MemoryPool);
1763 } else if (MemoryPool->isCoarseGrained()) {
1764 CoarseGrainedMemoryPools.push_back(MemoryPool);
1765 }
1766 }
1767 return Plugin::success();
1768 }
1769
1770 /// Destroy all memory pools.
1771 Error deinitMemoryPools() {
1772 for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools)
1773 delete Pool;
1774
1775 AllMemoryPools.clear();
1776 FineGrainedMemoryPools.clear();
1777 CoarseGrainedMemoryPools.clear();
1778 ArgsMemoryPools.clear();
1779
1780 return Plugin::success();
1781 }
1782
1783 /// Retrieve and construct all memory pools from the device agent(s).
1784 virtual Error retrieveAllMemoryPools() = 0;
1785
1786 /// Get the device agent.
1787 virtual hsa_agent_t getAgent() const = 0;
1788
1789protected:
1790 /// Array of all memory pools available to the host agents.
1791 llvm::SmallVector<AMDGPUMemoryPoolTy *> AllMemoryPools;
1792
1793 /// Array of fine-grained memory pools available to the host agents.
1794 llvm::SmallVector<AMDGPUMemoryPoolTy *> FineGrainedMemoryPools;
1795
1796 /// Array of coarse-grained memory pools available to the host agents.
1797 llvm::SmallVector<AMDGPUMemoryPoolTy *> CoarseGrainedMemoryPools;
1798
1799 /// Array of kernel args memory pools available to the host agents.
1800 llvm::SmallVector<AMDGPUMemoryPoolTy *> ArgsMemoryPools;
1801};
1802
1803/// Class representing the host device. This host device may have more than one
1804/// HSA host agent. We aggregate all its resources into the same instance.
1805struct AMDHostDeviceTy : public AMDGenericDeviceTy {
1806 /// Create a host device from an array of host agents.
1807 AMDHostDeviceTy(AMDGPUPluginTy &Plugin,
1808 const llvm::SmallVector<hsa_agent_t> &HostAgents)
1809 : AMDGenericDeviceTy(), Agents(HostAgents), ArgsMemoryManager(Plugin),
1810 PinnedMemoryManager(Plugin) {
1811 assert(HostAgents.size() && "No host agent found");
1812 }
1813
1814 /// Initialize the host device memory pools and the memory managers for
1815 /// kernel args and host pinned memory allocations.
1816 Error init() {
1817 if (auto Err = initMemoryPools())
1818 return Err;
1819
1820 if (auto Err = ArgsMemoryManager.init(getArgsMemoryPool()))
1821 return Err;
1822
1823 if (auto Err = PinnedMemoryManager.init(getFineGrainedMemoryPool()))
1824 return Err;
1825
1826 return Plugin::success();
1827 }
1828
1829 /// Deinitialize memory pools and managers.
1830 Error deinit() {
1831 if (auto Err = deinitMemoryPools())
1832 return Err;
1833
1834 if (auto Err = ArgsMemoryManager.deinit())
1835 return Err;
1836
1837 if (auto Err = PinnedMemoryManager.deinit())
1838 return Err;
1839
1840 return Plugin::success();
1841 }
1842
1843 /// Retrieve and construct all memory pools from the host agents.
1844 Error retrieveAllMemoryPools() override {
1845 // Iterate through the available pools across the host agents.
1846 for (hsa_agent_t Agent : Agents) {
1847 Error Err = hsa_utils::iterateAgentMemoryPools(
1848 Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) {
1849 AMDGPUMemoryPoolTy *MemoryPool =
1850 new AMDGPUMemoryPoolTy(HSAMemoryPool);
1851 AllMemoryPools.push_back(MemoryPool);
1852 return HSA_STATUS_SUCCESS;
1853 });
1854 if (Err)
1855 return Err;
1856 }
1857 return Plugin::success();
1858 }
1859
1860 /// Get one of the host agents. Return always the first agent.
1861 hsa_agent_t getAgent() const override { return Agents[0]; }
1862
1863 /// Get a memory pool for fine-grained allocations.
1864 AMDGPUMemoryPoolTy &getFineGrainedMemoryPool() {
1865 assert(!FineGrainedMemoryPools.empty() && "No fine-grained mempool");
1866 // Retrieve any memory pool.
1867 return *FineGrainedMemoryPools[0];
1868 }
1869
1870 AMDGPUMemoryPoolTy &getCoarseGrainedMemoryPool() {
1871 assert(!CoarseGrainedMemoryPools.empty() && "No coarse-grained mempool");
1872 // Retrieve any memory pool.
1873 return *CoarseGrainedMemoryPools[0];
1874 }
1875
1876 /// Get a memory pool for kernel args allocations.
1877 AMDGPUMemoryPoolTy &getArgsMemoryPool() {
1878 assert(!ArgsMemoryPools.empty() && "No kernelargs mempool");
1879 // Retrieve any memory pool.
1880 return *ArgsMemoryPools[0];
1881 }
1882
1883 /// Getters for kernel args and host pinned memory managers.
1884 AMDGPUMemoryManagerTy &getArgsMemoryManager() { return ArgsMemoryManager; }
1885 AMDGPUMemoryManagerTy &getPinnedMemoryManager() {
1886 return PinnedMemoryManager;
1887 }
1888
1889private:
1890 /// Array of agents on the host side.
1891 const llvm::SmallVector<hsa_agent_t> Agents;
1892
1893 // Memory manager for kernel arguments.
1894 AMDGPUMemoryManagerTy ArgsMemoryManager;
1895
1896 // Memory manager for pinned memory.
1897 AMDGPUMemoryManagerTy PinnedMemoryManager;
1898};
1899
1900/// Class implementing the AMDGPU device functionalities which derives from the
1901/// generic device class.
1902struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
1903 // Create an AMDGPU device with a device id and default AMDGPU grid values.
1904 AMDGPUDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices,
1905 AMDHostDeviceTy &HostDevice, hsa_agent_t Agent)
1906 : GenericDeviceTy(Plugin, DeviceId, NumDevices, {}), AMDGenericDeviceTy(),
1907 OMPX_NumQueues("LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES", 4),
1908 OMPX_QueueSize("LIBOMPTARGET_AMDGPU_HSA_QUEUE_SIZE", 512),
1909 OMPX_DefaultTeamsPerCU("LIBOMPTARGET_AMDGPU_TEAMS_PER_CU", 4),
1910 OMPX_MaxAsyncCopyBytes("LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES",
1911 1 * 1024 * 1024), // 1MB
1912 OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS",
1913 64),
1914 OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", 2000000),
1915 OMPX_UseMultipleSdmaEngines(
1916 "LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES", false),
1917 OMPX_ApuMaps("OMPX_APU_MAPS", false), AMDGPUStreamManager(*this, Agent),
1918 AMDGPUEventManager(*this), AMDGPUSignalManager(*this), Agent(Agent),
1919 HostDevice(HostDevice) {}
1920
1921 ~AMDGPUDeviceTy() {}
1922
1923 /// Initialize the device, its resources and get its properties.
1924 Error initImpl(GenericPluginTy &Plugin) override {
1925 // First setup all the memory pools.
1926 if (auto Err = initMemoryPools())
1927 return Err;
1928
1929 char GPUName[64];
1930 if (auto Err = getDeviceAttr(HSA_AGENT_INFO_NAME, GPUName))
1931 return Err;
1932 ComputeUnitKind = GPUName;
1933
1934 // Get the wavefront size.
1935 uint32_t WavefrontSize = 0;
1936 if (auto Err = getDeviceAttr(HSA_AGENT_INFO_WAVEFRONT_SIZE, WavefrontSize))
1937 return Err;
1938 GridValues.GV_Warp_Size = WavefrontSize;
1939
1940 // Get the frequency of the steady clock. If the attribute is missing
1941 // assume running on an older libhsa and default to 0, omp_get_wtime
1942 // will be inaccurate but otherwise programs can still run.
1943 if (getDeviceAttrRaw(HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY,
1944 ClockFrequency) != HSA_STATUS_SUCCESS)
1945 ClockFrequency = 0;
1946
1947 // Load the grid values depending on the wavefront.
1948 if (WavefrontSize == 32)
1949 GridValues = getAMDGPUGridValues<32>();
1950 else if (WavefrontSize == 64)
1951 GridValues = getAMDGPUGridValues<64>();
1952 else
1953 return Plugin::error(ErrorCode::UNSUPPORTED,
1954 "unexpected AMDGPU wavefront %d", WavefrontSize);
1955
1956 // Get maximum number of workitems per workgroup.
1957 uint16_t WorkgroupMaxDim[3];
1958 if (auto Err =
1959 getDeviceAttr(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgroupMaxDim))
1960 return Err;
1961 GridValues.GV_Max_WG_Size = WorkgroupMaxDim[0];
1962
1963 // Get maximum number of workgroups.
1964 hsa_dim3_t GridMaxDim;
1965 if (auto Err = getDeviceAttr(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim))
1966 return Err;
1967
1968 GridValues.GV_Max_Teams = GridMaxDim.x / GridValues.GV_Max_WG_Size;
1969 if (GridValues.GV_Max_Teams == 0)
1970 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
1971 "maximum number of teams cannot be zero");
1972
1973 // Compute the default number of teams.
1974 uint32_t ComputeUnits = 0;
1975 if (auto Err =
1976 getDeviceAttr(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, ComputeUnits))
1977 return Err;
1978 GridValues.GV_Default_Num_Teams = ComputeUnits * OMPX_DefaultTeamsPerCU;
1979
1980 uint32_t WavesPerCU = 0;
1981 if (auto Err =
1982 getDeviceAttr(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, WavesPerCU))
1983 return Err;
1984 HardwareParallelism = ComputeUnits * WavesPerCU;
1985
1986 // Get maximum size of any device queues and maximum number of queues.
1987 uint32_t MaxQueueSize;
1988 if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUE_MAX_SIZE, MaxQueueSize))
1989 return Err;
1990
1991 uint32_t MaxQueues;
1992 if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUES_MAX, MaxQueues))
1993 return Err;
1994
1995 // Compute the number of queues and their size.
1996 OMPX_NumQueues = std::max(1U, std::min(OMPX_NumQueues.get(), MaxQueues));
1997 OMPX_QueueSize = std::min(OMPX_QueueSize.get(), MaxQueueSize);
1998
1999 // Initialize stream pool.
2000 if (auto Err = AMDGPUStreamManager.init(OMPX_InitialNumStreams,
2001 OMPX_NumQueues, OMPX_QueueSize))
2002 return Err;
2003
2004 // Initialize event pool.
2005 if (auto Err = AMDGPUEventManager.init(OMPX_InitialNumEvents))
2006 return Err;
2007
2008 // Initialize signal pool.
2009 if (auto Err = AMDGPUSignalManager.init(OMPX_InitialNumSignals))
2010 return Err;
2011
2012 // Detect if XNACK is enabled
2013 SmallVector<SmallString<32>> Targets;
2014 if (auto Err = hsa_utils::getTargetTripleAndFeatures(Agent, Targets))
2015 return Err;
2016 if (!Targets.empty() && Targets[0].str().contains("xnack+"))
2017 IsXnackEnabled = true;
2018
2019 // detect if device is an APU.
2020 if (auto Err = checkIfAPU())
2021 return Err;
2022
2023 return Plugin::success();
2024 }
2025
2026 /// Deinitialize the device and release its resources.
2027 Error deinitImpl() override {
2028 // Deinitialize the stream and event pools.
2029 if (auto Err = AMDGPUStreamManager.deinit())
2030 return Err;
2031
2032 if (auto Err = AMDGPUEventManager.deinit())
2033 return Err;
2034
2035 if (auto Err = AMDGPUSignalManager.deinit())
2036 return Err;
2037
2038 // Close modules if necessary.
2039 if (!LoadedImages.empty()) {
2040 // Each image has its own module.
2041 for (DeviceImageTy *Image : LoadedImages) {
2042 AMDGPUDeviceImageTy &AMDImage =
2043 static_cast<AMDGPUDeviceImageTy &>(*Image);
2044
2045 // Unload the executable of the image.
2046 if (auto Err = AMDImage.unloadExecutable())
2047 return Err;
2048 }
2049 }
2050
2051 // Invalidate agent reference.
2052 Agent = {0};
2053
2054 return Plugin::success();
2055 }
2056
2057 virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
2058 DeviceImageTy &Image) override {
2059 GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
2060 if (Handler.isSymbolInImage(*this, Image, "amdgcn.device.fini"))
2061 Image.setPendingGlobalDtors();
2062
2063 return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true);
2064 }
2065
2066 virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
2067 DeviceImageTy &Image) override {
2068 if (Image.hasPendingGlobalDtors())
2069 return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false);
2070 return Plugin::success();
2071 }
2072
2073 uint64_t getStreamBusyWaitMicroseconds() const { return OMPX_StreamBusyWait; }
2074
2075 Expected<std::unique_ptr<MemoryBuffer>>
2076 doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const override {
2077
2078 // TODO: We should try to avoid materialization but there seems to be no
2079 // good linker interface w/o file i/o.
2080 SmallString<128> LinkerInputFilePath;
2081 std::error_code EC = sys::fs::createTemporaryFile("amdgpu-pre-link-jit",
2082 "o", LinkerInputFilePath);
2083 if (EC)
2084 return Plugin::error(ErrorCode::HOST_IO,
2085 "failed to create temporary file for linker");
2086
2087 // Write the file's contents to the output file.
2088 Expected<std::unique_ptr<FileOutputBuffer>> OutputOrErr =
2089 FileOutputBuffer::create(LinkerInputFilePath, MB->getBuffer().size());
2090 if (!OutputOrErr)
2091 return OutputOrErr.takeError();
2092 std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr);
2093 llvm::copy(Range: MB->getBuffer(), Out: Output->getBufferStart());
2094 if (Error E = Output->commit())
2095 return std::move(E);
2096
2097 SmallString<128> LinkerOutputFilePath;
2098 EC = sys::fs::createTemporaryFile("amdgpu-pre-link-jit", "so",
2099 LinkerOutputFilePath);
2100 if (EC)
2101 return Plugin::error(ErrorCode::HOST_IO,
2102 "failed to create temporary file for linker");
2103
2104 const auto &ErrorOrPath = sys::findProgramByName("lld");
2105 if (!ErrorOrPath)
2106 return createStringError(ErrorCode::HOST_TOOL_NOT_FOUND,
2107 "failed to find `lld` on the PATH.");
2108
2109 std::string LLDPath = ErrorOrPath.get();
2110 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
2111 "Using `%s` to link JITed amdgcn output.", LLDPath.c_str());
2112
2113 std::string MCPU = "-plugin-opt=mcpu=" + getComputeUnitKind();
2114 StringRef Args[] = {LLDPath,
2115 "-flavor",
2116 "gnu",
2117 "--no-undefined",
2118 "-shared",
2119 MCPU,
2120 "-o",
2121 LinkerOutputFilePath.data(),
2122 LinkerInputFilePath.data()};
2123
2124 std::string Error;
2125 int RC = sys::ExecuteAndWait(LLDPath, Args, std::nullopt, {}, 0, 0, &Error);
2126 if (RC)
2127 return Plugin::error(ErrorCode::LINK_FAILURE,
2128 "linking optimized bitcode failed: %s",
2129 Error.c_str());
2130
2131 auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(LinkerOutputFilePath);
2132 if (!BufferOrErr)
2133 return Plugin::error(ErrorCode::HOST_IO,
2134 "failed to open temporary file for lld");
2135
2136 // Clean up the temporary files afterwards.
2137 if (sys::fs::remove(LinkerOutputFilePath))
2138 return Plugin::error(ErrorCode::HOST_IO,
2139 "failed to remove temporary output file for lld");
2140 if (sys::fs::remove(LinkerInputFilePath))
2141 return Plugin::error(ErrorCode::HOST_IO,
2142 "failed to remove temporary input file for lld");
2143
2144 return std::move(*BufferOrErr);
2145 }
2146
2147 /// See GenericDeviceTy::getComputeUnitKind().
2148 std::string getComputeUnitKind() const override { return ComputeUnitKind; }
2149
2150 /// Returns the clock frequency for the given AMDGPU device.
2151 uint64_t getClockFrequency() const override { return ClockFrequency; }
2152
2153 /// Allocate and construct an AMDGPU kernel.
2154 Expected<GenericKernelTy &> constructKernel(const char *Name) override {
2155 // Allocate and construct the AMDGPU kernel.
2156 AMDGPUKernelTy *AMDGPUKernel = Plugin.allocate<AMDGPUKernelTy>();
2157 if (!AMDGPUKernel)
2158 return Plugin::error(ErrorCode::OUT_OF_RESOURCES,
2159 "failed to allocate memory for AMDGPU kernel");
2160
2161 new (AMDGPUKernel) AMDGPUKernelTy(Name);
2162
2163 return *AMDGPUKernel;
2164 }
2165
2166 /// Set the current context to this device's context. Do nothing since the
2167 /// AMDGPU devices do not have the concept of contexts.
2168 Error setContext() override { return Plugin::success(); }
2169
2170 /// AMDGPU returns the product of the number of compute units and the waves
2171 /// per compute unit.
2172 uint64_t getHardwareParallelism() const override {
2173 return HardwareParallelism;
2174 }
2175
2176 /// We want to set up the RPC server for host services to the GPU if it is
2177 /// available.
2178 bool shouldSetupRPCServer() const override { return true; }
2179
2180 /// The RPC interface should have enough space for all available parallelism.
2181 uint64_t requestedRPCPortCount() const override {
2182 return getHardwareParallelism();
2183 }
2184
2185 /// Get the stream of the asynchronous info structure or get a new one.
2186 Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper,
2187 AMDGPUStreamTy *&Stream) {
2188 // Get the stream (if any) from the async info.
2189 Stream = AsyncInfoWrapper.getQueueAs<AMDGPUStreamTy *>();
2190 if (!Stream) {
2191 // There was no stream; get an idle one.
2192 if (auto Err = AMDGPUStreamManager.getResource(Stream))
2193 return Err;
2194
2195 // Modify the async info's stream.
2196 AsyncInfoWrapper.setQueueAs<AMDGPUStreamTy *>(Stream);
2197 }
2198 return Plugin::success();
2199 }
2200
2201 /// Load the binary image into the device and allocate an image object.
2202 Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage,
2203 int32_t ImageId) override {
2204 // Allocate and initialize the image object.
2205 AMDGPUDeviceImageTy *AMDImage = Plugin.allocate<AMDGPUDeviceImageTy>();
2206 new (AMDImage) AMDGPUDeviceImageTy(ImageId, *this, TgtImage);
2207
2208 // Load the HSA executable.
2209 if (Error Err = AMDImage->loadExecutable(Device: *this))
2210 return std::move(Err);
2211
2212 return AMDImage;
2213 }
2214
2215 /// Allocate memory on the device or related to the device.
2216 void *allocate(size_t Size, void *, TargetAllocTy Kind) override;
2217
2218 /// Deallocate memory on the device or related to the device.
2219 int free(void *TgtPtr, TargetAllocTy Kind) override {
2220 if (TgtPtr == nullptr)
2221 return OFFLOAD_SUCCESS;
2222
2223 AMDGPUMemoryPoolTy *MemoryPool = nullptr;
2224 switch (Kind) {
2225 case TARGET_ALLOC_DEFAULT:
2226 case TARGET_ALLOC_DEVICE:
2227 case TARGET_ALLOC_DEVICE_NON_BLOCKING:
2228 MemoryPool = CoarseGrainedMemoryPools[0];
2229 break;
2230 case TARGET_ALLOC_HOST:
2231 MemoryPool = &HostDevice.getFineGrainedMemoryPool();
2232 break;
2233 case TARGET_ALLOC_SHARED:
2234 MemoryPool = &HostDevice.getFineGrainedMemoryPool();
2235 break;
2236 }
2237
2238 if (!MemoryPool) {
2239 REPORT("No memory pool for the specified allocation kind\n");
2240 return OFFLOAD_FAIL;
2241 }
2242
2243 if (Error Err = MemoryPool->deallocate(Ptr: TgtPtr)) {
2244 REPORT("%s\n", toString(E: std::move(Err)).data());
2245 return OFFLOAD_FAIL;
2246 }
2247
2248 return OFFLOAD_SUCCESS;
2249 }
2250
2251 /// Synchronize current thread with the pending operations on the async info.
2252 Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
2253 AMDGPUStreamTy *Stream =
2254 reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
2255 assert(Stream && "Invalid stream");
2256
2257 if (auto Err = Stream->synchronize())
2258 return Err;
2259
2260 // Once the stream is synchronized, return it to stream pool and reset
2261 // AsyncInfo. This is to make sure the synchronization only works for its
2262 // own tasks.
2263 AsyncInfo.Queue = nullptr;
2264 return AMDGPUStreamManager.returnResource(Stream);
2265 }
2266
2267 /// Query for the completion of the pending operations on the async info.
2268 Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override {
2269 AMDGPUStreamTy *Stream =
2270 reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
2271 assert(Stream && "Invalid stream");
2272
2273 auto CompletedOrErr = Stream->query();
2274 if (!CompletedOrErr)
2275 return CompletedOrErr.takeError();
2276
2277 // Return if it the stream did not complete yet.
2278 if (!(*CompletedOrErr))
2279 return Plugin::success();
2280
2281 // Once the stream is completed, return it to stream pool and reset
2282 // AsyncInfo. This is to make sure the synchronization only works for its
2283 // own tasks.
2284 AsyncInfo.Queue = nullptr;
2285 return AMDGPUStreamManager.returnResource(Stream);
2286 }
2287
2288 /// Pin the host buffer and return the device pointer that should be used for
2289 /// device transfers.
2290 Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override {
2291 void *PinnedPtr = nullptr;
2292
2293 hsa_status_t Status =
2294 hsa_amd_memory_lock(HstPtr, Size, nullptr, 0, &PinnedPtr);
2295 if (auto Err = Plugin::check(Status, "error in hsa_amd_memory_lock: %s\n"))
2296 return std::move(Err);
2297
2298 return PinnedPtr;
2299 }
2300
2301 /// Unpin the host buffer.
2302 Error dataUnlockImpl(void *HstPtr) override {
2303 hsa_status_t Status = hsa_amd_memory_unlock(HstPtr);
2304 return Plugin::check(Status, "error in hsa_amd_memory_unlock: %s\n");
2305 }
2306
2307 /// Check through the HSA runtime whether the \p HstPtr buffer is pinned.
2308 Expected<bool> isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr,
2309 void *&BaseDevAccessiblePtr,
2310 size_t &BaseSize) const override {
2311 hsa_amd_pointer_info_t Info;
2312 Info.size = sizeof(hsa_amd_pointer_info_t);
2313
2314 hsa_status_t Status = hsa_amd_pointer_info(
2315 HstPtr, &Info, /*Allocator=*/nullptr, /*num_agents_accessible=*/nullptr,
2316 /*accessible=*/nullptr);
2317 if (auto Err = Plugin::check(Status, "error in hsa_amd_pointer_info: %s"))
2318 return std::move(Err);
2319
2320 // The buffer may be locked or allocated through HSA allocators. Assume that
2321 // the buffer is host pinned if the runtime reports a HSA type.
2322 if (Info.type != HSA_EXT_POINTER_TYPE_LOCKED &&
2323 Info.type != HSA_EXT_POINTER_TYPE_HSA)
2324 return false;
2325
2326 assert(Info.hostBaseAddress && "Invalid host pinned address");
2327 assert(Info.agentBaseAddress && "Invalid agent pinned address");
2328 assert(Info.sizeInBytes > 0 && "Invalid pinned allocation size");
2329
2330 // Save the allocation info in the output parameters.
2331 BaseHstPtr = Info.hostBaseAddress;
2332 BaseDevAccessiblePtr = Info.agentBaseAddress;
2333 BaseSize = Info.sizeInBytes;
2334
2335 return true;
2336 }
2337
2338 /// Submit data to the device (host to device transfer).
2339 Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
2340 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2341 AMDGPUStreamTy *Stream = nullptr;
2342 void *PinnedPtr = nullptr;
2343
2344 // Use one-step asynchronous operation when host memory is already pinned.
2345 if (void *PinnedPtr =
2346 PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) {
2347 if (auto Err = getStream(AsyncInfoWrapper, Stream))
2348 return Err;
2349 return Stream->pushPinnedMemoryCopyAsync(Dst: TgtPtr, Src: PinnedPtr, CopySize: Size);
2350 }
2351
2352 // For large transfers use synchronous behavior.
2353 if (Size >= OMPX_MaxAsyncCopyBytes) {
2354 if (AsyncInfoWrapper.hasQueue())
2355 if (auto Err = synchronize(AsyncInfoWrapper))
2356 return Err;
2357
2358 hsa_status_t Status;
2359 Status = hsa_amd_memory_lock(const_cast<void *>(HstPtr), Size, nullptr, 0,
2360 &PinnedPtr);
2361 if (auto Err =
2362 Plugin::check(Status, "error in hsa_amd_memory_lock: %s\n"))
2363 return Err;
2364
2365 AMDGPUSignalTy Signal;
2366 if (auto Err = Signal.init())
2367 return Err;
2368
2369 if (auto Err = hsa_utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr,
2370 Agent, PinnedPtr, Agent, Size, 0,
2371 nullptr, Signal.get()))
2372 return Err;
2373
2374 if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
2375 return Err;
2376
2377 if (auto Err = Signal.deinit())
2378 return Err;
2379
2380 Status = hsa_amd_memory_unlock(const_cast<void *>(HstPtr));
2381 return Plugin::check(Status, "error in hsa_amd_memory_unlock: %s\n");
2382 }
2383
2384 // Otherwise, use two-step copy with an intermediate pinned host buffer.
2385 AMDGPUMemoryManagerTy &PinnedMemoryManager =
2386 HostDevice.getPinnedMemoryManager();
2387 if (auto Err = PinnedMemoryManager.allocate(Size, PtrStorage: &PinnedPtr))
2388 return Err;
2389
2390 if (auto Err = getStream(AsyncInfoWrapper, Stream))
2391 return Err;
2392
2393 return Stream->pushMemoryCopyH2DAsync(Dst: TgtPtr, Src: HstPtr, Inter: PinnedPtr, CopySize: Size,
2394 MemoryManager&: PinnedMemoryManager);
2395 }
2396
2397 /// Retrieve data from the device (device to host transfer).
2398 Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
2399 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2400 AMDGPUStreamTy *Stream = nullptr;
2401 void *PinnedPtr = nullptr;
2402
2403 // Use one-step asynchronous operation when host memory is already pinned.
2404 if (void *PinnedPtr =
2405 PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) {
2406 if (auto Err = getStream(AsyncInfoWrapper, Stream))
2407 return Err;
2408
2409 return Stream->pushPinnedMemoryCopyAsync(Dst: PinnedPtr, Src: TgtPtr, CopySize: Size);
2410 }
2411
2412 // For large transfers use synchronous behavior.
2413 if (Size >= OMPX_MaxAsyncCopyBytes) {
2414 if (AsyncInfoWrapper.hasQueue())
2415 if (auto Err = synchronize(AsyncInfoWrapper))
2416 return Err;
2417
2418 hsa_status_t Status;
2419 Status = hsa_amd_memory_lock(const_cast<void *>(HstPtr), Size, nullptr, 0,
2420 &PinnedPtr);
2421 if (auto Err =
2422 Plugin::check(Status, "error in hsa_amd_memory_lock: %s\n"))
2423 return Err;
2424
2425 AMDGPUSignalTy Signal;
2426 if (auto Err = Signal.init())
2427 return Err;
2428
2429 if (auto Err = hsa_utils::asyncMemCopy(useMultipleSdmaEngines(),
2430 PinnedPtr, Agent, TgtPtr, Agent,
2431 Size, 0, nullptr, Signal.get()))
2432 return Err;
2433
2434 if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
2435 return Err;
2436
2437 if (auto Err = Signal.deinit())
2438 return Err;
2439
2440 Status = hsa_amd_memory_unlock(const_cast<void *>(HstPtr));
2441 return Plugin::check(Status, "error in hsa_amd_memory_unlock: %s\n");
2442 }
2443
2444 // Otherwise, use two-step copy with an intermediate pinned host buffer.
2445 AMDGPUMemoryManagerTy &PinnedMemoryManager =
2446 HostDevice.getPinnedMemoryManager();
2447 if (auto Err = PinnedMemoryManager.allocate(Size, PtrStorage: &PinnedPtr))
2448 return Err;
2449
2450 if (auto Err = getStream(AsyncInfoWrapper, Stream))
2451 return Err;
2452
2453 return Stream->pushMemoryCopyD2HAsync(Dst: HstPtr, Src: TgtPtr, Inter: PinnedPtr, CopySize: Size,
2454 MemoryManager&: PinnedMemoryManager);
2455 }
2456
2457 /// Exchange data between two devices within the plugin.
2458 Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice,
2459 void *DstPtr, int64_t Size,
2460 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2461 AMDGPUDeviceTy &DstDevice = static_cast<AMDGPUDeviceTy &>(DstGenericDevice);
2462
2463 // For large transfers use synchronous behavior.
2464 if (Size >= OMPX_MaxAsyncCopyBytes) {
2465 if (AsyncInfoWrapper.hasQueue())
2466 if (auto Err = synchronize(AsyncInfoWrapper))
2467 return Err;
2468
2469 AMDGPUSignalTy Signal;
2470 if (auto Err = Signal.init())
2471 return Err;
2472
2473 if (auto Err = hsa_utils::asyncMemCopy(
2474 useMultipleSdmaEngines(), DstPtr, DstDevice.getAgent(), SrcPtr,
2475 getAgent(), (uint64_t)Size, 0, nullptr, Signal.get()))
2476 return Err;
2477
2478 if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
2479 return Err;
2480
2481 return Signal.deinit();
2482 }
2483
2484 AMDGPUStreamTy *Stream = nullptr;
2485 if (auto Err = getStream(AsyncInfoWrapper, Stream))
2486 return Err;
2487 if (Size <= 0)
2488 return Plugin::success();
2489
2490 return Stream->pushMemoryCopyD2DAsync(DstPtr, DstDevice.getAgent(), SrcPtr,
2491 getAgent(), (uint64_t)Size);
2492 }
2493
2494 /// Initialize the async info for interoperability purposes.
2495 Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2496 // TODO: Implement this function.
2497 return Plugin::success();
2498 }
2499
2500 /// Initialize the device info for interoperability purposes.
2501 Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override {
2502 DeviceInfo->Context = nullptr;
2503
2504 if (!DeviceInfo->Device)
2505 DeviceInfo->Device = reinterpret_cast<void *>(Agent.handle);
2506
2507 return Plugin::success();
2508 }
2509
2510 /// Create an event.
2511 Error createEventImpl(void **EventPtrStorage) override {
2512 AMDGPUEventTy **Event = reinterpret_cast<AMDGPUEventTy **>(EventPtrStorage);
2513 return AMDGPUEventManager.getResource(*Event);
2514 }
2515
2516 /// Destroy a previously created event.
2517 Error destroyEventImpl(void *EventPtr) override {
2518 AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
2519 return AMDGPUEventManager.returnResource(Event);
2520 }
2521
2522 /// Record the event.
2523 Error recordEventImpl(void *EventPtr,
2524 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2525 AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
2526 assert(Event && "Invalid event");
2527
2528 AMDGPUStreamTy *Stream = nullptr;
2529 if (auto Err = getStream(AsyncInfoWrapper, Stream))
2530 return Err;
2531
2532 return Event->record(Stream&: *Stream);
2533 }
2534
2535 /// Make the stream wait on the event.
2536 Error waitEventImpl(void *EventPtr,
2537 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2538 AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
2539
2540 AMDGPUStreamTy *Stream = nullptr;
2541 if (auto Err = getStream(AsyncInfoWrapper, Stream))
2542 return Err;
2543
2544 return Event->wait(Stream&: *Stream);
2545 }
2546
2547 /// Synchronize the current thread with the event.
2548 Error syncEventImpl(void *EventPtr) override {
2549 return Plugin::error(ErrorCode::UNIMPLEMENTED,
2550 "synchronize event not implemented");
2551 }
2552
2553 /// Print information about the device.
2554 Error obtainInfoImpl(InfoQueueTy &Info) override {
2555 char TmpChar[1000];
2556 const char *TmpCharPtr = "Unknown";
2557 uint16_t Major, Minor;
2558 uint32_t TmpUInt, TmpUInt2;
2559 uint32_t CacheSize[4];
2560 size_t TmpSt;
2561 bool TmpBool;
2562 uint16_t WorkgrpMaxDim[3];
2563 hsa_dim3_t GridMaxDim;
2564 hsa_status_t Status, Status2;
2565
2566 Status = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major);
2567 Status2 = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor);
2568 if (Status == HSA_STATUS_SUCCESS && Status2 == HSA_STATUS_SUCCESS)
2569 Info.add("HSA Runtime Version",
2570 std::to_string(val: Major) + "." + std::to_string(val: Minor));
2571
2572 Info.add("HSA OpenMP Device Number", DeviceId);
2573
2574 Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar);
2575 if (Status == HSA_STATUS_SUCCESS)
2576 Info.add("Product Name", TmpChar);
2577
2578 Status = getDeviceAttrRaw(HSA_AGENT_INFO_NAME, TmpChar);
2579 if (Status == HSA_STATUS_SUCCESS)
2580 Info.add("Device Name", TmpChar);
2581
2582 Status = getDeviceAttrRaw(HSA_AGENT_INFO_VENDOR_NAME, TmpChar);
2583 if (Status == HSA_STATUS_SUCCESS)
2584 Info.add("Vendor Name", TmpChar);
2585
2586 hsa_device_type_t DevType;
2587 Status = getDeviceAttrRaw(HSA_AGENT_INFO_DEVICE, DevType);
2588 if (Status == HSA_STATUS_SUCCESS) {
2589 switch (DevType) {
2590 case HSA_DEVICE_TYPE_CPU:
2591 TmpCharPtr = "CPU";
2592 break;
2593 case HSA_DEVICE_TYPE_GPU:
2594 TmpCharPtr = "GPU";
2595 break;
2596 case HSA_DEVICE_TYPE_DSP:
2597 TmpCharPtr = "DSP";
2598 break;
2599 }
2600 Info.add("Device Type", TmpCharPtr);
2601 }
2602
2603 Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUES_MAX, TmpUInt);
2604 if (Status == HSA_STATUS_SUCCESS)
2605 Info.add("Max Queues", TmpUInt);
2606
2607 Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MIN_SIZE, TmpUInt);
2608 if (Status == HSA_STATUS_SUCCESS)
2609 Info.add("Queue Min Size", TmpUInt);
2610
2611 Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MAX_SIZE, TmpUInt);
2612 if (Status == HSA_STATUS_SUCCESS)
2613 Info.add("Queue Max Size", TmpUInt);
2614
2615 // FIXME: This is deprecated according to HSA documentation. But using
2616 // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during
2617 // runtime.
2618 Status = getDeviceAttrRaw(HSA_AGENT_INFO_CACHE_SIZE, CacheSize);
2619 if (Status == HSA_STATUS_SUCCESS) {
2620 Info.add("Cache");
2621
2622 for (int I = 0; I < 4; I++)
2623 if (CacheSize[I])
2624 Info.add<InfoLevel2>("L" + std::to_string(I), CacheSize[I]);
2625 }
2626
2627 Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_CACHELINE_SIZE, TmpUInt);
2628 if (Status == HSA_STATUS_SUCCESS)
2629 Info.add("Cacheline Size", TmpUInt);
2630
2631 Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt);
2632 if (Status == HSA_STATUS_SUCCESS)
2633 Info.add("Max Clock Freq", TmpUInt, "MHz");
2634
2635 Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, TmpUInt);
2636 if (Status == HSA_STATUS_SUCCESS)
2637 Info.add("Compute Units", TmpUInt);
2638
2639 Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, TmpUInt);
2640 if (Status == HSA_STATUS_SUCCESS)
2641 Info.add("SIMD per CU", TmpUInt);
2642
2643 Status = getDeviceAttrRaw(HSA_AGENT_INFO_FAST_F16_OPERATION, TmpBool);
2644 if (Status == HSA_STATUS_SUCCESS)
2645 Info.add("Fast F16 Operation", TmpBool);
2646
2647 Status = getDeviceAttrRaw(HSA_AGENT_INFO_WAVEFRONT_SIZE, TmpUInt2);
2648 if (Status == HSA_STATUS_SUCCESS)
2649 Info.add("Wavefront Size", TmpUInt2);
2650
2651 Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, TmpUInt);
2652 if (Status == HSA_STATUS_SUCCESS)
2653 Info.add("Workgroup Max Size", TmpUInt);
2654
2655 Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgrpMaxDim);
2656 if (Status == HSA_STATUS_SUCCESS) {
2657 Info.add("Workgroup Max Size per Dimension");
2658 Info.add<InfoLevel2>("x", WorkgrpMaxDim[0]);
2659 Info.add<InfoLevel2>("y", WorkgrpMaxDim[1]);
2660 Info.add<InfoLevel2>("z", WorkgrpMaxDim[2]);
2661 }
2662
2663 Status = getDeviceAttrRaw(
2664 (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, TmpUInt);
2665 if (Status == HSA_STATUS_SUCCESS) {
2666 Info.add("Max Waves Per CU", TmpUInt);
2667 Info.add("Max Work-item Per CU", TmpUInt * TmpUInt2);
2668 }
2669
2670 Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_SIZE, TmpUInt);
2671 if (Status == HSA_STATUS_SUCCESS)
2672 Info.add("Grid Max Size", TmpUInt);
2673
2674 Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim);
2675 if (Status == HSA_STATUS_SUCCESS) {
2676 Info.add("Grid Max Size per Dimension");
2677 Info.add<InfoLevel2>("x", GridMaxDim.x);
2678 Info.add<InfoLevel2>("y", GridMaxDim.y);
2679 Info.add<InfoLevel2>("z", GridMaxDim.z);
2680 }
2681
2682 Status = getDeviceAttrRaw(HSA_AGENT_INFO_FBARRIER_MAX_SIZE, TmpUInt);
2683 if (Status == HSA_STATUS_SUCCESS)
2684 Info.add("Max fbarriers/Workgrp", TmpUInt);
2685
2686 Info.add("Memory Pools");
2687 for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
2688 std::string TmpStr, TmpStr2;
2689
2690 if (Pool->isGlobal())
2691 TmpStr = "Global";
2692 else if (Pool->isReadOnly())
2693 TmpStr = "ReadOnly";
2694 else if (Pool->isPrivate())
2695 TmpStr = "Private";
2696 else if (Pool->isGroup())
2697 TmpStr = "Group";
2698 else
2699 TmpStr = "Unknown";
2700
2701 Info.add<InfoLevel2>(std::string("Pool ") + TmpStr);
2702
2703 if (Pool->isGlobal()) {
2704 if (Pool->isFineGrained())
2705 TmpStr2 += "Fine Grained ";
2706 if (Pool->isCoarseGrained())
2707 TmpStr2 += "Coarse Grained ";
2708 if (Pool->supportsKernelArgs())
2709 TmpStr2 += "Kernarg ";
2710
2711 Info.add<InfoLevel3>("Flags", TmpStr2);
2712 }
2713
2714 Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, TmpSt);
2715 if (Status == HSA_STATUS_SUCCESS)
2716 Info.add<InfoLevel3>("Size", TmpSt, "bytes");
2717
2718 Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
2719 TmpBool);
2720 if (Status == HSA_STATUS_SUCCESS)
2721 Info.add<InfoLevel3>("Allocatable", TmpBool);
2722
2723 Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE,
2724 TmpSt);
2725 if (Status == HSA_STATUS_SUCCESS)
2726 Info.add<InfoLevel3>("Runtime Alloc Granule", TmpSt, "bytes");
2727
2728 Status = Pool->getAttrRaw(
2729 HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, TmpSt);
2730 if (Status == HSA_STATUS_SUCCESS)
2731 Info.add<InfoLevel3>("Runtime Alloc Alignment", TmpSt, "bytes");
2732
2733 Status =
2734 Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, TmpBool);
2735 if (Status == HSA_STATUS_SUCCESS)
2736 Info.add<InfoLevel3>("Accessible by all", TmpBool);
2737 }
2738
2739 Info.add("ISAs");
2740 auto Err = hsa_utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) {
2741 Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, TmpChar);
2742 if (Status == HSA_STATUS_SUCCESS)
2743 Info.add<InfoLevel2>("Name", TmpChar);
2744
2745 return Status;
2746 });
2747
2748 // Silently consume the error.
2749 if (Err)
2750 consumeError(std::move(Err));
2751
2752 return Plugin::success();
2753 }
2754
2755 /// Returns true if auto zero-copy the best configuration for the current
2756 /// arch.
2757 /// On AMDGPUs, automatic zero-copy is turned on
2758 /// when running on an APU with XNACK (unified memory) support
2759 /// enabled. On discrete GPUs, automatic zero-copy is triggered
2760 /// if the user sets the environment variable OMPX_APU_MAPS=1
2761 /// and if XNACK is enabled. The rationale is that zero-copy
2762 /// is the best configuration (performance, memory footprint) on APUs,
2763 /// while it is often not the best on discrete GPUs.
2764 /// XNACK can be enabled with a kernel boot parameter or with
2765 /// the HSA_XNACK environment variable.
2766 bool useAutoZeroCopyImpl() override {
2767 return ((IsAPU || OMPX_ApuMaps) && IsXnackEnabled);
2768 }
2769
2770 /// Getters and setters for stack and heap sizes.
2771 Error getDeviceStackSize(uint64_t &Value) override {
2772 Value = StackSize;
2773 return Plugin::success();
2774 }
2775 Error setDeviceStackSize(uint64_t Value) override {
2776 StackSize = Value;
2777 return Plugin::success();
2778 }
2779 Error getDeviceHeapSize(uint64_t &Value) override {
2780 Value = DeviceMemoryPoolSize;
2781 return Plugin::success();
2782 }
2783 Error setDeviceHeapSize(uint64_t Value) override {
2784 for (DeviceImageTy *Image : LoadedImages)
2785 if (auto Err = setupDeviceMemoryPool(Plugin, *Image, Value))
2786 return Err;
2787 DeviceMemoryPoolSize = Value;
2788 return Plugin::success();
2789 }
2790 Error getDeviceMemorySize(uint64_t &Value) override {
2791 for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
2792 if (Pool->isGlobal()) {
2793 hsa_status_t Status =
2794 Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, Value);
2795 return Plugin::check(Status, "error in getting device memory size: %s");
2796 }
2797 }
2798 return Plugin::error(ErrorCode::UNSUPPORTED,
2799 "getDeviceMemorySize:: no global pool");
2800 }
2801
2802 /// AMDGPU-specific function to get device attributes.
2803 template <typename Ty> Error getDeviceAttr(uint32_t Kind, Ty &Value) {
2804 hsa_status_t Status =
2805 hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value);
2806 return Plugin::check(Status, "Error in hsa_agent_get_info: %s");
2807 }
2808
2809 template <typename Ty>
2810 hsa_status_t getDeviceAttrRaw(uint32_t Kind, Ty &Value) {
2811 return hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value);
2812 }
2813
2814 /// Get the device agent.
2815 hsa_agent_t getAgent() const override { return Agent; }
2816
2817 /// Get the signal manager.
2818 AMDGPUSignalManagerTy &getSignalManager() { return AMDGPUSignalManager; }
2819
2820 /// Retrieve and construct all memory pools of the device agent.
2821 Error retrieveAllMemoryPools() override {
2822 // Iterate through the available pools of the device agent.
2823 return hsa_utils::iterateAgentMemoryPools(
2824 Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) {
2825 AMDGPUMemoryPoolTy *MemoryPool =
2826 Plugin.allocate<AMDGPUMemoryPoolTy>();
2827 new (MemoryPool) AMDGPUMemoryPoolTy(HSAMemoryPool);
2828 AllMemoryPools.push_back(MemoryPool);
2829 return HSA_STATUS_SUCCESS;
2830 });
2831 }
2832
2833 bool useMultipleSdmaEngines() const { return OMPX_UseMultipleSdmaEngines; }
2834
2835private:
2836 using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>;
2837 using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>;
2838
2839 /// Common method to invoke a single threaded constructor or destructor
2840 /// kernel by name.
2841 Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image,
2842 bool IsCtor) {
2843 const char *KernelName =
2844 IsCtor ? "amdgcn.device.init" : "amdgcn.device.fini";
2845 // Perform a quick check for the named kernel in the image. The kernel
2846 // should be created by the 'amdgpu-lower-ctor-dtor' pass.
2847 GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
2848 if (IsCtor && !Handler.isSymbolInImage(*this, Image, KernelName))
2849 return Plugin::success();
2850
2851 // Allocate and construct the AMDGPU kernel.
2852 AMDGPUKernelTy AMDGPUKernel(KernelName);
2853 if (auto Err = AMDGPUKernel.init(*this, Image))
2854 return Err;
2855
2856 AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
2857
2858 KernelArgsTy KernelArgs = {};
2859 uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u};
2860 if (auto Err = AMDGPUKernel.launchImpl(
2861 *this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs,
2862 KernelLaunchParamsTy{}, AsyncInfoWrapper))
2863 return Err;
2864
2865 Error Err = Plugin::success();
2866 AsyncInfoWrapper.finalize(Err);
2867
2868 return Err;
2869 }
2870
2871 /// Detect if current architecture is an APU.
2872 Error checkIfAPU() {
2873 // TODO: replace with ROCr API once it becomes available.
2874 llvm::StringRef StrGfxName(ComputeUnitKind);
2875 bool MayBeAPU = llvm::StringSwitch<bool>(StrGfxName)
2876 .Case(S: "gfx942", Value: true)
2877 .Default(Value: false);
2878 if (!MayBeAPU)
2879 return Plugin::success();
2880
2881 // can be MI300A or MI300X
2882 uint32_t ChipID = 0;
2883 if (auto Err = getDeviceAttr(HSA_AMD_AGENT_INFO_CHIP_ID, ChipID))
2884 return Err;
2885
2886 if (!(ChipID & 0x1)) {
2887 IsAPU = true;
2888 return Plugin::success();
2889 }
2890 return Plugin::success();
2891 }
2892
2893 /// Envar for controlling the number of HSA queues per device. High number of
2894 /// queues may degrade performance.
2895 UInt32Envar OMPX_NumQueues;
2896
2897 /// Envar for controlling the size of each HSA queue. The size is the number
2898 /// of HSA packets a queue is expected to hold. It is also the number of HSA
2899 /// packets that can be pushed into each queue without waiting the driver to
2900 /// process them.
2901 UInt32Envar OMPX_QueueSize;
2902
2903 /// Envar for controlling the default number of teams relative to the number
2904 /// of compute units (CUs) the device has:
2905 /// #default_teams = OMPX_DefaultTeamsPerCU * #CUs.
2906 UInt32Envar OMPX_DefaultTeamsPerCU;
2907
2908 /// Envar specifying the maximum size in bytes where the memory copies are
2909 /// asynchronous operations. Up to this transfer size, the memory copies are
2910 /// asynchronous operations pushed to the corresponding stream. For larger
2911 /// transfers, they are synchronous transfers.
2912 UInt32Envar OMPX_MaxAsyncCopyBytes;
2913
2914 /// Envar controlling the initial number of HSA signals per device. There is
2915 /// one manager of signals per device managing several pre-allocated signals.
2916 /// These signals are mainly used by AMDGPU streams. If needed, more signals
2917 /// will be created.
2918 UInt32Envar OMPX_InitialNumSignals;
2919
2920 /// Environment variables to set the time to wait in active state before
2921 /// switching to blocked state. The default 2000000 busywaits for 2 seconds
2922 /// before going into a blocking HSA wait state. The unit for these variables
2923 /// are microseconds.
2924 UInt32Envar OMPX_StreamBusyWait;
2925
2926 /// Use ROCm 5.7 interface for multiple SDMA engines
2927 BoolEnvar OMPX_UseMultipleSdmaEngines;
2928
2929 /// Value of OMPX_APU_MAPS env var used to force
2930 /// automatic zero-copy behavior on non-APU GPUs.
2931 BoolEnvar OMPX_ApuMaps;
2932
2933 /// Stream manager for AMDGPU streams.
2934 AMDGPUStreamManagerTy AMDGPUStreamManager;
2935
2936 /// Event manager for AMDGPU events.
2937 AMDGPUEventManagerTy AMDGPUEventManager;
2938
2939 /// Signal manager for AMDGPU signals.
2940 AMDGPUSignalManagerTy AMDGPUSignalManager;
2941
2942 /// The agent handler corresponding to the device.
2943 hsa_agent_t Agent;
2944
2945 /// The GPU architecture.
2946 std::string ComputeUnitKind;
2947
2948 /// The frequency of the steady clock inside the device.
2949 uint64_t ClockFrequency;
2950
2951 /// The total number of concurrent work items that can be running on the GPU.
2952 uint64_t HardwareParallelism;
2953
2954 /// Reference to the host device.
2955 AMDHostDeviceTy &HostDevice;
2956
2957 /// The current size of the global device memory pool (managed by us).
2958 uint64_t DeviceMemoryPoolSize = 1L << 29L /*512MB=*/;
2959
2960 /// The current size of the stack that will be used in cases where it could
2961 /// not be statically determined.
2962 uint64_t StackSize = 16 * 1024 /* 16 KB */;
2963
2964 /// Is the plugin associated with an APU?
2965 bool IsAPU = false;
2966
2967 /// True is the system is configured with XNACK-Enabled.
2968 /// False otherwise.
2969 bool IsXnackEnabled = false;
2970};
2971
2972Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
2973 hsa_code_object_reader_t Reader;
2974 hsa_status_t Status =
2975 hsa_code_object_reader_create_from_memory(getStart(), getSize(), &Reader);
2976 if (auto Err = Plugin::check(
2977 Status, "error in hsa_code_object_reader_create_from_memory: %s"))
2978 return Err;
2979
2980 Status = hsa_executable_create_alt(
2981 HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "", &Executable);
2982 if (auto Err =
2983 Plugin::check(Status, "error in hsa_executable_create_alt: %s"))
2984 return Err;
2985
2986 hsa_loaded_code_object_t Object;
2987 Status = hsa_executable_load_agent_code_object(Executable, Device.getAgent(),
2988 Reader, "", &Object);
2989 if (auto Err = Plugin::check(
2990 Status, "error in hsa_executable_load_agent_code_object: %s"))
2991 return Err;
2992
2993 Status = hsa_executable_freeze(Executable, "");
2994 if (auto Err = Plugin::check(Status, "error in hsa_executable_freeze: %s"))
2995 return Err;
2996
2997 uint32_t Result;
2998 Status = hsa_executable_validate(Executable, &Result);
2999 if (auto Err = Plugin::check(Status, "error in hsa_executable_validate: %s"))
3000 return Err;
3001
3002 if (Result)
3003 return Plugin::error(ErrorCode::INVALID_BINARY,
3004 "loaded HSA executable does not validate");
3005
3006 Status = hsa_code_object_reader_destroy(Reader);
3007 if (auto Err =
3008 Plugin::check(Status, "error in hsa_code_object_reader_destroy: %s"))
3009 return Err;
3010
3011 if (auto Err = hsa_utils::readAMDGPUMetaDataFromImage(
3012 getMemoryBuffer(), KernelInfoMap, ELFABIVersion))
3013 return Err;
3014
3015 return Plugin::success();
3016}
3017
3018Expected<hsa_executable_symbol_t>
3019AMDGPUDeviceImageTy::findDeviceSymbol(GenericDeviceTy &Device,
3020 StringRef SymbolName) const {
3021
3022 AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(Device);
3023 hsa_agent_t Agent = AMDGPUDevice.getAgent();
3024
3025 hsa_executable_symbol_t Symbol;
3026 hsa_status_t Status = hsa_executable_get_symbol_by_name(
3027 Executable, SymbolName.data(), &Agent, &Symbol);
3028 if (auto Err = Plugin::check(
3029 Status, "error in hsa_executable_get_symbol_by_name(%s): %s",
3030 SymbolName.data()))
3031 return std::move(Err);
3032
3033 return Symbol;
3034}
3035
3036template <typename ResourceTy>
3037Error AMDGPUResourceRef<ResourceTy>::create(GenericDeviceTy &Device) {
3038 if (Resource)
3039 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
3040 "creating an existing resource");
3041
3042 AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(Device);
3043
3044 Resource = new ResourceTy(AMDGPUDevice);
3045
3046 return Resource->init();
3047}
3048
3049AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
3050 : Agent(Device.getAgent()), Queue(nullptr),
3051 SignalManager(Device.getSignalManager()), Device(Device),
3052 // Initialize the std::deque with some empty positions.
3053 Slots(32), NextSlot(0), SyncCycle(0),
3054 StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
3055 UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {}
3056
3057/// Class implementing the AMDGPU-specific functionalities of the global
3058/// handler.
3059struct AMDGPUGlobalHandlerTy final : public GenericGlobalHandlerTy {
3060 /// Get the metadata of a global from the device. The name and size of the
3061 /// global is read from DeviceGlobal and the address of the global is written
3062 /// to DeviceGlobal.
3063 Error getGlobalMetadataFromDevice(GenericDeviceTy &Device,
3064 DeviceImageTy &Image,
3065 GlobalTy &DeviceGlobal) override {
3066 AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(Image);
3067
3068 // Find the symbol on the device executable.
3069 auto SymbolOrErr =
3070 AMDImage.findDeviceSymbol(Device, DeviceGlobal.getName());
3071 if (!SymbolOrErr)
3072 return SymbolOrErr.takeError();
3073
3074 hsa_executable_symbol_t Symbol = *SymbolOrErr;
3075 hsa_symbol_kind_t SymbolType;
3076 hsa_status_t Status;
3077 uint64_t SymbolAddr;
3078 uint32_t SymbolSize;
3079
3080 // Retrieve the type, address and size of the symbol.
3081 std::pair<hsa_executable_symbol_info_t, void *> RequiredInfos[] = {
3082 {HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &SymbolType},
3083 {HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &SymbolAddr},
3084 {HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &SymbolSize}};
3085
3086 for (auto &Info : RequiredInfos) {
3087 Status = hsa_executable_symbol_get_info(Symbol, Info.first, Info.second);
3088 if (auto Err = Plugin::check(
3089 Status, "error in hsa_executable_symbol_get_info: %s"))
3090 return Err;
3091 }
3092
3093 // Check the size of the symbol.
3094 if (SymbolSize != DeviceGlobal.getSize())
3095 return Plugin::error(
3096 ErrorCode::INVALID_BINARY,
3097 "failed to load global '%s' due to size mismatch (%zu != %zu)",
3098 DeviceGlobal.getName().data(), SymbolSize,
3099 (size_t)DeviceGlobal.getSize());
3100
3101 // Store the symbol address on the device global metadata.
3102 DeviceGlobal.setPtr(reinterpret_cast<void *>(SymbolAddr));
3103
3104 return Plugin::success();
3105 }
3106};
3107
3108/// Class implementing the AMDGPU-specific functionalities of the plugin.
3109struct AMDGPUPluginTy final : public GenericPluginTy {
3110 /// Create an AMDGPU plugin and initialize the AMDGPU driver.
3111 AMDGPUPluginTy()
3112 : GenericPluginTy(getTripleArch()), Initialized(false),
3113 HostDevice(nullptr) {}
3114
3115 /// This class should not be copied.
3116 AMDGPUPluginTy(const AMDGPUPluginTy &) = delete;
3117 AMDGPUPluginTy(AMDGPUPluginTy &&) = delete;
3118
3119 /// Initialize the plugin and return the number of devices.
3120 Expected<int32_t> initImpl() override {
3121 hsa_status_t Status = hsa_init();
3122 if (Status != HSA_STATUS_SUCCESS) {
3123 // Cannot call hsa_success_string.
3124 DP("Failed to initialize AMDGPU's HSA library\n");
3125 return 0;
3126 }
3127
3128 // The initialization of HSA was successful. It should be safe to call
3129 // HSA functions from now on, e.g., hsa_shut_down.
3130 Initialized = true;
3131
3132 // Register event handler to detect memory errors on the devices.
3133 Status = hsa_amd_register_system_event_handler(eventHandler, this);
3134 if (auto Err = Plugin::check(
3135 Status, "error in hsa_amd_register_system_event_handler: %s"))
3136 return std::move(Err);
3137
3138 // List of host (CPU) agents.
3139 llvm::SmallVector<hsa_agent_t> HostAgents;
3140
3141 // Count the number of available agents.
3142 auto Err = hsa_utils::iterateAgents(Callback: [&](hsa_agent_t Agent) {
3143 // Get the device type of the agent.
3144 hsa_device_type_t DeviceType;
3145 hsa_status_t Status =
3146 hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType);
3147 if (Status != HSA_STATUS_SUCCESS)
3148 return Status;
3149
3150 // Classify the agents into kernel (GPU) and host (CPU) kernels.
3151 if (DeviceType == HSA_DEVICE_TYPE_GPU) {
3152 // Ensure that the GPU agent supports kernel dispatch packets.
3153 hsa_agent_feature_t Features;
3154 Status = hsa_agent_get_info(Agent, HSA_AGENT_INFO_FEATURE, &Features);
3155 if (Features & HSA_AGENT_FEATURE_KERNEL_DISPATCH)
3156 KernelAgents.push_back(Agent);
3157 } else if (DeviceType == HSA_DEVICE_TYPE_CPU) {
3158 HostAgents.push_back(Agent);
3159 }
3160 return HSA_STATUS_SUCCESS;
3161 });
3162
3163 if (Err)
3164 return std::move(Err);
3165
3166 int32_t NumDevices = KernelAgents.size();
3167 if (NumDevices == 0) {
3168 // Do not initialize if there are no devices.
3169 DP("There are no devices supporting AMDGPU.\n");
3170 return 0;
3171 }
3172
3173 // There are kernel agents but there is no host agent. That should be
3174 // treated as an error.
3175 if (HostAgents.empty())
3176 return Plugin::error(ErrorCode::BACKEND_FAILURE, "no AMDGPU host agents");
3177
3178 // Initialize the host device using host agents.
3179 HostDevice = allocate<AMDHostDeviceTy>();
3180 new (HostDevice) AMDHostDeviceTy(*this, HostAgents);
3181
3182 // Setup the memory pools of available for the host.
3183 if (auto Err = HostDevice->init())
3184 return std::move(Err);
3185
3186 return NumDevices;
3187 }
3188
3189 /// Deinitialize the plugin.
3190 Error deinitImpl() override {
3191 // The HSA runtime was not initialized, so nothing from the plugin was
3192 // actually initialized.
3193 if (!Initialized)
3194 return Plugin::success();
3195
3196 if (HostDevice)
3197 if (auto Err = HostDevice->deinit())
3198 return Err;
3199
3200 // Finalize the HSA runtime.
3201 hsa_status_t Status = hsa_shut_down();
3202 return Plugin::check(Status, "error in hsa_shut_down: %s");
3203 }
3204
3205 /// Creates an AMDGPU device.
3206 GenericDeviceTy *createDevice(GenericPluginTy &Plugin, int32_t DeviceId,
3207 int32_t NumDevices) override {
3208 return new AMDGPUDeviceTy(Plugin, DeviceId, NumDevices, getHostDevice(),
3209 getKernelAgent(DeviceId));
3210 }
3211
3212 /// Creates an AMDGPU global handler.
3213 GenericGlobalHandlerTy *createGlobalHandler() override {
3214 return new AMDGPUGlobalHandlerTy();
3215 }
3216
3217 Triple::ArchType getTripleArch() const override { return Triple::amdgcn; }
3218
3219 const char *getName() const override { return GETNAME(TARGET_NAME); }
3220
3221 /// Get the ELF code for recognizing the compatible image binary.
3222 uint16_t getMagicElfBits() const override { return ELF::EM_AMDGPU; }
3223
3224 /// Check whether the image is compatible with an AMDGPU device.
3225 Expected<bool> isELFCompatible(uint32_t DeviceId,
3226 StringRef Image) const override {
3227 // Get the associated architecture and flags from the ELF.
3228 auto ElfOrErr = ELF64LEObjectFile::create(
3229 MemoryBufferRef(Image, /*Identifier=*/""), /*InitContent=*/false);
3230 if (!ElfOrErr)
3231 return ElfOrErr.takeError();
3232 std::optional<StringRef> Processor = ElfOrErr->tryGetCPUName();
3233 if (!Processor)
3234 return false;
3235
3236 SmallVector<SmallString<32>> Targets;
3237 if (auto Err = hsa_utils::getTargetTripleAndFeatures(
3238 getKernelAgent(DeviceId), Targets))
3239 return Err;
3240 for (auto &Target : Targets)
3241 if (offloading::amdgpu::isImageCompatibleWithEnv(
3242 Processor ? *Processor : "", ElfOrErr->getPlatformFlags(),
3243 Target.str()))
3244 return true;
3245 return false;
3246 }
3247
3248 bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) override {
3249 return true;
3250 }
3251
3252 /// Get the host device instance.
3253 AMDHostDeviceTy &getHostDevice() {
3254 assert(HostDevice && "Host device not initialized");
3255 return *HostDevice;
3256 }
3257
3258 /// Get the kernel agent with the corresponding agent id.
3259 hsa_agent_t getKernelAgent(int32_t AgentId) const {
3260 assert((uint32_t)AgentId < KernelAgents.size() && "Invalid agent id");
3261 return KernelAgents[AgentId];
3262 }
3263
3264 /// Get the list of the available kernel agents.
3265 const llvm::SmallVector<hsa_agent_t> &getKernelAgents() const {
3266 return KernelAgents;
3267 }
3268
3269private:
3270 /// Event handler that will be called by ROCr if an event is detected.
3271 static hsa_status_t eventHandler(const hsa_amd_event_t *Event,
3272 void *PluginPtr) {
3273 if (Event->event_type != HSA_AMD_GPU_MEMORY_FAULT_EVENT)
3274 return HSA_STATUS_SUCCESS;
3275
3276 SmallVector<std::string> Reasons;
3277 uint32_t ReasonsMask = Event->memory_fault.fault_reason_mask;
3278 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_PAGE_NOT_PRESENT)
3279 Reasons.emplace_back(Args: "Page not present or supervisor privilege");
3280 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_READ_ONLY)
3281 Reasons.emplace_back(Args: "Write access to a read-only page");
3282 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_NX)
3283 Reasons.emplace_back(Args: "Execute access to a page marked NX");
3284 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HOST_ONLY)
3285 Reasons.emplace_back(Args: "GPU attempted access to a host only page");
3286 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_DRAMECC)
3287 Reasons.emplace_back(Args: "DRAM ECC failure");
3288 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_IMPRECISE)
3289 Reasons.emplace_back(Args: "Can't determine the exact fault address");
3290 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_SRAMECC)
3291 Reasons.emplace_back(Args: "SRAM ECC failure (ie registers, no fault address)");
3292 if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HANG)
3293 Reasons.emplace_back(Args: "GPU reset following unspecified hang");
3294
3295 // If we do not know the reason, say so, otherwise remove the trailing comma
3296 // and space.
3297 if (Reasons.empty())
3298 Reasons.emplace_back(Args: "Unknown (" + std::to_string(val: ReasonsMask) + ")");
3299
3300 uint32_t Node = -1;
3301 hsa_agent_get_info(Event->memory_fault.agent, HSA_AGENT_INFO_NODE, &Node);
3302
3303 AMDGPUPluginTy &Plugin = *reinterpret_cast<AMDGPUPluginTy *>(PluginPtr);
3304 for (uint32_t I = 0, E = Plugin.getNumDevices();
3305 Node != uint32_t(-1) && I < E; ++I) {
3306 AMDGPUDeviceTy &AMDGPUDevice =
3307 reinterpret_cast<AMDGPUDeviceTy &>(Plugin.getDevice(I));
3308 auto KernelTraceInfoRecord =
3309 AMDGPUDevice.KernelLaunchTraces.getExclusiveAccessor();
3310
3311 uint32_t DeviceNode = -1;
3312 if (auto Err =
3313 AMDGPUDevice.getDeviceAttr(HSA_AGENT_INFO_NODE, DeviceNode)) {
3314 consumeError(std::move(Err));
3315 continue;
3316 }
3317 if (DeviceNode != Node)
3318 continue;
3319 void *DevicePtr = (void *)Event->memory_fault.virtual_address;
3320 std::string S;
3321 llvm::raw_string_ostream OS(S);
3322 OS << llvm::format("memory access fault by GPU %" PRIu32
3323 " (agent 0x%" PRIx64
3324 ") at virtual address %p. Reasons: %s",
3325 Node, Event->memory_fault.agent.handle,
3326 (void *)Event->memory_fault.virtual_address,
3327 llvm::join(Reasons, ", ").c_str());
3328 ErrorReporter::reportKernelTraces(AMDGPUDevice, *KernelTraceInfoRecord);
3329 ErrorReporter::reportMemoryAccessError(AMDGPUDevice, DevicePtr, S,
3330 /*Abort*/ true);
3331 }
3332
3333 // Abort the execution since we do not recover from this error.
3334 FATAL_MESSAGE(1,
3335 "memory access fault by GPU %" PRIu32 " (agent 0x%" PRIx64
3336 ") at virtual address %p. Reasons: %s",
3337 Node, Event->memory_fault.agent.handle,
3338 (void *)Event->memory_fault.virtual_address,
3339 llvm::join(Reasons, ", ").c_str());
3340
3341 return HSA_STATUS_ERROR;
3342 }
3343
3344 /// Indicate whether the HSA runtime was correctly initialized. Even if there
3345 /// is no available devices this boolean will be true. It indicates whether
3346 /// we can safely call HSA functions (e.g., hsa_shut_down).
3347 bool Initialized;
3348
3349 /// Arrays of the available GPU and CPU agents. These arrays of handles should
3350 /// not be here but in the AMDGPUDeviceTy structures directly. However, the
3351 /// HSA standard does not provide API functions to retirve agents directly,
3352 /// only iterating functions. We cache the agents here for convenience.
3353 llvm::SmallVector<hsa_agent_t> KernelAgents;
3354
3355 /// The device representing all HSA host agents.
3356 AMDHostDeviceTy *HostDevice;
3357};
3358
3359Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
3360 uint32_t NumThreads[3], uint32_t NumBlocks[3],
3361 KernelArgsTy &KernelArgs,
3362 KernelLaunchParamsTy LaunchParams,
3363 AsyncInfoWrapperTy &AsyncInfoWrapper) const {
3364 if (ArgsSize != LaunchParams.Size &&
3365 ArgsSize > LaunchParams.Size + getImplicitArgsSize())
3366 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
3367 "invalid kernel arguments size");
3368
3369 AMDGPUPluginTy &AMDGPUPlugin =
3370 static_cast<AMDGPUPluginTy &>(GenericDevice.Plugin);
3371 AMDHostDeviceTy &HostDevice = AMDGPUPlugin.getHostDevice();
3372 AMDGPUMemoryManagerTy &ArgsMemoryManager = HostDevice.getArgsMemoryManager();
3373
3374 void *AllArgs = nullptr;
3375 if (auto Err = ArgsMemoryManager.allocate(ArgsSize, &AllArgs))
3376 return Err;
3377
3378 // Account for user requested dynamic shared memory.
3379 uint32_t GroupSize = getGroupSize();
3380 if (uint32_t MaxDynCGroupMem = std::max(
3381 KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize())) {
3382 GroupSize += MaxDynCGroupMem;
3383 }
3384
3385 uint64_t StackSize;
3386 if (auto Err = GenericDevice.getDeviceStackSize(StackSize))
3387 return Err;
3388
3389 // Copy the explicit arguments.
3390 // TODO: We should expose the args memory manager alloc to the common part as
3391 // alternative to copying them twice.
3392 if (LaunchParams.Size)
3393 std::memcpy(dest: AllArgs, src: LaunchParams.Data, n: LaunchParams.Size);
3394
3395 AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(GenericDevice);
3396
3397 AMDGPUStreamTy *Stream = nullptr;
3398 if (auto Err = AMDGPUDevice.getStream(AsyncInfoWrapper, Stream))
3399 return Err;
3400
3401 uint64_t ImplArgsOffset = utils::roundUp(
3402 LaunchParams.Size, alignof(hsa_utils::AMDGPUImplicitArgsTy));
3403 if (ArgsSize > ImplArgsOffset) {
3404 hsa_utils::AMDGPUImplicitArgsTy *ImplArgs =
3405 reinterpret_cast<hsa_utils::AMDGPUImplicitArgsTy *>(
3406 utils::advancePtr(AllArgs, ImplArgsOffset));
3407
3408 // Set the COV5+ implicit arguments to the appropriate values if present.
3409 uint64_t ImplArgsSize = ArgsSize - ImplArgsOffset;
3410 std::memset(s: ImplArgs, c: 0, n: ImplArgsSize);
3411
3412 using ImplArgsTy = hsa_utils::AMDGPUImplicitArgsTy;
3413 hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::BlockCountX, ImplArgsSize,
3414 NumBlocks[0]);
3415 hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::BlockCountY, ImplArgsSize,
3416 NumBlocks[1]);
3417 hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::BlockCountZ, ImplArgsSize,
3418 NumBlocks[2]);
3419
3420 hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GroupSizeX, ImplArgsSize,
3421 NumThreads[0]);
3422 hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GroupSizeY, ImplArgsSize,
3423 NumThreads[1]);
3424 hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GroupSizeZ, ImplArgsSize,
3425 NumThreads[2]);
3426
3427 hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GridDims, ImplArgsSize,
3428 NumBlocks[2] * NumThreads[2] > 1
3429 ? 3
3430 : 1 + (NumBlocks[1] * NumThreads[1] != 1));
3431
3432 hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::DynamicLdsSize, ImplArgsSize,
3433 KernelArgs.DynCGroupMem);
3434 }
3435
3436 // Push the kernel launch into the stream.
3437 return Stream->pushKernelLaunch(Kernel: *this, KernelArgs: AllArgs, NumThreads, NumBlocks,
3438 GroupSize, StackSize, MemoryManager&: ArgsMemoryManager);
3439}
3440
3441Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
3442 KernelArgsTy &KernelArgs,
3443 uint32_t NumThreads[3],
3444 uint32_t NumBlocks[3]) const {
3445 // Only do all this when the output is requested
3446 if (!(getInfoLevel() & OMP_INFOTYPE_PLUGIN_KERNEL))
3447 return Plugin::success();
3448
3449 // We don't have data to print additional info, but no hard error
3450 if (!KernelInfo.has_value())
3451 return Plugin::success();
3452
3453 // General Info
3454 auto NumGroups = NumBlocks;
3455 auto ThreadsPerGroup = NumThreads;
3456
3457 // Kernel Arguments Info
3458 auto ArgNum = KernelArgs.NumArgs;
3459 auto LoopTripCount = KernelArgs.Tripcount;
3460
3461 // Details for AMDGPU kernels (read from image)
3462 // https://www.llvm.org/docs/AMDGPUUsage.html#code-object-v4-metadata
3463 auto GroupSegmentSize = (*KernelInfo).GroupSegmentList;
3464 auto SGPRCount = (*KernelInfo).SGPRCount;
3465 auto VGPRCount = (*KernelInfo).VGPRCount;
3466 auto SGPRSpillCount = (*KernelInfo).SGPRSpillCount;
3467 auto VGPRSpillCount = (*KernelInfo).VGPRSpillCount;
3468 auto MaxFlatWorkgroupSize = (*KernelInfo).MaxFlatWorkgroupSize;
3469
3470 // Prints additional launch info that contains the following.
3471 // Num Args: The number of kernel arguments
3472 // Teams x Thrds: The number of teams and the number of threads actually
3473 // running.
3474 // MaxFlatWorkgroupSize: Maximum flat work-group size supported by the
3475 // kernel in work-items
3476 // LDS Usage: Amount of bytes used in LDS storage
3477 // S/VGPR Count: the number of S/V GPRs occupied by the kernel
3478 // S/VGPR Spill Count: how many S/VGPRs are spilled by the kernel
3479 // Tripcount: loop tripcount for the kernel
3480 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(),
3481 "#Args: %d Teams x Thrds: %4ux%4u (MaxFlatWorkGroupSize: %u) LDS "
3482 "Usage: %uB #SGPRs/VGPRs: %u/%u #SGPR/VGPR Spills: %u/%u Tripcount: "
3483 "%lu\n",
3484 ArgNum, NumGroups[0] * NumGroups[1] * NumGroups[2],
3485 ThreadsPerGroup[0] * ThreadsPerGroup[1] * ThreadsPerGroup[2],
3486 MaxFlatWorkgroupSize, GroupSegmentSize, SGPRCount, VGPRCount,
3487 SGPRSpillCount, VGPRSpillCount, LoopTripCount);
3488
3489 return Plugin::success();
3490}
3491
3492template <typename... ArgsTy>
3493static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
3494 hsa_status_t ResultCode = static_cast<hsa_status_t>(Code);
3495 if (ResultCode == HSA_STATUS_SUCCESS || ResultCode == HSA_STATUS_INFO_BREAK)
3496 return Plugin::success();
3497
3498 const char *Desc = "unknown error";
3499 hsa_status_t Ret = hsa_status_string(ResultCode, &Desc);
3500 if (Ret != HSA_STATUS_SUCCESS)
3501 REPORT("Unrecognized " GETNAME(TARGET_NAME) " error code %d\n", Code);
3502
3503 // TODO: Add more entries to this switch
3504 ErrorCode OffloadErrCode;
3505 switch (ResultCode) {
3506 case HSA_STATUS_ERROR_INVALID_SYMBOL_NAME:
3507 OffloadErrCode = ErrorCode::NOT_FOUND;
3508 break;
3509 default:
3510 OffloadErrCode = ErrorCode::UNKNOWN;
3511 }
3512
3513 return Plugin::error(OffloadErrCode, ErrFmt, Args..., Desc);
3514}
3515
3516void *AMDGPUMemoryManagerTy::allocate(size_t Size, void *HstPtr,
3517 TargetAllocTy Kind) {
3518 // Allocate memory from the pool.
3519 void *Ptr = nullptr;
3520 if (auto Err = MemoryPool->allocate(Size, PtrStorage: &Ptr)) {
3521 consumeError(Err: std::move(Err));
3522 return nullptr;
3523 }
3524 assert(Ptr && "Invalid pointer");
3525
3526 // Get a list of agents that can access this memory pool.
3527 llvm::SmallVector<hsa_agent_t> Agents;
3528 llvm::copy_if(
3529 Plugin.getKernelAgents(), std::back_inserter(Agents),
3530 [&](hsa_agent_t Agent) { return MemoryPool->canAccess(Agent); });
3531
3532 // Allow all valid kernel agents to access the allocation.
3533 if (auto Err = MemoryPool->enableAccess(Ptr, Size, Agents)) {
3534 REPORT("%s\n", toString(std::move(Err)).data());
3535 return nullptr;
3536 }
3537 return Ptr;
3538}
3539
3540void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
3541 if (Size == 0)
3542 return nullptr;
3543
3544 // Find the correct memory pool.
3545 AMDGPUMemoryPoolTy *MemoryPool = nullptr;
3546 switch (Kind) {
3547 case TARGET_ALLOC_DEFAULT:
3548 case TARGET_ALLOC_DEVICE:
3549 case TARGET_ALLOC_DEVICE_NON_BLOCKING:
3550 MemoryPool = CoarseGrainedMemoryPools[0];
3551 break;
3552 case TARGET_ALLOC_HOST:
3553 MemoryPool = &HostDevice.getFineGrainedMemoryPool();
3554 break;
3555 case TARGET_ALLOC_SHARED:
3556 MemoryPool = &HostDevice.getFineGrainedMemoryPool();
3557 break;
3558 }
3559
3560 if (!MemoryPool) {
3561 REPORT("No memory pool for the specified allocation kind\n");
3562 return nullptr;
3563 }
3564
3565 // Allocate from the corresponding memory pool.
3566 void *Alloc = nullptr;
3567 if (Error Err = MemoryPool->allocate(Size, PtrStorage: &Alloc)) {
3568 REPORT("%s\n", toString(E: std::move(Err)).data());
3569 return nullptr;
3570 }
3571
3572 if (Alloc) {
3573 // Get a list of agents that can access this memory pool. Inherently
3574 // necessary for host or shared allocations Also enabled for device memory
3575 // to allow device to device memcpy
3576 llvm::SmallVector<hsa_agent_t> Agents;
3577 llvm::copy_if(static_cast<AMDGPUPluginTy &>(Plugin).getKernelAgents(),
3578 std::back_inserter(Agents), [&](hsa_agent_t Agent) {
3579 return MemoryPool->canAccess(Agent);
3580 });
3581
3582 // Enable all valid kernel agents to access the buffer.
3583 if (auto Err = MemoryPool->enableAccess(Alloc, Size, Agents)) {
3584 REPORT("%s\n", toString(std::move(Err)).data());
3585 return nullptr;
3586 }
3587 }
3588
3589 return Alloc;
3590}
3591
3592void AMDGPUQueueTy::callbackError(hsa_status_t Status, hsa_queue_t *Source,
3593 void *Data) {
3594 auto &AMDGPUDevice = *reinterpret_cast<AMDGPUDeviceTy *>(Data);
3595
3596 if (Status == HSA_STATUS_ERROR_EXCEPTION) {
3597 auto KernelTraceInfoRecord =
3598 AMDGPUDevice.KernelLaunchTraces.getExclusiveAccessor();
3599 std::function<bool(__tgt_async_info &)> AsyncInfoWrapperMatcher =
3600 [=](__tgt_async_info &AsyncInfo) {
3601 auto *Stream = reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
3602 if (!Stream || !Stream->getQueue())
3603 return false;
3604 return Stream->getQueue()->Queue == Source;
3605 };
3606 ErrorReporter::reportTrapInKernel(AMDGPUDevice, *KernelTraceInfoRecord,
3607 AsyncInfoWrapperMatcher);
3608 }
3609
3610 auto Err = Plugin::check(Status, "received error in queue %p: %s", Source);
3611 FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data());
3612}
3613
3614} // namespace plugin
3615} // namespace target
3616} // namespace omp
3617} // namespace llvm
3618
3619extern "C" {
3620llvm::omp::target::plugin::GenericPluginTy *createPlugin_amdgpu() {
3621 return new llvm::omp::target::plugin::AMDGPUPluginTy();
3622}
3623}
3624

source code of offload/plugins-nextgen/amdgpu/src/rtl.cpp