rtl.cpp source code [offload/plugins-nextgen/amdgpu/src/rtl.cpp]

1	//===----RTLs/amdgpu/src/rtl.cpp - Target RTLs Implementation ----- C++ --===//*
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// RTL NextGen for AMDGPU machine
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include <atomic>
14	#include <cassert>
15	#include <cstddef>
16	#include <deque>
17	#include <mutex>
18	#include <string>
19	#include <system_error>
20	#include <unistd.h>
21	#include <unordered_map>
22
23	#include "Shared/Debug.h"
24	#include "Shared/Environment.h"
25	#include "Shared/Utils.h"
26	#include "Utils/ELF.h"
27
28	#include "GlobalHandler.h"
29	#include "OpenMP/OMPT/Callback.h"
30	#include "PluginInterface.h"
31	#include "UtilitiesRTL.h"
32	#include "omptarget.h"
33
34	#include "llvm/ADT/SmallString.h"
35	#include "llvm/ADT/SmallVector.h"
36	#include "llvm/ADT/StringRef.h"
37	#include "llvm/BinaryFormat/ELF.h"
38	#include "llvm/Frontend/OpenMP/OMPConstants.h"
39	#include "llvm/Frontend/OpenMP/OMPGridValues.h"
40	#include "llvm/Support/Error.h"
41	#include "llvm/Support/FileOutputBuffer.h"
42	#include "llvm/Support/FileSystem.h"
43	#include "llvm/Support/MemoryBuffer.h"
44	#include "llvm/Support/Program.h"
45	#include "llvm/Support/raw_ostream.h"
46
47	#if !defined(__BYTE_ORDER__) \|\| !defined(__ORDER_LITTLE_ENDIAN__) \|\| \
48	!defined(__ORDER_BIG_ENDIAN__)
49	#error "Missing preprocessor definitions for endianness detection."
50	#endif
51
52	// The HSA headers require these definitions.
53	#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
54	#define LITTLEENDIAN_CPU
55	#elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
56	#define BIGENDIAN_CPU
57	#endif
58
59	#if defined(__has_include)
60	#if __has_include("hsa/hsa.h")
61	#include "hsa/hsa.h"
62	#include "hsa/hsa_ext_amd.h"
63	#elif __has_include("hsa.h")
64	#include "hsa.h"
65	#include "hsa_ext_amd.h"
66	#endif
67	#else
68	#include "hsa/hsa.h"
69	#include "hsa/hsa_ext_amd.h"
70	#endif
71
72	namespace llvm {
73	namespace omp {
74	namespace target {
75	namespace plugin {
76
77	/// Forward declarations for all specialized data structures.
78	struct AMDGPUKernelTy;
79	struct AMDGPUDeviceTy;
80	struct AMDGPUPluginTy;
81	struct AMDGPUStreamTy;
82	struct AMDGPUEventTy;
83	struct AMDGPUStreamManagerTy;
84	struct AMDGPUEventManagerTy;
85	struct AMDGPUDeviceImageTy;
86	struct AMDGPUMemoryManagerTy;
87	struct AMDGPUMemoryPoolTy;
88
89	namespace utils {
90
91	/// Iterate elements using an HSA iterate function. Do not use this function
92	/// directly but the specialized ones below instead.
93	template <typename ElemTy, typename IterFuncTy, typename CallbackTy>
94	hsa_status_t iterate(IterFuncTy Func, CallbackTy Cb) {
95	auto L = [](ElemTy Elem, void *Data) -> hsa_status_t {
96	CallbackTy Unwrapped = static_cast<CallbackTy >(Data);
97	return (*Unwrapped)(Elem);
98	};
99	return Func(L, static_cast<void *>(&Cb));
100	}
101
102	/// Iterate elements using an HSA iterate function passing a parameter. Do not
103	/// use this function directly but the specialized ones below instead.
104	template <typename ElemTy, typename IterFuncTy, typename IterFuncArgTy,
105	typename CallbackTy>
106	hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) {
107	auto L = [](ElemTy Elem, void *Data) -> hsa_status_t {
108	CallbackTy Unwrapped = static_cast<CallbackTy >(Data);
109	return (*Unwrapped)(Elem);
110	};
111	return Func(FuncArg, L, static_cast<void *>(&Cb));
112	}
113
114	/// Iterate elements using an HSA iterate function passing a parameter. Do not
115	/// use this function directly but the specialized ones below instead.
116	template <typename Elem1Ty, typename Elem2Ty, typename IterFuncTy,
117	typename IterFuncArgTy, typename CallbackTy>
118	hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) {
119	auto L = [](Elem1Ty Elem1, Elem2Ty Elem2, void *Data) -> hsa_status_t {
120	CallbackTy Unwrapped = static_cast<CallbackTy >(Data);
121	return (*Unwrapped)(Elem1, Elem2);
122	};
123	return Func(FuncArg, L, static_cast<void *>(&Cb));
124	}
125
126	/// Iterate agents.
127	template <typename CallbackTy> Error iterateAgents(CallbackTy Callback) {
128	hsa_status_t Status = iterate<hsa_agent_t>(hsa_iterate_agents, Callback);
129	return Plugin::check(Status, "Error in hsa_iterate_agents: %s");
130	}
131
132	/// Iterate ISAs of an agent.
133	template <typename CallbackTy>
134	Error iterateAgentISAs(hsa_agent_t Agent, CallbackTy Cb) {
135	hsa_status_t Status = iterate<hsa_isa_t>(hsa_agent_iterate_isas, Agent, Cb);
136	return Plugin::check(Status, "Error in hsa_agent_iterate_isas: %s");
137	}
138
139	/// Iterate memory pools of an agent.
140	template <typename CallbackTy>
141	Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) {
142	hsa_status_t Status = iterate<hsa_amd_memory_pool_t>(
143	hsa_amd_agent_iterate_memory_pools, Agent, Cb);
144	return Plugin::check(Status,
145	"Error in hsa_amd_agent_iterate_memory_pools: %s");
146	}
147
148	/// Dispatches an asynchronous memory copy.
149	/// Enables different SDMA engines for the dispatch in a round-robin fashion.
150	Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent,
151	const void *Src, hsa_agent_t SrcAgent, size_t Size,
152	uint32_t NumDepSignals, const hsa_signal_t *DepSignals,
153	hsa_signal_t CompletionSignal) {
154	if (!UseMultipleSdmaEngines) {
155	hsa_status_t S =
156	hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, Size,
157	NumDepSignals, DepSignals, CompletionSignal);
158	return Plugin::check(S, "Error in hsa_amd_memory_async_copy: %s");
159	}
160
161	// This solution is probably not the best
162	#if !(HSA_AMD_INTERFACE_VERSION_MAJOR >= 1 && \
163	HSA_AMD_INTERFACE_VERSION_MINOR >= 2)
164	return Plugin::error("Async copy on selected SDMA requires ROCm 5.7");
165	#else
166	static std::atomic<int> SdmaEngine{`1`};
167
168	// This atomics solution is probably not the best, but should be sufficient
169	// for now.
170	// In a worst case scenario, in which threads read the same value, they will
171	// dispatch to the same SDMA engine. This may result in sub-optimal
172	// performance. However, I think the possibility to be fairly low.
173	int LocalSdmaEngine = SdmaEngine.load(std::memory_order_acquire);
174	// This call is only avail in ROCm >= 5.7
175	hsa_status_t S = hsa_amd_memory_async_copy_on_engine(
176	Dst, DstAgent, Src, SrcAgent, Size, NumDepSignals, DepSignals,
177	CompletionSignal, (hsa_amd_sdma_engine_id_t)LocalSdmaEngine,
178	/force_copy_on_sdma=/true);
179	// Increment to use one of two SDMA engines: 0x1, 0x2
180	LocalSdmaEngine = (LocalSdmaEngine << `1`) % `3`;
181	SdmaEngine.store(LocalSdmaEngine, std::memory_order_relaxed);
182
183	return Plugin::check(S, "Error in hsa_amd_memory_async_copy_on_engine: %s");
184	#endif
185	}
186
187	Expected<std::string> getTargetTripleAndFeatures(hsa_agent_t Agent) {
188	std::string Target;
189	auto Err = utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) {
190	uint32_t Length;
191	hsa_status_t Status;
192	Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME_LENGTH, &Length);
193	if (Status != HSA_STATUS_SUCCESS)
194	return Status;
195
196	llvm::SmallVector<char> ISAName(Length);
197	Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, ISAName.begin());
198	if (Status != HSA_STATUS_SUCCESS)
199	return Status;
200
201	llvm::StringRef TripleTarget(ISAName.begin(), Length);
202	if (TripleTarget.consume_front(Prefix: "amdgcn-amd-amdhsa"))
203	Target = TripleTarget.ltrim(Char: `'-'`).rtrim(Char: `'\0'`).str();
204	return HSA_STATUS_SUCCESS;
205	});
206	if (Err)
207	return Err;
208	return Target;
209	}
210	} // namespace utils
211
212	/// Utility class representing generic resource references to AMDGPU resources.
213	template <typename ResourceTy>
214	struct AMDGPUResourceRef : public GenericDeviceResourceRef {
215	/// The underlying handle type for resources.
216	using HandleTy = ResourceTy *;
217
218	/// Create an empty reference to an invalid resource.
219	AMDGPUResourceRef() : Resource(nullptr) {}
220
221	/// Create a reference to an existing resource.
222	AMDGPUResourceRef(HandleTy Resource) : Resource(Resource) {}
223
224	virtual ~AMDGPUResourceRef() {}
225
226	/// Create a new resource and save the reference. The reference must be empty
227	/// before calling to this function.
228	Error create(GenericDeviceTy &Device) override;
229
230	/// Destroy the referenced resource and invalidate the reference. The
231	/// reference must be to a valid resource before calling to this function.
232	Error destroy(GenericDeviceTy &Device) override {
233	if (!Resource)
234	return Plugin::error("Destroying an invalid resource");
235
236	if (auto Err = Resource->deinit())
237	return Err;
238
239	delete Resource;
240
241	Resource = nullptr;
242	return Plugin::success();
243	}
244
245	/// Get the underlying resource handle.
246	operator HandleTy() const { return Resource; }
247
248	private:
249	/// The handle to the actual resource.
250	HandleTy Resource;
251	};
252
253	/// Class holding an HSA memory pool.
254	struct AMDGPUMemoryPoolTy {
255	/// Create a memory pool from an HSA memory pool.
256	AMDGPUMemoryPoolTy(hsa_amd_memory_pool_t MemoryPool)
257	: MemoryPool(MemoryPool), GlobalFlags(`0`) {}
258
259	/// Initialize the memory pool retrieving its properties.
260	Error init() {
261	if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_SEGMENT, Segment))
262	return Err;
263
264	if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, GlobalFlags))
265	return Err;
266
267	return Plugin::success();
268	}
269
270	/// Getter of the HSA memory pool.
271	hsa_amd_memory_pool_t get() const { return MemoryPool; }
272
273	/// Indicate the segment which belongs to.
274	bool isGlobal() const { return (Segment == HSA_AMD_SEGMENT_GLOBAL); }
275	bool isReadOnly() const { return (Segment == HSA_AMD_SEGMENT_READONLY); }
276	bool isPrivate() const { return (Segment == HSA_AMD_SEGMENT_PRIVATE); }
277	bool isGroup() const { return (Segment == HSA_AMD_SEGMENT_GROUP); }
278
279	/// Indicate if it is fine-grained memory. Valid only for global.
280	bool isFineGrained() const {
281	assert(isGlobal() && "Not global memory");
282	return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED);
283	}
284
285	/// Indicate if it is coarse-grained memory. Valid only for global.
286	bool isCoarseGrained() const {
287	assert(isGlobal() && "Not global memory");
288	return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED);
289	}
290
291	/// Indicate if it supports storing kernel arguments. Valid only for global.
292	bool supportsKernelArgs() const {
293	assert(isGlobal() && "Not global memory");
294	return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT);
295	}
296
297	/// Allocate memory on the memory pool.
298	Error allocate(size_t Size, void **PtrStorage) {
299	hsa_status_t Status =
300	hsa_amd_memory_pool_allocate(MemoryPool, Size, `0`, PtrStorage);
301	return Plugin::check(Status, "Error in hsa_amd_memory_pool_allocate: %s");
302	}
303
304	/// Return memory to the memory pool.
305	Error deallocate(void *Ptr) {
306	hsa_status_t Status = hsa_amd_memory_pool_free(Ptr);
307	return Plugin::check(Status, "Error in hsa_amd_memory_pool_free: %s");
308	}
309
310	/// Allow the device to access a specific allocation.
311	Error enableAccess(void *Ptr, int64_t Size,
312	const llvm::SmallVector<hsa_agent_t> &Agents) const {
313	#ifdef OMPTARGET_DEBUG
314	for (hsa_agent_t Agent : Agents) {
315	hsa_amd_memory_pool_access_t Access;
316	if (auto Err =
317	getAttr(Agent, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, Access))
318	return Err;
319
320	// The agent is not allowed to access the memory pool in any case. Do not
321	// continue because otherwise it result in undefined behavior.
322	if (Access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED)
323	return Plugin::error("An agent is not allowed to access a memory pool");
324	}
325	#endif
326
327	// We can access but it is disabled by default. Enable the access then.
328	hsa_status_t Status =
329	hsa_amd_agents_allow_access(Agents.size(), Agents.data(), nullptr, Ptr);
330	return Plugin::check(Status, "Error in hsa_amd_agents_allow_access: %s");
331	}
332
333	/// Get attribute from the memory pool.
334	template <typename Ty>
335	Error getAttr(hsa_amd_memory_pool_info_t Kind, Ty &Value) const {
336	hsa_status_t Status;
337	Status = hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value);
338	return Plugin::check(Status, "Error in hsa_amd_memory_pool_get_info: %s");
339	}
340
341	template <typename Ty>
342	hsa_status_t getAttrRaw(hsa_amd_memory_pool_info_t Kind, Ty &Value) const {
343	return hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value);
344	}
345
346	/// Get attribute from the memory pool relating to an agent.
347	template <typename Ty>
348	Error getAttr(hsa_agent_t Agent, hsa_amd_agent_memory_pool_info_t Kind,
349	Ty &Value) const {
350	hsa_status_t Status;
351	Status =
352	hsa_amd_agent_memory_pool_get_info(Agent, MemoryPool, Kind, &Value);
353	return Plugin::check(Status,
354	"Error in hsa_amd_agent_memory_pool_get_info: %s");
355	}
356
357	private:
358	/// The HSA memory pool.
359	hsa_amd_memory_pool_t MemoryPool;
360
361	/// The segment where the memory pool belongs to.
362	hsa_amd_segment_t Segment;
363
364	/// The global flags of memory pool. Only valid if the memory pool belongs to
365	/// the global segment.
366	uint32_t GlobalFlags;
367	};
368
369	/// Class that implements a memory manager that gets memory from a specific
370	/// memory pool.
371	struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
372
373	/// Create an empty memory manager.
374	AMDGPUMemoryManagerTy(AMDGPUPluginTy &Plugin)
375	: Plugin(Plugin), MemoryPool(nullptr), MemoryManager(nullptr) {}
376
377	/// Initialize the memory manager from a memory pool.
378	Error init(AMDGPUMemoryPoolTy &MemoryPool) {
379	const uint32_t Threshold = `1` << `30`;
380	this->MemoryManager = new MemoryManagerTy(*this, Threshold);
381	this->MemoryPool = &MemoryPool;
382	return Plugin::success();
383	}
384
385	/// Deinitialize the memory manager and free its allocations.
386	Error deinit() {
387	assert(MemoryManager && "Invalid memory manager");
388
389	// Delete and invalidate the memory manager. At this point, the memory
390	// manager will deallocate all its allocations.
391	delete MemoryManager;
392	MemoryManager = nullptr;
393
394	return Plugin::success();
395	}
396
397	/// Reuse or allocate memory through the memory manager.
398	Error allocate(size_t Size, void **PtrStorage) {
399	assert(MemoryManager && "Invalid memory manager");
400	assert(PtrStorage && "Invalid pointer storage");
401
402	PtrStorage = MemoryManager->allocate(Size, nullptr*);
403	if (PtrStorage == nullptr*)
404	return Plugin::error("Failure to allocate from AMDGPU memory manager");
405
406	return Plugin::success();
407	}
408
409	/// Release an allocation to be reused.
410	Error deallocate(void *Ptr) {
411	assert(Ptr && "Invalid pointer");
412
413	if (MemoryManager->free(Ptr))
414	return Plugin::error("Failure to deallocate from AMDGPU memory manager");
415
416	return Plugin::success();
417	}
418
419	private:
420	/// Allocation callback that will be called once the memory manager does not
421	/// have more previously allocated buffers.
422	void allocate(size_t Size, void* *HstPtr, TargetAllocTy Kind) override;
423
424	/// Deallocation callack that will be called by the memory manager.
425	int free(void *TgtPtr, TargetAllocTy Kind) override {
426	if (auto Err = MemoryPool->deallocate(Ptr: TgtPtr)) {
427	consumeError(Err: std::move(Err));
428	return OFFLOAD_FAIL;
429	}
430	return OFFLOAD_SUCCESS;
431	}
432
433	/// The underlying plugin that owns this memory manager.
434	AMDGPUPluginTy &Plugin;
435
436	/// The memory pool used to allocate memory.
437	AMDGPUMemoryPoolTy *MemoryPool;
438
439	/// Reference to the actual memory manager.
440	MemoryManagerTy *MemoryManager;
441	};
442
443	/// Class implementing the AMDGPU device images' properties.
444	struct AMDGPUDeviceImageTy : public DeviceImageTy {
445	/// Create the AMDGPU image with the id and the target image pointer.
446	AMDGPUDeviceImageTy(int32_t ImageId, GenericDeviceTy &Device,
447	const __tgt_device_image *TgtImage)
448	: DeviceImageTy(ImageId, Device, TgtImage) {}
449
450	/// Prepare and load the executable corresponding to the image.
451	Error loadExecutable(const AMDGPUDeviceTy &Device);
452
453	/// Unload the executable.
454	Error unloadExecutable() {
455	hsa_status_t Status = hsa_executable_destroy(Executable);
456	if (auto Err = Plugin::check(Status, "Error in hsa_executable_destroy: %s"))
457	return Err;
458
459	Status = hsa_code_object_destroy(CodeObject);
460	return Plugin::check(Status, "Error in hsa_code_object_destroy: %s");
461	}
462
463	/// Get the executable.
464	hsa_executable_t getExecutable() const { return Executable; }
465
466	/// Get to Code Object Version of the ELF
467	uint16_t getELFABIVersion() const { return ELFABIVersion; }
468
469	/// Find an HSA device symbol by its name on the executable.
470	Expected<hsa_executable_symbol_t>
471	findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const;
472
473	/// Get additional info for kernel, e.g., register spill counts
474	std::optional<utils::KernelMetaDataTy>
475	getKernelInfo(StringRef Identifier) const {
476	auto It = KernelInfoMap.find(Identifier);
477
478	if (It == KernelInfoMap.end())
479	return {};
480
481	return It->second;
482	}
483
484	private:
485	/// The exectuable loaded on the agent.
486	hsa_executable_t Executable;
487	hsa_code_object_t CodeObject;
488	StringMap<utils::KernelMetaDataTy> KernelInfoMap;
489	uint16_t ELFABIVersion;
490	};
491
492	/// Class implementing the AMDGPU kernel functionalities which derives from the
493	/// generic kernel class.
494	struct AMDGPUKernelTy : public GenericKernelTy {
495	/// Create an AMDGPU kernel with a name and an execution mode.
496	AMDGPUKernelTy(const char *Name) : GenericKernelTy(Name) {}
497
498	/// Initialize the AMDGPU kernel.
499	Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override {
500	AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(Image);
501
502	// Kernel symbols have a ".kd" suffix.
503	std::string KernelName(getName());
504	KernelName += ".kd";
505
506	// Find the symbol on the device executable.
507	auto SymbolOrErr = AMDImage.findDeviceSymbol(Device, KernelName);
508	if (!SymbolOrErr)
509	return SymbolOrErr.takeError();
510
511	hsa_executable_symbol_t Symbol = *SymbolOrErr;
512	hsa_symbol_kind_t SymbolType;
513	hsa_status_t Status;
514
515	// Retrieve different properties of the kernel symbol.
516	std::pair<hsa_executable_symbol_info_t, void *> RequiredInfos[] = {
517	{HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &SymbolType},
518	{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &KernelObject},
519	{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &ArgsSize},
520	{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &GroupSize},
521	{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &DynamicStack},
522	{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &PrivateSize}};
523
524	for (auto &Info : RequiredInfos) {
525	Status = hsa_executable_symbol_get_info(Symbol, Info.first, Info.second);
526	if (auto Err = Plugin::check(
527	Status, "Error in hsa_executable_symbol_get_info: %s"))
528	return Err;
529	}
530
531	// Make sure it is a kernel symbol.
532	if (SymbolType != HSA_SYMBOL_KIND_KERNEL)
533	return Plugin::error("Symbol %s is not a kernel function");
534
535	// TODO: Read the kernel descriptor for the max threads per block. May be
536	// read from the image.
537
538	ImplicitArgsSize = utils::getImplicitArgsSize(AMDImage.getELFABIVersion());
539	DP("ELFABIVersion: %d\n", AMDImage.getELFABIVersion());
540
541	// Get additional kernel info read from image
542	KernelInfo = AMDImage.getKernelInfo(getName());
543	if (!KernelInfo.has_value())
544	INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device.getDeviceId(),
545	"Could not read extra information for kernel %s.", getName());
546
547	return Plugin::success();
548	}
549
550	/// Launch the AMDGPU kernel function.
551	Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
552	uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args,
553	AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
554
555	/// Print more elaborate kernel launch info for AMDGPU
556	Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
557	KernelArgsTy &KernelArgs, uint32_t NumThreads,
558	uint64_t NumBlocks) const override;
559
560	/// Get group and private segment kernel size.
561	uint32_t getGroupSize() const { return GroupSize; }
562	uint32_t getPrivateSize() const { return PrivateSize; }
563
564	/// Get the HSA kernel object representing the kernel function.
565	uint64_t getKernelObject() const { return KernelObject; }
566
567	/// Get the size of implicitargs based on the code object version
568	/// @return 56 for cov4 and 256 for cov5
569	uint32_t getImplicitArgsSize() const { return ImplicitArgsSize; }
570
571	/// Indicates whether or not we need to set up our own private segment size.
572	bool usesDynamicStack() const { return DynamicStack; }
573
574	private:
575	/// The kernel object to execute.
576	uint64_t KernelObject;
577
578	/// The args, group and private segments sizes required by a kernel instance.
579	uint32_t ArgsSize;
580	uint32_t GroupSize;
581	uint32_t PrivateSize;
582	bool DynamicStack;
583
584	/// The size of implicit kernel arguments.
585	uint32_t ImplicitArgsSize;
586
587	/// Additional Info for the AMD GPU Kernel
588	std::optional<utils::KernelMetaDataTy> KernelInfo;
589	};
590
591	/// Class representing an HSA signal. Signals are used to define dependencies
592	/// between asynchronous operations: kernel launches and memory transfers.
593	struct AMDGPUSignalTy {
594	/// Create an empty signal.
595	AMDGPUSignalTy() : HSASignal({`0`}), UseCount() {}
596	AMDGPUSignalTy(AMDGPUDeviceTy &Device) : HSASignal({`0`}), UseCount() {}
597
598	/// Initialize the signal with an initial value.
599	Error init(uint32_t InitialValue = `1`) {
600	hsa_status_t Status =
601	hsa_amd_signal_create(InitialValue, `0`, nullptr, `0`, &HSASignal);
602	return Plugin::check(Status, "Error in hsa_signal_create: %s");
603	}
604
605	/// Deinitialize the signal.
606	Error deinit() {
607	hsa_status_t Status = hsa_signal_destroy(HSASignal);
608	return Plugin::check(Status, "Error in hsa_signal_destroy: %s");
609	}
610
611	/// Wait until the signal gets a zero value.
612	Error wait(const uint64_t ActiveTimeout = `0`, RPCServerTy RPCServer = nullptr*,
613	GenericDeviceTy Device = nullptr) const* {
614	if (ActiveTimeout && !RPCServer) {
615	hsa_signal_value_t Got = `1`;
616	Got = hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, `0`,
617	ActiveTimeout, HSA_WAIT_STATE_ACTIVE);
618	if (Got == `0`)
619	return Plugin::success();
620	}
621
622	// If there is an RPC device attached to this stream we run it as a server.
623	uint64_t Timeout = RPCServer ? `8192` : UINT64_MAX;
624	auto WaitState = RPCServer ? HSA_WAIT_STATE_ACTIVE : HSA_WAIT_STATE_BLOCKED;
625	while (hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, `0`,
626	Timeout, WaitState) != `0`) {
627	if (RPCServer && Device)
628	if (auto Err = RPCServer->runServer(*Device))
629	return Err;
630	}
631	return Plugin::success();
632	}
633
634	/// Load the value on the signal.
635	hsa_signal_value_t load() const {
636	return hsa_signal_load_scacquire(HSASignal);
637	}
638
639	/// Signal decrementing by one.
640	void signal() {
641	assert(load() > `0` && "Invalid signal value");
642	hsa_signal_subtract_screlease(HSASignal, `1`);
643	}
644
645	/// Reset the signal value before reusing the signal. Do not call this
646	/// function if the signal is being currently used by any watcher, such as a
647	/// plugin thread or the HSA runtime.
648	void reset() { hsa_signal_store_screlease(HSASignal, `1`); }
649
650	/// Increase the number of concurrent uses.
651	void increaseUseCount() { UseCount.increase(); }
652
653	/// Decrease the number of concurrent uses and return whether was the last.
654	bool decreaseUseCount() { return UseCount.decrease(); }
655
656	hsa_signal_t get() const { return HSASignal; }
657
658	private:
659	/// The underlying HSA signal.
660	hsa_signal_t HSASignal;
661
662	/// Reference counter for tracking the concurrent use count. This is mainly
663	/// used for knowing how many streams are using the signal.
664	RefCountTy<> UseCount;
665	};
666
667	/// Classes for holding AMDGPU signals and managing signals.
668	using AMDGPUSignalRef = AMDGPUResourceRef<AMDGPUSignalTy>;
669	using AMDGPUSignalManagerTy = GenericDeviceResourceManagerTy<AMDGPUSignalRef>;
670
671	/// Class holding an HSA queue to submit kernel and barrier packets.
672	struct AMDGPUQueueTy {
673	/// Create an empty queue.
674	AMDGPUQueueTy() : Queue(nullptr), Mutex (), NumUsers(`0`) {}
675
676	/// Lazily initialize a new queue belonging to a specific agent.
677	Error init(hsa_agent_t Agent, int32_t QueueSize) {
678	if (Queue)
679	return Plugin::success();
680	hsa_status_t Status =
681	hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError,
682	nullptr, UINT32_MAX, UINT32_MAX, &Queue);
683	return Plugin::check(Status, "Error in hsa_queue_create: %s");
684	}
685
686	/// Deinitialize the queue and destroy its resources.
687	Error deinit() {
688	std::lock_guard<std::mutex> Lock(Mutex);
689	if (!Queue)
690	return Plugin::success();
691	hsa_status_t Status = hsa_queue_destroy(Queue);
692	return Plugin::check(Status, "Error in hsa_queue_destroy: %s");
693	}
694
695	/// Returns the number of streams, this queue is currently assigned to.
696	bool getUserCount() const { return NumUsers; }
697
698	/// Returns if the underlying HSA queue is initialized.
699	bool isInitialized() { return Queue != nullptr; }
700
701	/// Decrement user count of the queue object.
702	void removeUser() { --NumUsers; }
703
704	/// Increase user count of the queue object.
705	void addUser() { ++NumUsers; }
706
707	/// Push a kernel launch to the queue. The kernel launch requires an output
708	/// signal and can define an optional input signal (nullptr if none).
709	Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
710	uint32_t NumThreads, uint64_t NumBlocks,
711	uint32_t GroupSize, uint64_t StackSize,
712	AMDGPUSignalTy *OutputSignal,
713	AMDGPUSignalTy *InputSignal) {
714	assert(OutputSignal && "Invalid kernel output signal");
715
716	// Lock the queue during the packet publishing process. Notice this blocks
717	// the addition of other packets to the queue. The following piece of code
718	// should be lightweight; do not block the thread, allocate memory, etc.
719	std::lock_guard<std::mutex> Lock(Mutex);
720	assert(Queue && "Interacted with a non-initialized queue!");
721
722	// Add a barrier packet before the kernel packet in case there is a pending
723	// preceding operation. The barrier packet will delay the processing of
724	// subsequent queue's packets until the barrier input signal are satisfied.
725	// No need output signal needed because the dependency is already guaranteed
726	// by the queue barrier itself.
727	if (InputSignal && InputSignal->load())
728	if (auto Err = pushBarrierImpl(OutputSignal: nullptr, InputSignal1: InputSignal))
729	return Err;
730
731	// Now prepare the kernel packet.
732	uint64_t PacketId;
733	hsa_kernel_dispatch_packet_t *Packet = acquirePacket(PacketId);
734	assert(Packet && "Invalid packet");
735
736	// The first 32 bits of the packet are written after the other fields
737	uint16_t Setup = UINT16_C(`1`) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
738	Packet->workgroup_size_x = NumThreads;
739	Packet->workgroup_size_y = `1`;
740	Packet->workgroup_size_z = `1`;
741	Packet->reserved0 = `0`;
742	Packet->grid_size_x = NumBlocks * NumThreads;
743	Packet->grid_size_y = `1`;
744	Packet->grid_size_z = `1`;
745	Packet->private_segment_size =
746	Kernel.usesDynamicStack() ? StackSize : Kernel.getPrivateSize();
747	Packet->group_segment_size = GroupSize;
748	Packet->kernel_object = Kernel.getKernelObject();
749	Packet->kernarg_address = KernelArgs;
750	Packet->reserved2 = `0`;
751	Packet->completion_signal = OutputSignal->get();
752
753	// Publish the packet. Do not modify the packet after this point.
754	publishKernelPacket(PacketId, Setup, Packet);
755
756	return Plugin::success();
757	}
758
759	/// Push a barrier packet that will wait up to two input signals. All signals
760	/// are optional (nullptr if none).
761	Error pushBarrier(AMDGPUSignalTy *OutputSignal,
762	const AMDGPUSignalTy *InputSignal1,
763	const AMDGPUSignalTy *InputSignal2) {
764	// Lock the queue during the packet publishing process.
765	std::lock_guard<std::mutex> Lock(Mutex);
766	assert(Queue && "Interacted with a non-initialized queue!");
767
768	// Push the barrier with the lock acquired.
769	return pushBarrierImpl(OutputSignal, InputSignal1, InputSignal2);
770	}
771
772	private:
773	/// Push a barrier packet that will wait up to two input signals. Assumes the
774	/// the queue lock is acquired.
775	Error pushBarrierImpl(AMDGPUSignalTy *OutputSignal,
776	const AMDGPUSignalTy *InputSignal1,
777	const AMDGPUSignalTy InputSignal2 = nullptr*) {
778	// Add a queue barrier waiting on both the other stream's operation and the
779	// last operation on the current stream (if any).
780	uint64_t PacketId;
781	hsa_barrier_and_packet_t *Packet =
782	(hsa_barrier_and_packet_t *)acquirePacket(PacketId);
783	assert(Packet && "Invalid packet");
784
785	Packet->reserved0 = `0`;
786	Packet->reserved1 = `0`;
787	Packet->dep_signal[`0`] = {`0`};
788	Packet->dep_signal[`1`] = {`0`};
789	Packet->dep_signal[`2`] = {`0`};
790	Packet->dep_signal[`3`] = {`0`};
791	Packet->dep_signal[`4`] = {`0`};
792	Packet->reserved2 = `0`;
793	Packet->completion_signal = {`0`};
794
795	// Set input and output dependencies if needed.
796	if (OutputSignal)
797	Packet->completion_signal = OutputSignal->get();
798	if (InputSignal1)
799	Packet->dep_signal[`0`] = InputSignal1->get();
800	if (InputSignal2)
801	Packet->dep_signal[`1`] = InputSignal2->get();
802
803	// Publish the packet. Do not modify the packet after this point.
804	publishBarrierPacket(PacketId, Packet);
805
806	return Plugin::success();
807	}
808
809	/// Acquire a packet from the queue. This call may block the thread if there
810	/// is no space in the underlying HSA queue. It may need to wait until the HSA
811	/// runtime processes some packets. Assumes the queue lock is acquired.
812	hsa_kernel_dispatch_packet_t *acquirePacket(uint64_t &PacketId) {
813	// Increase the queue index with relaxed memory order. Notice this will need
814	// another subsequent atomic operation with acquire order.
815	PacketId = hsa_queue_add_write_index_relaxed(Queue, `1`);
816
817	// Wait for the package to be available. Notice the atomic operation uses
818	// the acquire memory order.
819	while (PacketId - hsa_queue_load_read_index_scacquire(Queue) >= Queue->size)
820	;
821
822	// Return the packet reference.
823	const uint32_t Mask = Queue->size - `1`; // The size is a power of 2.
824	return (hsa_kernel_dispatch_packet_t *)Queue->base_address +
825	(PacketId & Mask);
826	}
827
828	/// Publish the kernel packet so that the HSA runtime can start processing
829	/// the kernel launch. Do not modify the packet once this function is called.
830	/// Assumes the queue lock is acquired.
831	void publishKernelPacket(uint64_t PacketId, uint16_t Setup,
832	hsa_kernel_dispatch_packet_t *Packet) {
833	uint32_t PacketPtr = reinterpret_cast<uint32_t >(Packet);
834
835	uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
836	Header \|= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
837	Header \|= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
838
839	// Publish the packet. Do not modify the package after this point.
840	uint32_t HeaderWord = Header \| (Setup << `16u`);
841	__atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);
842
843	// Signal the doorbell about the published packet.
844	hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
845	}
846
847	/// Publish the barrier packet so that the HSA runtime can start processing
848	/// the barrier. Next packets in the queue will not be processed until all
849	/// barrier dependencies (signals) are satisfied. Assumes the queue is locked
850	void publishBarrierPacket(uint64_t PacketId,
851	hsa_barrier_and_packet_t *Packet) {
852	uint32_t PacketPtr = reinterpret_cast<uint32_t >(Packet);
853	uint16_t Setup = `0`;
854	uint16_t Header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
855	Header \|= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
856	Header \|= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
857
858	// Publish the packet. Do not modify the package after this point.
859	uint32_t HeaderWord = Header \| (Setup << `16u`);
860	__atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);
861
862	// Signal the doorbell about the published packet.
863	hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
864	}
865
866	/// Callack that will be called when an error is detected on the HSA queue.
867	static void callbackError(hsa_status_t Status, hsa_queue_t Source, void* *) {
868	auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source);
869	FATAL_MESSAGE(`1`, "%s", toString(std::move(Err)).data());
870	}
871
872	/// The HSA queue.
873	hsa_queue_t *Queue;
874
875	/// Mutex to protect the acquiring and publishing of packets. For the moment,
876	/// we need this mutex to prevent publishing packets that are not ready to be
877	/// published in a multi-thread scenario. Without a queue lock, a thread T1
878	/// could acquire packet P and thread T2 acquire packet P+1. Thread T2 could
879	/// publish its packet P+1 (signaling the queue's doorbell) before packet P
880	/// from T1 is ready to be processed. That scenario should be invalid. Thus,
881	/// we use the following mutex to make packet acquiring and publishing atomic.
882	/// TODO: There are other more advanced approaches to avoid this mutex using
883	/// atomic operations. We can further investigate it if this is a bottleneck.
884	std::mutex Mutex;
885
886	/// The number of streams, this queue is currently assigned to. A queue is
887	/// considered idle when this is zero, otherwise: busy.
888	uint32_t NumUsers;
889	};
890
891	/// Struct that implements a stream of asynchronous operations for AMDGPU
892	/// devices. This class relies on signals to implement streams and define the
893	/// dependencies between asynchronous operations.
894	struct AMDGPUStreamTy {
895	private:
896	/// Utility struct holding arguments for async H2H memory copies.
897	struct MemcpyArgsTy {
898	void *Dst;
899	const void *Src;
900	size_t Size;
901	};
902
903	/// Utility struct holding arguments for freeing buffers to memory managers.
904	struct ReleaseBufferArgsTy {
905	void *Buffer;
906	AMDGPUMemoryManagerTy *MemoryManager;
907	};
908
909	/// Utility struct holding arguments for releasing signals to signal managers.
910	struct ReleaseSignalArgsTy {
911	AMDGPUSignalTy *Signal;
912	AMDGPUSignalManagerTy *SignalManager;
913	};
914
915	/// The stream is composed of N stream's slots. The struct below represents
916	/// the fields of each slot. Each slot has a signal and an optional action
917	/// function. When appending an HSA asynchronous operation to the stream, one
918	/// slot is consumed and used to store the operation's information. The
919	/// operation's output signal is set to the consumed slot's signal. If there
920	/// is a previous asynchronous operation on the previous slot, the HSA async
921	/// operation's input signal is set to the signal of the previous slot. This
922	/// way, we obtain a chain of dependant async operations. The action is a
923	/// function that will be executed eventually after the operation is
924	/// completed, e.g., for releasing a buffer.
925	struct StreamSlotTy {
926	/// The output signal of the stream operation. May be used by the subsequent
927	/// operation as input signal.
928	AMDGPUSignalTy *Signal;
929
930	/// The action that must be performed after the operation's completion. Set
931	/// to nullptr when there is no action to perform.
932	Error (ActionFunction)(void* *);
933
934	/// Space for the action's arguments. A pointer to these arguments is passed
935	/// to the action function. Notice the space of arguments is limited.
936	union {
937	MemcpyArgsTy MemcpyArgs;
938	ReleaseBufferArgsTy ReleaseBufferArgs;
939	ReleaseSignalArgsTy ReleaseSignalArgs;
940	} ActionArgs;
941
942	/// Create an empty slot.
943	StreamSlotTy() : Signal(nullptr), ActionFunction(nullptr) {}
944
945	/// Schedule a host memory copy action on the slot.
946	Error schedHostMemoryCopy(void Dst, const* void *Src, size_t Size) {
947	ActionFunction = memcpyAction;
948	ActionArgs.MemcpyArgs = MemcpyArgsTy{Dst, Src, Size};
949	return Plugin::success();
950	}
951
952	/// Schedule a release buffer action on the slot.
953	Error schedReleaseBuffer(void *Buffer, AMDGPUMemoryManagerTy &Manager) {
954	ActionFunction = releaseBufferAction;
955	ActionArgs.ReleaseBufferArgs = ReleaseBufferArgsTy{Buffer, &Manager};
956	return Plugin::success();
957	}
958
959	/// Schedule a signal release action on the slot.
960	Error schedReleaseSignal(AMDGPUSignalTy *SignalToRelease,
961	AMDGPUSignalManagerTy *SignalManager) {
962	ActionFunction = releaseSignalAction;
963	ActionArgs.ReleaseSignalArgs =
964	ReleaseSignalArgsTy{SignalToRelease, SignalManager};
965	return Plugin::success();
966	}
967
968	// Perform the action if needed.
969	Error performAction() {
970	if (!ActionFunction)
971	return Plugin::success();
972
973	// Perform the action.
974	if (ActionFunction == memcpyAction) {
975	if (auto Err = memcpyAction(&ActionArgs))
976	return Err;
977	} else if (ActionFunction == releaseBufferAction) {
978	if (auto Err = releaseBufferAction(&ActionArgs))
979	return Err;
980	} else if (ActionFunction == releaseSignalAction) {
981	if (auto Err = releaseSignalAction(&ActionArgs))
982	return Err;
983	} else {
984	return Plugin::error("Unknown action function!");
985	}
986
987	// Invalidate the action.
988	ActionFunction = nullptr;
989
990	return Plugin::success();
991	}
992	};
993
994	/// The device agent where the stream was created.
995	hsa_agent_t Agent;
996
997	/// The queue that the stream uses to launch kernels.
998	AMDGPUQueueTy *Queue;
999
1000	/// The manager of signals to reuse signals.
1001	AMDGPUSignalManagerTy &SignalManager;
1002
1003	/// A reference to the associated device.
1004	GenericDeviceTy &Device;
1005
1006	/// Array of stream slots. Use std::deque because it can dynamically grow
1007	/// without invalidating the already inserted elements. For instance, the
1008	/// std::vector may invalidate the elements by reallocating the internal
1009	/// array if there is not enough space on new insertions.
1010	std::deque<StreamSlotTy> Slots;
1011
1012	/// The next available slot on the queue. This is reset to zero each time the
1013	/// stream is synchronized. It also indicates the current number of consumed
1014	/// slots at a given time.
1015	uint32_t NextSlot;
1016
1017	/// The synchronization id. This number is increased each time the stream is
1018	/// synchronized. It is useful to detect if an AMDGPUEventTy points to an
1019	/// operation that was already finalized in a previous stream sycnhronize.
1020	uint32_t SyncCycle;
1021
1022	/// A pointer associated with an RPC server running on the given device. If
1023	/// RPC is not being used this will be a null pointer. Otherwise, this
1024	/// indicates that an RPC server is expected to be run on this stream.
1025	RPCServerTy *RPCServer;
1026
1027	/// Mutex to protect stream's management.
1028	mutable std::mutex Mutex;
1029
1030	/// Timeout hint for HSA actively waiting for signal value to change
1031	const uint64_t StreamBusyWaitMicroseconds;
1032
1033	/// Indicate to spread data transfers across all avilable SDMAs
1034	bool UseMultipleSdmaEngines;
1035
1036	/// Return the current number of asychronous operations on the stream.
1037	uint32_t size() const { return NextSlot; }
1038
1039	/// Return the last valid slot on the stream.
1040	uint32_t last() const { return size() - `1`; }
1041
1042	/// Consume one slot from the stream. Since the stream uses signals on demand
1043	/// and releases them once the slot is no longer used, the function requires
1044	/// an idle signal for the new consumed slot.
1045	std::pair<uint32_t, AMDGPUSignalTy > consume(AMDGPUSignalTy OutputSignal) {
1046	// Double the stream size if needed. Since we use std::deque, this operation
1047	// does not invalidate the already added slots.
1048	if (Slots.size() == NextSlot)
1049	Slots.resize(new_size: Slots.size() * `2`);
1050
1051	// Update the next available slot and the stream size.
1052	uint32_t Curr = NextSlot++;
1053
1054	// Retrieve the input signal, if any, of the current operation.
1055	AMDGPUSignalTy InputSignal = (Curr > `0`) ? Slots [Curr - `1`].Signal : nullptr*;
1056
1057	// Set the output signal of the current slot.
1058	Slots [Curr].Signal = OutputSignal;
1059
1060	return std::make_pair(x&: Curr, y&: InputSignal);
1061	}
1062
1063	/// Complete all pending post actions and reset the stream after synchronizing
1064	/// or positively querying the stream.
1065	Error complete() {
1066	for (uint32_t Slot = `0`; Slot < NextSlot; ++Slot) {
1067	// Take the post action of the operation if any.
1068	if (auto Err = Slots [Slot].performAction())
1069	return Err;
1070
1071	// Release the slot's signal if possible. Otherwise, another user will.
1072	if (Slots [Slot].Signal->decreaseUseCount())
1073	if (auto Err = SignalManager.returnResource(Slots[Slot].Signal))
1074	return Err;
1075
1076	Slots [Slot].Signal = nullptr;
1077	}
1078
1079	// Reset the stream slots to zero.
1080	NextSlot = `0`;
1081
1082	// Increase the synchronization id since the stream completed a sync cycle.
1083	SyncCycle += `1`;
1084
1085	return Plugin::success();
1086	}
1087
1088	/// Make the current stream wait on a specific operation of another stream.
1089	/// The idea is to make the current stream waiting on two signals: 1) the last
1090	/// signal of the current stream, and 2) the last signal of the other stream.
1091	/// Use a barrier packet with two input signals.
1092	Error waitOnStreamOperation(AMDGPUStreamTy &OtherStream, uint32_t Slot) {
1093	if (Queue == nullptr)
1094	return Plugin::error("Target queue was nullptr");
1095
1096	/// The signal that we must wait from the other stream.
1097	AMDGPUSignalTy *OtherSignal = OtherStream.Slots [Slot].Signal;
1098
1099	// Prevent the release of the other stream's signal.
1100	OtherSignal->increaseUseCount();
1101
1102	// Retrieve an available signal for the operation's output.
1103	AMDGPUSignalTy OutputSignal = nullptr*;
1104	if (auto Err = SignalManager.getResource(OutputSignal))
1105	return Err;
1106	OutputSignal->reset();
1107	OutputSignal->increaseUseCount();
1108
1109	// Consume stream slot and compute dependencies.
1110	auto [Curr, InputSignal] = consume(OutputSignal);
1111
1112	// Setup the post action to release the signal.
1113	if (auto Err = Slots[Curr].schedReleaseSignal(OtherSignal, &SignalManager))
1114	return Err;
1115
1116	// Push a barrier into the queue with both input signals.
1117	return Queue->pushBarrier(OutputSignal, InputSignal1: InputSignal, InputSignal2: OtherSignal);
1118	}
1119
1120	/// Callback for running a specific asynchronous operation. This callback is
1121	/// used for hsa_amd_signal_async_handler. The argument is the operation that
1122	/// should be executed. Notice we use the post action mechanism to codify the
1123	/// asynchronous operation.
1124	static bool asyncActionCallback(hsa_signal_value_t Value, void *Args) {
1125	StreamSlotTy Slot = reinterpret_cast<StreamSlotTy >(Args);
1126	assert(Slot && "Invalid slot");
1127	assert(Slot->Signal && "Invalid signal");
1128
1129	// This thread is outside the stream mutex. Make sure the thread sees the
1130	// changes on the slot.
1131	std::atomic_thread_fence(m: std::memory_order_acquire);
1132
1133	// Peform the operation.
1134	if (auto Err = Slot->performAction())
1135	FATAL_MESSAGE(`1`, "Error peforming post action: %s",
1136	toString(E: std::move(Err)).data());
1137
1138	// Signal the output signal to notify the asycnhronous operation finalized.
1139	Slot->Signal->signal();
1140
1141	// Unregister callback.
1142	return false;
1143	}
1144
1145	// Callback for host-to-host memory copies. This is an asynchronous action.
1146	static Error memcpyAction(void *Data) {
1147	MemcpyArgsTy Args = reinterpret_cast<MemcpyArgsTy >(Data);
1148	assert(Args && "Invalid arguments");
1149	assert(Args->Dst && "Invalid destination buffer");
1150	assert(Args->Src && "Invalid source buffer");
1151
1152	std::memcpy(dest: Args->Dst, src: Args->Src, n: Args->Size);
1153
1154	return Plugin::success();
1155	}
1156
1157	/// Releasing a memory buffer to a memory manager. This is a post completion
1158	/// action. There are two kinds of memory buffers:
1159	/// 1. For kernel arguments. This buffer can be freed after receiving the
1160	/// kernel completion signal.
1161	/// 2. For H2D tranfers that need pinned memory space for staging. This
1162	/// buffer can be freed after receiving the transfer completion signal.
1163	/// 3. For D2H tranfers that need pinned memory space for staging. This
1164	/// buffer cannot be freed after receiving the transfer completion signal
1165	/// because of the following asynchronous H2H callback.
1166	/// For this reason, This action can only be taken at
1167	/// AMDGPUStreamTy::complete()
1168	/// Because of the case 3, all releaseBufferActions are taken at
1169	/// AMDGPUStreamTy::complete() in the current implementation.
1170	static Error releaseBufferAction(void *Data) {
1171	ReleaseBufferArgsTy Args = reinterpret_cast<ReleaseBufferArgsTy >(Data);
1172	assert(Args && "Invalid arguments");
1173	assert(Args->MemoryManager && "Invalid memory manager");
1174	assert(Args->Buffer && "Invalid buffer");
1175
1176	// Release the allocation to the memory manager.
1177	return Args->MemoryManager->deallocate(Ptr: Args->Buffer);
1178	}
1179
1180	/// Releasing a signal object back to SignalManager. This is a post completion
1181	/// action. This action can only be taken at AMDGPUStreamTy::complete()
1182	static Error releaseSignalAction(void *Data) {
1183	ReleaseSignalArgsTy Args = reinterpret_cast<ReleaseSignalArgsTy >(Data);
1184	assert(Args && "Invalid arguments");
1185	assert(Args->Signal && "Invalid signal");
1186	assert(Args->SignalManager && "Invalid signal manager");
1187
1188	// Release the signal if needed.
1189	if (Args->Signal->decreaseUseCount())
1190	if (auto Err = Args->SignalManager->returnResource(Args->Signal))
1191	return Err;
1192
1193	return Plugin::success();
1194	}
1195
1196	public:
1197	/// Create an empty stream associated with a specific device.
1198	AMDGPUStreamTy(AMDGPUDeviceTy &Device);
1199
1200	/// Intialize the stream's signals.
1201	Error init() { return Plugin::success(); }
1202
1203	/// Deinitialize the stream's signals.
1204	Error deinit() { return Plugin::success(); }
1205
1206	/// Attach an RPC server to this stream.
1207	void setRPCServer(RPCServerTy *Server) { RPCServer = Server; }
1208
1209	/// Push a asynchronous kernel to the stream. The kernel arguments must be
1210	/// placed in a special allocation for kernel args and must keep alive until
1211	/// the kernel finalizes. Once the kernel is finished, the stream will release
1212	/// the kernel args buffer to the specified memory manager.
1213	Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
1214	uint32_t NumThreads, uint64_t NumBlocks,
1215	uint32_t GroupSize, uint64_t StackSize,
1216	AMDGPUMemoryManagerTy &MemoryManager) {
1217	if (Queue == nullptr)
1218	return Plugin::error("Target queue was nullptr");
1219
1220	// Retrieve an available signal for the operation's output.
1221	AMDGPUSignalTy OutputSignal = nullptr*;
1222	if (auto Err = SignalManager.getResource(OutputSignal))
1223	return Err;
1224	OutputSignal->reset();
1225	OutputSignal->increaseUseCount();
1226
1227	std::lock_guard<std::mutex> StreamLock(Mutex);
1228
1229	// Consume stream slot and compute dependencies.
1230	auto [Curr, InputSignal] = consume(OutputSignal);
1231
1232	// Setup the post action to release the kernel args buffer.
1233	if (auto Err = Slots [Curr].schedReleaseBuffer(Buffer: KernelArgs, Manager&: MemoryManager))
1234	return Err;
1235
1236	// Push the kernel with the output signal and an input signal (optional)
1237	return Queue->pushKernelLaunch(Kernel, KernelArgs, NumThreads, NumBlocks,
1238	GroupSize, StackSize, OutputSignal,
1239	InputSignal);
1240	}
1241
1242	/// Push an asynchronous memory copy between pinned memory buffers.
1243	Error pushPinnedMemoryCopyAsync(void Dst, const* void *Src,
1244	uint64_t CopySize) {
1245	// Retrieve an available signal for the operation's output.
1246	AMDGPUSignalTy OutputSignal = nullptr*;
1247	if (auto Err = SignalManager.getResource(OutputSignal))
1248	return Err;
1249	OutputSignal->reset();
1250	OutputSignal->increaseUseCount();
1251
1252	std::lock_guard<std::mutex> Lock(Mutex);
1253
1254	// Consume stream slot and compute dependencies.
1255	auto [Curr, InputSignal] = consume(OutputSignal);
1256
1257	// Issue the async memory copy.
1258	if (InputSignal && InputSignal->load()) {
1259	hsa_signal_t InputSignalRaw = InputSignal->get();
1260	return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
1261	CopySize, `1`, &InputSignalRaw,
1262	OutputSignal->get());
1263	}
1264
1265	return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
1266	CopySize, `0`, nullptr, OutputSignal->get());
1267	}
1268
1269	/// Push an asynchronous memory copy device-to-host involving an unpinned
1270	/// memory buffer. The operation consists of a two-step copy from the
1271	/// device buffer to an intermediate pinned host buffer, and then, to a
1272	/// unpinned host buffer. Both operations are asynchronous and dependant.
1273	/// The intermediate pinned buffer will be released to the specified memory
1274	/// manager once the operation completes.
1275	Error pushMemoryCopyD2HAsync(void Dst, const* void Src, void* *Inter,
1276	uint64_t CopySize,
1277	AMDGPUMemoryManagerTy &MemoryManager) {
1278	// Retrieve available signals for the operation's outputs.
1279	AMDGPUSignalTy *OutputSignals[`2`] = {};
1280	if (auto Err = SignalManager.getResources(/Num=/`2`, OutputSignals))
1281	return Err;
1282	for (auto *Signal : OutputSignals) {
1283	Signal->reset();
1284	Signal->increaseUseCount();
1285	}
1286
1287	std::lock_guard<std::mutex> Lock(Mutex);
1288
1289	// Consume stream slot and compute dependencies.
1290	auto [Curr, InputSignal] = consume(OutputSignal: OutputSignals[`0`]);
1291
1292	// Setup the post action for releasing the intermediate buffer.
1293	if (auto Err = Slots [Curr].schedReleaseBuffer(Buffer: Inter, Manager&: MemoryManager))
1294	return Err;
1295
1296	// Issue the first step: device to host transfer. Avoid defining the input
1297	// dependency if already satisfied.
1298	if (InputSignal && InputSignal->load()) {
1299	hsa_signal_t InputSignalRaw = InputSignal->get();
1300	if (auto Err = utils::asyncMemCopy(
1301	UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, `1`,
1302	&InputSignalRaw, OutputSignals[`0`]->get()))
1303	return Err;
1304	} else {
1305	if (auto Err = utils::asyncMemCopy(UseMultipleSdmaEngines, Inter, Agent,
1306	Src, Agent, CopySize, `0`, nullptr,
1307	OutputSignals[`0`]->get()))
1308	return Err;
1309	}
1310
1311	// Consume another stream slot and compute dependencies.
1312	std::tie(args&: Curr, args&: InputSignal) = consume(OutputSignal: OutputSignals[`1`]);
1313	assert(InputSignal && "Invalid input signal");
1314
1315	// The std::memcpy is done asynchronously using an async handler. We store
1316	// the function's information in the action but it's not actually an action.
1317	if (auto Err = Slots [Curr].schedHostMemoryCopy(Dst, Src: Inter, Size: CopySize))
1318	return Err;
1319
1320	// Make changes on this slot visible to the async handler's thread.
1321	std::atomic_thread_fence(m: std::memory_order_release);
1322
1323	// Issue the second step: host to host transfer.
1324	hsa_status_t Status = hsa_amd_signal_async_handler(
1325	InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, `0`, asyncActionCallback,
1326	(void *)&Slots[Curr]);
1327
1328	return Plugin::check(Status, "Error in hsa_amd_signal_async_handler: %s");
1329	}
1330
1331	/// Push an asynchronous memory copy host-to-device involving an unpinned
1332	/// memory buffer. The operation consists of a two-step copy from the
1333	/// unpinned host buffer to an intermediate pinned host buffer, and then, to
1334	/// the pinned host buffer. Both operations are asynchronous and dependant.
1335	/// The intermediate pinned buffer will be released to the specified memory
1336	/// manager once the operation completes.
1337	Error pushMemoryCopyH2DAsync(void Dst, const* void Src, void* *Inter,
1338	uint64_t CopySize,
1339	AMDGPUMemoryManagerTy &MemoryManager) {
1340	// Retrieve available signals for the operation's outputs.
1341	AMDGPUSignalTy *OutputSignals[`2`] = {};
1342	if (auto Err = SignalManager.getResources(/Num=/`2`, OutputSignals))
1343	return Err;
1344	for (auto *Signal : OutputSignals) {
1345	Signal->reset();
1346	Signal->increaseUseCount();
1347	}
1348
1349	AMDGPUSignalTy *OutputSignal = OutputSignals[`0`];
1350
1351	std::lock_guard<std::mutex> Lock(Mutex);
1352
1353	// Consume stream slot and compute dependencies.
1354	auto [Curr, InputSignal] = consume(OutputSignal);
1355
1356	// Issue the first step: host to host transfer.
1357	if (InputSignal && InputSignal->load()) {
1358	// The std::memcpy is done asynchronously using an async handler. We store
1359	// the function's information in the action but it is not actually a
1360	// post action.
1361	if (auto Err = Slots [Curr].schedHostMemoryCopy(Dst: Inter, Src, Size: CopySize))
1362	return Err;
1363
1364	// Make changes on this slot visible to the async handler's thread.
1365	std::atomic_thread_fence(m: std::memory_order_release);
1366
1367	hsa_status_t Status = hsa_amd_signal_async_handler(
1368	InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, `0`, asyncActionCallback,
1369	(void *)&Slots[Curr]);
1370
1371	if (auto Err = Plugin::check(Status,
1372	"Error in hsa_amd_signal_async_handler: %s"))
1373	return Err;
1374
1375	// Let's use now the second output signal.
1376	OutputSignal = OutputSignals[`1`];
1377
1378	// Consume another stream slot and compute dependencies.
1379	std::tie(args&: Curr, args&: InputSignal) = consume(OutputSignal);
1380	} else {
1381	// All preceding operations completed, copy the memory synchronously.
1382	std::memcpy(dest: Inter, src: Src, n: CopySize);
1383
1384	// Return the second signal because it will not be used.
1385	OutputSignals[`1`]->decreaseUseCount();
1386	if (auto Err = SignalManager.returnResource(OutputSignals[`1`]))
1387	return Err;
1388	}
1389
1390	// Setup the post action to release the intermediate pinned buffer.
1391	if (auto Err = Slots [Curr].schedReleaseBuffer(Buffer: Inter, Manager&: MemoryManager))
1392	return Err;
1393
1394	// Issue the second step: host to device transfer. Avoid defining the input
1395	// dependency if already satisfied.
1396	if (InputSignal && InputSignal->load()) {
1397	hsa_signal_t InputSignalRaw = InputSignal->get();
1398	return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
1399	Agent, CopySize, `1`, &InputSignalRaw,
1400	OutputSignal->get());
1401	}
1402	return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, Agent,
1403	CopySize, `0`, nullptr, OutputSignal->get());
1404	}
1405
1406	// AMDGPUDeviceTy is incomplete here, passing the underlying agent instead
1407	Error pushMemoryCopyD2DAsync(void Dst, hsa_agent_t DstAgent, const* void *Src,
1408	hsa_agent_t SrcAgent, uint64_t CopySize) {
1409	AMDGPUSignalTy *OutputSignal;
1410	if (auto Err = SignalManager.getResources(/Num=/`1`, &OutputSignal))
1411	return Err;
1412	OutputSignal->reset();
1413	OutputSignal->increaseUseCount();
1414
1415	std::lock_guard<std::mutex> Lock(Mutex);
1416
1417	// Consume stream slot and compute dependencies.
1418	auto [Curr, InputSignal] = consume(OutputSignal);
1419
1420	// The agents need to have access to the corresponding memory
1421	// This is presently only true if the pointers were originally
1422	// allocated by this runtime or the caller made the appropriate
1423	// access calls.
1424
1425	if (InputSignal && InputSignal->load()) {
1426	hsa_signal_t InputSignalRaw = InputSignal->get();
1427	return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
1428	SrcAgent, CopySize, `1`, &InputSignalRaw,
1429	OutputSignal->get());
1430	}
1431	return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
1432	SrcAgent, CopySize, `0`, nullptr,
1433	OutputSignal->get());
1434	}
1435
1436	/// Synchronize with the stream. The current thread waits until all operations
1437	/// are finalized and it performs the pending post actions (i.e., releasing
1438	/// intermediate buffers).
1439	Error synchronize() {
1440	std::lock_guard<std::mutex> Lock(Mutex);
1441
1442	// No need to synchronize anything.
1443	if (size() == `0`)
1444	return Plugin::success();
1445
1446	// Wait until all previous operations on the stream have completed.
1447	if (auto Err = Slots[last()].Signal->wait(StreamBusyWaitMicroseconds,
1448	RPCServer, &Device))
1449	return Err;
1450
1451	// Reset the stream and perform all pending post actions.
1452	return complete();
1453	}
1454
1455	/// Query the stream and complete pending post actions if operations finished.
1456	/// Return whether all the operations completed. This operation does not block
1457	/// the calling thread.
1458	Expected<bool> query() {
1459	std::lock_guard<std::mutex> Lock(Mutex);
1460
1461	// No need to query anything.
1462	if (size() == `0`)
1463	return true;
1464
1465	// The last operation did not complete yet. Return directly.
1466	if (Slots [last()].Signal->load())
1467	return false;
1468
1469	// Reset the stream and perform all pending post actions.
1470	if (auto Err = complete())
1471	return std::move(Err);
1472
1473	return true;
1474	}
1475
1476	/// Record the state of the stream on an event.
1477	Error recordEvent(AMDGPUEventTy &Event) const;
1478
1479	/// Make the stream wait on an event.
1480	Error waitEvent(const AMDGPUEventTy &Event);
1481
1482	friend struct AMDGPUStreamManagerTy;
1483	};
1484
1485	/// Class representing an event on AMDGPU. The event basically stores some
1486	/// information regarding the state of the recorded stream.
1487	struct AMDGPUEventTy {
1488	/// Create an empty event.
1489	AMDGPUEventTy(AMDGPUDeviceTy &Device)
1490	: RecordedStream(nullptr), RecordedSlot(-`1`), RecordedSyncCycle(-`1`) {}
1491
1492	/// Initialize and deinitialize.
1493	Error init() { return Plugin::success(); }
1494	Error deinit() { return Plugin::success(); }
1495
1496	/// Record the state of a stream on the event.
1497	Error record(AMDGPUStreamTy &Stream) {
1498	std::lock_guard<std::mutex> Lock(Mutex);
1499
1500	// Ignore the last recorded stream.
1501	RecordedStream = &Stream;
1502
1503	return Stream.recordEvent(Event&: *this);
1504	}
1505
1506	/// Make a stream wait on the current event.
1507	Error wait(AMDGPUStreamTy &Stream) {
1508	std::lock_guard<std::mutex> Lock(Mutex);
1509
1510	if (!RecordedStream)
1511	return Plugin::error("Event does not have any recorded stream");
1512
1513	// Synchronizing the same stream. Do nothing.
1514	if (RecordedStream == &Stream)
1515	return Plugin::success();
1516
1517	// No need to wait anything, the recorded stream already finished the
1518	// corresponding operation.
1519	if (RecordedSlot < `0`)
1520	return Plugin::success();
1521
1522	return Stream.waitEvent(Event: *this);
1523	}
1524
1525	protected:
1526	/// The stream registered in this event.
1527	AMDGPUStreamTy *RecordedStream;
1528
1529	/// The recordered operation on the recorded stream.
1530	int64_t RecordedSlot;
1531
1532	/// The sync cycle when the stream was recorded. Used to detect stale events.
1533	int64_t RecordedSyncCycle;
1534
1535	/// Mutex to safely access event fields.
1536	mutable std::mutex Mutex;
1537
1538	friend struct AMDGPUStreamTy;
1539	};
1540
1541	Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) const {
1542	std::lock_guard<std::mutex> Lock(Mutex);
1543
1544	if (size() > `0`) {
1545	// Record the synchronize identifier (to detect stale recordings) and
1546	// the last valid stream's operation.
1547	Event.RecordedSyncCycle = SyncCycle;
1548	Event.RecordedSlot = last();
1549
1550	assert(Event.RecordedSyncCycle >= `0` && "Invalid recorded sync cycle");
1551	assert(Event.RecordedSlot >= `0` && "Invalid recorded slot");
1552	} else {
1553	// The stream is empty, everything already completed, record nothing.
1554	Event.RecordedSyncCycle = -`1`;
1555	Event.RecordedSlot = -`1`;
1556	}
1557	return Plugin::success();
1558	}
1559
1560	Error AMDGPUStreamTy::waitEvent(const AMDGPUEventTy &Event) {
1561	// Retrieve the recorded stream on the event.
1562	AMDGPUStreamTy &RecordedStream = *Event.RecordedStream;
1563
1564	std::scoped_lock<std::mutex, std::mutex> Lock(Mutex, RecordedStream.Mutex);
1565
1566	// The recorded stream already completed the operation because the synchronize
1567	// identifier is already outdated.
1568	if (RecordedStream.SyncCycle != (uint32_t)Event.RecordedSyncCycle)
1569	return Plugin::success();
1570
1571	// Again, the recorded stream already completed the operation, the last
1572	// operation's output signal is satisfied.
1573	if (!RecordedStream.Slots[Event.RecordedSlot].Signal->load())
1574	return Plugin::success();
1575
1576	// Otherwise, make the current stream wait on the other stream's operation.
1577	return waitOnStreamOperation(OtherStream&: RecordedStream, Slot: Event.RecordedSlot);
1578	}
1579
1580	struct AMDGPUStreamManagerTy final
1581	: GenericDeviceResourceManagerTy<AMDGPUResourceRef<AMDGPUStreamTy>> {
1582	using ResourceRef = AMDGPUResourceRef<AMDGPUStreamTy>;
1583	using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>;
1584
1585	AMDGPUStreamManagerTy(GenericDeviceTy &Device, hsa_agent_t HSAAgent)
1586	: GenericDeviceResourceManagerTy(Device),
1587	OMPX_QueueTracking("LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING", true),
1588	NextQueue(`0`), Agent(HSAAgent) {}
1589
1590	Error init(uint32_t InitialSize, int NumHSAQueues, int HSAQueueSize) {
1591	Queues = std::vector<AMDGPUQueueTy>(NumHSAQueues);
1592	QueueSize = HSAQueueSize;
1593	MaxNumQueues = NumHSAQueues;
1594	// Initialize one queue eagerly
1595	if (auto Err = Queues.front().init(Agent, QueueSize))
1596	return Err;
1597
1598	return GenericDeviceResourceManagerTy::init(InitialSize);
1599	}
1600
1601	/// Deinitialize the resource pool and delete all resources. This function
1602	/// must be called before the destructor.
1603	Error deinit() override {
1604	// De-init all queues
1605	for (AMDGPUQueueTy &Queue : Queues) {
1606	if (auto Err = Queue.deinit())
1607	return Err;
1608	}
1609
1610	return GenericDeviceResourceManagerTy::deinit();
1611	}
1612
1613	/// Get a single stream from the pool or create new resources.
1614	virtual Error getResource(AMDGPUStreamTy *&StreamHandle) override {
1615	return getResourcesImpl(`1`, &StreamHandle, [this](AMDGPUStreamTy *&Handle) {
1616	return assignNextQueue(Stream: Handle);
1617	});
1618	}
1619
1620	/// Return stream to the pool.
1621	virtual Error returnResource(AMDGPUStreamTy *StreamHandle) override {
1622	return returnResourceImpl(StreamHandle, [](AMDGPUStreamTy *Handle) {
1623	Handle->Queue->removeUser();
1624	return Plugin::success();
1625	});
1626	}
1627
1628	private:
1629	/// Search for and assign an prefereably idle queue to the given Stream. If
1630	/// there is no queue without current users, choose the queue with the lowest
1631	/// user count. If utilization is ignored: use round robin selection.
1632	inline Error assignNextQueue(AMDGPUStreamTy *Stream) {
1633	// Start from zero when tracking utilization, otherwise: round robin policy.
1634	uint32_t Index = OMPX_QueueTracking ? `0` : NextQueue++ % MaxNumQueues;
1635
1636	if (OMPX_QueueTracking) {
1637	// Find the least used queue.
1638	for (uint32_t I = `0`; I < MaxNumQueues; ++I) {
1639	// Early exit when an initialized queue is idle.
1640	if (Queues [I].isInitialized() && Queues [I].getUserCount() == `0`) {
1641	Index = I;
1642	break;
1643	}
1644
1645	// Update the least used queue.
1646	if (Queues [Index].getUserCount() > Queues [I].getUserCount())
1647	Index = I;
1648	}
1649	}
1650
1651	// Make sure the queue is initialized, then add user & assign.
1652	if (auto Err = Queues[Index].init(Agent, QueueSize))
1653	return Err;
1654	Queues [Index].addUser();
1655	Stream->Queue = &Queues [Index];
1656
1657	return Plugin::success();
1658	}
1659
1660	/// Envar for controlling the tracking of busy HSA queues.
1661	BoolEnvar OMPX_QueueTracking;
1662
1663	/// The next queue index to use for round robin selection.
1664	uint32_t NextQueue;
1665
1666	/// The queues which are assigned to requested streams.
1667	std::vector<AMDGPUQueueTy> Queues;
1668
1669	/// The corresponding device as HSA agent.
1670	hsa_agent_t Agent;
1671
1672	/// The maximum number of queues.
1673	int MaxNumQueues;
1674
1675	/// The size of created queues.
1676	int QueueSize;
1677	};
1678
1679	/// Abstract class that holds the common members of the actual kernel devices
1680	/// and the host device. Both types should inherit from this class.
1681	struct AMDGenericDeviceTy {
1682	AMDGenericDeviceTy() {}
1683
1684	virtual ~AMDGenericDeviceTy() {}
1685
1686	/// Create all memory pools which the device has access to and classify them.
1687	Error initMemoryPools() {
1688	// Retrieve all memory pools from the device agent(s).
1689	Error Err = retrieveAllMemoryPools();
1690	if (Err)
1691	return Err;
1692
1693	for (AMDGPUMemoryPoolTy *MemoryPool : AllMemoryPools) {
1694	// Initialize the memory pool and retrieve some basic info.
1695	Error Err = MemoryPool->init();
1696	if (Err)
1697	return Err;
1698
1699	if (!MemoryPool->isGlobal())
1700	continue;
1701
1702	// Classify the memory pools depending on their properties.
1703	if (MemoryPool->isFineGrained()) {
1704	FineGrainedMemoryPools.push_back(MemoryPool);
1705	if (MemoryPool->supportsKernelArgs())
1706	ArgsMemoryPools.push_back(MemoryPool);
1707	} else if (MemoryPool->isCoarseGrained()) {
1708	CoarseGrainedMemoryPools.push_back(MemoryPool);
1709	}
1710	}
1711	return Plugin::success();
1712	}
1713
1714	/// Destroy all memory pools.
1715	Error deinitMemoryPools() {
1716	for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools)
1717	delete Pool;
1718
1719	AllMemoryPools.clear();
1720	FineGrainedMemoryPools.clear();
1721	CoarseGrainedMemoryPools.clear();
1722	ArgsMemoryPools.clear();
1723
1724	return Plugin::success();
1725	}
1726
1727	/// Retrieve and construct all memory pools from the device agent(s).
1728	virtual Error retrieveAllMemoryPools() = `0`;
1729
1730	/// Get the device agent.
1731	virtual hsa_agent_t getAgent() const = `0`;
1732
1733	protected:
1734	/// Array of all memory pools available to the host agents.
1735	llvm::SmallVector<AMDGPUMemoryPoolTy *> AllMemoryPools;
1736
1737	/// Array of fine-grained memory pools available to the host agents.
1738	llvm::SmallVector<AMDGPUMemoryPoolTy *> FineGrainedMemoryPools;
1739
1740	/// Array of coarse-grained memory pools available to the host agents.
1741	llvm::SmallVector<AMDGPUMemoryPoolTy *> CoarseGrainedMemoryPools;
1742
1743	/// Array of kernel args memory pools available to the host agents.
1744	llvm::SmallVector<AMDGPUMemoryPoolTy *> ArgsMemoryPools;
1745	};
1746
1747	/// Class representing the host device. This host device may have more than one
1748	/// HSA host agent. We aggregate all its resources into the same instance.
1749	struct AMDHostDeviceTy : public AMDGenericDeviceTy {
1750	/// Create a host device from an array of host agents.
1751	AMDHostDeviceTy(AMDGPUPluginTy &Plugin,
1752	const llvm::SmallVector<hsa_agent_t> &HostAgents)
1753	: AMDGenericDeviceTy (), Agents(HostAgents), ArgsMemoryManager(Plugin),
1754	PinnedMemoryManager(Plugin) {
1755	assert(HostAgents.size() && "No host agent found");
1756	}
1757
1758	/// Initialize the host device memory pools and the memory managers for
1759	/// kernel args and host pinned memory allocations.
1760	Error init() {
1761	if (auto Err = initMemoryPools())
1762	return Err;
1763
1764	if (auto Err = ArgsMemoryManager.init(getArgsMemoryPool()))
1765	return Err;
1766
1767	if (auto Err = PinnedMemoryManager.init(getFineGrainedMemoryPool()))
1768	return Err;
1769
1770	return Plugin::success();
1771	}
1772
1773	/// Deinitialize memory pools and managers.
1774	Error deinit() {
1775	if (auto Err = deinitMemoryPools())
1776	return Err;
1777
1778	if (auto Err = ArgsMemoryManager.deinit())
1779	return Err;
1780
1781	if (auto Err = PinnedMemoryManager.deinit())
1782	return Err;
1783
1784	return Plugin::success();
1785	}
1786
1787	/// Retrieve and construct all memory pools from the host agents.
1788	Error retrieveAllMemoryPools() override {
1789	// Iterate through the available pools across the host agents.
1790	for (hsa_agent_t Agent : Agents) {
1791	Error Err = utils::iterateAgentMemoryPools(
1792	Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) {
1793	AMDGPUMemoryPoolTy *MemoryPool =
1794	new AMDGPUMemoryPoolTy(HSAMemoryPool);
1795	AllMemoryPools.push_back(MemoryPool);
1796	return HSA_STATUS_SUCCESS;
1797	});
1798	if (Err)
1799	return Err;
1800	}
1801	return Plugin::success();
1802	}
1803
1804	/// Get one of the host agents. Return always the first agent.
1805	hsa_agent_t getAgent() const override { return Agents[`0`]; }
1806
1807	/// Get a memory pool for fine-grained allocations.
1808	AMDGPUMemoryPoolTy &getFineGrainedMemoryPool() {
1809	assert(!FineGrainedMemoryPools.empty() && "No fine-grained mempool");
1810	// Retrive any memory pool.
1811	return *FineGrainedMemoryPools[`0`];
1812	}
1813
1814	AMDGPUMemoryPoolTy &getCoarseGrainedMemoryPool() {
1815	assert(!CoarseGrainedMemoryPools.empty() && "No coarse-grained mempool");
1816	// Retrive any memory pool.
1817	return *CoarseGrainedMemoryPools[`0`];
1818	}
1819
1820	/// Get a memory pool for kernel args allocations.
1821	AMDGPUMemoryPoolTy &getArgsMemoryPool() {
1822	assert(!ArgsMemoryPools.empty() && "No kernelargs mempool");
1823	// Retrieve any memory pool.
1824	return *ArgsMemoryPools[`0`];
1825	}
1826
1827	/// Getters for kernel args and host pinned memory managers.
1828	AMDGPUMemoryManagerTy &getArgsMemoryManager() { return ArgsMemoryManager; }
1829	AMDGPUMemoryManagerTy &getPinnedMemoryManager() {
1830	return PinnedMemoryManager;
1831	}
1832
1833	private:
1834	/// Array of agents on the host side.
1835	const llvm::SmallVector<hsa_agent_t> Agents;
1836
1837	// Memory manager for kernel arguments.
1838	AMDGPUMemoryManagerTy ArgsMemoryManager;
1839
1840	// Memory manager for pinned memory.
1841	AMDGPUMemoryManagerTy PinnedMemoryManager;
1842	};
1843
1844	/// Class implementing the AMDGPU device functionalities which derives from the
1845	/// generic device class.
1846	struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
1847	// Create an AMDGPU device with a device id and default AMDGPU grid values.
1848	AMDGPUDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices,
1849	AMDHostDeviceTy &HostDevice, hsa_agent_t Agent)
1850	: GenericDeviceTy(Plugin, DeviceId, NumDevices, {`0`}),
1851	AMDGenericDeviceTy (),
1852	OMPX_NumQueues("LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES", `4`),
1853	OMPX_QueueSize("LIBOMPTARGET_AMDGPU_HSA_QUEUE_SIZE", `512`),
1854	OMPX_DefaultTeamsPerCU("LIBOMPTARGET_AMDGPU_TEAMS_PER_CU", `4`),
1855	OMPX_MaxAsyncCopyBytes("LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES",
1856	`1` * `1024` * `1024`), // 1MB
1857	OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS",
1858	`64`),
1859	OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", `2000000`),
1860	OMPX_UseMultipleSdmaEngines(
1861	"LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES", false),
1862	OMPX_ApuMaps("OMPX_APU_MAPS", false), AMDGPUStreamManager(*this, Agent),
1863	AMDGPUEventManager(*this), AMDGPUSignalManager(*this), Agent(Agent),
1864	HostDevice(HostDevice) {}
1865
1866	~AMDGPUDeviceTy() {}
1867
1868	/// Initialize the device, its resources and get its properties.
1869	Error initImpl(GenericPluginTy &Plugin) override {
1870	// First setup all the memory pools.
1871	if (auto Err = initMemoryPools())
1872	return Err;
1873
1874	char GPUName[`64`];
1875	if (auto Err = getDeviceAttr(HSA_AGENT_INFO_NAME, GPUName))
1876	return Err;
1877	ComputeUnitKind = GPUName;
1878
1879	// Get the wavefront size.
1880	uint32_t WavefrontSize = `0`;
1881	if (auto Err = getDeviceAttr(HSA_AGENT_INFO_WAVEFRONT_SIZE, WavefrontSize))
1882	return Err;
1883	GridValues.GV_Warp_Size = WavefrontSize;
1884
1885	// Get the frequency of the steady clock. If the attribute is missing
1886	// assume running on an older libhsa and default to 0, omp_get_wtime
1887	// will be inaccurate but otherwise programs can still run.
1888	if (getDeviceAttrRaw(HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY,
1889	ClockFrequency) != HSA_STATUS_SUCCESS)
1890	ClockFrequency = `0`;
1891
1892	// Load the grid values dependending on the wavefront.
1893	if (WavefrontSize == `32`)
1894	GridValues = getAMDGPUGridValues<`32`>();
1895	else if (WavefrontSize == `64`)
1896	GridValues = getAMDGPUGridValues<`64`>();
1897	else
1898	return Plugin::error("Unexpected AMDGPU wavefront %d", WavefrontSize);
1899
1900	// Get maximum number of workitems per workgroup.
1901	uint16_t WorkgroupMaxDim[`3`];
1902	if (auto Err =
1903	getDeviceAttr(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgroupMaxDim))
1904	return Err;
1905	GridValues.GV_Max_WG_Size = WorkgroupMaxDim[`0`];
1906
1907	// Get maximum number of workgroups.
1908	hsa_dim3_t GridMaxDim;
1909	if (auto Err = getDeviceAttr(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim))
1910	return Err;
1911
1912	GridValues.GV_Max_Teams = GridMaxDim.x / GridValues.GV_Max_WG_Size;
1913	if (GridValues.GV_Max_Teams == `0`)
1914	return Plugin::error("Maximum number of teams cannot be zero");
1915
1916	// Compute the default number of teams.
1917	uint32_t ComputeUnits = `0`;
1918	if (auto Err =
1919	getDeviceAttr(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, ComputeUnits))
1920	return Err;
1921	GridValues.GV_Default_Num_Teams = ComputeUnits * OMPX_DefaultTeamsPerCU;
1922
1923	uint32_t WavesPerCU = `0`;
1924	if (auto Err =
1925	getDeviceAttr(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, WavesPerCU))
1926	return Err;
1927	HardwareParallelism = ComputeUnits * WavesPerCU;
1928
1929	// Get maximum size of any device queues and maximum number of queues.
1930	uint32_t MaxQueueSize;
1931	if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUE_MAX_SIZE, MaxQueueSize))
1932	return Err;
1933
1934	uint32_t MaxQueues;
1935	if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUES_MAX, MaxQueues))
1936	return Err;
1937
1938	// Compute the number of queues and their size.
1939	OMPX_NumQueues = std::max(`1U`, std::min(OMPX_NumQueues.get(), MaxQueues));
1940	OMPX_QueueSize = std::min(OMPX_QueueSize.get(), MaxQueueSize);
1941
1942	// Initialize stream pool.
1943	if (auto Err = AMDGPUStreamManager.init(OMPX_InitialNumStreams,
1944	OMPX_NumQueues, OMPX_QueueSize))
1945	return Err;
1946
1947	// Initialize event pool.
1948	if (auto Err = AMDGPUEventManager.init(OMPX_InitialNumEvents))
1949	return Err;
1950
1951	// Initialize signal pool.
1952	if (auto Err = AMDGPUSignalManager.init(OMPX_InitialNumSignals))
1953	return Err;
1954
1955	// Detect if XNACK is enabled
1956	auto TargeTripleAndFeaturesOrError =
1957	utils::getTargetTripleAndFeatures(Agent);
1958	if (!TargeTripleAndFeaturesOrError)
1959	return TargeTripleAndFeaturesOrError.takeError();
1960	if (static_cast<StringRef>(*TargeTripleAndFeaturesOrError)
1961	.contains(Other: "xnack+"))
1962	IsXnackEnabled = true;
1963
1964	// detect if device is an APU.
1965	if (auto Err = checkIfAPU())
1966	return Err;
1967
1968	return Plugin::success();
1969	}
1970
1971	/// Deinitialize the device and release its resources.
1972	Error deinitImpl() override {
1973	// Deinitialize the stream and event pools.
1974	if (auto Err = AMDGPUStreamManager.deinit())
1975	return Err;
1976
1977	if (auto Err = AMDGPUEventManager.deinit())
1978	return Err;
1979
1980	if (auto Err = AMDGPUSignalManager.deinit())
1981	return Err;
1982
1983	// Close modules if necessary.
1984	if (!LoadedImages.empty()) {
1985	// Each image has its own module.
1986	for (DeviceImageTy *Image : LoadedImages) {
1987	AMDGPUDeviceImageTy &AMDImage =
1988	static_cast<AMDGPUDeviceImageTy &>(*Image);
1989
1990	// Unload the executable of the image.
1991	if (auto Err = AMDImage.unloadExecutable())
1992	return Err;
1993	}
1994	}
1995
1996	// Invalidate agent reference.
1997	Agent = {`0`};
1998
1999	return Plugin::success();
2000	}
2001
2002	virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
2003	DeviceImageTy &Image) override {
2004	GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
2005	if (Handler.isSymbolInImage(*this, Image, "amdgcn.device.fini"))
2006	Image.setPendingGlobalDtors();
2007
2008	return callGlobalCtorDtorCommon(Plugin, Image, /IsCtor=/true);
2009	}
2010
2011	virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
2012	DeviceImageTy &Image) override {
2013	if (Image.hasPendingGlobalDtors())
2014	return callGlobalCtorDtorCommon(Plugin, Image, /IsCtor=/false);
2015	return Plugin::success();
2016	}
2017
2018	const uint64_t getStreamBusyWaitMicroseconds() const {
2019	return OMPX_StreamBusyWait;
2020	}
2021
2022	Expected<std::unique_ptr<MemoryBuffer>>
2023	doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const override {
2024
2025	// TODO: We should try to avoid materialization but there seems to be no
2026	// good linker interface w/o file i/o.
2027	SmallString<`128`> LinkerInputFilePath;
2028	std::error_code EC = sys::fs::createTemporaryFile("amdgpu-pre-link-jit",
2029	"o", LinkerInputFilePath);
2030	if (EC)
2031	return Plugin::error("Failed to create temporary file for linker");
2032
2033	// Write the file's contents to the output file.
2034	Expected<std::unique_ptr<FileOutputBuffer>> OutputOrErr =
2035	FileOutputBuffer::create(LinkerInputFilePath, MB->getBuffer().size());
2036	if (!OutputOrErr)
2037	return OutputOrErr.takeError();
2038	std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr);
2039	llvm::copy(Range: MB ->getBuffer(), Out: Output ->getBufferStart());
2040	if (Error E = Output ->commit())
2041	return std::move(E);
2042
2043	SmallString<`128`> LinkerOutputFilePath;
2044	EC = sys::fs::createTemporaryFile("amdgpu-pre-link-jit", "so",
2045	LinkerOutputFilePath);
2046	if (EC)
2047	return Plugin::error("Failed to create temporary file for linker");
2048
2049	const auto &ErrorOrPath = sys::findProgramByName("lld");
2050	if (!ErrorOrPath)
2051	return createStringError(EC: inconvertibleErrorCode(),
2052	Msg: "Failed to find `lld` on the PATH.");
2053
2054	std::string LLDPath = ErrorOrPath.get();
2055	INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
2056	"Using `%s` to link JITed amdgcn ouput.", LLDPath.c_str());
2057
2058	std::string MCPU = "-plugin-opt=mcpu=" + getComputeUnitKind();
2059	StringRef Args[] = {LLDPath,
2060	"-flavor",
2061	"gnu",
2062	"--no-undefined",
2063	"-shared",
2064	MCPU,
2065	"-o",
2066	LinkerOutputFilePath.data(),
2067	LinkerInputFilePath.data()};
2068
2069	std::string Error;
2070	int RC = sys::ExecuteAndWait(LLDPath, Args, std::nullopt, {}, `0`, `0`, &Error);
2071	if (RC)
2072	return Plugin::error("Linking optimized bitcode failed: %s",
2073	Error.c_str());
2074
2075	auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(LinkerOutputFilePath);
2076	if (!BufferOrErr)
2077	return Plugin::error("Failed to open temporary file for lld");
2078
2079	// Clean up the temporary files afterwards.
2080	if (sys::fs::remove(LinkerOutputFilePath))
2081	return Plugin::error("Failed to remove temporary output file for lld");
2082	if (sys::fs::remove(LinkerInputFilePath))
2083	return Plugin::error("Failed to remove temporary input file for lld");
2084
2085	return std::move(*BufferOrErr);
2086	}
2087
2088	/// See GenericDeviceTy::getComputeUnitKind().
2089	std::string getComputeUnitKind() const override { return ComputeUnitKind; }
2090
2091	/// Returns the clock frequency for the given AMDGPU device.
2092	uint64_t getClockFrequency() const override { return ClockFrequency; }
2093
2094	/// Allocate and construct an AMDGPU kernel.
2095	Expected<GenericKernelTy &> constructKernel(const char *Name) override {
2096	// Allocate and construct the AMDGPU kernel.
2097	AMDGPUKernelTy *AMDGPUKernel = Plugin.allocate<AMDGPUKernelTy>();
2098	if (!AMDGPUKernel)
2099	return Plugin::error("Failed to allocate memory for AMDGPU kernel");
2100
2101	new (AMDGPUKernel) AMDGPUKernelTy (Name);
2102
2103	return *AMDGPUKernel;
2104	}
2105
2106	/// Set the current context to this device's context. Do nothing since the
2107	/// AMDGPU devices do not have the concept of contexts.
2108	Error setContext() override { return Plugin::success(); }
2109
2110	/// AMDGPU returns the product of the number of compute units and the waves
2111	/// per compute unit.
2112	uint64_t getHardwareParallelism() const override {
2113	return HardwareParallelism;
2114	}
2115
2116	/// We want to set up the RPC server for host services to the GPU if it is
2117	/// availible.
2118	bool shouldSetupRPCServer() const override {
2119	return libomptargetSupportsRPC();
2120	}
2121
2122	/// The RPC interface should have enough space for all availible parallelism.
2123	uint64_t requestedRPCPortCount() const override {
2124	return getHardwareParallelism();
2125	}
2126
2127	/// Get the stream of the asynchronous info sructure or get a new one.
2128	Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper,
2129	AMDGPUStreamTy *&Stream) {
2130	// Get the stream (if any) from the async info.
2131	Stream = AsyncInfoWrapper.getQueueAs<AMDGPUStreamTy *>();
2132	if (!Stream) {
2133	// There was no stream; get an idle one.
2134	if (auto Err = AMDGPUStreamManager.getResource(Stream))
2135	return Err;
2136
2137	// Modify the async info's stream.
2138	AsyncInfoWrapper.setQueueAs<AMDGPUStreamTy *>(Stream);
2139	}
2140	return Plugin::success();
2141	}
2142
2143	/// Load the binary image into the device and allocate an image object.
2144	Expected<DeviceImageTy > loadBinaryImpl(const* __tgt_device_image *TgtImage,
2145	int32_t ImageId) override {
2146	// Allocate and initialize the image object.
2147	AMDGPUDeviceImageTy *AMDImage = Plugin.allocate<AMDGPUDeviceImageTy>();
2148	new (AMDImage) AMDGPUDeviceImageTy(ImageId, *this, TgtImage);
2149
2150	// Load the HSA executable.
2151	if (Error Err = AMDImage->loadExecutable(Device: *this))
2152	return std::move(Err);
2153
2154	return AMDImage;
2155	}
2156
2157	/// Allocate memory on the device or related to the device.
2158	void allocate(size_t Size, void* *, TargetAllocTy Kind) override;
2159
2160	/// Deallocate memory on the device or related to the device.
2161	int free(void *TgtPtr, TargetAllocTy Kind) override {
2162	if (TgtPtr == nullptr)
2163	return OFFLOAD_SUCCESS;
2164
2165	AMDGPUMemoryPoolTy MemoryPool = nullptr*;
2166	switch (Kind) {
2167	case TARGET_ALLOC_DEFAULT:
2168	case TARGET_ALLOC_DEVICE:
2169	case TARGET_ALLOC_DEVICE_NON_BLOCKING:
2170	MemoryPool = CoarseGrainedMemoryPools[`0`];
2171	break;
2172	case TARGET_ALLOC_HOST:
2173	MemoryPool = &HostDevice.getFineGrainedMemoryPool();
2174	break;
2175	case TARGET_ALLOC_SHARED:
2176	MemoryPool = &HostDevice.getFineGrainedMemoryPool();
2177	break;
2178	}
2179
2180	if (!MemoryPool) {
2181	REPORT("No memory pool for the specified allocation kind\n");
2182	return OFFLOAD_FAIL;
2183	}
2184
2185	if (Error Err = MemoryPool->deallocate(Ptr: TgtPtr)) {
2186	REPORT("%s\n", toString(E: std::move(Err)).data());
2187	return OFFLOAD_FAIL;
2188	}
2189
2190	return OFFLOAD_SUCCESS;
2191	}
2192
2193	/// Synchronize current thread with the pending operations on the async info.
2194	Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
2195	AMDGPUStreamTy *Stream =
2196	reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
2197	assert(Stream && "Invalid stream");
2198
2199	if (auto Err = Stream->synchronize())
2200	return Err;
2201
2202	// Once the stream is synchronized, return it to stream pool and reset
2203	// AsyncInfo. This is to make sure the synchronization only works for its
2204	// own tasks.
2205	AsyncInfo.Queue = nullptr;
2206	return AMDGPUStreamManager.returnResource(Stream);
2207	}
2208
2209	/// Query for the completion of the pending operations on the async info.
2210	Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override {
2211	AMDGPUStreamTy *Stream =
2212	reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
2213	assert(Stream && "Invalid stream");
2214
2215	auto CompletedOrErr = Stream->query();
2216	if (!CompletedOrErr)
2217	return CompletedOrErr.takeError();
2218
2219	// Return if it the stream did not complete yet.
2220	if (!(*CompletedOrErr))
2221	return Plugin::success();
2222
2223	// Once the stream is completed, return it to stream pool and reset
2224	// AsyncInfo. This is to make sure the synchronization only works for its
2225	// own tasks.
2226	AsyncInfo.Queue = nullptr;
2227	return AMDGPUStreamManager.returnResource(Stream);
2228	}
2229
2230	/// Pin the host buffer and return the device pointer that should be used for
2231	/// device transfers.
2232	Expected<void > dataLockImpl(void* *HstPtr, int64_t Size) override {
2233	void PinnedPtr = nullptr*;
2234
2235	hsa_status_t Status =
2236	hsa_amd_memory_lock(HstPtr, Size, nullptr, `0`, &PinnedPtr);
2237	if (auto Err = Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n"))
2238	return std::move(Err);
2239
2240	return PinnedPtr;
2241	}
2242
2243	/// Unpin the host buffer.
2244	Error dataUnlockImpl(void *HstPtr) override {
2245	hsa_status_t Status = hsa_amd_memory_unlock(HstPtr);
2246	return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n");
2247	}
2248
2249	/// Check through the HSA runtime whether the \p HstPtr buffer is pinned.
2250	Expected<bool> isPinnedPtrImpl(void HstPtr, void* *&BaseHstPtr,
2251	void *&BaseDevAccessiblePtr,
2252	size_t &BaseSize) const override {
2253	hsa_amd_pointer_info_t Info;
2254	Info.size = sizeof(hsa_amd_pointer_info_t);
2255
2256	hsa_status_t Status = hsa_amd_pointer_info(
2257	HstPtr, &Info, /Allocator=/nullptr, /num_agents_accessible=/nullptr,
2258	/accessible=/nullptr);
2259	if (auto Err = Plugin::check(Status, "Error in hsa_amd_pointer_info: %s"))
2260	return std::move(Err);
2261
2262	// The buffer may be locked or allocated through HSA allocators. Assume that
2263	// the buffer is host pinned if the runtime reports a HSA type.
2264	if (Info.type != HSA_EXT_POINTER_TYPE_LOCKED &&
2265	Info.type != HSA_EXT_POINTER_TYPE_HSA)
2266	return false;
2267
2268	assert(Info.hostBaseAddress && "Invalid host pinned address");
2269	assert(Info.agentBaseAddress && "Invalid agent pinned address");
2270	assert(Info.sizeInBytes > `0` && "Invalid pinned allocation size");
2271
2272	// Save the allocation info in the output parameters.
2273	BaseHstPtr = Info.hostBaseAddress;
2274	BaseDevAccessiblePtr = Info.agentBaseAddress;
2275	BaseSize = Info.sizeInBytes;
2276
2277	return true;
2278	}
2279
2280	/// Submit data to the device (host to device transfer).
2281	Error dataSubmitImpl(void TgtPtr, const* void *HstPtr, int64_t Size,
2282	AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2283	AMDGPUStreamTy Stream = nullptr*;
2284	void PinnedPtr = nullptr*;
2285
2286	// Use one-step asynchronous operation when host memory is already pinned.
2287	if (void *PinnedPtr =
2288	PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) {
2289	if (auto Err = getStream(AsyncInfoWrapper, Stream))
2290	return Err;
2291	return Stream->pushPinnedMemoryCopyAsync(Dst: TgtPtr, Src: PinnedPtr, CopySize: Size);
2292	}
2293
2294	// For large transfers use synchronous behavior.
2295	if (Size >= OMPX_MaxAsyncCopyBytes) {
2296	if (AsyncInfoWrapper.hasQueue())
2297	if (auto Err = synchronize(AsyncInfoWrapper))
2298	return Err;
2299
2300	hsa_status_t Status;
2301	Status = hsa_amd_memory_lock(const_cast<void >(HstPtr), Size, nullptr*, `0`,
2302	&PinnedPtr);
2303	if (auto Err =
2304	Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n"))
2305	return Err;
2306
2307	AMDGPUSignalTy Signal;
2308	if (auto Err = Signal.init())
2309	return Err;
2310
2311	if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr,
2312	Agent, PinnedPtr, Agent, Size, `0`,
2313	nullptr, Signal.get()))
2314	return Err;
2315
2316	if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
2317	return Err;
2318
2319	if (auto Err = Signal.deinit())
2320	return Err;
2321
2322	Status = hsa_amd_memory_unlock(const_cast<void *>(HstPtr));
2323	return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n");
2324	}
2325
2326	// Otherwise, use two-step copy with an intermediate pinned host buffer.
2327	AMDGPUMemoryManagerTy &PinnedMemoryManager =
2328	HostDevice.getPinnedMemoryManager();
2329	if (auto Err = PinnedMemoryManager.allocate(Size, PtrStorage: &PinnedPtr))
2330	return Err;
2331
2332	if (auto Err = getStream(AsyncInfoWrapper, Stream))
2333	return Err;
2334
2335	return Stream->pushMemoryCopyH2DAsync(Dst: TgtPtr, Src: HstPtr, Inter: PinnedPtr, CopySize: Size,
2336	MemoryManager&: PinnedMemoryManager);
2337	}
2338
2339	/// Retrieve data from the device (device to host transfer).
2340	Error dataRetrieveImpl(void HstPtr, const* void *TgtPtr, int64_t Size,
2341	AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2342	AMDGPUStreamTy Stream = nullptr*;
2343	void PinnedPtr = nullptr*;
2344
2345	// Use one-step asynchronous operation when host memory is already pinned.
2346	if (void *PinnedPtr =
2347	PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) {
2348	if (auto Err = getStream(AsyncInfoWrapper, Stream))
2349	return Err;
2350
2351	return Stream->pushPinnedMemoryCopyAsync(Dst: PinnedPtr, Src: TgtPtr, CopySize: Size);
2352	}
2353
2354	// For large transfers use synchronous behavior.
2355	if (Size >= OMPX_MaxAsyncCopyBytes) {
2356	if (AsyncInfoWrapper.hasQueue())
2357	if (auto Err = synchronize(AsyncInfoWrapper))
2358	return Err;
2359
2360	hsa_status_t Status;
2361	Status = hsa_amd_memory_lock(const_cast<void >(HstPtr), Size, nullptr*, `0`,
2362	&PinnedPtr);
2363	if (auto Err =
2364	Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n"))
2365	return Err;
2366
2367	AMDGPUSignalTy Signal;
2368	if (auto Err = Signal.init())
2369	return Err;
2370
2371	if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), PinnedPtr,
2372	Agent, TgtPtr, Agent, Size, `0`, nullptr,
2373	Signal.get()))
2374	return Err;
2375
2376	if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
2377	return Err;
2378
2379	if (auto Err = Signal.deinit())
2380	return Err;
2381
2382	Status = hsa_amd_memory_unlock(const_cast<void *>(HstPtr));
2383	return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n");
2384	}
2385
2386	// Otherwise, use two-step copy with an intermediate pinned host buffer.
2387	AMDGPUMemoryManagerTy &PinnedMemoryManager =
2388	HostDevice.getPinnedMemoryManager();
2389	if (auto Err = PinnedMemoryManager.allocate(Size, PtrStorage: &PinnedPtr))
2390	return Err;
2391
2392	if (auto Err = getStream(AsyncInfoWrapper, Stream))
2393	return Err;
2394
2395	return Stream->pushMemoryCopyD2HAsync(Dst: HstPtr, Src: TgtPtr, Inter: PinnedPtr, CopySize: Size,
2396	MemoryManager&: PinnedMemoryManager);
2397	}
2398
2399	/// Exchange data between two devices within the plugin.
2400	Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice,
2401	void *DstPtr, int64_t Size,
2402	AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2403	AMDGPUDeviceTy &DstDevice = static_cast<AMDGPUDeviceTy &>(DstGenericDevice);
2404
2405	// For large transfers use synchronous behavior.
2406	if (Size >= OMPX_MaxAsyncCopyBytes) {
2407	if (AsyncInfoWrapper.hasQueue())
2408	if (auto Err = synchronize(AsyncInfoWrapper))
2409	return Err;
2410
2411	AMDGPUSignalTy Signal;
2412	if (auto Err = Signal.init())
2413	return Err;
2414
2415	if (auto Err = utils::asyncMemCopy(
2416	useMultipleSdmaEngines(), DstPtr, DstDevice.getAgent(), SrcPtr,
2417	getAgent(), (uint64_t)Size, `0`, nullptr, Signal.get()))
2418	return Err;
2419
2420	if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
2421	return Err;
2422
2423	return Signal.deinit();
2424	}
2425
2426	AMDGPUStreamTy Stream = nullptr*;
2427	if (auto Err = getStream(AsyncInfoWrapper, Stream))
2428	return Err;
2429	if (Size <= `0`)
2430	return Plugin::success();
2431
2432	return Stream->pushMemoryCopyD2DAsync(DstPtr, DstDevice.getAgent(), SrcPtr,
2433	getAgent(), (uint64_t)Size);
2434	}
2435
2436	/// Initialize the async info for interoperability purposes.
2437	Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2438	// TODO: Implement this function.
2439	return Plugin::success();
2440	}
2441
2442	/// Initialize the device info for interoperability purposes.
2443	Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override {
2444	DeviceInfo->Context = nullptr;
2445
2446	if (!DeviceInfo->Device)
2447	DeviceInfo->Device = reinterpret_cast<void *>(Agent.handle);
2448
2449	return Plugin::success();
2450	}
2451
2452	/// Create an event.
2453	Error createEventImpl(void **EventPtrStorage) override {
2454	AMDGPUEventTy Event = reinterpret_cast<AMDGPUEventTy >(EventPtrStorage);
2455	return AMDGPUEventManager.getResource(*Event);
2456	}
2457
2458	/// Destroy a previously created event.
2459	Error destroyEventImpl(void *EventPtr) override {
2460	AMDGPUEventTy Event = reinterpret_cast<AMDGPUEventTy >(EventPtr);
2461	return AMDGPUEventManager.returnResource(Event);
2462	}
2463
2464	/// Record the event.
2465	Error recordEventImpl(void *EventPtr,
2466	AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2467	AMDGPUEventTy Event = reinterpret_cast<AMDGPUEventTy >(EventPtr);
2468	assert(Event && "Invalid event");
2469
2470	AMDGPUStreamTy Stream = nullptr*;
2471	if (auto Err = getStream(AsyncInfoWrapper, Stream))
2472	return Err;
2473
2474	return Event->record(Stream&: *Stream);
2475	}
2476
2477	/// Make the stream wait on the event.
2478	Error waitEventImpl(void *EventPtr,
2479	AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2480	AMDGPUEventTy Event = reinterpret_cast<AMDGPUEventTy >(EventPtr);
2481
2482	AMDGPUStreamTy Stream = nullptr*;
2483	if (auto Err = getStream(AsyncInfoWrapper, Stream))
2484	return Err;
2485
2486	return Event->wait(Stream&: *Stream);
2487	}
2488
2489	/// Synchronize the current thread with the event.
2490	Error syncEventImpl(void *EventPtr) override {
2491	return Plugin::error("Synchronize event not implemented");
2492	}
2493
2494	/// Print information about the device.
2495	Error obtainInfoImpl(InfoQueueTy &Info) override {
2496	char TmpChar[`1000`];
2497	const char *TmpCharPtr = "Unknown";
2498	uint16_t Major, Minor;
2499	uint32_t TmpUInt, TmpUInt2;
2500	uint32_t CacheSize[`4`];
2501	size_t TmpSt;
2502	bool TmpBool;
2503	uint16_t WorkgrpMaxDim[`3`];
2504	hsa_dim3_t GridMaxDim;
2505	hsa_status_t Status, Status2;
2506
2507	Status = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major);
2508	Status2 = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor);
2509	if (Status == HSA_STATUS_SUCCESS && Status2 == HSA_STATUS_SUCCESS)
2510	Info.add("HSA Runtime Version",
2511	std::to_string(val: Major) + "." + std::to_string(val: Minor));
2512
2513	Info.add("HSA OpenMP Device Number", DeviceId);
2514
2515	Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar);
2516	if (Status == HSA_STATUS_SUCCESS)
2517	Info.add("Product Name", TmpChar);
2518
2519	Status = getDeviceAttrRaw(HSA_AGENT_INFO_NAME, TmpChar);
2520	if (Status == HSA_STATUS_SUCCESS)
2521	Info.add("Device Name", TmpChar);
2522
2523	Status = getDeviceAttrRaw(HSA_AGENT_INFO_VENDOR_NAME, TmpChar);
2524	if (Status == HSA_STATUS_SUCCESS)
2525	Info.add("Vendor Name", TmpChar);
2526
2527	hsa_device_type_t DevType;
2528	Status = getDeviceAttrRaw(HSA_AGENT_INFO_DEVICE, DevType);
2529	if (Status == HSA_STATUS_SUCCESS) {
2530	switch (DevType) {
2531	case HSA_DEVICE_TYPE_CPU:
2532	TmpCharPtr = "CPU";
2533	break;
2534	case HSA_DEVICE_TYPE_GPU:
2535	TmpCharPtr = "GPU";
2536	break;
2537	case HSA_DEVICE_TYPE_DSP:
2538	TmpCharPtr = "DSP";
2539	break;
2540	}
2541	Info.add("Device Type", TmpCharPtr);
2542	}
2543
2544	Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUES_MAX, TmpUInt);
2545	if (Status == HSA_STATUS_SUCCESS)
2546	Info.add("Max Queues", TmpUInt);
2547
2548	Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MIN_SIZE, TmpUInt);
2549	if (Status == HSA_STATUS_SUCCESS)
2550	Info.add("Queue Min Size", TmpUInt);
2551
2552	Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MAX_SIZE, TmpUInt);
2553	if (Status == HSA_STATUS_SUCCESS)
2554	Info.add("Queue Max Size", TmpUInt);
2555
2556	// FIXME: This is deprecated according to HSA documentation. But using
2557	// hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during
2558	// runtime.
2559	Status = getDeviceAttrRaw(HSA_AGENT_INFO_CACHE_SIZE, CacheSize);
2560	if (Status == HSA_STATUS_SUCCESS) {
2561	Info.add("Cache");
2562
2563	for (int I = `0`; I < `4`; I++)
2564	if (CacheSize[I])
2565	Info.add<InfoLevel2>("L" + std::to_string(I), CacheSize[I]);
2566	}
2567
2568	Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_CACHELINE_SIZE, TmpUInt);
2569	if (Status == HSA_STATUS_SUCCESS)
2570	Info.add("Cacheline Size", TmpUInt);
2571
2572	Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt);
2573	if (Status == HSA_STATUS_SUCCESS)
2574	Info.add("Max Clock Freq", TmpUInt, "MHz");
2575
2576	Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, TmpUInt);
2577	if (Status == HSA_STATUS_SUCCESS)
2578	Info.add("Compute Units", TmpUInt);
2579
2580	Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, TmpUInt);
2581	if (Status == HSA_STATUS_SUCCESS)
2582	Info.add("SIMD per CU", TmpUInt);
2583
2584	Status = getDeviceAttrRaw(HSA_AGENT_INFO_FAST_F16_OPERATION, TmpBool);
2585	if (Status == HSA_STATUS_SUCCESS)
2586	Info.add("Fast F16 Operation", TmpBool);
2587
2588	Status = getDeviceAttrRaw(HSA_AGENT_INFO_WAVEFRONT_SIZE, TmpUInt2);
2589	if (Status == HSA_STATUS_SUCCESS)
2590	Info.add("Wavefront Size", TmpUInt2);
2591
2592	Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, TmpUInt);
2593	if (Status == HSA_STATUS_SUCCESS)
2594	Info.add("Workgroup Max Size", TmpUInt);
2595
2596	Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgrpMaxDim);
2597	if (Status == HSA_STATUS_SUCCESS) {
2598	Info.add("Workgroup Max Size per Dimension");
2599	Info.add<InfoLevel2>("x", WorkgrpMaxDim[`0`]);
2600	Info.add<InfoLevel2>("y", WorkgrpMaxDim[`1`]);
2601	Info.add<InfoLevel2>("z", WorkgrpMaxDim[`2`]);
2602	}
2603
2604	Status = getDeviceAttrRaw(
2605	(hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, TmpUInt);
2606	if (Status == HSA_STATUS_SUCCESS) {
2607	Info.add("Max Waves Per CU", TmpUInt);
2608	Info.add("Max Work-item Per CU", TmpUInt * TmpUInt2);
2609	}
2610
2611	Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_SIZE, TmpUInt);
2612	if (Status == HSA_STATUS_SUCCESS)
2613	Info.add("Grid Max Size", TmpUInt);
2614
2615	Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim);
2616	if (Status == HSA_STATUS_SUCCESS) {
2617	Info.add("Grid Max Size per Dimension");
2618	Info.add<InfoLevel2>("x", GridMaxDim.x);
2619	Info.add<InfoLevel2>("y", GridMaxDim.y);
2620	Info.add<InfoLevel2>("z", GridMaxDim.z);
2621	}
2622
2623	Status = getDeviceAttrRaw(HSA_AGENT_INFO_FBARRIER_MAX_SIZE, TmpUInt);
2624	if (Status == HSA_STATUS_SUCCESS)
2625	Info.add("Max fbarriers/Workgrp", TmpUInt);
2626
2627	Info.add("Memory Pools");
2628	for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
2629	std::string TmpStr, TmpStr2;
2630
2631	if (Pool->isGlobal())
2632	TmpStr = "Global";
2633	else if (Pool->isReadOnly())
2634	TmpStr = "ReadOnly";
2635	else if (Pool->isPrivate())
2636	TmpStr = "Private";
2637	else if (Pool->isGroup())
2638	TmpStr = "Group";
2639	else
2640	TmpStr = "Unknown";
2641
2642	Info.add<InfoLevel2>(std::string("Pool ") + TmpStr);
2643
2644	if (Pool->isGlobal()) {
2645	if (Pool->isFineGrained())
2646	TmpStr2 += "Fine Grained ";
2647	if (Pool->isCoarseGrained())
2648	TmpStr2 += "Coarse Grained ";
2649	if (Pool->supportsKernelArgs())
2650	TmpStr2 += "Kernarg ";
2651
2652	Info.add<InfoLevel3>("Flags", TmpStr2);
2653	}
2654
2655	Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, TmpSt);
2656	if (Status == HSA_STATUS_SUCCESS)
2657	Info.add<InfoLevel3>("Size", TmpSt, "bytes");
2658
2659	Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
2660	TmpBool);
2661	if (Status == HSA_STATUS_SUCCESS)
2662	Info.add<InfoLevel3>("Allocatable", TmpBool);
2663
2664	Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE,
2665	TmpSt);
2666	if (Status == HSA_STATUS_SUCCESS)
2667	Info.add<InfoLevel3>("Runtime Alloc Granule", TmpSt, "bytes");
2668
2669	Status = Pool->getAttrRaw(
2670	HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, TmpSt);
2671	if (Status == HSA_STATUS_SUCCESS)
2672	Info.add<InfoLevel3>("Runtime Alloc Alignment", TmpSt, "bytes");
2673
2674	Status =
2675	Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, TmpBool);
2676	if (Status == HSA_STATUS_SUCCESS)
2677	Info.add<InfoLevel3>("Accessable by all", TmpBool);
2678	}
2679
2680	Info.add("ISAs");
2681	auto Err = utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) {
2682	Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, TmpChar);
2683	if (Status == HSA_STATUS_SUCCESS)
2684	Info.add<InfoLevel2>("Name", TmpChar);
2685
2686	return Status;
2687	});
2688
2689	// Silently consume the error.
2690	if (Err)
2691	consumeError(std::move(Err));
2692
2693	return Plugin::success();
2694	}
2695
2696	/// Returns true if auto zero-copy the best configuration for the current
2697	/// arch.
2698	/// On AMDGPUs, automatic zero-copy is turned on
2699	/// when running on an APU with XNACK (unified memory) support
2700	/// enabled. On discrete GPUs, automatic zero-copy is triggered
2701	/// if the user sets the environment variable OMPX_APU_MAPS=1
2702	/// and if XNACK is enabled. The rationale is that zero-copy
2703	/// is the best configuration (performance, memory footprint) on APUs,
2704	/// while it is often not the best on discrete GPUs.
2705	/// XNACK can be enabled with a kernel boot parameter or with
2706	/// the HSA_XNACK environment variable.
2707	bool useAutoZeroCopyImpl() override {
2708	return ((IsAPU \|\| OMPX_ApuMaps) && IsXnackEnabled);
2709	}
2710
2711	/// Getters and setters for stack and heap sizes.
2712	Error getDeviceStackSize(uint64_t &Value) override {
2713	Value = StackSize;
2714	return Plugin::success();
2715	}
2716	Error setDeviceStackSize(uint64_t Value) override {
2717	StackSize = Value;
2718	return Plugin::success();
2719	}
2720	Error getDeviceHeapSize(uint64_t &Value) override {
2721	Value = DeviceMemoryPoolSize;
2722	return Plugin::success();
2723	}
2724	Error setDeviceHeapSize(uint64_t Value) override {
2725	for (DeviceImageTy *Image : LoadedImages)
2726	if (auto Err = setupDeviceMemoryPool(Plugin, *Image, Value))
2727	return Err;
2728	DeviceMemoryPoolSize = Value;
2729	return Plugin::success();
2730	}
2731	Error getDeviceMemorySize(uint64_t &Value) override {
2732	for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
2733	if (Pool->isGlobal()) {
2734	hsa_status_t Status =
2735	Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, Value);
2736	return Plugin::check(Status, "Error in getting device memory size: %s");
2737	}
2738	}
2739	return Plugin::error("getDeviceMemorySize:: no global pool");
2740	}
2741
2742	/// AMDGPU-specific function to get device attributes.
2743	template <typename Ty> Error getDeviceAttr(uint32_t Kind, Ty &Value) {
2744	hsa_status_t Status =
2745	hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value);
2746	return Plugin::check(Status, "Error in hsa_agent_get_info: %s");
2747	}
2748
2749	template <typename Ty>
2750	hsa_status_t getDeviceAttrRaw(uint32_t Kind, Ty &Value) {
2751	return hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value);
2752	}
2753
2754	/// Get the device agent.
2755	hsa_agent_t getAgent() const override { return Agent; }
2756
2757	/// Get the signal manager.
2758	AMDGPUSignalManagerTy &getSignalManager() { return AMDGPUSignalManager; }
2759
2760	/// Retrieve and construct all memory pools of the device agent.
2761	Error retrieveAllMemoryPools() override {
2762	// Iterate through the available pools of the device agent.
2763	return utils::iterateAgentMemoryPools(
2764	Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) {
2765	AMDGPUMemoryPoolTy *MemoryPool =
2766	Plugin.allocate<AMDGPUMemoryPoolTy>();
2767	new (MemoryPool) AMDGPUMemoryPoolTy(HSAMemoryPool);
2768	AllMemoryPools.push_back(MemoryPool);
2769	return HSA_STATUS_SUCCESS;
2770	});
2771	}
2772
2773	bool useMultipleSdmaEngines() const { return OMPX_UseMultipleSdmaEngines; }
2774
2775	private:
2776	using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>;
2777	using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>;
2778
2779	/// Common method to invoke a single threaded constructor or destructor
2780	/// kernel by name.
2781	Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image,
2782	bool IsCtor) {
2783	const char *KernelName =
2784	IsCtor ? "amdgcn.device.init" : "amdgcn.device.fini";
2785	// Perform a quick check for the named kernel in the image. The kernel
2786	// should be created by the 'amdgpu-lower-ctor-dtor' pass.
2787	GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
2788	if (IsCtor && !Handler.isSymbolInImage(*this, Image, KernelName))
2789	return Plugin::success();
2790
2791	// Allocate and construct the AMDGPU kernel.
2792	AMDGPUKernelTy AMDGPUKernel(KernelName);
2793	if (auto Err = AMDGPUKernel.init(*this, Image))
2794	return Err;
2795
2796	AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
2797
2798	KernelArgsTy KernelArgs = {};
2799	if (auto Err = AMDGPUKernel.launchImpl(*this, /NumThread=/`1u`,
2800	/NumBlocks=/`1ul`, KernelArgs,
2801	/Args=/nullptr, AsyncInfoWrapper))
2802	return Err;
2803
2804	Error Err = Plugin::success();
2805	AsyncInfoWrapper.finalize(Err);
2806
2807	return Err;
2808	}
2809
2810	/// Detect if current architecture is an APU.
2811	Error checkIfAPU() {
2812	// TODO: replace with ROCr API once it becomes available.
2813	llvm::StringRef StrGfxName(ComputeUnitKind);
2814	IsAPU = llvm::StringSwitch<bool>(StrGfxName)
2815	.Case(S: "gfx940", Value: true)
2816	.Default(Value: false);
2817	if (IsAPU)
2818	return Plugin::success();
2819
2820	bool MayBeAPU = llvm::StringSwitch<bool>(StrGfxName)
2821	.Case(S: "gfx942", Value: true)
2822	.Default(Value: false);
2823	if (!MayBeAPU)
2824	return Plugin::success();
2825
2826	// can be MI300A or MI300X
2827	uint32_t ChipID = `0`;
2828	if (auto Err = getDeviceAttr(HSA_AMD_AGENT_INFO_CHIP_ID, ChipID))
2829	return Err;
2830
2831	if (!(ChipID & `0x1`)) {
2832	IsAPU = true;
2833	return Plugin::success();
2834	}
2835	return Plugin::success();
2836	}
2837
2838	/// Envar for controlling the number of HSA queues per device. High number of
2839	/// queues may degrade performance.
2840	UInt32Envar OMPX_NumQueues;
2841
2842	/// Envar for controlling the size of each HSA queue. The size is the number
2843	/// of HSA packets a queue is expected to hold. It is also the number of HSA
2844	/// packets that can be pushed into each queue without waiting the driver to
2845	/// process them.
2846	UInt32Envar OMPX_QueueSize;
2847
2848	/// Envar for controlling the default number of teams relative to the number
2849	/// of compute units (CUs) the device has:
2850	/// #default_teams = OMPX_DefaultTeamsPerCU #CUs.*
2851	UInt32Envar OMPX_DefaultTeamsPerCU;
2852
2853	/// Envar specifying the maximum size in bytes where the memory copies are
2854	/// asynchronous operations. Up to this transfer size, the memory copies are
2855	/// asychronous operations pushed to the corresponding stream. For larger
2856	/// transfers, they are synchronous transfers.
2857	UInt32Envar OMPX_MaxAsyncCopyBytes;
2858
2859	/// Envar controlling the initial number of HSA signals per device. There is
2860	/// one manager of signals per device managing several pre-allocated signals.
2861	/// These signals are mainly used by AMDGPU streams. If needed, more signals
2862	/// will be created.
2863	UInt32Envar OMPX_InitialNumSignals;
2864
2865	/// Environment variables to set the time to wait in active state before
2866	/// switching to blocked state. The default 2000000 busywaits for 2 seconds
2867	/// before going into a blocking HSA wait state. The unit for these variables
2868	/// are microseconds.
2869	UInt32Envar OMPX_StreamBusyWait;
2870
2871	/// Use ROCm 5.7 interface for multiple SDMA engines
2872	BoolEnvar OMPX_UseMultipleSdmaEngines;
2873
2874	/// Value of OMPX_APU_MAPS env var used to force
2875	/// automatic zero-copy behavior on non-APU GPUs.
2876	BoolEnvar OMPX_ApuMaps;
2877
2878	/// Stream manager for AMDGPU streams.
2879	AMDGPUStreamManagerTy AMDGPUStreamManager;
2880
2881	/// Event manager for AMDGPU events.
2882	AMDGPUEventManagerTy AMDGPUEventManager;
2883
2884	/// Signal manager for AMDGPU signals.
2885	AMDGPUSignalManagerTy AMDGPUSignalManager;
2886
2887	/// The agent handler corresponding to the device.
2888	hsa_agent_t Agent;
2889
2890	/// The GPU architecture.
2891	std::string ComputeUnitKind;
2892
2893	/// The frequency of the steady clock inside the device.
2894	uint64_t ClockFrequency;
2895
2896	/// The total number of concurrent work items that can be running on the GPU.
2897	uint64_t HardwareParallelism;
2898
2899	/// Reference to the host device.
2900	AMDHostDeviceTy &HostDevice;
2901
2902	/// The current size of the global device memory pool (managed by us).
2903	uint64_t DeviceMemoryPoolSize = `1L` << `29L` /512MB=/;
2904
2905	/// The current size of the stack that will be used in cases where it could
2906	/// not be statically determined.
2907	uint64_t StackSize = `16` * `1024` / 16 KB /;
2908
2909	/// Is the plugin associated with an APU?
2910	bool IsAPU = false;
2911
2912	/// True is the system is configured with XNACK-Enabled.
2913	/// False otherwise.
2914	bool IsXnackEnabled = false;
2915	};
2916
2917	Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
2918	hsa_status_t Status;
2919	Status = hsa_code_object_deserialize(getStart(), getSize(), "", &CodeObject);
2920	if (auto Err =
2921	Plugin::check(Status, "Error in hsa_code_object_deserialize: %s"))
2922	return Err;
2923
2924	Status = hsa_executable_create_alt(
2925	HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "", &Executable);
2926	if (auto Err =
2927	Plugin::check(Status, "Error in hsa_executable_create_alt: %s"))
2928	return Err;
2929
2930	Status = hsa_executable_load_code_object(Executable, Device.getAgent(),
2931	CodeObject, "");
2932	if (auto Err =
2933	Plugin::check(Status, "Error in hsa_executable_load_code_object: %s"))
2934	return Err;
2935
2936	Status = hsa_executable_freeze(Executable, "");
2937	if (auto Err = Plugin::check(Status, "Error in hsa_executable_freeze: %s"))
2938	return Err;
2939
2940	uint32_t Result;
2941	Status = hsa_executable_validate(Executable, &Result);
2942	if (auto Err = Plugin::check(Status, "Error in hsa_executable_validate: %s"))
2943	return Err;
2944
2945	if (Result)
2946	return Plugin::error("Loaded HSA executable does not validate");
2947
2948	if (auto Err = utils::readAMDGPUMetaDataFromImage(
2949	getMemoryBuffer(), KernelInfoMap, ELFABIVersion))
2950	return Err;
2951
2952	return Plugin::success();
2953	}
2954
2955	Expected<hsa_executable_symbol_t>
2956	AMDGPUDeviceImageTy::findDeviceSymbol(GenericDeviceTy &Device,
2957	StringRef SymbolName) const {
2958
2959	AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(Device);
2960	hsa_agent_t Agent = AMDGPUDevice.getAgent();
2961
2962	hsa_executable_symbol_t Symbol;
2963	hsa_status_t Status = hsa_executable_get_symbol_by_name(
2964	Executable, SymbolName.data(), &Agent, &Symbol);
2965	if (auto Err = Plugin::check(
2966	Status, "Error in hsa_executable_get_symbol_by_name(%s): %s",
2967	SymbolName.data()))
2968	return std::move(Err);
2969
2970	return Symbol;
2971	}
2972
2973	template <typename ResourceTy>
2974	Error AMDGPUResourceRef<ResourceTy>::create(GenericDeviceTy &Device) {
2975	if (Resource)
2976	return Plugin::error("Creating an existing resource");
2977
2978	AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(Device);
2979
2980	Resource = new ResourceTy(AMDGPUDevice);
2981
2982	return Resource->init();
2983	}
2984
2985	AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
2986	: Agent(Device.getAgent()), Queue(nullptr),
2987	SignalManager(Device.getSignalManager()), Device(Device),
2988	// Initialize the std::deque with some empty positions.
2989	Slots (`32`), NextSlot(`0`), SyncCycle(`0`), RPCServer(nullptr),
2990	StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
2991	UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {}
2992
2993	/// Class implementing the AMDGPU-specific functionalities of the global
2994	/// handler.
2995	struct AMDGPUGlobalHandlerTy final : public GenericGlobalHandlerTy {
2996	/// Get the metadata of a global from the device. The name and size of the
2997	/// global is read from DeviceGlobal and the address of the global is written
2998	/// to DeviceGlobal.
2999	Error getGlobalMetadataFromDevice(GenericDeviceTy &Device,
3000	DeviceImageTy &Image,
3001	GlobalTy &DeviceGlobal) override {
3002	AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(Image);
3003
3004	// Find the symbol on the device executable.
3005	auto SymbolOrErr =
3006	AMDImage.findDeviceSymbol(Device, DeviceGlobal.getName());
3007	if (!SymbolOrErr)
3008	return SymbolOrErr.takeError();
3009
3010	hsa_executable_symbol_t Symbol = *SymbolOrErr;
3011	hsa_symbol_kind_t SymbolType;
3012	hsa_status_t Status;
3013	uint64_t SymbolAddr;
3014	uint32_t SymbolSize;
3015
3016	// Retrieve the type, address and size of the symbol.
3017	std::pair<hsa_executable_symbol_info_t, void *> RequiredInfos[] = {
3018	{HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &SymbolType},
3019	{HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &SymbolAddr},
3020	{HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &SymbolSize}};
3021
3022	for (auto &Info : RequiredInfos) {
3023	Status = hsa_executable_symbol_get_info(Symbol, Info.first, Info.second);
3024	if (auto Err = Plugin::check(
3025	Status, "Error in hsa_executable_symbol_get_info: %s"))
3026	return Err;
3027	}
3028
3029	// Check the size of the symbol.
3030	if (SymbolSize != DeviceGlobal.getSize())
3031	return Plugin::error(
3032	"Failed to load global '%s' due to size mismatch (%zu != %zu)",
3033	DeviceGlobal.getName().data(), SymbolSize,
3034	(size_t)DeviceGlobal.getSize());
3035
3036	// Store the symbol address on the device global metadata.
3037	DeviceGlobal.setPtr(reinterpret_cast<void *>(SymbolAddr));
3038
3039	return Plugin::success();
3040	}
3041	};
3042
3043	/// Class implementing the AMDGPU-specific functionalities of the plugin.
3044	struct AMDGPUPluginTy final : public GenericPluginTy {
3045	/// Create an AMDGPU plugin and initialize the AMDGPU driver.
3046	AMDGPUPluginTy()
3047	: GenericPluginTy(getTripleArch()), Initialized(false),
3048	HostDevice(nullptr) {}
3049
3050	/// This class should not be copied.
3051	AMDGPUPluginTy(const AMDGPUPluginTy &) = delete;
3052	AMDGPUPluginTy(AMDGPUPluginTy &&) = delete;
3053
3054	/// Initialize the plugin and return the number of devices.
3055	Expected<int32_t> initImpl() override {
3056	hsa_status_t Status = hsa_init();
3057	if (Status != HSA_STATUS_SUCCESS) {
3058	// Cannot call hsa_success_string.
3059	DP("Failed to initialize AMDGPU's HSA library\n");
3060	return `0`;
3061	}
3062
3063	// The initialization of HSA was successful. It should be safe to call
3064	// HSA functions from now on, e.g., hsa_shut_down.
3065	Initialized = true;
3066
3067	#ifdef OMPT_SUPPORT
3068	ompt::connectLibrary();
3069	#endif
3070
3071	// Register event handler to detect memory errors on the devices.
3072	Status = hsa_amd_register_system_event_handler(eventHandler, nullptr);
3073	if (auto Err = Plugin::check(
3074	Status, "Error in hsa_amd_register_system_event_handler: %s"))
3075	return std::move(Err);
3076
3077	// List of host (CPU) agents.
3078	llvm::SmallVector<hsa_agent_t> HostAgents;
3079
3080	// Count the number of available agents.
3081	auto Err = utils::iterateAgents(Callback: [&](hsa_agent_t Agent) {
3082	// Get the device type of the agent.
3083	hsa_device_type_t DeviceType;
3084	hsa_status_t Status =
3085	hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType);
3086	if (Status != HSA_STATUS_SUCCESS)
3087	return Status;
3088
3089	// Classify the agents into kernel (GPU) and host (CPU) kernels.
3090	if (DeviceType == HSA_DEVICE_TYPE_GPU) {
3091	// Ensure that the GPU agent supports kernel dispatch packets.
3092	hsa_agent_feature_t Features;
3093	Status = hsa_agent_get_info(Agent, HSA_AGENT_INFO_FEATURE, &Features);
3094	if (Features & HSA_AGENT_FEATURE_KERNEL_DISPATCH)
3095	KernelAgents.push_back(Agent);
3096	} else if (DeviceType == HSA_DEVICE_TYPE_CPU) {
3097	HostAgents.push_back(Agent);
3098	}
3099	return HSA_STATUS_SUCCESS;
3100	});
3101
3102	if (Err)
3103	return std::move(Err);
3104
3105	int32_t NumDevices = KernelAgents.size();
3106	if (NumDevices == `0`) {
3107	// Do not initialize if there are no devices.
3108	DP("There are no devices supporting AMDGPU.\n");
3109	return `0`;
3110	}
3111
3112	// There are kernel agents but there is no host agent. That should be
3113	// treated as an error.
3114	if (HostAgents.empty())
3115	return Plugin::error("No AMDGPU host agents");
3116
3117	// Initialize the host device using host agents.
3118	HostDevice = allocate<AMDHostDeviceTy>();
3119	new (HostDevice) AMDHostDeviceTy(*this, HostAgents);
3120
3121	// Setup the memory pools of available for the host.
3122	if (auto Err = HostDevice->init())
3123	return std::move(Err);
3124
3125	return NumDevices;
3126	}
3127
3128	/// Deinitialize the plugin.
3129	Error deinitImpl() override {
3130	// The HSA runtime was not initialized, so nothing from the plugin was
3131	// actually initialized.
3132	if (!Initialized)
3133	return Plugin::success();
3134
3135	if (HostDevice)
3136	if (auto Err = HostDevice->deinit())
3137	return Err;
3138
3139	// Finalize the HSA runtime.
3140	hsa_status_t Status = hsa_shut_down();
3141	return Plugin::check(Status, "Error in hsa_shut_down: %s");
3142	}
3143
3144	/// Creates an AMDGPU device.
3145	GenericDeviceTy *createDevice(GenericPluginTy &Plugin, int32_t DeviceId,
3146	int32_t NumDevices) override {
3147	return new AMDGPUDeviceTy(Plugin, DeviceId, NumDevices, getHostDevice(),
3148	getKernelAgent(DeviceId));
3149	}
3150
3151	/// Creates an AMDGPU global handler.
3152	GenericGlobalHandlerTy *createGlobalHandler() override {
3153	return new AMDGPUGlobalHandlerTy ();
3154	}
3155
3156	Triple::ArchType getTripleArch() const override { return Triple::amdgcn; }
3157
3158	/// Get the ELF code for recognizing the compatible image binary.
3159	uint16_t getMagicElfBits() const override { return ELF::EM_AMDGPU; }
3160
3161	/// Check whether the image is compatible with an AMDGPU device.
3162	Expected<bool> isELFCompatible(StringRef Image) const override {
3163	// Get the associated architecture and flags from the ELF.
3164	auto ElfOrErr = ELF64LEObjectFile::create(
3165	MemoryBufferRef(Image, /Identifier=/""), /InitContent=/false);
3166	if (!ElfOrErr)
3167	return ElfOrErr.takeError();
3168	std::optional<StringRef> Processor = ElfOrErr->tryGetCPUName();
3169
3170	for (hsa_agent_t Agent : KernelAgents) {
3171	auto TargeTripleAndFeaturesOrError =
3172	utils::getTargetTripleAndFeatures(Agent);
3173	if (!TargeTripleAndFeaturesOrError)
3174	return TargeTripleAndFeaturesOrError.takeError();
3175	if (!utils::isImageCompatibleWithEnv(Processor ? *Processor : "",
3176	ElfOrErr->getPlatformFlags(),
3177	*TargeTripleAndFeaturesOrError))
3178	return false;
3179	}
3180	return true;
3181	}
3182
3183	bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) override {
3184	return true;
3185	}
3186
3187	/// Get the host device instance.
3188	AMDHostDeviceTy &getHostDevice() {
3189	assert(HostDevice && "Host device not initialized");
3190	return *HostDevice;
3191	}
3192
3193	/// Get the kernel agent with the corresponding agent id.
3194	hsa_agent_t getKernelAgent(int32_t AgentId) const {
3195	assert((uint32_t)AgentId < KernelAgents.size() && "Invalid agent id");
3196	return KernelAgents[AgentId];
3197	}
3198
3199	/// Get the list of the available kernel agents.
3200	const llvm::SmallVector<hsa_agent_t> &getKernelAgents() const {
3201	return KernelAgents;
3202	}
3203
3204	private:
3205	/// Event handler that will be called by ROCr if an event is detected.
3206	static hsa_status_t eventHandler(const hsa_amd_event_t Event, void* *) {
3207	if (Event->event_type != HSA_AMD_GPU_MEMORY_FAULT_EVENT)
3208	return HSA_STATUS_SUCCESS;
3209
3210	SmallVector<std::string> Reasons;
3211	uint32_t ReasonsMask = Event->memory_fault.fault_reason_mask;
3212	if (ReasonsMask & HSA_AMD_MEMORY_FAULT_PAGE_NOT_PRESENT)
3213	Reasons.emplace_back(Args: "Page not present or supervisor privilege");
3214	if (ReasonsMask & HSA_AMD_MEMORY_FAULT_READ_ONLY)
3215	Reasons.emplace_back(Args: "Write access to a read-only page");
3216	if (ReasonsMask & HSA_AMD_MEMORY_FAULT_NX)
3217	Reasons.emplace_back(Args: "Execute access to a page marked NX");
3218	if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HOST_ONLY)
3219	Reasons.emplace_back(Args: "GPU attempted access to a host only page");
3220	if (ReasonsMask & HSA_AMD_MEMORY_FAULT_DRAMECC)
3221	Reasons.emplace_back(Args: "DRAM ECC failure");
3222	if (ReasonsMask & HSA_AMD_MEMORY_FAULT_IMPRECISE)
3223	Reasons.emplace_back(Args: "Can't determine the exact fault address");
3224	if (ReasonsMask & HSA_AMD_MEMORY_FAULT_SRAMECC)
3225	Reasons.emplace_back(Args: "SRAM ECC failure (ie registers, no fault address)");
3226	if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HANG)
3227	Reasons.emplace_back(Args: "GPU reset following unspecified hang");
3228
3229	// If we do not know the reason, say so, otherwise remove the trailing comma
3230	// and space.
3231	if (Reasons.empty())
3232	Reasons.emplace_back(Args: "Unknown (" + std::to_string(val: ReasonsMask) + ")");
3233
3234	uint32_t Node = -`1`;
3235	hsa_agent_get_info(Event->memory_fault.agent, HSA_AGENT_INFO_NODE, &Node);
3236
3237	// Abort the execution since we do not recover from this error.
3238	FATAL_MESSAGE(`1`,
3239	"Memory access fault by GPU %" PRIu32 " (agent 0x%" PRIx64
3240	") at virtual address %p. Reasons: %s",
3241	Node, Event->memory_fault.agent.handle,
3242	(void *)Event->memory_fault.virtual_address,
3243	llvm::join(Reasons, ", ").c_str());
3244
3245	return HSA_STATUS_ERROR;
3246	}
3247
3248	/// Indicate whether the HSA runtime was correctly initialized. Even if there
3249	/// is no available devices this boolean will be true. It indicates whether
3250	/// we can safely call HSA functions (e.g., hsa_shut_down).
3251	bool Initialized;
3252
3253	/// Arrays of the available GPU and CPU agents. These arrays of handles should
3254	/// not be here but in the AMDGPUDeviceTy structures directly. However, the
3255	/// HSA standard does not provide API functions to retirve agents directly,
3256	/// only iterating functions. We cache the agents here for convenience.
3257	llvm::SmallVector<hsa_agent_t> KernelAgents;
3258
3259	/// The device representing all HSA host agents.
3260	AMDHostDeviceTy *HostDevice;
3261	};
3262
3263	Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
3264	uint32_t NumThreads, uint64_t NumBlocks,
3265	KernelArgsTy &KernelArgs, void *Args,
3266	AsyncInfoWrapperTy &AsyncInfoWrapper) const {
3267	const uint32_t KernelArgsSize = KernelArgs.NumArgs * sizeof(void *);
3268
3269	if (ArgsSize < KernelArgsSize)
3270	return Plugin::error("Mismatch of kernel arguments size");
3271
3272	// The args size reported by HSA may or may not contain the implicit args.
3273	// For now, assume that HSA does not consider the implicit arguments when
3274	// reporting the arguments of a kernel. In the worst case, we can waste
3275	// 56 bytes per allocation.
3276	uint32_t AllArgsSize = KernelArgsSize + ImplicitArgsSize;
3277
3278	AMDGPUPluginTy &AMDGPUPlugin =
3279	static_cast<AMDGPUPluginTy &>(GenericDevice.Plugin);
3280	AMDHostDeviceTy &HostDevice = AMDGPUPlugin.getHostDevice();
3281	AMDGPUMemoryManagerTy &ArgsMemoryManager = HostDevice.getArgsMemoryManager();
3282
3283	void AllArgs = nullptr*;
3284	if (auto Err = ArgsMemoryManager.allocate(AllArgsSize, &AllArgs))
3285	return Err;
3286
3287	// Account for user requested dynamic shared memory.
3288	uint32_t GroupSize = getGroupSize();
3289	if (uint32_t MaxDynCGroupMem = std::max(
3290	KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize())) {
3291	GroupSize += MaxDynCGroupMem;
3292	}
3293
3294	uint64_t StackSize;
3295	if (auto Err = GenericDevice.getDeviceStackSize(StackSize))
3296	return Err;
3297
3298	// Initialize implicit arguments.
3299	utils::AMDGPUImplicitArgsTy *ImplArgs =
3300	reinterpret_cast<utils::AMDGPUImplicitArgsTy *>(
3301	advanceVoidPtr(AllArgs, KernelArgsSize));
3302
3303	// Initialize the implicit arguments to zero.
3304	std::memset(s: ImplArgs, c: `0`, n: ImplicitArgsSize);
3305
3306	// Copy the explicit arguments.
3307	// TODO: We should expose the args memory manager alloc to the common part as
3308	// alternative to copying them twice.
3309	if (KernelArgs.NumArgs)
3310	std::memcpy(dest: AllArgs, src: *static_cast<void **>(Args),
3311	n: sizeof(void ) KernelArgs.NumArgs);
3312
3313	AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(GenericDevice);
3314
3315	AMDGPUStreamTy Stream = nullptr*;
3316	if (auto Err = AMDGPUDevice.getStream(AsyncInfoWrapper, Stream))
3317	return Err;
3318
3319	// If this kernel requires an RPC server we attach its pointer to the stream.
3320	if (GenericDevice.getRPCServer())
3321	Stream->setRPCServer(GenericDevice.getRPCServer());
3322
3323	// Only COV5 implicitargs needs to be set. COV4 implicitargs are not used.
3324	if (getImplicitArgsSize() == sizeof(utils::AMDGPUImplicitArgsTy)) {
3325	ImplArgs->BlockCountX = NumBlocks;
3326	ImplArgs->BlockCountY = `1`;
3327	ImplArgs->BlockCountZ = `1`;
3328	ImplArgs->GroupSizeX = NumThreads;
3329	ImplArgs->GroupSizeY = `1`;
3330	ImplArgs->GroupSizeZ = `1`;
3331	ImplArgs->GridDims = `1`;
3332	ImplArgs->DynamicLdsSize = KernelArgs.DynCGroupMem;
3333	}
3334
3335	// Push the kernel launch into the stream.
3336	return Stream->pushKernelLaunch(Kernel: *this, KernelArgs: AllArgs, NumThreads, NumBlocks,
3337	GroupSize, StackSize, MemoryManager&: ArgsMemoryManager);
3338	}
3339
3340	Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
3341	KernelArgsTy &KernelArgs,
3342	uint32_t NumThreads,
3343	uint64_t NumBlocks) const {
3344	// Only do all this when the output is requested
3345	if (!(getInfoLevel() & OMP_INFOTYPE_PLUGIN_KERNEL))
3346	return Plugin::success();
3347
3348	// We don't have data to print additional info, but no hard error
3349	if (!KernelInfo.has_value())
3350	return Plugin::success();
3351
3352	// General Info
3353	auto NumGroups = NumBlocks;
3354	auto ThreadsPerGroup = NumThreads;
3355
3356	// Kernel Arguments Info
3357	auto ArgNum = KernelArgs.NumArgs;
3358	auto LoopTripCount = KernelArgs.Tripcount;
3359
3360	// Details for AMDGPU kernels (read from image)
3361	// https://www.llvm.org/docs/AMDGPUUsage.html#code-object-v4-metadata
3362	auto GroupSegmentSize = (*KernelInfo).GroupSegmentList;
3363	auto SGPRCount = (*KernelInfo).SGPRCount;
3364	auto VGPRCount = (*KernelInfo).VGPRCount;
3365	auto SGPRSpillCount = (*KernelInfo).SGPRSpillCount;
3366	auto VGPRSpillCount = (*KernelInfo).VGPRSpillCount;
3367	auto MaxFlatWorkgroupSize = (*KernelInfo).MaxFlatWorkgroupSize;
3368
3369	// Prints additional launch info that contains the following.
3370	// Num Args: The number of kernel arguments
3371	// Teams x Thrds: The number of teams and the number of threads actually
3372	// running.
3373	// MaxFlatWorkgroupSize: Maximum flat work-group size supported by the
3374	// kernel in work-items
3375	// LDS Usage: Amount of bytes used in LDS storage
3376	// S/VGPR Count: the number of S/V GPRs occupied by the kernel
3377	// S/VGPR Spill Count: how many S/VGPRs are spilled by the kernel
3378	// Tripcount: loop tripcount for the kernel
3379	INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(),
3380	"#Args: %d Teams x Thrds: %4lux%4u (MaxFlatWorkGroupSize: %u) LDS "
3381	"Usage: %uB #SGPRs/VGPRs: %u/%u #SGPR/VGPR Spills: %u/%u Tripcount: "
3382	"%lu\n",
3383	ArgNum, NumGroups, ThreadsPerGroup, MaxFlatWorkgroupSize,
3384	GroupSegmentSize, SGPRCount, VGPRCount, SGPRSpillCount, VGPRSpillCount,
3385	LoopTripCount);
3386
3387	return Plugin::success();
3388	}
3389
3390	GenericPluginTy PluginTy::createPlugin() { return* new AMDGPUPluginTy(); }
3391
3392	template <typename... ArgsTy>
3393	static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
3394	hsa_status_t ResultCode = static_cast<hsa_status_t>(Code);
3395	if (ResultCode == HSA_STATUS_SUCCESS \|\| ResultCode == HSA_STATUS_INFO_BREAK)
3396	return Error::success();
3397
3398	const char *Desc = "Unknown error";
3399	hsa_status_t Ret = hsa_status_string(ResultCode, &Desc);
3400	if (Ret != HSA_STATUS_SUCCESS)
3401	REPORT("Unrecognized " GETNAME(TARGET_NAME) " error code %d\n", Code);
3402
3403	return createStringError<ArgsTy..., const char *>(inconvertibleErrorCode(),
3404	ErrFmt, Args..., Desc);
3405	}
3406
3407	void AMDGPUMemoryManagerTy::allocate(size_t Size, void* *HstPtr,
3408	TargetAllocTy Kind) {
3409	// Allocate memory from the pool.
3410	void Ptr = nullptr*;
3411	if (auto Err = MemoryPool->allocate(Size, PtrStorage: &Ptr)) {
3412	consumeError(Err: std::move(Err));
3413	return nullptr;
3414	}
3415	assert(Ptr && "Invalid pointer");
3416
3417	auto &KernelAgents = Plugin.getKernelAgents();
3418
3419	// Allow all kernel agents to access the allocation.
3420	if (auto Err = MemoryPool->enableAccess(Ptr, Size, KernelAgents)) {
3421	REPORT("%s\n", toString(std::move(Err)).data());
3422	return nullptr;
3423	}
3424	return Ptr;
3425	}
3426
3427	void AMDGPUDeviceTy::allocate(size_t Size, void* *, TargetAllocTy Kind) {
3428	if (Size == `0`)
3429	return nullptr;
3430
3431	// Find the correct memory pool.
3432	AMDGPUMemoryPoolTy MemoryPool = nullptr*;
3433	switch (Kind) {
3434	case TARGET_ALLOC_DEFAULT:
3435	case TARGET_ALLOC_DEVICE:
3436	case TARGET_ALLOC_DEVICE_NON_BLOCKING:
3437	MemoryPool = CoarseGrainedMemoryPools[`0`];
3438	break;
3439	case TARGET_ALLOC_HOST:
3440	MemoryPool = &HostDevice.getFineGrainedMemoryPool();
3441	break;
3442	case TARGET_ALLOC_SHARED:
3443	MemoryPool = &HostDevice.getFineGrainedMemoryPool();
3444	break;
3445	}
3446
3447	if (!MemoryPool) {
3448	REPORT("No memory pool for the specified allocation kind\n");
3449	return nullptr;
3450	}
3451
3452	// Allocate from the corresponding memory pool.
3453	void Alloc = nullptr*;
3454	if (Error Err = MemoryPool->allocate(Size, PtrStorage: &Alloc)) {
3455	REPORT("%s\n", toString(E: std::move(Err)).data());
3456	return nullptr;
3457	}
3458
3459	if (Alloc) {
3460	auto &KernelAgents =
3461	static_cast<AMDGPUPluginTy &>(Plugin).getKernelAgents();
3462	// Inherently necessary for host or shared allocations
3463	// Also enabled for device memory to allow device to device memcpy
3464
3465	// Enable all kernel agents to access the buffer.
3466	if (auto Err = MemoryPool->enableAccess(Alloc, Size, KernelAgents)) {
3467	REPORT("%s\n", toString(std::move(Err)).data());
3468	return nullptr;
3469	}
3470	}
3471
3472	return Alloc;
3473	}
3474
3475	} // namespace plugin
3476	} // namespace target
3477	} // namespace omp
3478	} // namespace llvm
3479

source code of offload/plugins-nextgen/amdgpu/src/rtl.cpp