1 | //===----RTLs/amdgpu/src/rtl.cpp - Target RTLs Implementation ----- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // RTL NextGen for AMDGPU machine |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include <atomic> |
14 | #include <cassert> |
15 | #include <cstddef> |
16 | #include <deque> |
17 | #include <mutex> |
18 | #include <string> |
19 | #include <system_error> |
20 | #include <unistd.h> |
21 | #include <unordered_map> |
22 | |
23 | #include "Shared/Debug.h" |
24 | #include "Shared/Environment.h" |
25 | #include "Shared/Utils.h" |
26 | #include "Utils/ELF.h" |
27 | |
28 | #include "GlobalHandler.h" |
29 | #include "OpenMP/OMPT/Callback.h" |
30 | #include "PluginInterface.h" |
31 | #include "UtilitiesRTL.h" |
32 | #include "omptarget.h" |
33 | |
34 | #include "llvm/ADT/SmallString.h" |
35 | #include "llvm/ADT/SmallVector.h" |
36 | #include "llvm/ADT/StringRef.h" |
37 | #include "llvm/BinaryFormat/ELF.h" |
38 | #include "llvm/Frontend/OpenMP/OMPConstants.h" |
39 | #include "llvm/Frontend/OpenMP/OMPGridValues.h" |
40 | #include "llvm/Support/Error.h" |
41 | #include "llvm/Support/FileOutputBuffer.h" |
42 | #include "llvm/Support/FileSystem.h" |
43 | #include "llvm/Support/MemoryBuffer.h" |
44 | #include "llvm/Support/Program.h" |
45 | #include "llvm/Support/raw_ostream.h" |
46 | |
47 | #if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || \ |
48 | !defined(__ORDER_BIG_ENDIAN__) |
49 | #error "Missing preprocessor definitions for endianness detection." |
50 | #endif |
51 | |
52 | // The HSA headers require these definitions. |
53 | #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) |
54 | #define LITTLEENDIAN_CPU |
55 | #elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) |
56 | #define BIGENDIAN_CPU |
57 | #endif |
58 | |
59 | #if defined(__has_include) |
60 | #if __has_include("hsa/hsa.h") |
61 | #include "hsa/hsa.h" |
62 | #include "hsa/hsa_ext_amd.h" |
63 | #elif __has_include("hsa.h") |
64 | #include "hsa.h" |
65 | #include "hsa_ext_amd.h" |
66 | #endif |
67 | #else |
68 | #include "hsa/hsa.h" |
69 | #include "hsa/hsa_ext_amd.h" |
70 | #endif |
71 | |
72 | namespace llvm { |
73 | namespace omp { |
74 | namespace target { |
75 | namespace plugin { |
76 | |
77 | /// Forward declarations for all specialized data structures. |
78 | struct AMDGPUKernelTy; |
79 | struct AMDGPUDeviceTy; |
80 | struct AMDGPUPluginTy; |
81 | struct AMDGPUStreamTy; |
82 | struct AMDGPUEventTy; |
83 | struct AMDGPUStreamManagerTy; |
84 | struct AMDGPUEventManagerTy; |
85 | struct AMDGPUDeviceImageTy; |
86 | struct AMDGPUMemoryManagerTy; |
87 | struct AMDGPUMemoryPoolTy; |
88 | |
89 | namespace utils { |
90 | |
91 | /// Iterate elements using an HSA iterate function. Do not use this function |
92 | /// directly but the specialized ones below instead. |
93 | template <typename ElemTy, typename IterFuncTy, typename CallbackTy> |
94 | hsa_status_t iterate(IterFuncTy Func, CallbackTy Cb) { |
95 | auto L = [](ElemTy Elem, void *Data) -> hsa_status_t { |
96 | CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data); |
97 | return (*Unwrapped)(Elem); |
98 | }; |
99 | return Func(L, static_cast<void *>(&Cb)); |
100 | } |
101 | |
102 | /// Iterate elements using an HSA iterate function passing a parameter. Do not |
103 | /// use this function directly but the specialized ones below instead. |
104 | template <typename ElemTy, typename IterFuncTy, typename IterFuncArgTy, |
105 | typename CallbackTy> |
106 | hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) { |
107 | auto L = [](ElemTy Elem, void *Data) -> hsa_status_t { |
108 | CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data); |
109 | return (*Unwrapped)(Elem); |
110 | }; |
111 | return Func(FuncArg, L, static_cast<void *>(&Cb)); |
112 | } |
113 | |
114 | /// Iterate elements using an HSA iterate function passing a parameter. Do not |
115 | /// use this function directly but the specialized ones below instead. |
116 | template <typename Elem1Ty, typename Elem2Ty, typename IterFuncTy, |
117 | typename IterFuncArgTy, typename CallbackTy> |
118 | hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) { |
119 | auto L = [](Elem1Ty Elem1, Elem2Ty Elem2, void *Data) -> hsa_status_t { |
120 | CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data); |
121 | return (*Unwrapped)(Elem1, Elem2); |
122 | }; |
123 | return Func(FuncArg, L, static_cast<void *>(&Cb)); |
124 | } |
125 | |
126 | /// Iterate agents. |
127 | template <typename CallbackTy> Error iterateAgents(CallbackTy Callback) { |
128 | hsa_status_t Status = iterate<hsa_agent_t>(hsa_iterate_agents, Callback); |
129 | return Plugin::check(Status, "Error in hsa_iterate_agents: %s" ); |
130 | } |
131 | |
132 | /// Iterate ISAs of an agent. |
133 | template <typename CallbackTy> |
134 | Error iterateAgentISAs(hsa_agent_t Agent, CallbackTy Cb) { |
135 | hsa_status_t Status = iterate<hsa_isa_t>(hsa_agent_iterate_isas, Agent, Cb); |
136 | return Plugin::check(Status, "Error in hsa_agent_iterate_isas: %s" ); |
137 | } |
138 | |
139 | /// Iterate memory pools of an agent. |
140 | template <typename CallbackTy> |
141 | Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) { |
142 | hsa_status_t Status = iterate<hsa_amd_memory_pool_t>( |
143 | hsa_amd_agent_iterate_memory_pools, Agent, Cb); |
144 | return Plugin::check(Status, |
145 | "Error in hsa_amd_agent_iterate_memory_pools: %s" ); |
146 | } |
147 | |
148 | /// Dispatches an asynchronous memory copy. |
149 | /// Enables different SDMA engines for the dispatch in a round-robin fashion. |
150 | Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent, |
151 | const void *Src, hsa_agent_t SrcAgent, size_t Size, |
152 | uint32_t NumDepSignals, const hsa_signal_t *DepSignals, |
153 | hsa_signal_t CompletionSignal) { |
154 | if (!UseMultipleSdmaEngines) { |
155 | hsa_status_t S = |
156 | hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, Size, |
157 | NumDepSignals, DepSignals, CompletionSignal); |
158 | return Plugin::check(S, "Error in hsa_amd_memory_async_copy: %s" ); |
159 | } |
160 | |
161 | // This solution is probably not the best |
162 | #if !(HSA_AMD_INTERFACE_VERSION_MAJOR >= 1 && \ |
163 | HSA_AMD_INTERFACE_VERSION_MINOR >= 2) |
164 | return Plugin::error("Async copy on selected SDMA requires ROCm 5.7" ); |
165 | #else |
166 | static std::atomic<int> SdmaEngine{1}; |
167 | |
168 | // This atomics solution is probably not the best, but should be sufficient |
169 | // for now. |
170 | // In a worst case scenario, in which threads read the same value, they will |
171 | // dispatch to the same SDMA engine. This may result in sub-optimal |
172 | // performance. However, I think the possibility to be fairly low. |
173 | int LocalSdmaEngine = SdmaEngine.load(std::memory_order_acquire); |
174 | // This call is only avail in ROCm >= 5.7 |
175 | hsa_status_t S = hsa_amd_memory_async_copy_on_engine( |
176 | Dst, DstAgent, Src, SrcAgent, Size, NumDepSignals, DepSignals, |
177 | CompletionSignal, (hsa_amd_sdma_engine_id_t)LocalSdmaEngine, |
178 | /*force_copy_on_sdma=*/true); |
179 | // Increment to use one of two SDMA engines: 0x1, 0x2 |
180 | LocalSdmaEngine = (LocalSdmaEngine << 1) % 3; |
181 | SdmaEngine.store(LocalSdmaEngine, std::memory_order_relaxed); |
182 | |
183 | return Plugin::check(S, "Error in hsa_amd_memory_async_copy_on_engine: %s" ); |
184 | #endif |
185 | } |
186 | |
187 | Expected<std::string> getTargetTripleAndFeatures(hsa_agent_t Agent) { |
188 | std::string Target; |
189 | auto Err = utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) { |
190 | uint32_t Length; |
191 | hsa_status_t Status; |
192 | Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME_LENGTH, &Length); |
193 | if (Status != HSA_STATUS_SUCCESS) |
194 | return Status; |
195 | |
196 | llvm::SmallVector<char> ISAName(Length); |
197 | Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, ISAName.begin()); |
198 | if (Status != HSA_STATUS_SUCCESS) |
199 | return Status; |
200 | |
201 | llvm::StringRef TripleTarget(ISAName.begin(), Length); |
202 | if (TripleTarget.consume_front(Prefix: "amdgcn-amd-amdhsa" )) |
203 | Target = TripleTarget.ltrim(Char: '-').rtrim(Char: '\0').str(); |
204 | return HSA_STATUS_SUCCESS; |
205 | }); |
206 | if (Err) |
207 | return Err; |
208 | return Target; |
209 | } |
210 | } // namespace utils |
211 | |
212 | /// Utility class representing generic resource references to AMDGPU resources. |
213 | template <typename ResourceTy> |
214 | struct AMDGPUResourceRef : public GenericDeviceResourceRef { |
215 | /// The underlying handle type for resources. |
216 | using HandleTy = ResourceTy *; |
217 | |
218 | /// Create an empty reference to an invalid resource. |
219 | AMDGPUResourceRef() : Resource(nullptr) {} |
220 | |
221 | /// Create a reference to an existing resource. |
222 | AMDGPUResourceRef(HandleTy Resource) : Resource(Resource) {} |
223 | |
224 | virtual ~AMDGPUResourceRef() {} |
225 | |
226 | /// Create a new resource and save the reference. The reference must be empty |
227 | /// before calling to this function. |
228 | Error create(GenericDeviceTy &Device) override; |
229 | |
230 | /// Destroy the referenced resource and invalidate the reference. The |
231 | /// reference must be to a valid resource before calling to this function. |
232 | Error destroy(GenericDeviceTy &Device) override { |
233 | if (!Resource) |
234 | return Plugin::error("Destroying an invalid resource" ); |
235 | |
236 | if (auto Err = Resource->deinit()) |
237 | return Err; |
238 | |
239 | delete Resource; |
240 | |
241 | Resource = nullptr; |
242 | return Plugin::success(); |
243 | } |
244 | |
245 | /// Get the underlying resource handle. |
246 | operator HandleTy() const { return Resource; } |
247 | |
248 | private: |
249 | /// The handle to the actual resource. |
250 | HandleTy Resource; |
251 | }; |
252 | |
253 | /// Class holding an HSA memory pool. |
254 | struct AMDGPUMemoryPoolTy { |
255 | /// Create a memory pool from an HSA memory pool. |
256 | AMDGPUMemoryPoolTy(hsa_amd_memory_pool_t MemoryPool) |
257 | : MemoryPool(MemoryPool), GlobalFlags(0) {} |
258 | |
259 | /// Initialize the memory pool retrieving its properties. |
260 | Error init() { |
261 | if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_SEGMENT, Segment)) |
262 | return Err; |
263 | |
264 | if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, GlobalFlags)) |
265 | return Err; |
266 | |
267 | return Plugin::success(); |
268 | } |
269 | |
270 | /// Getter of the HSA memory pool. |
271 | hsa_amd_memory_pool_t get() const { return MemoryPool; } |
272 | |
273 | /// Indicate the segment which belongs to. |
274 | bool isGlobal() const { return (Segment == HSA_AMD_SEGMENT_GLOBAL); } |
275 | bool isReadOnly() const { return (Segment == HSA_AMD_SEGMENT_READONLY); } |
276 | bool isPrivate() const { return (Segment == HSA_AMD_SEGMENT_PRIVATE); } |
277 | bool isGroup() const { return (Segment == HSA_AMD_SEGMENT_GROUP); } |
278 | |
279 | /// Indicate if it is fine-grained memory. Valid only for global. |
280 | bool isFineGrained() const { |
281 | assert(isGlobal() && "Not global memory" ); |
282 | return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED); |
283 | } |
284 | |
285 | /// Indicate if it is coarse-grained memory. Valid only for global. |
286 | bool isCoarseGrained() const { |
287 | assert(isGlobal() && "Not global memory" ); |
288 | return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED); |
289 | } |
290 | |
291 | /// Indicate if it supports storing kernel arguments. Valid only for global. |
292 | bool supportsKernelArgs() const { |
293 | assert(isGlobal() && "Not global memory" ); |
294 | return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT); |
295 | } |
296 | |
297 | /// Allocate memory on the memory pool. |
298 | Error allocate(size_t Size, void **PtrStorage) { |
299 | hsa_status_t Status = |
300 | hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, PtrStorage); |
301 | return Plugin::check(Status, "Error in hsa_amd_memory_pool_allocate: %s" ); |
302 | } |
303 | |
304 | /// Return memory to the memory pool. |
305 | Error deallocate(void *Ptr) { |
306 | hsa_status_t Status = hsa_amd_memory_pool_free(Ptr); |
307 | return Plugin::check(Status, "Error in hsa_amd_memory_pool_free: %s" ); |
308 | } |
309 | |
310 | /// Allow the device to access a specific allocation. |
311 | Error enableAccess(void *Ptr, int64_t Size, |
312 | const llvm::SmallVector<hsa_agent_t> &Agents) const { |
313 | #ifdef OMPTARGET_DEBUG |
314 | for (hsa_agent_t Agent : Agents) { |
315 | hsa_amd_memory_pool_access_t Access; |
316 | if (auto Err = |
317 | getAttr(Agent, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, Access)) |
318 | return Err; |
319 | |
320 | // The agent is not allowed to access the memory pool in any case. Do not |
321 | // continue because otherwise it result in undefined behavior. |
322 | if (Access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) |
323 | return Plugin::error("An agent is not allowed to access a memory pool" ); |
324 | } |
325 | #endif |
326 | |
327 | // We can access but it is disabled by default. Enable the access then. |
328 | hsa_status_t Status = |
329 | hsa_amd_agents_allow_access(Agents.size(), Agents.data(), nullptr, Ptr); |
330 | return Plugin::check(Status, "Error in hsa_amd_agents_allow_access: %s" ); |
331 | } |
332 | |
333 | /// Get attribute from the memory pool. |
334 | template <typename Ty> |
335 | Error getAttr(hsa_amd_memory_pool_info_t Kind, Ty &Value) const { |
336 | hsa_status_t Status; |
337 | Status = hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value); |
338 | return Plugin::check(Status, "Error in hsa_amd_memory_pool_get_info: %s" ); |
339 | } |
340 | |
341 | template <typename Ty> |
342 | hsa_status_t getAttrRaw(hsa_amd_memory_pool_info_t Kind, Ty &Value) const { |
343 | return hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value); |
344 | } |
345 | |
346 | /// Get attribute from the memory pool relating to an agent. |
347 | template <typename Ty> |
348 | Error getAttr(hsa_agent_t Agent, hsa_amd_agent_memory_pool_info_t Kind, |
349 | Ty &Value) const { |
350 | hsa_status_t Status; |
351 | Status = |
352 | hsa_amd_agent_memory_pool_get_info(Agent, MemoryPool, Kind, &Value); |
353 | return Plugin::check(Status, |
354 | "Error in hsa_amd_agent_memory_pool_get_info: %s" ); |
355 | } |
356 | |
357 | private: |
358 | /// The HSA memory pool. |
359 | hsa_amd_memory_pool_t MemoryPool; |
360 | |
361 | /// The segment where the memory pool belongs to. |
362 | hsa_amd_segment_t Segment; |
363 | |
364 | /// The global flags of memory pool. Only valid if the memory pool belongs to |
365 | /// the global segment. |
366 | uint32_t GlobalFlags; |
367 | }; |
368 | |
369 | /// Class that implements a memory manager that gets memory from a specific |
370 | /// memory pool. |
371 | struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy { |
372 | |
373 | /// Create an empty memory manager. |
374 | AMDGPUMemoryManagerTy(AMDGPUPluginTy &Plugin) |
375 | : Plugin(Plugin), MemoryPool(nullptr), MemoryManager(nullptr) {} |
376 | |
377 | /// Initialize the memory manager from a memory pool. |
378 | Error init(AMDGPUMemoryPoolTy &MemoryPool) { |
379 | const uint32_t Threshold = 1 << 30; |
380 | this->MemoryManager = new MemoryManagerTy(*this, Threshold); |
381 | this->MemoryPool = &MemoryPool; |
382 | return Plugin::success(); |
383 | } |
384 | |
385 | /// Deinitialize the memory manager and free its allocations. |
386 | Error deinit() { |
387 | assert(MemoryManager && "Invalid memory manager" ); |
388 | |
389 | // Delete and invalidate the memory manager. At this point, the memory |
390 | // manager will deallocate all its allocations. |
391 | delete MemoryManager; |
392 | MemoryManager = nullptr; |
393 | |
394 | return Plugin::success(); |
395 | } |
396 | |
397 | /// Reuse or allocate memory through the memory manager. |
398 | Error allocate(size_t Size, void **PtrStorage) { |
399 | assert(MemoryManager && "Invalid memory manager" ); |
400 | assert(PtrStorage && "Invalid pointer storage" ); |
401 | |
402 | *PtrStorage = MemoryManager->allocate(Size, nullptr); |
403 | if (*PtrStorage == nullptr) |
404 | return Plugin::error("Failure to allocate from AMDGPU memory manager" ); |
405 | |
406 | return Plugin::success(); |
407 | } |
408 | |
409 | /// Release an allocation to be reused. |
410 | Error deallocate(void *Ptr) { |
411 | assert(Ptr && "Invalid pointer" ); |
412 | |
413 | if (MemoryManager->free(Ptr)) |
414 | return Plugin::error("Failure to deallocate from AMDGPU memory manager" ); |
415 | |
416 | return Plugin::success(); |
417 | } |
418 | |
419 | private: |
420 | /// Allocation callback that will be called once the memory manager does not |
421 | /// have more previously allocated buffers. |
422 | void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) override; |
423 | |
424 | /// Deallocation callack that will be called by the memory manager. |
425 | int free(void *TgtPtr, TargetAllocTy Kind) override { |
426 | if (auto Err = MemoryPool->deallocate(Ptr: TgtPtr)) { |
427 | consumeError(Err: std::move(Err)); |
428 | return OFFLOAD_FAIL; |
429 | } |
430 | return OFFLOAD_SUCCESS; |
431 | } |
432 | |
433 | /// The underlying plugin that owns this memory manager. |
434 | AMDGPUPluginTy &Plugin; |
435 | |
436 | /// The memory pool used to allocate memory. |
437 | AMDGPUMemoryPoolTy *MemoryPool; |
438 | |
439 | /// Reference to the actual memory manager. |
440 | MemoryManagerTy *MemoryManager; |
441 | }; |
442 | |
443 | /// Class implementing the AMDGPU device images' properties. |
444 | struct AMDGPUDeviceImageTy : public DeviceImageTy { |
445 | /// Create the AMDGPU image with the id and the target image pointer. |
446 | AMDGPUDeviceImageTy(int32_t ImageId, GenericDeviceTy &Device, |
447 | const __tgt_device_image *TgtImage) |
448 | : DeviceImageTy(ImageId, Device, TgtImage) {} |
449 | |
450 | /// Prepare and load the executable corresponding to the image. |
451 | Error loadExecutable(const AMDGPUDeviceTy &Device); |
452 | |
453 | /// Unload the executable. |
454 | Error unloadExecutable() { |
455 | hsa_status_t Status = hsa_executable_destroy(Executable); |
456 | if (auto Err = Plugin::check(Status, "Error in hsa_executable_destroy: %s" )) |
457 | return Err; |
458 | |
459 | Status = hsa_code_object_destroy(CodeObject); |
460 | return Plugin::check(Status, "Error in hsa_code_object_destroy: %s" ); |
461 | } |
462 | |
463 | /// Get the executable. |
464 | hsa_executable_t getExecutable() const { return Executable; } |
465 | |
466 | /// Get to Code Object Version of the ELF |
467 | uint16_t getELFABIVersion() const { return ELFABIVersion; } |
468 | |
469 | /// Find an HSA device symbol by its name on the executable. |
470 | Expected<hsa_executable_symbol_t> |
471 | findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const; |
472 | |
473 | /// Get additional info for kernel, e.g., register spill counts |
474 | std::optional<utils::KernelMetaDataTy> |
475 | getKernelInfo(StringRef Identifier) const { |
476 | auto It = KernelInfoMap.find(Identifier); |
477 | |
478 | if (It == KernelInfoMap.end()) |
479 | return {}; |
480 | |
481 | return It->second; |
482 | } |
483 | |
484 | private: |
485 | /// The exectuable loaded on the agent. |
486 | hsa_executable_t Executable; |
487 | hsa_code_object_t CodeObject; |
488 | StringMap<utils::KernelMetaDataTy> KernelInfoMap; |
489 | uint16_t ELFABIVersion; |
490 | }; |
491 | |
492 | /// Class implementing the AMDGPU kernel functionalities which derives from the |
493 | /// generic kernel class. |
494 | struct AMDGPUKernelTy : public GenericKernelTy { |
495 | /// Create an AMDGPU kernel with a name and an execution mode. |
496 | AMDGPUKernelTy(const char *Name) : GenericKernelTy(Name) {} |
497 | |
498 | /// Initialize the AMDGPU kernel. |
499 | Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override { |
500 | AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(Image); |
501 | |
502 | // Kernel symbols have a ".kd" suffix. |
503 | std::string KernelName(getName()); |
504 | KernelName += ".kd" ; |
505 | |
506 | // Find the symbol on the device executable. |
507 | auto SymbolOrErr = AMDImage.findDeviceSymbol(Device, KernelName); |
508 | if (!SymbolOrErr) |
509 | return SymbolOrErr.takeError(); |
510 | |
511 | hsa_executable_symbol_t Symbol = *SymbolOrErr; |
512 | hsa_symbol_kind_t SymbolType; |
513 | hsa_status_t Status; |
514 | |
515 | // Retrieve different properties of the kernel symbol. |
516 | std::pair<hsa_executable_symbol_info_t, void *> RequiredInfos[] = { |
517 | {HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &SymbolType}, |
518 | {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &KernelObject}, |
519 | {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &ArgsSize}, |
520 | {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &GroupSize}, |
521 | {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &DynamicStack}, |
522 | {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &PrivateSize}}; |
523 | |
524 | for (auto &Info : RequiredInfos) { |
525 | Status = hsa_executable_symbol_get_info(Symbol, Info.first, Info.second); |
526 | if (auto Err = Plugin::check( |
527 | Status, "Error in hsa_executable_symbol_get_info: %s" )) |
528 | return Err; |
529 | } |
530 | |
531 | // Make sure it is a kernel symbol. |
532 | if (SymbolType != HSA_SYMBOL_KIND_KERNEL) |
533 | return Plugin::error("Symbol %s is not a kernel function" ); |
534 | |
535 | // TODO: Read the kernel descriptor for the max threads per block. May be |
536 | // read from the image. |
537 | |
538 | ImplicitArgsSize = utils::getImplicitArgsSize(AMDImage.getELFABIVersion()); |
539 | DP("ELFABIVersion: %d\n" , AMDImage.getELFABIVersion()); |
540 | |
541 | // Get additional kernel info read from image |
542 | KernelInfo = AMDImage.getKernelInfo(getName()); |
543 | if (!KernelInfo.has_value()) |
544 | INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device.getDeviceId(), |
545 | "Could not read extra information for kernel %s." , getName()); |
546 | |
547 | return Plugin::success(); |
548 | } |
549 | |
550 | /// Launch the AMDGPU kernel function. |
551 | Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads, |
552 | uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args, |
553 | AsyncInfoWrapperTy &AsyncInfoWrapper) const override; |
554 | |
555 | /// Print more elaborate kernel launch info for AMDGPU |
556 | Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice, |
557 | KernelArgsTy &KernelArgs, uint32_t NumThreads, |
558 | uint64_t NumBlocks) const override; |
559 | |
560 | /// Get group and private segment kernel size. |
561 | uint32_t getGroupSize() const { return GroupSize; } |
562 | uint32_t getPrivateSize() const { return PrivateSize; } |
563 | |
564 | /// Get the HSA kernel object representing the kernel function. |
565 | uint64_t getKernelObject() const { return KernelObject; } |
566 | |
567 | /// Get the size of implicitargs based on the code object version |
568 | /// @return 56 for cov4 and 256 for cov5 |
569 | uint32_t getImplicitArgsSize() const { return ImplicitArgsSize; } |
570 | |
571 | /// Indicates whether or not we need to set up our own private segment size. |
572 | bool usesDynamicStack() const { return DynamicStack; } |
573 | |
574 | private: |
575 | /// The kernel object to execute. |
576 | uint64_t KernelObject; |
577 | |
578 | /// The args, group and private segments sizes required by a kernel instance. |
579 | uint32_t ArgsSize; |
580 | uint32_t GroupSize; |
581 | uint32_t PrivateSize; |
582 | bool DynamicStack; |
583 | |
584 | /// The size of implicit kernel arguments. |
585 | uint32_t ImplicitArgsSize; |
586 | |
587 | /// Additional Info for the AMD GPU Kernel |
588 | std::optional<utils::KernelMetaDataTy> KernelInfo; |
589 | }; |
590 | |
591 | /// Class representing an HSA signal. Signals are used to define dependencies |
592 | /// between asynchronous operations: kernel launches and memory transfers. |
593 | struct AMDGPUSignalTy { |
594 | /// Create an empty signal. |
595 | AMDGPUSignalTy() : HSASignal({0}), UseCount() {} |
596 | AMDGPUSignalTy(AMDGPUDeviceTy &Device) : HSASignal({0}), UseCount() {} |
597 | |
598 | /// Initialize the signal with an initial value. |
599 | Error init(uint32_t InitialValue = 1) { |
600 | hsa_status_t Status = |
601 | hsa_amd_signal_create(InitialValue, 0, nullptr, 0, &HSASignal); |
602 | return Plugin::check(Status, "Error in hsa_signal_create: %s" ); |
603 | } |
604 | |
605 | /// Deinitialize the signal. |
606 | Error deinit() { |
607 | hsa_status_t Status = hsa_signal_destroy(HSASignal); |
608 | return Plugin::check(Status, "Error in hsa_signal_destroy: %s" ); |
609 | } |
610 | |
611 | /// Wait until the signal gets a zero value. |
612 | Error wait(const uint64_t ActiveTimeout = 0, RPCServerTy *RPCServer = nullptr, |
613 | GenericDeviceTy *Device = nullptr) const { |
614 | if (ActiveTimeout && !RPCServer) { |
615 | hsa_signal_value_t Got = 1; |
616 | Got = hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0, |
617 | ActiveTimeout, HSA_WAIT_STATE_ACTIVE); |
618 | if (Got == 0) |
619 | return Plugin::success(); |
620 | } |
621 | |
622 | // If there is an RPC device attached to this stream we run it as a server. |
623 | uint64_t Timeout = RPCServer ? 8192 : UINT64_MAX; |
624 | auto WaitState = RPCServer ? HSA_WAIT_STATE_ACTIVE : HSA_WAIT_STATE_BLOCKED; |
625 | while (hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0, |
626 | Timeout, WaitState) != 0) { |
627 | if (RPCServer && Device) |
628 | if (auto Err = RPCServer->runServer(*Device)) |
629 | return Err; |
630 | } |
631 | return Plugin::success(); |
632 | } |
633 | |
634 | /// Load the value on the signal. |
635 | hsa_signal_value_t load() const { |
636 | return hsa_signal_load_scacquire(HSASignal); |
637 | } |
638 | |
639 | /// Signal decrementing by one. |
640 | void signal() { |
641 | assert(load() > 0 && "Invalid signal value" ); |
642 | hsa_signal_subtract_screlease(HSASignal, 1); |
643 | } |
644 | |
645 | /// Reset the signal value before reusing the signal. Do not call this |
646 | /// function if the signal is being currently used by any watcher, such as a |
647 | /// plugin thread or the HSA runtime. |
648 | void reset() { hsa_signal_store_screlease(HSASignal, 1); } |
649 | |
650 | /// Increase the number of concurrent uses. |
651 | void increaseUseCount() { UseCount.increase(); } |
652 | |
653 | /// Decrease the number of concurrent uses and return whether was the last. |
654 | bool decreaseUseCount() { return UseCount.decrease(); } |
655 | |
656 | hsa_signal_t get() const { return HSASignal; } |
657 | |
658 | private: |
659 | /// The underlying HSA signal. |
660 | hsa_signal_t HSASignal; |
661 | |
662 | /// Reference counter for tracking the concurrent use count. This is mainly |
663 | /// used for knowing how many streams are using the signal. |
664 | RefCountTy<> UseCount; |
665 | }; |
666 | |
667 | /// Classes for holding AMDGPU signals and managing signals. |
668 | using AMDGPUSignalRef = AMDGPUResourceRef<AMDGPUSignalTy>; |
669 | using AMDGPUSignalManagerTy = GenericDeviceResourceManagerTy<AMDGPUSignalRef>; |
670 | |
671 | /// Class holding an HSA queue to submit kernel and barrier packets. |
672 | struct AMDGPUQueueTy { |
673 | /// Create an empty queue. |
674 | AMDGPUQueueTy() : Queue(nullptr), Mutex(), NumUsers(0) {} |
675 | |
676 | /// Lazily initialize a new queue belonging to a specific agent. |
677 | Error init(hsa_agent_t Agent, int32_t QueueSize) { |
678 | if (Queue) |
679 | return Plugin::success(); |
680 | hsa_status_t Status = |
681 | hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError, |
682 | nullptr, UINT32_MAX, UINT32_MAX, &Queue); |
683 | return Plugin::check(Status, "Error in hsa_queue_create: %s" ); |
684 | } |
685 | |
686 | /// Deinitialize the queue and destroy its resources. |
687 | Error deinit() { |
688 | std::lock_guard<std::mutex> Lock(Mutex); |
689 | if (!Queue) |
690 | return Plugin::success(); |
691 | hsa_status_t Status = hsa_queue_destroy(Queue); |
692 | return Plugin::check(Status, "Error in hsa_queue_destroy: %s" ); |
693 | } |
694 | |
695 | /// Returns the number of streams, this queue is currently assigned to. |
696 | bool getUserCount() const { return NumUsers; } |
697 | |
698 | /// Returns if the underlying HSA queue is initialized. |
699 | bool isInitialized() { return Queue != nullptr; } |
700 | |
701 | /// Decrement user count of the queue object. |
702 | void removeUser() { --NumUsers; } |
703 | |
704 | /// Increase user count of the queue object. |
705 | void addUser() { ++NumUsers; } |
706 | |
707 | /// Push a kernel launch to the queue. The kernel launch requires an output |
708 | /// signal and can define an optional input signal (nullptr if none). |
709 | Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs, |
710 | uint32_t NumThreads, uint64_t NumBlocks, |
711 | uint32_t GroupSize, uint64_t StackSize, |
712 | AMDGPUSignalTy *OutputSignal, |
713 | AMDGPUSignalTy *InputSignal) { |
714 | assert(OutputSignal && "Invalid kernel output signal" ); |
715 | |
716 | // Lock the queue during the packet publishing process. Notice this blocks |
717 | // the addition of other packets to the queue. The following piece of code |
718 | // should be lightweight; do not block the thread, allocate memory, etc. |
719 | std::lock_guard<std::mutex> Lock(Mutex); |
720 | assert(Queue && "Interacted with a non-initialized queue!" ); |
721 | |
722 | // Add a barrier packet before the kernel packet in case there is a pending |
723 | // preceding operation. The barrier packet will delay the processing of |
724 | // subsequent queue's packets until the barrier input signal are satisfied. |
725 | // No need output signal needed because the dependency is already guaranteed |
726 | // by the queue barrier itself. |
727 | if (InputSignal && InputSignal->load()) |
728 | if (auto Err = pushBarrierImpl(OutputSignal: nullptr, InputSignal1: InputSignal)) |
729 | return Err; |
730 | |
731 | // Now prepare the kernel packet. |
732 | uint64_t PacketId; |
733 | hsa_kernel_dispatch_packet_t *Packet = acquirePacket(PacketId); |
734 | assert(Packet && "Invalid packet" ); |
735 | |
736 | // The first 32 bits of the packet are written after the other fields |
737 | uint16_t Setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; |
738 | Packet->workgroup_size_x = NumThreads; |
739 | Packet->workgroup_size_y = 1; |
740 | Packet->workgroup_size_z = 1; |
741 | Packet->reserved0 = 0; |
742 | Packet->grid_size_x = NumBlocks * NumThreads; |
743 | Packet->grid_size_y = 1; |
744 | Packet->grid_size_z = 1; |
745 | Packet->private_segment_size = |
746 | Kernel.usesDynamicStack() ? StackSize : Kernel.getPrivateSize(); |
747 | Packet->group_segment_size = GroupSize; |
748 | Packet->kernel_object = Kernel.getKernelObject(); |
749 | Packet->kernarg_address = KernelArgs; |
750 | Packet->reserved2 = 0; |
751 | Packet->completion_signal = OutputSignal->get(); |
752 | |
753 | // Publish the packet. Do not modify the packet after this point. |
754 | publishKernelPacket(PacketId, Setup, Packet); |
755 | |
756 | return Plugin::success(); |
757 | } |
758 | |
759 | /// Push a barrier packet that will wait up to two input signals. All signals |
760 | /// are optional (nullptr if none). |
761 | Error pushBarrier(AMDGPUSignalTy *OutputSignal, |
762 | const AMDGPUSignalTy *InputSignal1, |
763 | const AMDGPUSignalTy *InputSignal2) { |
764 | // Lock the queue during the packet publishing process. |
765 | std::lock_guard<std::mutex> Lock(Mutex); |
766 | assert(Queue && "Interacted with a non-initialized queue!" ); |
767 | |
768 | // Push the barrier with the lock acquired. |
769 | return pushBarrierImpl(OutputSignal, InputSignal1, InputSignal2); |
770 | } |
771 | |
772 | private: |
773 | /// Push a barrier packet that will wait up to two input signals. Assumes the |
774 | /// the queue lock is acquired. |
775 | Error pushBarrierImpl(AMDGPUSignalTy *OutputSignal, |
776 | const AMDGPUSignalTy *InputSignal1, |
777 | const AMDGPUSignalTy *InputSignal2 = nullptr) { |
778 | // Add a queue barrier waiting on both the other stream's operation and the |
779 | // last operation on the current stream (if any). |
780 | uint64_t PacketId; |
781 | hsa_barrier_and_packet_t *Packet = |
782 | (hsa_barrier_and_packet_t *)acquirePacket(PacketId); |
783 | assert(Packet && "Invalid packet" ); |
784 | |
785 | Packet->reserved0 = 0; |
786 | Packet->reserved1 = 0; |
787 | Packet->dep_signal[0] = {0}; |
788 | Packet->dep_signal[1] = {0}; |
789 | Packet->dep_signal[2] = {0}; |
790 | Packet->dep_signal[3] = {0}; |
791 | Packet->dep_signal[4] = {0}; |
792 | Packet->reserved2 = 0; |
793 | Packet->completion_signal = {0}; |
794 | |
795 | // Set input and output dependencies if needed. |
796 | if (OutputSignal) |
797 | Packet->completion_signal = OutputSignal->get(); |
798 | if (InputSignal1) |
799 | Packet->dep_signal[0] = InputSignal1->get(); |
800 | if (InputSignal2) |
801 | Packet->dep_signal[1] = InputSignal2->get(); |
802 | |
803 | // Publish the packet. Do not modify the packet after this point. |
804 | publishBarrierPacket(PacketId, Packet); |
805 | |
806 | return Plugin::success(); |
807 | } |
808 | |
809 | /// Acquire a packet from the queue. This call may block the thread if there |
810 | /// is no space in the underlying HSA queue. It may need to wait until the HSA |
811 | /// runtime processes some packets. Assumes the queue lock is acquired. |
812 | hsa_kernel_dispatch_packet_t *acquirePacket(uint64_t &PacketId) { |
813 | // Increase the queue index with relaxed memory order. Notice this will need |
814 | // another subsequent atomic operation with acquire order. |
815 | PacketId = hsa_queue_add_write_index_relaxed(Queue, 1); |
816 | |
817 | // Wait for the package to be available. Notice the atomic operation uses |
818 | // the acquire memory order. |
819 | while (PacketId - hsa_queue_load_read_index_scacquire(Queue) >= Queue->size) |
820 | ; |
821 | |
822 | // Return the packet reference. |
823 | const uint32_t Mask = Queue->size - 1; // The size is a power of 2. |
824 | return (hsa_kernel_dispatch_packet_t *)Queue->base_address + |
825 | (PacketId & Mask); |
826 | } |
827 | |
828 | /// Publish the kernel packet so that the HSA runtime can start processing |
829 | /// the kernel launch. Do not modify the packet once this function is called. |
830 | /// Assumes the queue lock is acquired. |
831 | void publishKernelPacket(uint64_t PacketId, uint16_t Setup, |
832 | hsa_kernel_dispatch_packet_t *Packet) { |
833 | uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet); |
834 | |
835 | uint16_t = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; |
836 | Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; |
837 | Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; |
838 | |
839 | // Publish the packet. Do not modify the package after this point. |
840 | uint32_t = Header | (Setup << 16u); |
841 | __atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE); |
842 | |
843 | // Signal the doorbell about the published packet. |
844 | hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId); |
845 | } |
846 | |
847 | /// Publish the barrier packet so that the HSA runtime can start processing |
848 | /// the barrier. Next packets in the queue will not be processed until all |
849 | /// barrier dependencies (signals) are satisfied. Assumes the queue is locked |
850 | void publishBarrierPacket(uint64_t PacketId, |
851 | hsa_barrier_and_packet_t *Packet) { |
852 | uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet); |
853 | uint16_t Setup = 0; |
854 | uint16_t = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; |
855 | Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; |
856 | Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; |
857 | |
858 | // Publish the packet. Do not modify the package after this point. |
859 | uint32_t = Header | (Setup << 16u); |
860 | __atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE); |
861 | |
862 | // Signal the doorbell about the published packet. |
863 | hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId); |
864 | } |
865 | |
866 | /// Callack that will be called when an error is detected on the HSA queue. |
867 | static void callbackError(hsa_status_t Status, hsa_queue_t *Source, void *) { |
868 | auto Err = Plugin::check(Status, "Received error in queue %p: %s" , Source); |
869 | FATAL_MESSAGE(1, "%s" , toString(std::move(Err)).data()); |
870 | } |
871 | |
872 | /// The HSA queue. |
873 | hsa_queue_t *Queue; |
874 | |
875 | /// Mutex to protect the acquiring and publishing of packets. For the moment, |
876 | /// we need this mutex to prevent publishing packets that are not ready to be |
877 | /// published in a multi-thread scenario. Without a queue lock, a thread T1 |
878 | /// could acquire packet P and thread T2 acquire packet P+1. Thread T2 could |
879 | /// publish its packet P+1 (signaling the queue's doorbell) before packet P |
880 | /// from T1 is ready to be processed. That scenario should be invalid. Thus, |
881 | /// we use the following mutex to make packet acquiring and publishing atomic. |
882 | /// TODO: There are other more advanced approaches to avoid this mutex using |
883 | /// atomic operations. We can further investigate it if this is a bottleneck. |
884 | std::mutex Mutex; |
885 | |
886 | /// The number of streams, this queue is currently assigned to. A queue is |
887 | /// considered idle when this is zero, otherwise: busy. |
888 | uint32_t NumUsers; |
889 | }; |
890 | |
891 | /// Struct that implements a stream of asynchronous operations for AMDGPU |
892 | /// devices. This class relies on signals to implement streams and define the |
893 | /// dependencies between asynchronous operations. |
894 | struct AMDGPUStreamTy { |
895 | private: |
896 | /// Utility struct holding arguments for async H2H memory copies. |
897 | struct MemcpyArgsTy { |
898 | void *Dst; |
899 | const void *Src; |
900 | size_t Size; |
901 | }; |
902 | |
903 | /// Utility struct holding arguments for freeing buffers to memory managers. |
904 | struct ReleaseBufferArgsTy { |
905 | void *Buffer; |
906 | AMDGPUMemoryManagerTy *MemoryManager; |
907 | }; |
908 | |
909 | /// Utility struct holding arguments for releasing signals to signal managers. |
910 | struct ReleaseSignalArgsTy { |
911 | AMDGPUSignalTy *Signal; |
912 | AMDGPUSignalManagerTy *SignalManager; |
913 | }; |
914 | |
915 | /// The stream is composed of N stream's slots. The struct below represents |
916 | /// the fields of each slot. Each slot has a signal and an optional action |
917 | /// function. When appending an HSA asynchronous operation to the stream, one |
918 | /// slot is consumed and used to store the operation's information. The |
919 | /// operation's output signal is set to the consumed slot's signal. If there |
920 | /// is a previous asynchronous operation on the previous slot, the HSA async |
921 | /// operation's input signal is set to the signal of the previous slot. This |
922 | /// way, we obtain a chain of dependant async operations. The action is a |
923 | /// function that will be executed eventually after the operation is |
924 | /// completed, e.g., for releasing a buffer. |
925 | struct StreamSlotTy { |
926 | /// The output signal of the stream operation. May be used by the subsequent |
927 | /// operation as input signal. |
928 | AMDGPUSignalTy *Signal; |
929 | |
930 | /// The action that must be performed after the operation's completion. Set |
931 | /// to nullptr when there is no action to perform. |
932 | Error (*ActionFunction)(void *); |
933 | |
934 | /// Space for the action's arguments. A pointer to these arguments is passed |
935 | /// to the action function. Notice the space of arguments is limited. |
936 | union { |
937 | MemcpyArgsTy MemcpyArgs; |
938 | ReleaseBufferArgsTy ReleaseBufferArgs; |
939 | ReleaseSignalArgsTy ReleaseSignalArgs; |
940 | } ActionArgs; |
941 | |
942 | /// Create an empty slot. |
943 | StreamSlotTy() : Signal(nullptr), ActionFunction(nullptr) {} |
944 | |
945 | /// Schedule a host memory copy action on the slot. |
946 | Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size) { |
947 | ActionFunction = memcpyAction; |
948 | ActionArgs.MemcpyArgs = MemcpyArgsTy{Dst, Src, Size}; |
949 | return Plugin::success(); |
950 | } |
951 | |
952 | /// Schedule a release buffer action on the slot. |
953 | Error schedReleaseBuffer(void *Buffer, AMDGPUMemoryManagerTy &Manager) { |
954 | ActionFunction = releaseBufferAction; |
955 | ActionArgs.ReleaseBufferArgs = ReleaseBufferArgsTy{Buffer, &Manager}; |
956 | return Plugin::success(); |
957 | } |
958 | |
959 | /// Schedule a signal release action on the slot. |
960 | Error schedReleaseSignal(AMDGPUSignalTy *SignalToRelease, |
961 | AMDGPUSignalManagerTy *SignalManager) { |
962 | ActionFunction = releaseSignalAction; |
963 | ActionArgs.ReleaseSignalArgs = |
964 | ReleaseSignalArgsTy{SignalToRelease, SignalManager}; |
965 | return Plugin::success(); |
966 | } |
967 | |
968 | // Perform the action if needed. |
969 | Error performAction() { |
970 | if (!ActionFunction) |
971 | return Plugin::success(); |
972 | |
973 | // Perform the action. |
974 | if (ActionFunction == memcpyAction) { |
975 | if (auto Err = memcpyAction(&ActionArgs)) |
976 | return Err; |
977 | } else if (ActionFunction == releaseBufferAction) { |
978 | if (auto Err = releaseBufferAction(&ActionArgs)) |
979 | return Err; |
980 | } else if (ActionFunction == releaseSignalAction) { |
981 | if (auto Err = releaseSignalAction(&ActionArgs)) |
982 | return Err; |
983 | } else { |
984 | return Plugin::error("Unknown action function!" ); |
985 | } |
986 | |
987 | // Invalidate the action. |
988 | ActionFunction = nullptr; |
989 | |
990 | return Plugin::success(); |
991 | } |
992 | }; |
993 | |
994 | /// The device agent where the stream was created. |
995 | hsa_agent_t Agent; |
996 | |
997 | /// The queue that the stream uses to launch kernels. |
998 | AMDGPUQueueTy *Queue; |
999 | |
1000 | /// The manager of signals to reuse signals. |
1001 | AMDGPUSignalManagerTy &SignalManager; |
1002 | |
1003 | /// A reference to the associated device. |
1004 | GenericDeviceTy &Device; |
1005 | |
1006 | /// Array of stream slots. Use std::deque because it can dynamically grow |
1007 | /// without invalidating the already inserted elements. For instance, the |
1008 | /// std::vector may invalidate the elements by reallocating the internal |
1009 | /// array if there is not enough space on new insertions. |
1010 | std::deque<StreamSlotTy> Slots; |
1011 | |
1012 | /// The next available slot on the queue. This is reset to zero each time the |
1013 | /// stream is synchronized. It also indicates the current number of consumed |
1014 | /// slots at a given time. |
1015 | uint32_t NextSlot; |
1016 | |
1017 | /// The synchronization id. This number is increased each time the stream is |
1018 | /// synchronized. It is useful to detect if an AMDGPUEventTy points to an |
1019 | /// operation that was already finalized in a previous stream sycnhronize. |
1020 | uint32_t SyncCycle; |
1021 | |
1022 | /// A pointer associated with an RPC server running on the given device. If |
1023 | /// RPC is not being used this will be a null pointer. Otherwise, this |
1024 | /// indicates that an RPC server is expected to be run on this stream. |
1025 | RPCServerTy *RPCServer; |
1026 | |
1027 | /// Mutex to protect stream's management. |
1028 | mutable std::mutex Mutex; |
1029 | |
1030 | /// Timeout hint for HSA actively waiting for signal value to change |
1031 | const uint64_t StreamBusyWaitMicroseconds; |
1032 | |
1033 | /// Indicate to spread data transfers across all avilable SDMAs |
1034 | bool UseMultipleSdmaEngines; |
1035 | |
1036 | /// Return the current number of asychronous operations on the stream. |
1037 | uint32_t size() const { return NextSlot; } |
1038 | |
1039 | /// Return the last valid slot on the stream. |
1040 | uint32_t last() const { return size() - 1; } |
1041 | |
1042 | /// Consume one slot from the stream. Since the stream uses signals on demand |
1043 | /// and releases them once the slot is no longer used, the function requires |
1044 | /// an idle signal for the new consumed slot. |
1045 | std::pair<uint32_t, AMDGPUSignalTy *> consume(AMDGPUSignalTy *OutputSignal) { |
1046 | // Double the stream size if needed. Since we use std::deque, this operation |
1047 | // does not invalidate the already added slots. |
1048 | if (Slots.size() == NextSlot) |
1049 | Slots.resize(new_size: Slots.size() * 2); |
1050 | |
1051 | // Update the next available slot and the stream size. |
1052 | uint32_t Curr = NextSlot++; |
1053 | |
1054 | // Retrieve the input signal, if any, of the current operation. |
1055 | AMDGPUSignalTy *InputSignal = (Curr > 0) ? Slots[Curr - 1].Signal : nullptr; |
1056 | |
1057 | // Set the output signal of the current slot. |
1058 | Slots[Curr].Signal = OutputSignal; |
1059 | |
1060 | return std::make_pair(x&: Curr, y&: InputSignal); |
1061 | } |
1062 | |
1063 | /// Complete all pending post actions and reset the stream after synchronizing |
1064 | /// or positively querying the stream. |
1065 | Error complete() { |
1066 | for (uint32_t Slot = 0; Slot < NextSlot; ++Slot) { |
1067 | // Take the post action of the operation if any. |
1068 | if (auto Err = Slots[Slot].performAction()) |
1069 | return Err; |
1070 | |
1071 | // Release the slot's signal if possible. Otherwise, another user will. |
1072 | if (Slots[Slot].Signal->decreaseUseCount()) |
1073 | if (auto Err = SignalManager.returnResource(Slots[Slot].Signal)) |
1074 | return Err; |
1075 | |
1076 | Slots[Slot].Signal = nullptr; |
1077 | } |
1078 | |
1079 | // Reset the stream slots to zero. |
1080 | NextSlot = 0; |
1081 | |
1082 | // Increase the synchronization id since the stream completed a sync cycle. |
1083 | SyncCycle += 1; |
1084 | |
1085 | return Plugin::success(); |
1086 | } |
1087 | |
1088 | /// Make the current stream wait on a specific operation of another stream. |
1089 | /// The idea is to make the current stream waiting on two signals: 1) the last |
1090 | /// signal of the current stream, and 2) the last signal of the other stream. |
1091 | /// Use a barrier packet with two input signals. |
1092 | Error waitOnStreamOperation(AMDGPUStreamTy &OtherStream, uint32_t Slot) { |
1093 | if (Queue == nullptr) |
1094 | return Plugin::error("Target queue was nullptr" ); |
1095 | |
1096 | /// The signal that we must wait from the other stream. |
1097 | AMDGPUSignalTy *OtherSignal = OtherStream.Slots[Slot].Signal; |
1098 | |
1099 | // Prevent the release of the other stream's signal. |
1100 | OtherSignal->increaseUseCount(); |
1101 | |
1102 | // Retrieve an available signal for the operation's output. |
1103 | AMDGPUSignalTy *OutputSignal = nullptr; |
1104 | if (auto Err = SignalManager.getResource(OutputSignal)) |
1105 | return Err; |
1106 | OutputSignal->reset(); |
1107 | OutputSignal->increaseUseCount(); |
1108 | |
1109 | // Consume stream slot and compute dependencies. |
1110 | auto [Curr, InputSignal] = consume(OutputSignal); |
1111 | |
1112 | // Setup the post action to release the signal. |
1113 | if (auto Err = Slots[Curr].schedReleaseSignal(OtherSignal, &SignalManager)) |
1114 | return Err; |
1115 | |
1116 | // Push a barrier into the queue with both input signals. |
1117 | return Queue->pushBarrier(OutputSignal, InputSignal1: InputSignal, InputSignal2: OtherSignal); |
1118 | } |
1119 | |
1120 | /// Callback for running a specific asynchronous operation. This callback is |
1121 | /// used for hsa_amd_signal_async_handler. The argument is the operation that |
1122 | /// should be executed. Notice we use the post action mechanism to codify the |
1123 | /// asynchronous operation. |
1124 | static bool asyncActionCallback(hsa_signal_value_t Value, void *Args) { |
1125 | StreamSlotTy *Slot = reinterpret_cast<StreamSlotTy *>(Args); |
1126 | assert(Slot && "Invalid slot" ); |
1127 | assert(Slot->Signal && "Invalid signal" ); |
1128 | |
1129 | // This thread is outside the stream mutex. Make sure the thread sees the |
1130 | // changes on the slot. |
1131 | std::atomic_thread_fence(m: std::memory_order_acquire); |
1132 | |
1133 | // Peform the operation. |
1134 | if (auto Err = Slot->performAction()) |
1135 | FATAL_MESSAGE(1, "Error peforming post action: %s" , |
1136 | toString(E: std::move(Err)).data()); |
1137 | |
1138 | // Signal the output signal to notify the asycnhronous operation finalized. |
1139 | Slot->Signal->signal(); |
1140 | |
1141 | // Unregister callback. |
1142 | return false; |
1143 | } |
1144 | |
1145 | // Callback for host-to-host memory copies. This is an asynchronous action. |
1146 | static Error memcpyAction(void *Data) { |
1147 | MemcpyArgsTy *Args = reinterpret_cast<MemcpyArgsTy *>(Data); |
1148 | assert(Args && "Invalid arguments" ); |
1149 | assert(Args->Dst && "Invalid destination buffer" ); |
1150 | assert(Args->Src && "Invalid source buffer" ); |
1151 | |
1152 | std::memcpy(dest: Args->Dst, src: Args->Src, n: Args->Size); |
1153 | |
1154 | return Plugin::success(); |
1155 | } |
1156 | |
1157 | /// Releasing a memory buffer to a memory manager. This is a post completion |
1158 | /// action. There are two kinds of memory buffers: |
1159 | /// 1. For kernel arguments. This buffer can be freed after receiving the |
1160 | /// kernel completion signal. |
1161 | /// 2. For H2D tranfers that need pinned memory space for staging. This |
1162 | /// buffer can be freed after receiving the transfer completion signal. |
1163 | /// 3. For D2H tranfers that need pinned memory space for staging. This |
1164 | /// buffer cannot be freed after receiving the transfer completion signal |
1165 | /// because of the following asynchronous H2H callback. |
1166 | /// For this reason, This action can only be taken at |
1167 | /// AMDGPUStreamTy::complete() |
1168 | /// Because of the case 3, all releaseBufferActions are taken at |
1169 | /// AMDGPUStreamTy::complete() in the current implementation. |
1170 | static Error releaseBufferAction(void *Data) { |
1171 | ReleaseBufferArgsTy *Args = reinterpret_cast<ReleaseBufferArgsTy *>(Data); |
1172 | assert(Args && "Invalid arguments" ); |
1173 | assert(Args->MemoryManager && "Invalid memory manager" ); |
1174 | assert(Args->Buffer && "Invalid buffer" ); |
1175 | |
1176 | // Release the allocation to the memory manager. |
1177 | return Args->MemoryManager->deallocate(Ptr: Args->Buffer); |
1178 | } |
1179 | |
1180 | /// Releasing a signal object back to SignalManager. This is a post completion |
1181 | /// action. This action can only be taken at AMDGPUStreamTy::complete() |
1182 | static Error releaseSignalAction(void *Data) { |
1183 | ReleaseSignalArgsTy *Args = reinterpret_cast<ReleaseSignalArgsTy *>(Data); |
1184 | assert(Args && "Invalid arguments" ); |
1185 | assert(Args->Signal && "Invalid signal" ); |
1186 | assert(Args->SignalManager && "Invalid signal manager" ); |
1187 | |
1188 | // Release the signal if needed. |
1189 | if (Args->Signal->decreaseUseCount()) |
1190 | if (auto Err = Args->SignalManager->returnResource(Args->Signal)) |
1191 | return Err; |
1192 | |
1193 | return Plugin::success(); |
1194 | } |
1195 | |
1196 | public: |
1197 | /// Create an empty stream associated with a specific device. |
1198 | AMDGPUStreamTy(AMDGPUDeviceTy &Device); |
1199 | |
1200 | /// Intialize the stream's signals. |
1201 | Error init() { return Plugin::success(); } |
1202 | |
1203 | /// Deinitialize the stream's signals. |
1204 | Error deinit() { return Plugin::success(); } |
1205 | |
1206 | /// Attach an RPC server to this stream. |
1207 | void setRPCServer(RPCServerTy *Server) { RPCServer = Server; } |
1208 | |
1209 | /// Push a asynchronous kernel to the stream. The kernel arguments must be |
1210 | /// placed in a special allocation for kernel args and must keep alive until |
1211 | /// the kernel finalizes. Once the kernel is finished, the stream will release |
1212 | /// the kernel args buffer to the specified memory manager. |
1213 | Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs, |
1214 | uint32_t NumThreads, uint64_t NumBlocks, |
1215 | uint32_t GroupSize, uint64_t StackSize, |
1216 | AMDGPUMemoryManagerTy &MemoryManager) { |
1217 | if (Queue == nullptr) |
1218 | return Plugin::error("Target queue was nullptr" ); |
1219 | |
1220 | // Retrieve an available signal for the operation's output. |
1221 | AMDGPUSignalTy *OutputSignal = nullptr; |
1222 | if (auto Err = SignalManager.getResource(OutputSignal)) |
1223 | return Err; |
1224 | OutputSignal->reset(); |
1225 | OutputSignal->increaseUseCount(); |
1226 | |
1227 | std::lock_guard<std::mutex> StreamLock(Mutex); |
1228 | |
1229 | // Consume stream slot and compute dependencies. |
1230 | auto [Curr, InputSignal] = consume(OutputSignal); |
1231 | |
1232 | // Setup the post action to release the kernel args buffer. |
1233 | if (auto Err = Slots[Curr].schedReleaseBuffer(Buffer: KernelArgs, Manager&: MemoryManager)) |
1234 | return Err; |
1235 | |
1236 | // Push the kernel with the output signal and an input signal (optional) |
1237 | return Queue->pushKernelLaunch(Kernel, KernelArgs, NumThreads, NumBlocks, |
1238 | GroupSize, StackSize, OutputSignal, |
1239 | InputSignal); |
1240 | } |
1241 | |
1242 | /// Push an asynchronous memory copy between pinned memory buffers. |
1243 | Error pushPinnedMemoryCopyAsync(void *Dst, const void *Src, |
1244 | uint64_t CopySize) { |
1245 | // Retrieve an available signal for the operation's output. |
1246 | AMDGPUSignalTy *OutputSignal = nullptr; |
1247 | if (auto Err = SignalManager.getResource(OutputSignal)) |
1248 | return Err; |
1249 | OutputSignal->reset(); |
1250 | OutputSignal->increaseUseCount(); |
1251 | |
1252 | std::lock_guard<std::mutex> Lock(Mutex); |
1253 | |
1254 | // Consume stream slot and compute dependencies. |
1255 | auto [Curr, InputSignal] = consume(OutputSignal); |
1256 | |
1257 | // Issue the async memory copy. |
1258 | if (InputSignal && InputSignal->load()) { |
1259 | hsa_signal_t InputSignalRaw = InputSignal->get(); |
1260 | return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent, |
1261 | CopySize, 1, &InputSignalRaw, |
1262 | OutputSignal->get()); |
1263 | } |
1264 | |
1265 | return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent, |
1266 | CopySize, 0, nullptr, OutputSignal->get()); |
1267 | } |
1268 | |
1269 | /// Push an asynchronous memory copy device-to-host involving an unpinned |
1270 | /// memory buffer. The operation consists of a two-step copy from the |
1271 | /// device buffer to an intermediate pinned host buffer, and then, to a |
1272 | /// unpinned host buffer. Both operations are asynchronous and dependant. |
1273 | /// The intermediate pinned buffer will be released to the specified memory |
1274 | /// manager once the operation completes. |
1275 | Error pushMemoryCopyD2HAsync(void *Dst, const void *Src, void *Inter, |
1276 | uint64_t CopySize, |
1277 | AMDGPUMemoryManagerTy &MemoryManager) { |
1278 | // Retrieve available signals for the operation's outputs. |
1279 | AMDGPUSignalTy *OutputSignals[2] = {}; |
1280 | if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals)) |
1281 | return Err; |
1282 | for (auto *Signal : OutputSignals) { |
1283 | Signal->reset(); |
1284 | Signal->increaseUseCount(); |
1285 | } |
1286 | |
1287 | std::lock_guard<std::mutex> Lock(Mutex); |
1288 | |
1289 | // Consume stream slot and compute dependencies. |
1290 | auto [Curr, InputSignal] = consume(OutputSignal: OutputSignals[0]); |
1291 | |
1292 | // Setup the post action for releasing the intermediate buffer. |
1293 | if (auto Err = Slots[Curr].schedReleaseBuffer(Buffer: Inter, Manager&: MemoryManager)) |
1294 | return Err; |
1295 | |
1296 | // Issue the first step: device to host transfer. Avoid defining the input |
1297 | // dependency if already satisfied. |
1298 | if (InputSignal && InputSignal->load()) { |
1299 | hsa_signal_t InputSignalRaw = InputSignal->get(); |
1300 | if (auto Err = utils::asyncMemCopy( |
1301 | UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1, |
1302 | &InputSignalRaw, OutputSignals[0]->get())) |
1303 | return Err; |
1304 | } else { |
1305 | if (auto Err = utils::asyncMemCopy(UseMultipleSdmaEngines, Inter, Agent, |
1306 | Src, Agent, CopySize, 0, nullptr, |
1307 | OutputSignals[0]->get())) |
1308 | return Err; |
1309 | } |
1310 | |
1311 | // Consume another stream slot and compute dependencies. |
1312 | std::tie(args&: Curr, args&: InputSignal) = consume(OutputSignal: OutputSignals[1]); |
1313 | assert(InputSignal && "Invalid input signal" ); |
1314 | |
1315 | // The std::memcpy is done asynchronously using an async handler. We store |
1316 | // the function's information in the action but it's not actually an action. |
1317 | if (auto Err = Slots[Curr].schedHostMemoryCopy(Dst, Src: Inter, Size: CopySize)) |
1318 | return Err; |
1319 | |
1320 | // Make changes on this slot visible to the async handler's thread. |
1321 | std::atomic_thread_fence(m: std::memory_order_release); |
1322 | |
1323 | // Issue the second step: host to host transfer. |
1324 | hsa_status_t Status = hsa_amd_signal_async_handler( |
1325 | InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback, |
1326 | (void *)&Slots[Curr]); |
1327 | |
1328 | return Plugin::check(Status, "Error in hsa_amd_signal_async_handler: %s" ); |
1329 | } |
1330 | |
1331 | /// Push an asynchronous memory copy host-to-device involving an unpinned |
1332 | /// memory buffer. The operation consists of a two-step copy from the |
1333 | /// unpinned host buffer to an intermediate pinned host buffer, and then, to |
1334 | /// the pinned host buffer. Both operations are asynchronous and dependant. |
1335 | /// The intermediate pinned buffer will be released to the specified memory |
1336 | /// manager once the operation completes. |
1337 | Error pushMemoryCopyH2DAsync(void *Dst, const void *Src, void *Inter, |
1338 | uint64_t CopySize, |
1339 | AMDGPUMemoryManagerTy &MemoryManager) { |
1340 | // Retrieve available signals for the operation's outputs. |
1341 | AMDGPUSignalTy *OutputSignals[2] = {}; |
1342 | if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals)) |
1343 | return Err; |
1344 | for (auto *Signal : OutputSignals) { |
1345 | Signal->reset(); |
1346 | Signal->increaseUseCount(); |
1347 | } |
1348 | |
1349 | AMDGPUSignalTy *OutputSignal = OutputSignals[0]; |
1350 | |
1351 | std::lock_guard<std::mutex> Lock(Mutex); |
1352 | |
1353 | // Consume stream slot and compute dependencies. |
1354 | auto [Curr, InputSignal] = consume(OutputSignal); |
1355 | |
1356 | // Issue the first step: host to host transfer. |
1357 | if (InputSignal && InputSignal->load()) { |
1358 | // The std::memcpy is done asynchronously using an async handler. We store |
1359 | // the function's information in the action but it is not actually a |
1360 | // post action. |
1361 | if (auto Err = Slots[Curr].schedHostMemoryCopy(Dst: Inter, Src, Size: CopySize)) |
1362 | return Err; |
1363 | |
1364 | // Make changes on this slot visible to the async handler's thread. |
1365 | std::atomic_thread_fence(m: std::memory_order_release); |
1366 | |
1367 | hsa_status_t Status = hsa_amd_signal_async_handler( |
1368 | InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback, |
1369 | (void *)&Slots[Curr]); |
1370 | |
1371 | if (auto Err = Plugin::check(Status, |
1372 | "Error in hsa_amd_signal_async_handler: %s" )) |
1373 | return Err; |
1374 | |
1375 | // Let's use now the second output signal. |
1376 | OutputSignal = OutputSignals[1]; |
1377 | |
1378 | // Consume another stream slot and compute dependencies. |
1379 | std::tie(args&: Curr, args&: InputSignal) = consume(OutputSignal); |
1380 | } else { |
1381 | // All preceding operations completed, copy the memory synchronously. |
1382 | std::memcpy(dest: Inter, src: Src, n: CopySize); |
1383 | |
1384 | // Return the second signal because it will not be used. |
1385 | OutputSignals[1]->decreaseUseCount(); |
1386 | if (auto Err = SignalManager.returnResource(OutputSignals[1])) |
1387 | return Err; |
1388 | } |
1389 | |
1390 | // Setup the post action to release the intermediate pinned buffer. |
1391 | if (auto Err = Slots[Curr].schedReleaseBuffer(Buffer: Inter, Manager&: MemoryManager)) |
1392 | return Err; |
1393 | |
1394 | // Issue the second step: host to device transfer. Avoid defining the input |
1395 | // dependency if already satisfied. |
1396 | if (InputSignal && InputSignal->load()) { |
1397 | hsa_signal_t InputSignalRaw = InputSignal->get(); |
1398 | return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, |
1399 | Agent, CopySize, 1, &InputSignalRaw, |
1400 | OutputSignal->get()); |
1401 | } |
1402 | return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, Agent, |
1403 | CopySize, 0, nullptr, OutputSignal->get()); |
1404 | } |
1405 | |
1406 | // AMDGPUDeviceTy is incomplete here, passing the underlying agent instead |
1407 | Error pushMemoryCopyD2DAsync(void *Dst, hsa_agent_t DstAgent, const void *Src, |
1408 | hsa_agent_t SrcAgent, uint64_t CopySize) { |
1409 | AMDGPUSignalTy *OutputSignal; |
1410 | if (auto Err = SignalManager.getResources(/*Num=*/1, &OutputSignal)) |
1411 | return Err; |
1412 | OutputSignal->reset(); |
1413 | OutputSignal->increaseUseCount(); |
1414 | |
1415 | std::lock_guard<std::mutex> Lock(Mutex); |
1416 | |
1417 | // Consume stream slot and compute dependencies. |
1418 | auto [Curr, InputSignal] = consume(OutputSignal); |
1419 | |
1420 | // The agents need to have access to the corresponding memory |
1421 | // This is presently only true if the pointers were originally |
1422 | // allocated by this runtime or the caller made the appropriate |
1423 | // access calls. |
1424 | |
1425 | if (InputSignal && InputSignal->load()) { |
1426 | hsa_signal_t InputSignalRaw = InputSignal->get(); |
1427 | return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src, |
1428 | SrcAgent, CopySize, 1, &InputSignalRaw, |
1429 | OutputSignal->get()); |
1430 | } |
1431 | return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src, |
1432 | SrcAgent, CopySize, 0, nullptr, |
1433 | OutputSignal->get()); |
1434 | } |
1435 | |
1436 | /// Synchronize with the stream. The current thread waits until all operations |
1437 | /// are finalized and it performs the pending post actions (i.e., releasing |
1438 | /// intermediate buffers). |
1439 | Error synchronize() { |
1440 | std::lock_guard<std::mutex> Lock(Mutex); |
1441 | |
1442 | // No need to synchronize anything. |
1443 | if (size() == 0) |
1444 | return Plugin::success(); |
1445 | |
1446 | // Wait until all previous operations on the stream have completed. |
1447 | if (auto Err = Slots[last()].Signal->wait(StreamBusyWaitMicroseconds, |
1448 | RPCServer, &Device)) |
1449 | return Err; |
1450 | |
1451 | // Reset the stream and perform all pending post actions. |
1452 | return complete(); |
1453 | } |
1454 | |
1455 | /// Query the stream and complete pending post actions if operations finished. |
1456 | /// Return whether all the operations completed. This operation does not block |
1457 | /// the calling thread. |
1458 | Expected<bool> query() { |
1459 | std::lock_guard<std::mutex> Lock(Mutex); |
1460 | |
1461 | // No need to query anything. |
1462 | if (size() == 0) |
1463 | return true; |
1464 | |
1465 | // The last operation did not complete yet. Return directly. |
1466 | if (Slots[last()].Signal->load()) |
1467 | return false; |
1468 | |
1469 | // Reset the stream and perform all pending post actions. |
1470 | if (auto Err = complete()) |
1471 | return std::move(Err); |
1472 | |
1473 | return true; |
1474 | } |
1475 | |
1476 | /// Record the state of the stream on an event. |
1477 | Error recordEvent(AMDGPUEventTy &Event) const; |
1478 | |
1479 | /// Make the stream wait on an event. |
1480 | Error waitEvent(const AMDGPUEventTy &Event); |
1481 | |
1482 | friend struct AMDGPUStreamManagerTy; |
1483 | }; |
1484 | |
1485 | /// Class representing an event on AMDGPU. The event basically stores some |
1486 | /// information regarding the state of the recorded stream. |
1487 | struct AMDGPUEventTy { |
1488 | /// Create an empty event. |
1489 | AMDGPUEventTy(AMDGPUDeviceTy &Device) |
1490 | : RecordedStream(nullptr), RecordedSlot(-1), RecordedSyncCycle(-1) {} |
1491 | |
1492 | /// Initialize and deinitialize. |
1493 | Error init() { return Plugin::success(); } |
1494 | Error deinit() { return Plugin::success(); } |
1495 | |
1496 | /// Record the state of a stream on the event. |
1497 | Error record(AMDGPUStreamTy &Stream) { |
1498 | std::lock_guard<std::mutex> Lock(Mutex); |
1499 | |
1500 | // Ignore the last recorded stream. |
1501 | RecordedStream = &Stream; |
1502 | |
1503 | return Stream.recordEvent(Event&: *this); |
1504 | } |
1505 | |
1506 | /// Make a stream wait on the current event. |
1507 | Error wait(AMDGPUStreamTy &Stream) { |
1508 | std::lock_guard<std::mutex> Lock(Mutex); |
1509 | |
1510 | if (!RecordedStream) |
1511 | return Plugin::error("Event does not have any recorded stream" ); |
1512 | |
1513 | // Synchronizing the same stream. Do nothing. |
1514 | if (RecordedStream == &Stream) |
1515 | return Plugin::success(); |
1516 | |
1517 | // No need to wait anything, the recorded stream already finished the |
1518 | // corresponding operation. |
1519 | if (RecordedSlot < 0) |
1520 | return Plugin::success(); |
1521 | |
1522 | return Stream.waitEvent(Event: *this); |
1523 | } |
1524 | |
1525 | protected: |
1526 | /// The stream registered in this event. |
1527 | AMDGPUStreamTy *RecordedStream; |
1528 | |
1529 | /// The recordered operation on the recorded stream. |
1530 | int64_t RecordedSlot; |
1531 | |
1532 | /// The sync cycle when the stream was recorded. Used to detect stale events. |
1533 | int64_t RecordedSyncCycle; |
1534 | |
1535 | /// Mutex to safely access event fields. |
1536 | mutable std::mutex Mutex; |
1537 | |
1538 | friend struct AMDGPUStreamTy; |
1539 | }; |
1540 | |
1541 | Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) const { |
1542 | std::lock_guard<std::mutex> Lock(Mutex); |
1543 | |
1544 | if (size() > 0) { |
1545 | // Record the synchronize identifier (to detect stale recordings) and |
1546 | // the last valid stream's operation. |
1547 | Event.RecordedSyncCycle = SyncCycle; |
1548 | Event.RecordedSlot = last(); |
1549 | |
1550 | assert(Event.RecordedSyncCycle >= 0 && "Invalid recorded sync cycle" ); |
1551 | assert(Event.RecordedSlot >= 0 && "Invalid recorded slot" ); |
1552 | } else { |
1553 | // The stream is empty, everything already completed, record nothing. |
1554 | Event.RecordedSyncCycle = -1; |
1555 | Event.RecordedSlot = -1; |
1556 | } |
1557 | return Plugin::success(); |
1558 | } |
1559 | |
1560 | Error AMDGPUStreamTy::waitEvent(const AMDGPUEventTy &Event) { |
1561 | // Retrieve the recorded stream on the event. |
1562 | AMDGPUStreamTy &RecordedStream = *Event.RecordedStream; |
1563 | |
1564 | std::scoped_lock<std::mutex, std::mutex> Lock(Mutex, RecordedStream.Mutex); |
1565 | |
1566 | // The recorded stream already completed the operation because the synchronize |
1567 | // identifier is already outdated. |
1568 | if (RecordedStream.SyncCycle != (uint32_t)Event.RecordedSyncCycle) |
1569 | return Plugin::success(); |
1570 | |
1571 | // Again, the recorded stream already completed the operation, the last |
1572 | // operation's output signal is satisfied. |
1573 | if (!RecordedStream.Slots[Event.RecordedSlot].Signal->load()) |
1574 | return Plugin::success(); |
1575 | |
1576 | // Otherwise, make the current stream wait on the other stream's operation. |
1577 | return waitOnStreamOperation(OtherStream&: RecordedStream, Slot: Event.RecordedSlot); |
1578 | } |
1579 | |
1580 | struct AMDGPUStreamManagerTy final |
1581 | : GenericDeviceResourceManagerTy<AMDGPUResourceRef<AMDGPUStreamTy>> { |
1582 | using ResourceRef = AMDGPUResourceRef<AMDGPUStreamTy>; |
1583 | using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>; |
1584 | |
1585 | AMDGPUStreamManagerTy(GenericDeviceTy &Device, hsa_agent_t HSAAgent) |
1586 | : GenericDeviceResourceManagerTy(Device), |
1587 | OMPX_QueueTracking("LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING" , true), |
1588 | NextQueue(0), Agent(HSAAgent) {} |
1589 | |
1590 | Error init(uint32_t InitialSize, int NumHSAQueues, int HSAQueueSize) { |
1591 | Queues = std::vector<AMDGPUQueueTy>(NumHSAQueues); |
1592 | QueueSize = HSAQueueSize; |
1593 | MaxNumQueues = NumHSAQueues; |
1594 | // Initialize one queue eagerly |
1595 | if (auto Err = Queues.front().init(Agent, QueueSize)) |
1596 | return Err; |
1597 | |
1598 | return GenericDeviceResourceManagerTy::init(InitialSize); |
1599 | } |
1600 | |
1601 | /// Deinitialize the resource pool and delete all resources. This function |
1602 | /// must be called before the destructor. |
1603 | Error deinit() override { |
1604 | // De-init all queues |
1605 | for (AMDGPUQueueTy &Queue : Queues) { |
1606 | if (auto Err = Queue.deinit()) |
1607 | return Err; |
1608 | } |
1609 | |
1610 | return GenericDeviceResourceManagerTy::deinit(); |
1611 | } |
1612 | |
1613 | /// Get a single stream from the pool or create new resources. |
1614 | virtual Error getResource(AMDGPUStreamTy *&StreamHandle) override { |
1615 | return getResourcesImpl(1, &StreamHandle, [this](AMDGPUStreamTy *&Handle) { |
1616 | return assignNextQueue(Stream: Handle); |
1617 | }); |
1618 | } |
1619 | |
1620 | /// Return stream to the pool. |
1621 | virtual Error returnResource(AMDGPUStreamTy *StreamHandle) override { |
1622 | return returnResourceImpl(StreamHandle, [](AMDGPUStreamTy *Handle) { |
1623 | Handle->Queue->removeUser(); |
1624 | return Plugin::success(); |
1625 | }); |
1626 | } |
1627 | |
1628 | private: |
1629 | /// Search for and assign an prefereably idle queue to the given Stream. If |
1630 | /// there is no queue without current users, choose the queue with the lowest |
1631 | /// user count. If utilization is ignored: use round robin selection. |
1632 | inline Error assignNextQueue(AMDGPUStreamTy *Stream) { |
1633 | // Start from zero when tracking utilization, otherwise: round robin policy. |
1634 | uint32_t Index = OMPX_QueueTracking ? 0 : NextQueue++ % MaxNumQueues; |
1635 | |
1636 | if (OMPX_QueueTracking) { |
1637 | // Find the least used queue. |
1638 | for (uint32_t I = 0; I < MaxNumQueues; ++I) { |
1639 | // Early exit when an initialized queue is idle. |
1640 | if (Queues[I].isInitialized() && Queues[I].getUserCount() == 0) { |
1641 | Index = I; |
1642 | break; |
1643 | } |
1644 | |
1645 | // Update the least used queue. |
1646 | if (Queues[Index].getUserCount() > Queues[I].getUserCount()) |
1647 | Index = I; |
1648 | } |
1649 | } |
1650 | |
1651 | // Make sure the queue is initialized, then add user & assign. |
1652 | if (auto Err = Queues[Index].init(Agent, QueueSize)) |
1653 | return Err; |
1654 | Queues[Index].addUser(); |
1655 | Stream->Queue = &Queues[Index]; |
1656 | |
1657 | return Plugin::success(); |
1658 | } |
1659 | |
1660 | /// Envar for controlling the tracking of busy HSA queues. |
1661 | BoolEnvar OMPX_QueueTracking; |
1662 | |
1663 | /// The next queue index to use for round robin selection. |
1664 | uint32_t NextQueue; |
1665 | |
1666 | /// The queues which are assigned to requested streams. |
1667 | std::vector<AMDGPUQueueTy> Queues; |
1668 | |
1669 | /// The corresponding device as HSA agent. |
1670 | hsa_agent_t Agent; |
1671 | |
1672 | /// The maximum number of queues. |
1673 | int MaxNumQueues; |
1674 | |
1675 | /// The size of created queues. |
1676 | int QueueSize; |
1677 | }; |
1678 | |
1679 | /// Abstract class that holds the common members of the actual kernel devices |
1680 | /// and the host device. Both types should inherit from this class. |
1681 | struct AMDGenericDeviceTy { |
1682 | AMDGenericDeviceTy() {} |
1683 | |
1684 | virtual ~AMDGenericDeviceTy() {} |
1685 | |
1686 | /// Create all memory pools which the device has access to and classify them. |
1687 | Error initMemoryPools() { |
1688 | // Retrieve all memory pools from the device agent(s). |
1689 | Error Err = retrieveAllMemoryPools(); |
1690 | if (Err) |
1691 | return Err; |
1692 | |
1693 | for (AMDGPUMemoryPoolTy *MemoryPool : AllMemoryPools) { |
1694 | // Initialize the memory pool and retrieve some basic info. |
1695 | Error Err = MemoryPool->init(); |
1696 | if (Err) |
1697 | return Err; |
1698 | |
1699 | if (!MemoryPool->isGlobal()) |
1700 | continue; |
1701 | |
1702 | // Classify the memory pools depending on their properties. |
1703 | if (MemoryPool->isFineGrained()) { |
1704 | FineGrainedMemoryPools.push_back(MemoryPool); |
1705 | if (MemoryPool->supportsKernelArgs()) |
1706 | ArgsMemoryPools.push_back(MemoryPool); |
1707 | } else if (MemoryPool->isCoarseGrained()) { |
1708 | CoarseGrainedMemoryPools.push_back(MemoryPool); |
1709 | } |
1710 | } |
1711 | return Plugin::success(); |
1712 | } |
1713 | |
1714 | /// Destroy all memory pools. |
1715 | Error deinitMemoryPools() { |
1716 | for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) |
1717 | delete Pool; |
1718 | |
1719 | AllMemoryPools.clear(); |
1720 | FineGrainedMemoryPools.clear(); |
1721 | CoarseGrainedMemoryPools.clear(); |
1722 | ArgsMemoryPools.clear(); |
1723 | |
1724 | return Plugin::success(); |
1725 | } |
1726 | |
1727 | /// Retrieve and construct all memory pools from the device agent(s). |
1728 | virtual Error retrieveAllMemoryPools() = 0; |
1729 | |
1730 | /// Get the device agent. |
1731 | virtual hsa_agent_t getAgent() const = 0; |
1732 | |
1733 | protected: |
1734 | /// Array of all memory pools available to the host agents. |
1735 | llvm::SmallVector<AMDGPUMemoryPoolTy *> AllMemoryPools; |
1736 | |
1737 | /// Array of fine-grained memory pools available to the host agents. |
1738 | llvm::SmallVector<AMDGPUMemoryPoolTy *> FineGrainedMemoryPools; |
1739 | |
1740 | /// Array of coarse-grained memory pools available to the host agents. |
1741 | llvm::SmallVector<AMDGPUMemoryPoolTy *> CoarseGrainedMemoryPools; |
1742 | |
1743 | /// Array of kernel args memory pools available to the host agents. |
1744 | llvm::SmallVector<AMDGPUMemoryPoolTy *> ArgsMemoryPools; |
1745 | }; |
1746 | |
1747 | /// Class representing the host device. This host device may have more than one |
1748 | /// HSA host agent. We aggregate all its resources into the same instance. |
1749 | struct AMDHostDeviceTy : public AMDGenericDeviceTy { |
1750 | /// Create a host device from an array of host agents. |
1751 | AMDHostDeviceTy(AMDGPUPluginTy &Plugin, |
1752 | const llvm::SmallVector<hsa_agent_t> &HostAgents) |
1753 | : AMDGenericDeviceTy(), Agents(HostAgents), ArgsMemoryManager(Plugin), |
1754 | PinnedMemoryManager(Plugin) { |
1755 | assert(HostAgents.size() && "No host agent found" ); |
1756 | } |
1757 | |
1758 | /// Initialize the host device memory pools and the memory managers for |
1759 | /// kernel args and host pinned memory allocations. |
1760 | Error init() { |
1761 | if (auto Err = initMemoryPools()) |
1762 | return Err; |
1763 | |
1764 | if (auto Err = ArgsMemoryManager.init(getArgsMemoryPool())) |
1765 | return Err; |
1766 | |
1767 | if (auto Err = PinnedMemoryManager.init(getFineGrainedMemoryPool())) |
1768 | return Err; |
1769 | |
1770 | return Plugin::success(); |
1771 | } |
1772 | |
1773 | /// Deinitialize memory pools and managers. |
1774 | Error deinit() { |
1775 | if (auto Err = deinitMemoryPools()) |
1776 | return Err; |
1777 | |
1778 | if (auto Err = ArgsMemoryManager.deinit()) |
1779 | return Err; |
1780 | |
1781 | if (auto Err = PinnedMemoryManager.deinit()) |
1782 | return Err; |
1783 | |
1784 | return Plugin::success(); |
1785 | } |
1786 | |
1787 | /// Retrieve and construct all memory pools from the host agents. |
1788 | Error retrieveAllMemoryPools() override { |
1789 | // Iterate through the available pools across the host agents. |
1790 | for (hsa_agent_t Agent : Agents) { |
1791 | Error Err = utils::iterateAgentMemoryPools( |
1792 | Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) { |
1793 | AMDGPUMemoryPoolTy *MemoryPool = |
1794 | new AMDGPUMemoryPoolTy(HSAMemoryPool); |
1795 | AllMemoryPools.push_back(MemoryPool); |
1796 | return HSA_STATUS_SUCCESS; |
1797 | }); |
1798 | if (Err) |
1799 | return Err; |
1800 | } |
1801 | return Plugin::success(); |
1802 | } |
1803 | |
1804 | /// Get one of the host agents. Return always the first agent. |
1805 | hsa_agent_t getAgent() const override { return Agents[0]; } |
1806 | |
1807 | /// Get a memory pool for fine-grained allocations. |
1808 | AMDGPUMemoryPoolTy &getFineGrainedMemoryPool() { |
1809 | assert(!FineGrainedMemoryPools.empty() && "No fine-grained mempool" ); |
1810 | // Retrive any memory pool. |
1811 | return *FineGrainedMemoryPools[0]; |
1812 | } |
1813 | |
1814 | AMDGPUMemoryPoolTy &getCoarseGrainedMemoryPool() { |
1815 | assert(!CoarseGrainedMemoryPools.empty() && "No coarse-grained mempool" ); |
1816 | // Retrive any memory pool. |
1817 | return *CoarseGrainedMemoryPools[0]; |
1818 | } |
1819 | |
1820 | /// Get a memory pool for kernel args allocations. |
1821 | AMDGPUMemoryPoolTy &getArgsMemoryPool() { |
1822 | assert(!ArgsMemoryPools.empty() && "No kernelargs mempool" ); |
1823 | // Retrieve any memory pool. |
1824 | return *ArgsMemoryPools[0]; |
1825 | } |
1826 | |
1827 | /// Getters for kernel args and host pinned memory managers. |
1828 | AMDGPUMemoryManagerTy &getArgsMemoryManager() { return ArgsMemoryManager; } |
1829 | AMDGPUMemoryManagerTy &getPinnedMemoryManager() { |
1830 | return PinnedMemoryManager; |
1831 | } |
1832 | |
1833 | private: |
1834 | /// Array of agents on the host side. |
1835 | const llvm::SmallVector<hsa_agent_t> Agents; |
1836 | |
1837 | // Memory manager for kernel arguments. |
1838 | AMDGPUMemoryManagerTy ArgsMemoryManager; |
1839 | |
1840 | // Memory manager for pinned memory. |
1841 | AMDGPUMemoryManagerTy PinnedMemoryManager; |
1842 | }; |
1843 | |
1844 | /// Class implementing the AMDGPU device functionalities which derives from the |
1845 | /// generic device class. |
1846 | struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { |
1847 | // Create an AMDGPU device with a device id and default AMDGPU grid values. |
1848 | AMDGPUDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices, |
1849 | AMDHostDeviceTy &HostDevice, hsa_agent_t Agent) |
1850 | : GenericDeviceTy(Plugin, DeviceId, NumDevices, {0}), |
1851 | AMDGenericDeviceTy(), |
1852 | OMPX_NumQueues("LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES" , 4), |
1853 | OMPX_QueueSize("LIBOMPTARGET_AMDGPU_HSA_QUEUE_SIZE" , 512), |
1854 | OMPX_DefaultTeamsPerCU("LIBOMPTARGET_AMDGPU_TEAMS_PER_CU" , 4), |
1855 | OMPX_MaxAsyncCopyBytes("LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES" , |
1856 | 1 * 1024 * 1024), // 1MB |
1857 | OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS" , |
1858 | 64), |
1859 | OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT" , 2000000), |
1860 | OMPX_UseMultipleSdmaEngines( |
1861 | "LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES" , false), |
1862 | OMPX_ApuMaps("OMPX_APU_MAPS" , false), AMDGPUStreamManager(*this, Agent), |
1863 | AMDGPUEventManager(*this), AMDGPUSignalManager(*this), Agent(Agent), |
1864 | HostDevice(HostDevice) {} |
1865 | |
1866 | ~AMDGPUDeviceTy() {} |
1867 | |
1868 | /// Initialize the device, its resources and get its properties. |
1869 | Error initImpl(GenericPluginTy &Plugin) override { |
1870 | // First setup all the memory pools. |
1871 | if (auto Err = initMemoryPools()) |
1872 | return Err; |
1873 | |
1874 | char GPUName[64]; |
1875 | if (auto Err = getDeviceAttr(HSA_AGENT_INFO_NAME, GPUName)) |
1876 | return Err; |
1877 | ComputeUnitKind = GPUName; |
1878 | |
1879 | // Get the wavefront size. |
1880 | uint32_t WavefrontSize = 0; |
1881 | if (auto Err = getDeviceAttr(HSA_AGENT_INFO_WAVEFRONT_SIZE, WavefrontSize)) |
1882 | return Err; |
1883 | GridValues.GV_Warp_Size = WavefrontSize; |
1884 | |
1885 | // Get the frequency of the steady clock. If the attribute is missing |
1886 | // assume running on an older libhsa and default to 0, omp_get_wtime |
1887 | // will be inaccurate but otherwise programs can still run. |
1888 | if (getDeviceAttrRaw(HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY, |
1889 | ClockFrequency) != HSA_STATUS_SUCCESS) |
1890 | ClockFrequency = 0; |
1891 | |
1892 | // Load the grid values dependending on the wavefront. |
1893 | if (WavefrontSize == 32) |
1894 | GridValues = getAMDGPUGridValues<32>(); |
1895 | else if (WavefrontSize == 64) |
1896 | GridValues = getAMDGPUGridValues<64>(); |
1897 | else |
1898 | return Plugin::error("Unexpected AMDGPU wavefront %d" , WavefrontSize); |
1899 | |
1900 | // Get maximum number of workitems per workgroup. |
1901 | uint16_t WorkgroupMaxDim[3]; |
1902 | if (auto Err = |
1903 | getDeviceAttr(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgroupMaxDim)) |
1904 | return Err; |
1905 | GridValues.GV_Max_WG_Size = WorkgroupMaxDim[0]; |
1906 | |
1907 | // Get maximum number of workgroups. |
1908 | hsa_dim3_t GridMaxDim; |
1909 | if (auto Err = getDeviceAttr(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim)) |
1910 | return Err; |
1911 | |
1912 | GridValues.GV_Max_Teams = GridMaxDim.x / GridValues.GV_Max_WG_Size; |
1913 | if (GridValues.GV_Max_Teams == 0) |
1914 | return Plugin::error("Maximum number of teams cannot be zero" ); |
1915 | |
1916 | // Compute the default number of teams. |
1917 | uint32_t ComputeUnits = 0; |
1918 | if (auto Err = |
1919 | getDeviceAttr(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, ComputeUnits)) |
1920 | return Err; |
1921 | GridValues.GV_Default_Num_Teams = ComputeUnits * OMPX_DefaultTeamsPerCU; |
1922 | |
1923 | uint32_t WavesPerCU = 0; |
1924 | if (auto Err = |
1925 | getDeviceAttr(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, WavesPerCU)) |
1926 | return Err; |
1927 | HardwareParallelism = ComputeUnits * WavesPerCU; |
1928 | |
1929 | // Get maximum size of any device queues and maximum number of queues. |
1930 | uint32_t MaxQueueSize; |
1931 | if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUE_MAX_SIZE, MaxQueueSize)) |
1932 | return Err; |
1933 | |
1934 | uint32_t MaxQueues; |
1935 | if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUES_MAX, MaxQueues)) |
1936 | return Err; |
1937 | |
1938 | // Compute the number of queues and their size. |
1939 | OMPX_NumQueues = std::max(1U, std::min(OMPX_NumQueues.get(), MaxQueues)); |
1940 | OMPX_QueueSize = std::min(OMPX_QueueSize.get(), MaxQueueSize); |
1941 | |
1942 | // Initialize stream pool. |
1943 | if (auto Err = AMDGPUStreamManager.init(OMPX_InitialNumStreams, |
1944 | OMPX_NumQueues, OMPX_QueueSize)) |
1945 | return Err; |
1946 | |
1947 | // Initialize event pool. |
1948 | if (auto Err = AMDGPUEventManager.init(OMPX_InitialNumEvents)) |
1949 | return Err; |
1950 | |
1951 | // Initialize signal pool. |
1952 | if (auto Err = AMDGPUSignalManager.init(OMPX_InitialNumSignals)) |
1953 | return Err; |
1954 | |
1955 | // Detect if XNACK is enabled |
1956 | auto TargeTripleAndFeaturesOrError = |
1957 | utils::getTargetTripleAndFeatures(Agent); |
1958 | if (!TargeTripleAndFeaturesOrError) |
1959 | return TargeTripleAndFeaturesOrError.takeError(); |
1960 | if (static_cast<StringRef>(*TargeTripleAndFeaturesOrError) |
1961 | .contains(Other: "xnack+" )) |
1962 | IsXnackEnabled = true; |
1963 | |
1964 | // detect if device is an APU. |
1965 | if (auto Err = checkIfAPU()) |
1966 | return Err; |
1967 | |
1968 | return Plugin::success(); |
1969 | } |
1970 | |
1971 | /// Deinitialize the device and release its resources. |
1972 | Error deinitImpl() override { |
1973 | // Deinitialize the stream and event pools. |
1974 | if (auto Err = AMDGPUStreamManager.deinit()) |
1975 | return Err; |
1976 | |
1977 | if (auto Err = AMDGPUEventManager.deinit()) |
1978 | return Err; |
1979 | |
1980 | if (auto Err = AMDGPUSignalManager.deinit()) |
1981 | return Err; |
1982 | |
1983 | // Close modules if necessary. |
1984 | if (!LoadedImages.empty()) { |
1985 | // Each image has its own module. |
1986 | for (DeviceImageTy *Image : LoadedImages) { |
1987 | AMDGPUDeviceImageTy &AMDImage = |
1988 | static_cast<AMDGPUDeviceImageTy &>(*Image); |
1989 | |
1990 | // Unload the executable of the image. |
1991 | if (auto Err = AMDImage.unloadExecutable()) |
1992 | return Err; |
1993 | } |
1994 | } |
1995 | |
1996 | // Invalidate agent reference. |
1997 | Agent = {0}; |
1998 | |
1999 | return Plugin::success(); |
2000 | } |
2001 | |
2002 | virtual Error callGlobalConstructors(GenericPluginTy &Plugin, |
2003 | DeviceImageTy &Image) override { |
2004 | GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); |
2005 | if (Handler.isSymbolInImage(*this, Image, "amdgcn.device.fini" )) |
2006 | Image.setPendingGlobalDtors(); |
2007 | |
2008 | return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true); |
2009 | } |
2010 | |
2011 | virtual Error callGlobalDestructors(GenericPluginTy &Plugin, |
2012 | DeviceImageTy &Image) override { |
2013 | if (Image.hasPendingGlobalDtors()) |
2014 | return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false); |
2015 | return Plugin::success(); |
2016 | } |
2017 | |
2018 | const uint64_t getStreamBusyWaitMicroseconds() const { |
2019 | return OMPX_StreamBusyWait; |
2020 | } |
2021 | |
2022 | Expected<std::unique_ptr<MemoryBuffer>> |
2023 | doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const override { |
2024 | |
2025 | // TODO: We should try to avoid materialization but there seems to be no |
2026 | // good linker interface w/o file i/o. |
2027 | SmallString<128> LinkerInputFilePath; |
2028 | std::error_code EC = sys::fs::createTemporaryFile("amdgpu-pre-link-jit" , |
2029 | "o" , LinkerInputFilePath); |
2030 | if (EC) |
2031 | return Plugin::error("Failed to create temporary file for linker" ); |
2032 | |
2033 | // Write the file's contents to the output file. |
2034 | Expected<std::unique_ptr<FileOutputBuffer>> OutputOrErr = |
2035 | FileOutputBuffer::create(LinkerInputFilePath, MB->getBuffer().size()); |
2036 | if (!OutputOrErr) |
2037 | return OutputOrErr.takeError(); |
2038 | std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr); |
2039 | llvm::copy(Range: MB->getBuffer(), Out: Output->getBufferStart()); |
2040 | if (Error E = Output->commit()) |
2041 | return std::move(E); |
2042 | |
2043 | SmallString<128> LinkerOutputFilePath; |
2044 | EC = sys::fs::createTemporaryFile("amdgpu-pre-link-jit" , "so" , |
2045 | LinkerOutputFilePath); |
2046 | if (EC) |
2047 | return Plugin::error("Failed to create temporary file for linker" ); |
2048 | |
2049 | const auto &ErrorOrPath = sys::findProgramByName("lld" ); |
2050 | if (!ErrorOrPath) |
2051 | return createStringError(EC: inconvertibleErrorCode(), |
2052 | Msg: "Failed to find `lld` on the PATH." ); |
2053 | |
2054 | std::string LLDPath = ErrorOrPath.get(); |
2055 | INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(), |
2056 | "Using `%s` to link JITed amdgcn ouput." , LLDPath.c_str()); |
2057 | |
2058 | std::string MCPU = "-plugin-opt=mcpu=" + getComputeUnitKind(); |
2059 | StringRef Args[] = {LLDPath, |
2060 | "-flavor" , |
2061 | "gnu" , |
2062 | "--no-undefined" , |
2063 | "-shared" , |
2064 | MCPU, |
2065 | "-o" , |
2066 | LinkerOutputFilePath.data(), |
2067 | LinkerInputFilePath.data()}; |
2068 | |
2069 | std::string Error; |
2070 | int RC = sys::ExecuteAndWait(LLDPath, Args, std::nullopt, {}, 0, 0, &Error); |
2071 | if (RC) |
2072 | return Plugin::error("Linking optimized bitcode failed: %s" , |
2073 | Error.c_str()); |
2074 | |
2075 | auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(LinkerOutputFilePath); |
2076 | if (!BufferOrErr) |
2077 | return Plugin::error("Failed to open temporary file for lld" ); |
2078 | |
2079 | // Clean up the temporary files afterwards. |
2080 | if (sys::fs::remove(LinkerOutputFilePath)) |
2081 | return Plugin::error("Failed to remove temporary output file for lld" ); |
2082 | if (sys::fs::remove(LinkerInputFilePath)) |
2083 | return Plugin::error("Failed to remove temporary input file for lld" ); |
2084 | |
2085 | return std::move(*BufferOrErr); |
2086 | } |
2087 | |
2088 | /// See GenericDeviceTy::getComputeUnitKind(). |
2089 | std::string getComputeUnitKind() const override { return ComputeUnitKind; } |
2090 | |
2091 | /// Returns the clock frequency for the given AMDGPU device. |
2092 | uint64_t getClockFrequency() const override { return ClockFrequency; } |
2093 | |
2094 | /// Allocate and construct an AMDGPU kernel. |
2095 | Expected<GenericKernelTy &> constructKernel(const char *Name) override { |
2096 | // Allocate and construct the AMDGPU kernel. |
2097 | AMDGPUKernelTy *AMDGPUKernel = Plugin.allocate<AMDGPUKernelTy>(); |
2098 | if (!AMDGPUKernel) |
2099 | return Plugin::error("Failed to allocate memory for AMDGPU kernel" ); |
2100 | |
2101 | new (AMDGPUKernel) AMDGPUKernelTy(Name); |
2102 | |
2103 | return *AMDGPUKernel; |
2104 | } |
2105 | |
2106 | /// Set the current context to this device's context. Do nothing since the |
2107 | /// AMDGPU devices do not have the concept of contexts. |
2108 | Error setContext() override { return Plugin::success(); } |
2109 | |
2110 | /// AMDGPU returns the product of the number of compute units and the waves |
2111 | /// per compute unit. |
2112 | uint64_t getHardwareParallelism() const override { |
2113 | return HardwareParallelism; |
2114 | } |
2115 | |
2116 | /// We want to set up the RPC server for host services to the GPU if it is |
2117 | /// availible. |
2118 | bool shouldSetupRPCServer() const override { |
2119 | return libomptargetSupportsRPC(); |
2120 | } |
2121 | |
2122 | /// The RPC interface should have enough space for all availible parallelism. |
2123 | uint64_t requestedRPCPortCount() const override { |
2124 | return getHardwareParallelism(); |
2125 | } |
2126 | |
2127 | /// Get the stream of the asynchronous info sructure or get a new one. |
2128 | Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper, |
2129 | AMDGPUStreamTy *&Stream) { |
2130 | // Get the stream (if any) from the async info. |
2131 | Stream = AsyncInfoWrapper.getQueueAs<AMDGPUStreamTy *>(); |
2132 | if (!Stream) { |
2133 | // There was no stream; get an idle one. |
2134 | if (auto Err = AMDGPUStreamManager.getResource(Stream)) |
2135 | return Err; |
2136 | |
2137 | // Modify the async info's stream. |
2138 | AsyncInfoWrapper.setQueueAs<AMDGPUStreamTy *>(Stream); |
2139 | } |
2140 | return Plugin::success(); |
2141 | } |
2142 | |
2143 | /// Load the binary image into the device and allocate an image object. |
2144 | Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage, |
2145 | int32_t ImageId) override { |
2146 | // Allocate and initialize the image object. |
2147 | AMDGPUDeviceImageTy *AMDImage = Plugin.allocate<AMDGPUDeviceImageTy>(); |
2148 | new (AMDImage) AMDGPUDeviceImageTy(ImageId, *this, TgtImage); |
2149 | |
2150 | // Load the HSA executable. |
2151 | if (Error Err = AMDImage->loadExecutable(Device: *this)) |
2152 | return std::move(Err); |
2153 | |
2154 | return AMDImage; |
2155 | } |
2156 | |
2157 | /// Allocate memory on the device or related to the device. |
2158 | void *allocate(size_t Size, void *, TargetAllocTy Kind) override; |
2159 | |
2160 | /// Deallocate memory on the device or related to the device. |
2161 | int free(void *TgtPtr, TargetAllocTy Kind) override { |
2162 | if (TgtPtr == nullptr) |
2163 | return OFFLOAD_SUCCESS; |
2164 | |
2165 | AMDGPUMemoryPoolTy *MemoryPool = nullptr; |
2166 | switch (Kind) { |
2167 | case TARGET_ALLOC_DEFAULT: |
2168 | case TARGET_ALLOC_DEVICE: |
2169 | case TARGET_ALLOC_DEVICE_NON_BLOCKING: |
2170 | MemoryPool = CoarseGrainedMemoryPools[0]; |
2171 | break; |
2172 | case TARGET_ALLOC_HOST: |
2173 | MemoryPool = &HostDevice.getFineGrainedMemoryPool(); |
2174 | break; |
2175 | case TARGET_ALLOC_SHARED: |
2176 | MemoryPool = &HostDevice.getFineGrainedMemoryPool(); |
2177 | break; |
2178 | } |
2179 | |
2180 | if (!MemoryPool) { |
2181 | REPORT("No memory pool for the specified allocation kind\n" ); |
2182 | return OFFLOAD_FAIL; |
2183 | } |
2184 | |
2185 | if (Error Err = MemoryPool->deallocate(Ptr: TgtPtr)) { |
2186 | REPORT("%s\n" , toString(E: std::move(Err)).data()); |
2187 | return OFFLOAD_FAIL; |
2188 | } |
2189 | |
2190 | return OFFLOAD_SUCCESS; |
2191 | } |
2192 | |
2193 | /// Synchronize current thread with the pending operations on the async info. |
2194 | Error synchronizeImpl(__tgt_async_info &AsyncInfo) override { |
2195 | AMDGPUStreamTy *Stream = |
2196 | reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue); |
2197 | assert(Stream && "Invalid stream" ); |
2198 | |
2199 | if (auto Err = Stream->synchronize()) |
2200 | return Err; |
2201 | |
2202 | // Once the stream is synchronized, return it to stream pool and reset |
2203 | // AsyncInfo. This is to make sure the synchronization only works for its |
2204 | // own tasks. |
2205 | AsyncInfo.Queue = nullptr; |
2206 | return AMDGPUStreamManager.returnResource(Stream); |
2207 | } |
2208 | |
2209 | /// Query for the completion of the pending operations on the async info. |
2210 | Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override { |
2211 | AMDGPUStreamTy *Stream = |
2212 | reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue); |
2213 | assert(Stream && "Invalid stream" ); |
2214 | |
2215 | auto CompletedOrErr = Stream->query(); |
2216 | if (!CompletedOrErr) |
2217 | return CompletedOrErr.takeError(); |
2218 | |
2219 | // Return if it the stream did not complete yet. |
2220 | if (!(*CompletedOrErr)) |
2221 | return Plugin::success(); |
2222 | |
2223 | // Once the stream is completed, return it to stream pool and reset |
2224 | // AsyncInfo. This is to make sure the synchronization only works for its |
2225 | // own tasks. |
2226 | AsyncInfo.Queue = nullptr; |
2227 | return AMDGPUStreamManager.returnResource(Stream); |
2228 | } |
2229 | |
2230 | /// Pin the host buffer and return the device pointer that should be used for |
2231 | /// device transfers. |
2232 | Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override { |
2233 | void *PinnedPtr = nullptr; |
2234 | |
2235 | hsa_status_t Status = |
2236 | hsa_amd_memory_lock(HstPtr, Size, nullptr, 0, &PinnedPtr); |
2237 | if (auto Err = Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n" )) |
2238 | return std::move(Err); |
2239 | |
2240 | return PinnedPtr; |
2241 | } |
2242 | |
2243 | /// Unpin the host buffer. |
2244 | Error dataUnlockImpl(void *HstPtr) override { |
2245 | hsa_status_t Status = hsa_amd_memory_unlock(HstPtr); |
2246 | return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n" ); |
2247 | } |
2248 | |
2249 | /// Check through the HSA runtime whether the \p HstPtr buffer is pinned. |
2250 | Expected<bool> isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr, |
2251 | void *&BaseDevAccessiblePtr, |
2252 | size_t &BaseSize) const override { |
2253 | hsa_amd_pointer_info_t Info; |
2254 | Info.size = sizeof(hsa_amd_pointer_info_t); |
2255 | |
2256 | hsa_status_t Status = hsa_amd_pointer_info( |
2257 | HstPtr, &Info, /*Allocator=*/nullptr, /*num_agents_accessible=*/nullptr, |
2258 | /*accessible=*/nullptr); |
2259 | if (auto Err = Plugin::check(Status, "Error in hsa_amd_pointer_info: %s" )) |
2260 | return std::move(Err); |
2261 | |
2262 | // The buffer may be locked or allocated through HSA allocators. Assume that |
2263 | // the buffer is host pinned if the runtime reports a HSA type. |
2264 | if (Info.type != HSA_EXT_POINTER_TYPE_LOCKED && |
2265 | Info.type != HSA_EXT_POINTER_TYPE_HSA) |
2266 | return false; |
2267 | |
2268 | assert(Info.hostBaseAddress && "Invalid host pinned address" ); |
2269 | assert(Info.agentBaseAddress && "Invalid agent pinned address" ); |
2270 | assert(Info.sizeInBytes > 0 && "Invalid pinned allocation size" ); |
2271 | |
2272 | // Save the allocation info in the output parameters. |
2273 | BaseHstPtr = Info.hostBaseAddress; |
2274 | BaseDevAccessiblePtr = Info.agentBaseAddress; |
2275 | BaseSize = Info.sizeInBytes; |
2276 | |
2277 | return true; |
2278 | } |
2279 | |
2280 | /// Submit data to the device (host to device transfer). |
2281 | Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size, |
2282 | AsyncInfoWrapperTy &AsyncInfoWrapper) override { |
2283 | AMDGPUStreamTy *Stream = nullptr; |
2284 | void *PinnedPtr = nullptr; |
2285 | |
2286 | // Use one-step asynchronous operation when host memory is already pinned. |
2287 | if (void *PinnedPtr = |
2288 | PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) { |
2289 | if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
2290 | return Err; |
2291 | return Stream->pushPinnedMemoryCopyAsync(Dst: TgtPtr, Src: PinnedPtr, CopySize: Size); |
2292 | } |
2293 | |
2294 | // For large transfers use synchronous behavior. |
2295 | if (Size >= OMPX_MaxAsyncCopyBytes) { |
2296 | if (AsyncInfoWrapper.hasQueue()) |
2297 | if (auto Err = synchronize(AsyncInfoWrapper)) |
2298 | return Err; |
2299 | |
2300 | hsa_status_t Status; |
2301 | Status = hsa_amd_memory_lock(const_cast<void *>(HstPtr), Size, nullptr, 0, |
2302 | &PinnedPtr); |
2303 | if (auto Err = |
2304 | Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n" )) |
2305 | return Err; |
2306 | |
2307 | AMDGPUSignalTy Signal; |
2308 | if (auto Err = Signal.init()) |
2309 | return Err; |
2310 | |
2311 | if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr, |
2312 | Agent, PinnedPtr, Agent, Size, 0, |
2313 | nullptr, Signal.get())) |
2314 | return Err; |
2315 | |
2316 | if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds())) |
2317 | return Err; |
2318 | |
2319 | if (auto Err = Signal.deinit()) |
2320 | return Err; |
2321 | |
2322 | Status = hsa_amd_memory_unlock(const_cast<void *>(HstPtr)); |
2323 | return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n" ); |
2324 | } |
2325 | |
2326 | // Otherwise, use two-step copy with an intermediate pinned host buffer. |
2327 | AMDGPUMemoryManagerTy &PinnedMemoryManager = |
2328 | HostDevice.getPinnedMemoryManager(); |
2329 | if (auto Err = PinnedMemoryManager.allocate(Size, PtrStorage: &PinnedPtr)) |
2330 | return Err; |
2331 | |
2332 | if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
2333 | return Err; |
2334 | |
2335 | return Stream->pushMemoryCopyH2DAsync(Dst: TgtPtr, Src: HstPtr, Inter: PinnedPtr, CopySize: Size, |
2336 | MemoryManager&: PinnedMemoryManager); |
2337 | } |
2338 | |
2339 | /// Retrieve data from the device (device to host transfer). |
2340 | Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size, |
2341 | AsyncInfoWrapperTy &AsyncInfoWrapper) override { |
2342 | AMDGPUStreamTy *Stream = nullptr; |
2343 | void *PinnedPtr = nullptr; |
2344 | |
2345 | // Use one-step asynchronous operation when host memory is already pinned. |
2346 | if (void *PinnedPtr = |
2347 | PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) { |
2348 | if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
2349 | return Err; |
2350 | |
2351 | return Stream->pushPinnedMemoryCopyAsync(Dst: PinnedPtr, Src: TgtPtr, CopySize: Size); |
2352 | } |
2353 | |
2354 | // For large transfers use synchronous behavior. |
2355 | if (Size >= OMPX_MaxAsyncCopyBytes) { |
2356 | if (AsyncInfoWrapper.hasQueue()) |
2357 | if (auto Err = synchronize(AsyncInfoWrapper)) |
2358 | return Err; |
2359 | |
2360 | hsa_status_t Status; |
2361 | Status = hsa_amd_memory_lock(const_cast<void *>(HstPtr), Size, nullptr, 0, |
2362 | &PinnedPtr); |
2363 | if (auto Err = |
2364 | Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n" )) |
2365 | return Err; |
2366 | |
2367 | AMDGPUSignalTy Signal; |
2368 | if (auto Err = Signal.init()) |
2369 | return Err; |
2370 | |
2371 | if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), PinnedPtr, |
2372 | Agent, TgtPtr, Agent, Size, 0, nullptr, |
2373 | Signal.get())) |
2374 | return Err; |
2375 | |
2376 | if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds())) |
2377 | return Err; |
2378 | |
2379 | if (auto Err = Signal.deinit()) |
2380 | return Err; |
2381 | |
2382 | Status = hsa_amd_memory_unlock(const_cast<void *>(HstPtr)); |
2383 | return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n" ); |
2384 | } |
2385 | |
2386 | // Otherwise, use two-step copy with an intermediate pinned host buffer. |
2387 | AMDGPUMemoryManagerTy &PinnedMemoryManager = |
2388 | HostDevice.getPinnedMemoryManager(); |
2389 | if (auto Err = PinnedMemoryManager.allocate(Size, PtrStorage: &PinnedPtr)) |
2390 | return Err; |
2391 | |
2392 | if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
2393 | return Err; |
2394 | |
2395 | return Stream->pushMemoryCopyD2HAsync(Dst: HstPtr, Src: TgtPtr, Inter: PinnedPtr, CopySize: Size, |
2396 | MemoryManager&: PinnedMemoryManager); |
2397 | } |
2398 | |
2399 | /// Exchange data between two devices within the plugin. |
2400 | Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice, |
2401 | void *DstPtr, int64_t Size, |
2402 | AsyncInfoWrapperTy &AsyncInfoWrapper) override { |
2403 | AMDGPUDeviceTy &DstDevice = static_cast<AMDGPUDeviceTy &>(DstGenericDevice); |
2404 | |
2405 | // For large transfers use synchronous behavior. |
2406 | if (Size >= OMPX_MaxAsyncCopyBytes) { |
2407 | if (AsyncInfoWrapper.hasQueue()) |
2408 | if (auto Err = synchronize(AsyncInfoWrapper)) |
2409 | return Err; |
2410 | |
2411 | AMDGPUSignalTy Signal; |
2412 | if (auto Err = Signal.init()) |
2413 | return Err; |
2414 | |
2415 | if (auto Err = utils::asyncMemCopy( |
2416 | useMultipleSdmaEngines(), DstPtr, DstDevice.getAgent(), SrcPtr, |
2417 | getAgent(), (uint64_t)Size, 0, nullptr, Signal.get())) |
2418 | return Err; |
2419 | |
2420 | if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds())) |
2421 | return Err; |
2422 | |
2423 | return Signal.deinit(); |
2424 | } |
2425 | |
2426 | AMDGPUStreamTy *Stream = nullptr; |
2427 | if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
2428 | return Err; |
2429 | if (Size <= 0) |
2430 | return Plugin::success(); |
2431 | |
2432 | return Stream->pushMemoryCopyD2DAsync(DstPtr, DstDevice.getAgent(), SrcPtr, |
2433 | getAgent(), (uint64_t)Size); |
2434 | } |
2435 | |
2436 | /// Initialize the async info for interoperability purposes. |
2437 | Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override { |
2438 | // TODO: Implement this function. |
2439 | return Plugin::success(); |
2440 | } |
2441 | |
2442 | /// Initialize the device info for interoperability purposes. |
2443 | Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override { |
2444 | DeviceInfo->Context = nullptr; |
2445 | |
2446 | if (!DeviceInfo->Device) |
2447 | DeviceInfo->Device = reinterpret_cast<void *>(Agent.handle); |
2448 | |
2449 | return Plugin::success(); |
2450 | } |
2451 | |
2452 | /// Create an event. |
2453 | Error createEventImpl(void **EventPtrStorage) override { |
2454 | AMDGPUEventTy **Event = reinterpret_cast<AMDGPUEventTy **>(EventPtrStorage); |
2455 | return AMDGPUEventManager.getResource(*Event); |
2456 | } |
2457 | |
2458 | /// Destroy a previously created event. |
2459 | Error destroyEventImpl(void *EventPtr) override { |
2460 | AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr); |
2461 | return AMDGPUEventManager.returnResource(Event); |
2462 | } |
2463 | |
2464 | /// Record the event. |
2465 | Error recordEventImpl(void *EventPtr, |
2466 | AsyncInfoWrapperTy &AsyncInfoWrapper) override { |
2467 | AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr); |
2468 | assert(Event && "Invalid event" ); |
2469 | |
2470 | AMDGPUStreamTy *Stream = nullptr; |
2471 | if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
2472 | return Err; |
2473 | |
2474 | return Event->record(Stream&: *Stream); |
2475 | } |
2476 | |
2477 | /// Make the stream wait on the event. |
2478 | Error waitEventImpl(void *EventPtr, |
2479 | AsyncInfoWrapperTy &AsyncInfoWrapper) override { |
2480 | AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr); |
2481 | |
2482 | AMDGPUStreamTy *Stream = nullptr; |
2483 | if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
2484 | return Err; |
2485 | |
2486 | return Event->wait(Stream&: *Stream); |
2487 | } |
2488 | |
2489 | /// Synchronize the current thread with the event. |
2490 | Error syncEventImpl(void *EventPtr) override { |
2491 | return Plugin::error("Synchronize event not implemented" ); |
2492 | } |
2493 | |
2494 | /// Print information about the device. |
2495 | Error obtainInfoImpl(InfoQueueTy &Info) override { |
2496 | char TmpChar[1000]; |
2497 | const char *TmpCharPtr = "Unknown" ; |
2498 | uint16_t Major, Minor; |
2499 | uint32_t TmpUInt, TmpUInt2; |
2500 | uint32_t CacheSize[4]; |
2501 | size_t TmpSt; |
2502 | bool TmpBool; |
2503 | uint16_t WorkgrpMaxDim[3]; |
2504 | hsa_dim3_t GridMaxDim; |
2505 | hsa_status_t Status, Status2; |
2506 | |
2507 | Status = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major); |
2508 | Status2 = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor); |
2509 | if (Status == HSA_STATUS_SUCCESS && Status2 == HSA_STATUS_SUCCESS) |
2510 | Info.add("HSA Runtime Version" , |
2511 | std::to_string(val: Major) + "." + std::to_string(val: Minor)); |
2512 | |
2513 | Info.add("HSA OpenMP Device Number" , DeviceId); |
2514 | |
2515 | Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar); |
2516 | if (Status == HSA_STATUS_SUCCESS) |
2517 | Info.add("Product Name" , TmpChar); |
2518 | |
2519 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_NAME, TmpChar); |
2520 | if (Status == HSA_STATUS_SUCCESS) |
2521 | Info.add("Device Name" , TmpChar); |
2522 | |
2523 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_VENDOR_NAME, TmpChar); |
2524 | if (Status == HSA_STATUS_SUCCESS) |
2525 | Info.add("Vendor Name" , TmpChar); |
2526 | |
2527 | hsa_device_type_t DevType; |
2528 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_DEVICE, DevType); |
2529 | if (Status == HSA_STATUS_SUCCESS) { |
2530 | switch (DevType) { |
2531 | case HSA_DEVICE_TYPE_CPU: |
2532 | TmpCharPtr = "CPU" ; |
2533 | break; |
2534 | case HSA_DEVICE_TYPE_GPU: |
2535 | TmpCharPtr = "GPU" ; |
2536 | break; |
2537 | case HSA_DEVICE_TYPE_DSP: |
2538 | TmpCharPtr = "DSP" ; |
2539 | break; |
2540 | } |
2541 | Info.add("Device Type" , TmpCharPtr); |
2542 | } |
2543 | |
2544 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUES_MAX, TmpUInt); |
2545 | if (Status == HSA_STATUS_SUCCESS) |
2546 | Info.add("Max Queues" , TmpUInt); |
2547 | |
2548 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MIN_SIZE, TmpUInt); |
2549 | if (Status == HSA_STATUS_SUCCESS) |
2550 | Info.add("Queue Min Size" , TmpUInt); |
2551 | |
2552 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MAX_SIZE, TmpUInt); |
2553 | if (Status == HSA_STATUS_SUCCESS) |
2554 | Info.add("Queue Max Size" , TmpUInt); |
2555 | |
2556 | // FIXME: This is deprecated according to HSA documentation. But using |
2557 | // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during |
2558 | // runtime. |
2559 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_CACHE_SIZE, CacheSize); |
2560 | if (Status == HSA_STATUS_SUCCESS) { |
2561 | Info.add("Cache" ); |
2562 | |
2563 | for (int I = 0; I < 4; I++) |
2564 | if (CacheSize[I]) |
2565 | Info.add<InfoLevel2>("L" + std::to_string(I), CacheSize[I]); |
2566 | } |
2567 | |
2568 | Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_CACHELINE_SIZE, TmpUInt); |
2569 | if (Status == HSA_STATUS_SUCCESS) |
2570 | Info.add("Cacheline Size" , TmpUInt); |
2571 | |
2572 | Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt); |
2573 | if (Status == HSA_STATUS_SUCCESS) |
2574 | Info.add("Max Clock Freq" , TmpUInt, "MHz" ); |
2575 | |
2576 | Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, TmpUInt); |
2577 | if (Status == HSA_STATUS_SUCCESS) |
2578 | Info.add("Compute Units" , TmpUInt); |
2579 | |
2580 | Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, TmpUInt); |
2581 | if (Status == HSA_STATUS_SUCCESS) |
2582 | Info.add("SIMD per CU" , TmpUInt); |
2583 | |
2584 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_FAST_F16_OPERATION, TmpBool); |
2585 | if (Status == HSA_STATUS_SUCCESS) |
2586 | Info.add("Fast F16 Operation" , TmpBool); |
2587 | |
2588 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_WAVEFRONT_SIZE, TmpUInt2); |
2589 | if (Status == HSA_STATUS_SUCCESS) |
2590 | Info.add("Wavefront Size" , TmpUInt2); |
2591 | |
2592 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, TmpUInt); |
2593 | if (Status == HSA_STATUS_SUCCESS) |
2594 | Info.add("Workgroup Max Size" , TmpUInt); |
2595 | |
2596 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgrpMaxDim); |
2597 | if (Status == HSA_STATUS_SUCCESS) { |
2598 | Info.add("Workgroup Max Size per Dimension" ); |
2599 | Info.add<InfoLevel2>("x" , WorkgrpMaxDim[0]); |
2600 | Info.add<InfoLevel2>("y" , WorkgrpMaxDim[1]); |
2601 | Info.add<InfoLevel2>("z" , WorkgrpMaxDim[2]); |
2602 | } |
2603 | |
2604 | Status = getDeviceAttrRaw( |
2605 | (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, TmpUInt); |
2606 | if (Status == HSA_STATUS_SUCCESS) { |
2607 | Info.add("Max Waves Per CU" , TmpUInt); |
2608 | Info.add("Max Work-item Per CU" , TmpUInt * TmpUInt2); |
2609 | } |
2610 | |
2611 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_SIZE, TmpUInt); |
2612 | if (Status == HSA_STATUS_SUCCESS) |
2613 | Info.add("Grid Max Size" , TmpUInt); |
2614 | |
2615 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim); |
2616 | if (Status == HSA_STATUS_SUCCESS) { |
2617 | Info.add("Grid Max Size per Dimension" ); |
2618 | Info.add<InfoLevel2>("x" , GridMaxDim.x); |
2619 | Info.add<InfoLevel2>("y" , GridMaxDim.y); |
2620 | Info.add<InfoLevel2>("z" , GridMaxDim.z); |
2621 | } |
2622 | |
2623 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_FBARRIER_MAX_SIZE, TmpUInt); |
2624 | if (Status == HSA_STATUS_SUCCESS) |
2625 | Info.add("Max fbarriers/Workgrp" , TmpUInt); |
2626 | |
2627 | Info.add("Memory Pools" ); |
2628 | for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) { |
2629 | std::string TmpStr, TmpStr2; |
2630 | |
2631 | if (Pool->isGlobal()) |
2632 | TmpStr = "Global" ; |
2633 | else if (Pool->isReadOnly()) |
2634 | TmpStr = "ReadOnly" ; |
2635 | else if (Pool->isPrivate()) |
2636 | TmpStr = "Private" ; |
2637 | else if (Pool->isGroup()) |
2638 | TmpStr = "Group" ; |
2639 | else |
2640 | TmpStr = "Unknown" ; |
2641 | |
2642 | Info.add<InfoLevel2>(std::string("Pool " ) + TmpStr); |
2643 | |
2644 | if (Pool->isGlobal()) { |
2645 | if (Pool->isFineGrained()) |
2646 | TmpStr2 += "Fine Grained " ; |
2647 | if (Pool->isCoarseGrained()) |
2648 | TmpStr2 += "Coarse Grained " ; |
2649 | if (Pool->supportsKernelArgs()) |
2650 | TmpStr2 += "Kernarg " ; |
2651 | |
2652 | Info.add<InfoLevel3>("Flags" , TmpStr2); |
2653 | } |
2654 | |
2655 | Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, TmpSt); |
2656 | if (Status == HSA_STATUS_SUCCESS) |
2657 | Info.add<InfoLevel3>("Size" , TmpSt, "bytes" ); |
2658 | |
2659 | Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, |
2660 | TmpBool); |
2661 | if (Status == HSA_STATUS_SUCCESS) |
2662 | Info.add<InfoLevel3>("Allocatable" , TmpBool); |
2663 | |
2664 | Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, |
2665 | TmpSt); |
2666 | if (Status == HSA_STATUS_SUCCESS) |
2667 | Info.add<InfoLevel3>("Runtime Alloc Granule" , TmpSt, "bytes" ); |
2668 | |
2669 | Status = Pool->getAttrRaw( |
2670 | HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, TmpSt); |
2671 | if (Status == HSA_STATUS_SUCCESS) |
2672 | Info.add<InfoLevel3>("Runtime Alloc Alignment" , TmpSt, "bytes" ); |
2673 | |
2674 | Status = |
2675 | Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, TmpBool); |
2676 | if (Status == HSA_STATUS_SUCCESS) |
2677 | Info.add<InfoLevel3>("Accessable by all" , TmpBool); |
2678 | } |
2679 | |
2680 | Info.add("ISAs" ); |
2681 | auto Err = utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) { |
2682 | Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, TmpChar); |
2683 | if (Status == HSA_STATUS_SUCCESS) |
2684 | Info.add<InfoLevel2>("Name" , TmpChar); |
2685 | |
2686 | return Status; |
2687 | }); |
2688 | |
2689 | // Silently consume the error. |
2690 | if (Err) |
2691 | consumeError(std::move(Err)); |
2692 | |
2693 | return Plugin::success(); |
2694 | } |
2695 | |
2696 | /// Returns true if auto zero-copy the best configuration for the current |
2697 | /// arch. |
2698 | /// On AMDGPUs, automatic zero-copy is turned on |
2699 | /// when running on an APU with XNACK (unified memory) support |
2700 | /// enabled. On discrete GPUs, automatic zero-copy is triggered |
2701 | /// if the user sets the environment variable OMPX_APU_MAPS=1 |
2702 | /// and if XNACK is enabled. The rationale is that zero-copy |
2703 | /// is the best configuration (performance, memory footprint) on APUs, |
2704 | /// while it is often not the best on discrete GPUs. |
2705 | /// XNACK can be enabled with a kernel boot parameter or with |
2706 | /// the HSA_XNACK environment variable. |
2707 | bool useAutoZeroCopyImpl() override { |
2708 | return ((IsAPU || OMPX_ApuMaps) && IsXnackEnabled); |
2709 | } |
2710 | |
2711 | /// Getters and setters for stack and heap sizes. |
2712 | Error getDeviceStackSize(uint64_t &Value) override { |
2713 | Value = StackSize; |
2714 | return Plugin::success(); |
2715 | } |
2716 | Error setDeviceStackSize(uint64_t Value) override { |
2717 | StackSize = Value; |
2718 | return Plugin::success(); |
2719 | } |
2720 | Error getDeviceHeapSize(uint64_t &Value) override { |
2721 | Value = DeviceMemoryPoolSize; |
2722 | return Plugin::success(); |
2723 | } |
2724 | Error setDeviceHeapSize(uint64_t Value) override { |
2725 | for (DeviceImageTy *Image : LoadedImages) |
2726 | if (auto Err = setupDeviceMemoryPool(Plugin, *Image, Value)) |
2727 | return Err; |
2728 | DeviceMemoryPoolSize = Value; |
2729 | return Plugin::success(); |
2730 | } |
2731 | Error getDeviceMemorySize(uint64_t &Value) override { |
2732 | for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) { |
2733 | if (Pool->isGlobal()) { |
2734 | hsa_status_t Status = |
2735 | Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, Value); |
2736 | return Plugin::check(Status, "Error in getting device memory size: %s" ); |
2737 | } |
2738 | } |
2739 | return Plugin::error("getDeviceMemorySize:: no global pool" ); |
2740 | } |
2741 | |
2742 | /// AMDGPU-specific function to get device attributes. |
2743 | template <typename Ty> Error getDeviceAttr(uint32_t Kind, Ty &Value) { |
2744 | hsa_status_t Status = |
2745 | hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value); |
2746 | return Plugin::check(Status, "Error in hsa_agent_get_info: %s" ); |
2747 | } |
2748 | |
2749 | template <typename Ty> |
2750 | hsa_status_t getDeviceAttrRaw(uint32_t Kind, Ty &Value) { |
2751 | return hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value); |
2752 | } |
2753 | |
2754 | /// Get the device agent. |
2755 | hsa_agent_t getAgent() const override { return Agent; } |
2756 | |
2757 | /// Get the signal manager. |
2758 | AMDGPUSignalManagerTy &getSignalManager() { return AMDGPUSignalManager; } |
2759 | |
2760 | /// Retrieve and construct all memory pools of the device agent. |
2761 | Error retrieveAllMemoryPools() override { |
2762 | // Iterate through the available pools of the device agent. |
2763 | return utils::iterateAgentMemoryPools( |
2764 | Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) { |
2765 | AMDGPUMemoryPoolTy *MemoryPool = |
2766 | Plugin.allocate<AMDGPUMemoryPoolTy>(); |
2767 | new (MemoryPool) AMDGPUMemoryPoolTy(HSAMemoryPool); |
2768 | AllMemoryPools.push_back(MemoryPool); |
2769 | return HSA_STATUS_SUCCESS; |
2770 | }); |
2771 | } |
2772 | |
2773 | bool useMultipleSdmaEngines() const { return OMPX_UseMultipleSdmaEngines; } |
2774 | |
2775 | private: |
2776 | using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>; |
2777 | using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>; |
2778 | |
2779 | /// Common method to invoke a single threaded constructor or destructor |
2780 | /// kernel by name. |
2781 | Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image, |
2782 | bool IsCtor) { |
2783 | const char *KernelName = |
2784 | IsCtor ? "amdgcn.device.init" : "amdgcn.device.fini" ; |
2785 | // Perform a quick check for the named kernel in the image. The kernel |
2786 | // should be created by the 'amdgpu-lower-ctor-dtor' pass. |
2787 | GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); |
2788 | if (IsCtor && !Handler.isSymbolInImage(*this, Image, KernelName)) |
2789 | return Plugin::success(); |
2790 | |
2791 | // Allocate and construct the AMDGPU kernel. |
2792 | AMDGPUKernelTy AMDGPUKernel(KernelName); |
2793 | if (auto Err = AMDGPUKernel.init(*this, Image)) |
2794 | return Err; |
2795 | |
2796 | AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr); |
2797 | |
2798 | KernelArgsTy KernelArgs = {}; |
2799 | if (auto Err = AMDGPUKernel.launchImpl(*this, /*NumThread=*/1u, |
2800 | /*NumBlocks=*/1ul, KernelArgs, |
2801 | /*Args=*/nullptr, AsyncInfoWrapper)) |
2802 | return Err; |
2803 | |
2804 | Error Err = Plugin::success(); |
2805 | AsyncInfoWrapper.finalize(Err); |
2806 | |
2807 | return Err; |
2808 | } |
2809 | |
2810 | /// Detect if current architecture is an APU. |
2811 | Error checkIfAPU() { |
2812 | // TODO: replace with ROCr API once it becomes available. |
2813 | llvm::StringRef StrGfxName(ComputeUnitKind); |
2814 | IsAPU = llvm::StringSwitch<bool>(StrGfxName) |
2815 | .Case(S: "gfx940" , Value: true) |
2816 | .Default(Value: false); |
2817 | if (IsAPU) |
2818 | return Plugin::success(); |
2819 | |
2820 | bool MayBeAPU = llvm::StringSwitch<bool>(StrGfxName) |
2821 | .Case(S: "gfx942" , Value: true) |
2822 | .Default(Value: false); |
2823 | if (!MayBeAPU) |
2824 | return Plugin::success(); |
2825 | |
2826 | // can be MI300A or MI300X |
2827 | uint32_t ChipID = 0; |
2828 | if (auto Err = getDeviceAttr(HSA_AMD_AGENT_INFO_CHIP_ID, ChipID)) |
2829 | return Err; |
2830 | |
2831 | if (!(ChipID & 0x1)) { |
2832 | IsAPU = true; |
2833 | return Plugin::success(); |
2834 | } |
2835 | return Plugin::success(); |
2836 | } |
2837 | |
2838 | /// Envar for controlling the number of HSA queues per device. High number of |
2839 | /// queues may degrade performance. |
2840 | UInt32Envar OMPX_NumQueues; |
2841 | |
2842 | /// Envar for controlling the size of each HSA queue. The size is the number |
2843 | /// of HSA packets a queue is expected to hold. It is also the number of HSA |
2844 | /// packets that can be pushed into each queue without waiting the driver to |
2845 | /// process them. |
2846 | UInt32Envar OMPX_QueueSize; |
2847 | |
2848 | /// Envar for controlling the default number of teams relative to the number |
2849 | /// of compute units (CUs) the device has: |
2850 | /// #default_teams = OMPX_DefaultTeamsPerCU * #CUs. |
2851 | UInt32Envar OMPX_DefaultTeamsPerCU; |
2852 | |
2853 | /// Envar specifying the maximum size in bytes where the memory copies are |
2854 | /// asynchronous operations. Up to this transfer size, the memory copies are |
2855 | /// asychronous operations pushed to the corresponding stream. For larger |
2856 | /// transfers, they are synchronous transfers. |
2857 | UInt32Envar OMPX_MaxAsyncCopyBytes; |
2858 | |
2859 | /// Envar controlling the initial number of HSA signals per device. There is |
2860 | /// one manager of signals per device managing several pre-allocated signals. |
2861 | /// These signals are mainly used by AMDGPU streams. If needed, more signals |
2862 | /// will be created. |
2863 | UInt32Envar OMPX_InitialNumSignals; |
2864 | |
2865 | /// Environment variables to set the time to wait in active state before |
2866 | /// switching to blocked state. The default 2000000 busywaits for 2 seconds |
2867 | /// before going into a blocking HSA wait state. The unit for these variables |
2868 | /// are microseconds. |
2869 | UInt32Envar OMPX_StreamBusyWait; |
2870 | |
2871 | /// Use ROCm 5.7 interface for multiple SDMA engines |
2872 | BoolEnvar OMPX_UseMultipleSdmaEngines; |
2873 | |
2874 | /// Value of OMPX_APU_MAPS env var used to force |
2875 | /// automatic zero-copy behavior on non-APU GPUs. |
2876 | BoolEnvar OMPX_ApuMaps; |
2877 | |
2878 | /// Stream manager for AMDGPU streams. |
2879 | AMDGPUStreamManagerTy AMDGPUStreamManager; |
2880 | |
2881 | /// Event manager for AMDGPU events. |
2882 | AMDGPUEventManagerTy AMDGPUEventManager; |
2883 | |
2884 | /// Signal manager for AMDGPU signals. |
2885 | AMDGPUSignalManagerTy AMDGPUSignalManager; |
2886 | |
2887 | /// The agent handler corresponding to the device. |
2888 | hsa_agent_t Agent; |
2889 | |
2890 | /// The GPU architecture. |
2891 | std::string ComputeUnitKind; |
2892 | |
2893 | /// The frequency of the steady clock inside the device. |
2894 | uint64_t ClockFrequency; |
2895 | |
2896 | /// The total number of concurrent work items that can be running on the GPU. |
2897 | uint64_t HardwareParallelism; |
2898 | |
2899 | /// Reference to the host device. |
2900 | AMDHostDeviceTy &HostDevice; |
2901 | |
2902 | /// The current size of the global device memory pool (managed by us). |
2903 | uint64_t DeviceMemoryPoolSize = 1L << 29L /*512MB=*/; |
2904 | |
2905 | /// The current size of the stack that will be used in cases where it could |
2906 | /// not be statically determined. |
2907 | uint64_t StackSize = 16 * 1024 /* 16 KB */; |
2908 | |
2909 | /// Is the plugin associated with an APU? |
2910 | bool IsAPU = false; |
2911 | |
2912 | /// True is the system is configured with XNACK-Enabled. |
2913 | /// False otherwise. |
2914 | bool IsXnackEnabled = false; |
2915 | }; |
2916 | |
2917 | Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) { |
2918 | hsa_status_t Status; |
2919 | Status = hsa_code_object_deserialize(getStart(), getSize(), "" , &CodeObject); |
2920 | if (auto Err = |
2921 | Plugin::check(Status, "Error in hsa_code_object_deserialize: %s" )) |
2922 | return Err; |
2923 | |
2924 | Status = hsa_executable_create_alt( |
2925 | HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "" , &Executable); |
2926 | if (auto Err = |
2927 | Plugin::check(Status, "Error in hsa_executable_create_alt: %s" )) |
2928 | return Err; |
2929 | |
2930 | Status = hsa_executable_load_code_object(Executable, Device.getAgent(), |
2931 | CodeObject, "" ); |
2932 | if (auto Err = |
2933 | Plugin::check(Status, "Error in hsa_executable_load_code_object: %s" )) |
2934 | return Err; |
2935 | |
2936 | Status = hsa_executable_freeze(Executable, "" ); |
2937 | if (auto Err = Plugin::check(Status, "Error in hsa_executable_freeze: %s" )) |
2938 | return Err; |
2939 | |
2940 | uint32_t Result; |
2941 | Status = hsa_executable_validate(Executable, &Result); |
2942 | if (auto Err = Plugin::check(Status, "Error in hsa_executable_validate: %s" )) |
2943 | return Err; |
2944 | |
2945 | if (Result) |
2946 | return Plugin::error("Loaded HSA executable does not validate" ); |
2947 | |
2948 | if (auto Err = utils::readAMDGPUMetaDataFromImage( |
2949 | getMemoryBuffer(), KernelInfoMap, ELFABIVersion)) |
2950 | return Err; |
2951 | |
2952 | return Plugin::success(); |
2953 | } |
2954 | |
2955 | Expected<hsa_executable_symbol_t> |
2956 | AMDGPUDeviceImageTy::findDeviceSymbol(GenericDeviceTy &Device, |
2957 | StringRef SymbolName) const { |
2958 | |
2959 | AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(Device); |
2960 | hsa_agent_t Agent = AMDGPUDevice.getAgent(); |
2961 | |
2962 | hsa_executable_symbol_t Symbol; |
2963 | hsa_status_t Status = hsa_executable_get_symbol_by_name( |
2964 | Executable, SymbolName.data(), &Agent, &Symbol); |
2965 | if (auto Err = Plugin::check( |
2966 | Status, "Error in hsa_executable_get_symbol_by_name(%s): %s" , |
2967 | SymbolName.data())) |
2968 | return std::move(Err); |
2969 | |
2970 | return Symbol; |
2971 | } |
2972 | |
2973 | template <typename ResourceTy> |
2974 | Error AMDGPUResourceRef<ResourceTy>::create(GenericDeviceTy &Device) { |
2975 | if (Resource) |
2976 | return Plugin::error("Creating an existing resource" ); |
2977 | |
2978 | AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(Device); |
2979 | |
2980 | Resource = new ResourceTy(AMDGPUDevice); |
2981 | |
2982 | return Resource->init(); |
2983 | } |
2984 | |
2985 | AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device) |
2986 | : Agent(Device.getAgent()), Queue(nullptr), |
2987 | SignalManager(Device.getSignalManager()), Device(Device), |
2988 | // Initialize the std::deque with some empty positions. |
2989 | Slots(32), NextSlot(0), SyncCycle(0), RPCServer(nullptr), |
2990 | StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()), |
2991 | UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {} |
2992 | |
2993 | /// Class implementing the AMDGPU-specific functionalities of the global |
2994 | /// handler. |
2995 | struct AMDGPUGlobalHandlerTy final : public GenericGlobalHandlerTy { |
2996 | /// Get the metadata of a global from the device. The name and size of the |
2997 | /// global is read from DeviceGlobal and the address of the global is written |
2998 | /// to DeviceGlobal. |
2999 | Error getGlobalMetadataFromDevice(GenericDeviceTy &Device, |
3000 | DeviceImageTy &Image, |
3001 | GlobalTy &DeviceGlobal) override { |
3002 | AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(Image); |
3003 | |
3004 | // Find the symbol on the device executable. |
3005 | auto SymbolOrErr = |
3006 | AMDImage.findDeviceSymbol(Device, DeviceGlobal.getName()); |
3007 | if (!SymbolOrErr) |
3008 | return SymbolOrErr.takeError(); |
3009 | |
3010 | hsa_executable_symbol_t Symbol = *SymbolOrErr; |
3011 | hsa_symbol_kind_t SymbolType; |
3012 | hsa_status_t Status; |
3013 | uint64_t SymbolAddr; |
3014 | uint32_t SymbolSize; |
3015 | |
3016 | // Retrieve the type, address and size of the symbol. |
3017 | std::pair<hsa_executable_symbol_info_t, void *> RequiredInfos[] = { |
3018 | {HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &SymbolType}, |
3019 | {HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &SymbolAddr}, |
3020 | {HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &SymbolSize}}; |
3021 | |
3022 | for (auto &Info : RequiredInfos) { |
3023 | Status = hsa_executable_symbol_get_info(Symbol, Info.first, Info.second); |
3024 | if (auto Err = Plugin::check( |
3025 | Status, "Error in hsa_executable_symbol_get_info: %s" )) |
3026 | return Err; |
3027 | } |
3028 | |
3029 | // Check the size of the symbol. |
3030 | if (SymbolSize != DeviceGlobal.getSize()) |
3031 | return Plugin::error( |
3032 | "Failed to load global '%s' due to size mismatch (%zu != %zu)" , |
3033 | DeviceGlobal.getName().data(), SymbolSize, |
3034 | (size_t)DeviceGlobal.getSize()); |
3035 | |
3036 | // Store the symbol address on the device global metadata. |
3037 | DeviceGlobal.setPtr(reinterpret_cast<void *>(SymbolAddr)); |
3038 | |
3039 | return Plugin::success(); |
3040 | } |
3041 | }; |
3042 | |
3043 | /// Class implementing the AMDGPU-specific functionalities of the plugin. |
3044 | struct AMDGPUPluginTy final : public GenericPluginTy { |
3045 | /// Create an AMDGPU plugin and initialize the AMDGPU driver. |
3046 | AMDGPUPluginTy() |
3047 | : GenericPluginTy(getTripleArch()), Initialized(false), |
3048 | HostDevice(nullptr) {} |
3049 | |
3050 | /// This class should not be copied. |
3051 | AMDGPUPluginTy(const AMDGPUPluginTy &) = delete; |
3052 | AMDGPUPluginTy(AMDGPUPluginTy &&) = delete; |
3053 | |
3054 | /// Initialize the plugin and return the number of devices. |
3055 | Expected<int32_t> initImpl() override { |
3056 | hsa_status_t Status = hsa_init(); |
3057 | if (Status != HSA_STATUS_SUCCESS) { |
3058 | // Cannot call hsa_success_string. |
3059 | DP("Failed to initialize AMDGPU's HSA library\n" ); |
3060 | return 0; |
3061 | } |
3062 | |
3063 | // The initialization of HSA was successful. It should be safe to call |
3064 | // HSA functions from now on, e.g., hsa_shut_down. |
3065 | Initialized = true; |
3066 | |
3067 | #ifdef OMPT_SUPPORT |
3068 | ompt::connectLibrary(); |
3069 | #endif |
3070 | |
3071 | // Register event handler to detect memory errors on the devices. |
3072 | Status = hsa_amd_register_system_event_handler(eventHandler, nullptr); |
3073 | if (auto Err = Plugin::check( |
3074 | Status, "Error in hsa_amd_register_system_event_handler: %s" )) |
3075 | return std::move(Err); |
3076 | |
3077 | // List of host (CPU) agents. |
3078 | llvm::SmallVector<hsa_agent_t> HostAgents; |
3079 | |
3080 | // Count the number of available agents. |
3081 | auto Err = utils::iterateAgents(Callback: [&](hsa_agent_t Agent) { |
3082 | // Get the device type of the agent. |
3083 | hsa_device_type_t DeviceType; |
3084 | hsa_status_t Status = |
3085 | hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType); |
3086 | if (Status != HSA_STATUS_SUCCESS) |
3087 | return Status; |
3088 | |
3089 | // Classify the agents into kernel (GPU) and host (CPU) kernels. |
3090 | if (DeviceType == HSA_DEVICE_TYPE_GPU) { |
3091 | // Ensure that the GPU agent supports kernel dispatch packets. |
3092 | hsa_agent_feature_t Features; |
3093 | Status = hsa_agent_get_info(Agent, HSA_AGENT_INFO_FEATURE, &Features); |
3094 | if (Features & HSA_AGENT_FEATURE_KERNEL_DISPATCH) |
3095 | KernelAgents.push_back(Agent); |
3096 | } else if (DeviceType == HSA_DEVICE_TYPE_CPU) { |
3097 | HostAgents.push_back(Agent); |
3098 | } |
3099 | return HSA_STATUS_SUCCESS; |
3100 | }); |
3101 | |
3102 | if (Err) |
3103 | return std::move(Err); |
3104 | |
3105 | int32_t NumDevices = KernelAgents.size(); |
3106 | if (NumDevices == 0) { |
3107 | // Do not initialize if there are no devices. |
3108 | DP("There are no devices supporting AMDGPU.\n" ); |
3109 | return 0; |
3110 | } |
3111 | |
3112 | // There are kernel agents but there is no host agent. That should be |
3113 | // treated as an error. |
3114 | if (HostAgents.empty()) |
3115 | return Plugin::error("No AMDGPU host agents" ); |
3116 | |
3117 | // Initialize the host device using host agents. |
3118 | HostDevice = allocate<AMDHostDeviceTy>(); |
3119 | new (HostDevice) AMDHostDeviceTy(*this, HostAgents); |
3120 | |
3121 | // Setup the memory pools of available for the host. |
3122 | if (auto Err = HostDevice->init()) |
3123 | return std::move(Err); |
3124 | |
3125 | return NumDevices; |
3126 | } |
3127 | |
3128 | /// Deinitialize the plugin. |
3129 | Error deinitImpl() override { |
3130 | // The HSA runtime was not initialized, so nothing from the plugin was |
3131 | // actually initialized. |
3132 | if (!Initialized) |
3133 | return Plugin::success(); |
3134 | |
3135 | if (HostDevice) |
3136 | if (auto Err = HostDevice->deinit()) |
3137 | return Err; |
3138 | |
3139 | // Finalize the HSA runtime. |
3140 | hsa_status_t Status = hsa_shut_down(); |
3141 | return Plugin::check(Status, "Error in hsa_shut_down: %s" ); |
3142 | } |
3143 | |
3144 | /// Creates an AMDGPU device. |
3145 | GenericDeviceTy *createDevice(GenericPluginTy &Plugin, int32_t DeviceId, |
3146 | int32_t NumDevices) override { |
3147 | return new AMDGPUDeviceTy(Plugin, DeviceId, NumDevices, getHostDevice(), |
3148 | getKernelAgent(DeviceId)); |
3149 | } |
3150 | |
3151 | /// Creates an AMDGPU global handler. |
3152 | GenericGlobalHandlerTy *createGlobalHandler() override { |
3153 | return new AMDGPUGlobalHandlerTy(); |
3154 | } |
3155 | |
3156 | Triple::ArchType getTripleArch() const override { return Triple::amdgcn; } |
3157 | |
3158 | /// Get the ELF code for recognizing the compatible image binary. |
3159 | uint16_t getMagicElfBits() const override { return ELF::EM_AMDGPU; } |
3160 | |
3161 | /// Check whether the image is compatible with an AMDGPU device. |
3162 | Expected<bool> isELFCompatible(StringRef Image) const override { |
3163 | // Get the associated architecture and flags from the ELF. |
3164 | auto ElfOrErr = ELF64LEObjectFile::create( |
3165 | MemoryBufferRef(Image, /*Identifier=*/"" ), /*InitContent=*/false); |
3166 | if (!ElfOrErr) |
3167 | return ElfOrErr.takeError(); |
3168 | std::optional<StringRef> Processor = ElfOrErr->tryGetCPUName(); |
3169 | |
3170 | for (hsa_agent_t Agent : KernelAgents) { |
3171 | auto TargeTripleAndFeaturesOrError = |
3172 | utils::getTargetTripleAndFeatures(Agent); |
3173 | if (!TargeTripleAndFeaturesOrError) |
3174 | return TargeTripleAndFeaturesOrError.takeError(); |
3175 | if (!utils::isImageCompatibleWithEnv(Processor ? *Processor : "" , |
3176 | ElfOrErr->getPlatformFlags(), |
3177 | *TargeTripleAndFeaturesOrError)) |
3178 | return false; |
3179 | } |
3180 | return true; |
3181 | } |
3182 | |
3183 | bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) override { |
3184 | return true; |
3185 | } |
3186 | |
3187 | /// Get the host device instance. |
3188 | AMDHostDeviceTy &getHostDevice() { |
3189 | assert(HostDevice && "Host device not initialized" ); |
3190 | return *HostDevice; |
3191 | } |
3192 | |
3193 | /// Get the kernel agent with the corresponding agent id. |
3194 | hsa_agent_t getKernelAgent(int32_t AgentId) const { |
3195 | assert((uint32_t)AgentId < KernelAgents.size() && "Invalid agent id" ); |
3196 | return KernelAgents[AgentId]; |
3197 | } |
3198 | |
3199 | /// Get the list of the available kernel agents. |
3200 | const llvm::SmallVector<hsa_agent_t> &getKernelAgents() const { |
3201 | return KernelAgents; |
3202 | } |
3203 | |
3204 | private: |
3205 | /// Event handler that will be called by ROCr if an event is detected. |
3206 | static hsa_status_t eventHandler(const hsa_amd_event_t *Event, void *) { |
3207 | if (Event->event_type != HSA_AMD_GPU_MEMORY_FAULT_EVENT) |
3208 | return HSA_STATUS_SUCCESS; |
3209 | |
3210 | SmallVector<std::string> Reasons; |
3211 | uint32_t ReasonsMask = Event->memory_fault.fault_reason_mask; |
3212 | if (ReasonsMask & HSA_AMD_MEMORY_FAULT_PAGE_NOT_PRESENT) |
3213 | Reasons.emplace_back(Args: "Page not present or supervisor privilege" ); |
3214 | if (ReasonsMask & HSA_AMD_MEMORY_FAULT_READ_ONLY) |
3215 | Reasons.emplace_back(Args: "Write access to a read-only page" ); |
3216 | if (ReasonsMask & HSA_AMD_MEMORY_FAULT_NX) |
3217 | Reasons.emplace_back(Args: "Execute access to a page marked NX" ); |
3218 | if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HOST_ONLY) |
3219 | Reasons.emplace_back(Args: "GPU attempted access to a host only page" ); |
3220 | if (ReasonsMask & HSA_AMD_MEMORY_FAULT_DRAMECC) |
3221 | Reasons.emplace_back(Args: "DRAM ECC failure" ); |
3222 | if (ReasonsMask & HSA_AMD_MEMORY_FAULT_IMPRECISE) |
3223 | Reasons.emplace_back(Args: "Can't determine the exact fault address" ); |
3224 | if (ReasonsMask & HSA_AMD_MEMORY_FAULT_SRAMECC) |
3225 | Reasons.emplace_back(Args: "SRAM ECC failure (ie registers, no fault address)" ); |
3226 | if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HANG) |
3227 | Reasons.emplace_back(Args: "GPU reset following unspecified hang" ); |
3228 | |
3229 | // If we do not know the reason, say so, otherwise remove the trailing comma |
3230 | // and space. |
3231 | if (Reasons.empty()) |
3232 | Reasons.emplace_back(Args: "Unknown (" + std::to_string(val: ReasonsMask) + ")" ); |
3233 | |
3234 | uint32_t Node = -1; |
3235 | hsa_agent_get_info(Event->memory_fault.agent, HSA_AGENT_INFO_NODE, &Node); |
3236 | |
3237 | // Abort the execution since we do not recover from this error. |
3238 | FATAL_MESSAGE(1, |
3239 | "Memory access fault by GPU %" PRIu32 " (agent 0x%" PRIx64 |
3240 | ") at virtual address %p. Reasons: %s" , |
3241 | Node, Event->memory_fault.agent.handle, |
3242 | (void *)Event->memory_fault.virtual_address, |
3243 | llvm::join(Reasons, ", " ).c_str()); |
3244 | |
3245 | return HSA_STATUS_ERROR; |
3246 | } |
3247 | |
3248 | /// Indicate whether the HSA runtime was correctly initialized. Even if there |
3249 | /// is no available devices this boolean will be true. It indicates whether |
3250 | /// we can safely call HSA functions (e.g., hsa_shut_down). |
3251 | bool Initialized; |
3252 | |
3253 | /// Arrays of the available GPU and CPU agents. These arrays of handles should |
3254 | /// not be here but in the AMDGPUDeviceTy structures directly. However, the |
3255 | /// HSA standard does not provide API functions to retirve agents directly, |
3256 | /// only iterating functions. We cache the agents here for convenience. |
3257 | llvm::SmallVector<hsa_agent_t> KernelAgents; |
3258 | |
3259 | /// The device representing all HSA host agents. |
3260 | AMDHostDeviceTy *HostDevice; |
3261 | }; |
3262 | |
3263 | Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, |
3264 | uint32_t NumThreads, uint64_t NumBlocks, |
3265 | KernelArgsTy &KernelArgs, void *Args, |
3266 | AsyncInfoWrapperTy &AsyncInfoWrapper) const { |
3267 | const uint32_t KernelArgsSize = KernelArgs.NumArgs * sizeof(void *); |
3268 | |
3269 | if (ArgsSize < KernelArgsSize) |
3270 | return Plugin::error("Mismatch of kernel arguments size" ); |
3271 | |
3272 | // The args size reported by HSA may or may not contain the implicit args. |
3273 | // For now, assume that HSA does not consider the implicit arguments when |
3274 | // reporting the arguments of a kernel. In the worst case, we can waste |
3275 | // 56 bytes per allocation. |
3276 | uint32_t AllArgsSize = KernelArgsSize + ImplicitArgsSize; |
3277 | |
3278 | AMDGPUPluginTy &AMDGPUPlugin = |
3279 | static_cast<AMDGPUPluginTy &>(GenericDevice.Plugin); |
3280 | AMDHostDeviceTy &HostDevice = AMDGPUPlugin.getHostDevice(); |
3281 | AMDGPUMemoryManagerTy &ArgsMemoryManager = HostDevice.getArgsMemoryManager(); |
3282 | |
3283 | void *AllArgs = nullptr; |
3284 | if (auto Err = ArgsMemoryManager.allocate(AllArgsSize, &AllArgs)) |
3285 | return Err; |
3286 | |
3287 | // Account for user requested dynamic shared memory. |
3288 | uint32_t GroupSize = getGroupSize(); |
3289 | if (uint32_t MaxDynCGroupMem = std::max( |
3290 | KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize())) { |
3291 | GroupSize += MaxDynCGroupMem; |
3292 | } |
3293 | |
3294 | uint64_t StackSize; |
3295 | if (auto Err = GenericDevice.getDeviceStackSize(StackSize)) |
3296 | return Err; |
3297 | |
3298 | // Initialize implicit arguments. |
3299 | utils::AMDGPUImplicitArgsTy *ImplArgs = |
3300 | reinterpret_cast<utils::AMDGPUImplicitArgsTy *>( |
3301 | advanceVoidPtr(AllArgs, KernelArgsSize)); |
3302 | |
3303 | // Initialize the implicit arguments to zero. |
3304 | std::memset(s: ImplArgs, c: 0, n: ImplicitArgsSize); |
3305 | |
3306 | // Copy the explicit arguments. |
3307 | // TODO: We should expose the args memory manager alloc to the common part as |
3308 | // alternative to copying them twice. |
3309 | if (KernelArgs.NumArgs) |
3310 | std::memcpy(dest: AllArgs, src: *static_cast<void **>(Args), |
3311 | n: sizeof(void *) * KernelArgs.NumArgs); |
3312 | |
3313 | AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(GenericDevice); |
3314 | |
3315 | AMDGPUStreamTy *Stream = nullptr; |
3316 | if (auto Err = AMDGPUDevice.getStream(AsyncInfoWrapper, Stream)) |
3317 | return Err; |
3318 | |
3319 | // If this kernel requires an RPC server we attach its pointer to the stream. |
3320 | if (GenericDevice.getRPCServer()) |
3321 | Stream->setRPCServer(GenericDevice.getRPCServer()); |
3322 | |
3323 | // Only COV5 implicitargs needs to be set. COV4 implicitargs are not used. |
3324 | if (getImplicitArgsSize() == sizeof(utils::AMDGPUImplicitArgsTy)) { |
3325 | ImplArgs->BlockCountX = NumBlocks; |
3326 | ImplArgs->BlockCountY = 1; |
3327 | ImplArgs->BlockCountZ = 1; |
3328 | ImplArgs->GroupSizeX = NumThreads; |
3329 | ImplArgs->GroupSizeY = 1; |
3330 | ImplArgs->GroupSizeZ = 1; |
3331 | ImplArgs->GridDims = 1; |
3332 | ImplArgs->DynamicLdsSize = KernelArgs.DynCGroupMem; |
3333 | } |
3334 | |
3335 | // Push the kernel launch into the stream. |
3336 | return Stream->pushKernelLaunch(Kernel: *this, KernelArgs: AllArgs, NumThreads, NumBlocks, |
3337 | GroupSize, StackSize, MemoryManager&: ArgsMemoryManager); |
3338 | } |
3339 | |
3340 | Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice, |
3341 | KernelArgsTy &KernelArgs, |
3342 | uint32_t NumThreads, |
3343 | uint64_t NumBlocks) const { |
3344 | // Only do all this when the output is requested |
3345 | if (!(getInfoLevel() & OMP_INFOTYPE_PLUGIN_KERNEL)) |
3346 | return Plugin::success(); |
3347 | |
3348 | // We don't have data to print additional info, but no hard error |
3349 | if (!KernelInfo.has_value()) |
3350 | return Plugin::success(); |
3351 | |
3352 | // General Info |
3353 | auto NumGroups = NumBlocks; |
3354 | auto ThreadsPerGroup = NumThreads; |
3355 | |
3356 | // Kernel Arguments Info |
3357 | auto ArgNum = KernelArgs.NumArgs; |
3358 | auto LoopTripCount = KernelArgs.Tripcount; |
3359 | |
3360 | // Details for AMDGPU kernels (read from image) |
3361 | // https://www.llvm.org/docs/AMDGPUUsage.html#code-object-v4-metadata |
3362 | auto GroupSegmentSize = (*KernelInfo).GroupSegmentList; |
3363 | auto SGPRCount = (*KernelInfo).SGPRCount; |
3364 | auto VGPRCount = (*KernelInfo).VGPRCount; |
3365 | auto SGPRSpillCount = (*KernelInfo).SGPRSpillCount; |
3366 | auto VGPRSpillCount = (*KernelInfo).VGPRSpillCount; |
3367 | auto MaxFlatWorkgroupSize = (*KernelInfo).MaxFlatWorkgroupSize; |
3368 | |
3369 | // Prints additional launch info that contains the following. |
3370 | // Num Args: The number of kernel arguments |
3371 | // Teams x Thrds: The number of teams and the number of threads actually |
3372 | // running. |
3373 | // MaxFlatWorkgroupSize: Maximum flat work-group size supported by the |
3374 | // kernel in work-items |
3375 | // LDS Usage: Amount of bytes used in LDS storage |
3376 | // S/VGPR Count: the number of S/V GPRs occupied by the kernel |
3377 | // S/VGPR Spill Count: how many S/VGPRs are spilled by the kernel |
3378 | // Tripcount: loop tripcount for the kernel |
3379 | INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(), |
3380 | "#Args: %d Teams x Thrds: %4lux%4u (MaxFlatWorkGroupSize: %u) LDS " |
3381 | "Usage: %uB #SGPRs/VGPRs: %u/%u #SGPR/VGPR Spills: %u/%u Tripcount: " |
3382 | "%lu\n" , |
3383 | ArgNum, NumGroups, ThreadsPerGroup, MaxFlatWorkgroupSize, |
3384 | GroupSegmentSize, SGPRCount, VGPRCount, SGPRSpillCount, VGPRSpillCount, |
3385 | LoopTripCount); |
3386 | |
3387 | return Plugin::success(); |
3388 | } |
3389 | |
3390 | GenericPluginTy *PluginTy::createPlugin() { return new AMDGPUPluginTy(); } |
3391 | |
3392 | template <typename... ArgsTy> |
3393 | static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) { |
3394 | hsa_status_t ResultCode = static_cast<hsa_status_t>(Code); |
3395 | if (ResultCode == HSA_STATUS_SUCCESS || ResultCode == HSA_STATUS_INFO_BREAK) |
3396 | return Error::success(); |
3397 | |
3398 | const char *Desc = "Unknown error" ; |
3399 | hsa_status_t Ret = hsa_status_string(ResultCode, &Desc); |
3400 | if (Ret != HSA_STATUS_SUCCESS) |
3401 | REPORT("Unrecognized " GETNAME(TARGET_NAME) " error code %d\n" , Code); |
3402 | |
3403 | return createStringError<ArgsTy..., const char *>(inconvertibleErrorCode(), |
3404 | ErrFmt, Args..., Desc); |
3405 | } |
3406 | |
3407 | void *AMDGPUMemoryManagerTy::allocate(size_t Size, void *HstPtr, |
3408 | TargetAllocTy Kind) { |
3409 | // Allocate memory from the pool. |
3410 | void *Ptr = nullptr; |
3411 | if (auto Err = MemoryPool->allocate(Size, PtrStorage: &Ptr)) { |
3412 | consumeError(Err: std::move(Err)); |
3413 | return nullptr; |
3414 | } |
3415 | assert(Ptr && "Invalid pointer" ); |
3416 | |
3417 | auto &KernelAgents = Plugin.getKernelAgents(); |
3418 | |
3419 | // Allow all kernel agents to access the allocation. |
3420 | if (auto Err = MemoryPool->enableAccess(Ptr, Size, KernelAgents)) { |
3421 | REPORT("%s\n" , toString(std::move(Err)).data()); |
3422 | return nullptr; |
3423 | } |
3424 | return Ptr; |
3425 | } |
3426 | |
3427 | void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) { |
3428 | if (Size == 0) |
3429 | return nullptr; |
3430 | |
3431 | // Find the correct memory pool. |
3432 | AMDGPUMemoryPoolTy *MemoryPool = nullptr; |
3433 | switch (Kind) { |
3434 | case TARGET_ALLOC_DEFAULT: |
3435 | case TARGET_ALLOC_DEVICE: |
3436 | case TARGET_ALLOC_DEVICE_NON_BLOCKING: |
3437 | MemoryPool = CoarseGrainedMemoryPools[0]; |
3438 | break; |
3439 | case TARGET_ALLOC_HOST: |
3440 | MemoryPool = &HostDevice.getFineGrainedMemoryPool(); |
3441 | break; |
3442 | case TARGET_ALLOC_SHARED: |
3443 | MemoryPool = &HostDevice.getFineGrainedMemoryPool(); |
3444 | break; |
3445 | } |
3446 | |
3447 | if (!MemoryPool) { |
3448 | REPORT("No memory pool for the specified allocation kind\n" ); |
3449 | return nullptr; |
3450 | } |
3451 | |
3452 | // Allocate from the corresponding memory pool. |
3453 | void *Alloc = nullptr; |
3454 | if (Error Err = MemoryPool->allocate(Size, PtrStorage: &Alloc)) { |
3455 | REPORT("%s\n" , toString(E: std::move(Err)).data()); |
3456 | return nullptr; |
3457 | } |
3458 | |
3459 | if (Alloc) { |
3460 | auto &KernelAgents = |
3461 | static_cast<AMDGPUPluginTy &>(Plugin).getKernelAgents(); |
3462 | // Inherently necessary for host or shared allocations |
3463 | // Also enabled for device memory to allow device to device memcpy |
3464 | |
3465 | // Enable all kernel agents to access the buffer. |
3466 | if (auto Err = MemoryPool->enableAccess(Alloc, Size, KernelAgents)) { |
3467 | REPORT("%s\n" , toString(std::move(Err)).data()); |
3468 | return nullptr; |
3469 | } |
3470 | } |
3471 | |
3472 | return Alloc; |
3473 | } |
3474 | |
3475 | } // namespace plugin |
3476 | } // namespace target |
3477 | } // namespace omp |
3478 | } // namespace llvm |
3479 | |