1 | //===----RTLs/amdgpu/src/rtl.cpp - Target RTLs Implementation ----- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // RTL NextGen for AMDGPU machine |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include <atomic> |
14 | #include <cassert> |
15 | #include <cstddef> |
16 | #include <cstdint> |
17 | #include <deque> |
18 | #include <functional> |
19 | #include <mutex> |
20 | #include <string> |
21 | #include <system_error> |
22 | #include <unistd.h> |
23 | #include <unordered_map> |
24 | |
25 | #include "ErrorReporting.h" |
26 | #include "Shared/APITypes.h" |
27 | #include "Shared/Debug.h" |
28 | #include "Shared/Environment.h" |
29 | #include "Shared/RefCnt.h" |
30 | #include "Shared/Utils.h" |
31 | #include "Utils/ELF.h" |
32 | |
33 | #include "GlobalHandler.h" |
34 | #include "OpenMP/OMPT/Callback.h" |
35 | #include "PluginInterface.h" |
36 | #include "UtilitiesRTL.h" |
37 | #include "omptarget.h" |
38 | |
39 | #include "llvm/ADT/SmallString.h" |
40 | #include "llvm/ADT/SmallVector.h" |
41 | #include "llvm/ADT/StringRef.h" |
42 | #include "llvm/BinaryFormat/ELF.h" |
43 | #include "llvm/Frontend/OpenMP/OMPConstants.h" |
44 | #include "llvm/Frontend/OpenMP/OMPGridValues.h" |
45 | #include "llvm/Support/Error.h" |
46 | #include "llvm/Support/FileOutputBuffer.h" |
47 | #include "llvm/Support/FileSystem.h" |
48 | #include "llvm/Support/MemoryBuffer.h" |
49 | #include "llvm/Support/Program.h" |
50 | #include "llvm/Support/Signals.h" |
51 | #include "llvm/Support/raw_ostream.h" |
52 | |
53 | #if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || \ |
54 | !defined(__ORDER_BIG_ENDIAN__) |
55 | #error "Missing preprocessor definitions for endianness detection." |
56 | #endif |
57 | |
58 | // The HSA headers require these definitions. |
59 | #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) |
60 | #define LITTLEENDIAN_CPU |
61 | #elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) |
62 | #define BIGENDIAN_CPU |
63 | #endif |
64 | |
65 | #if defined(__has_include) |
66 | #if __has_include("hsa.h") |
67 | #include "hsa.h" |
68 | #include "hsa_ext_amd.h" |
69 | #elif __has_include("hsa/hsa.h") |
70 | #include "hsa/hsa.h" |
71 | #include "hsa/hsa_ext_amd.h" |
72 | #endif |
73 | #else |
74 | #include "hsa/hsa.h" |
75 | #include "hsa/hsa_ext_amd.h" |
76 | #endif |
77 | |
78 | using namespace error; |
79 | |
80 | namespace llvm { |
81 | namespace omp { |
82 | namespace target { |
83 | namespace plugin { |
84 | |
85 | /// Forward declarations for all specialized data structures. |
86 | struct AMDGPUKernelTy; |
87 | struct AMDGPUDeviceTy; |
88 | struct AMDGPUPluginTy; |
89 | struct AMDGPUStreamTy; |
90 | struct AMDGPUEventTy; |
91 | struct AMDGPUStreamManagerTy; |
92 | struct AMDGPUEventManagerTy; |
93 | struct AMDGPUDeviceImageTy; |
94 | struct AMDGPUMemoryManagerTy; |
95 | struct AMDGPUMemoryPoolTy; |
96 | |
97 | namespace hsa_utils { |
98 | |
99 | /// Iterate elements using an HSA iterate function. Do not use this function |
100 | /// directly but the specialized ones below instead. |
101 | template <typename ElemTy, typename IterFuncTy, typename CallbackTy> |
102 | hsa_status_t iterate(IterFuncTy Func, CallbackTy Cb) { |
103 | auto L = [](ElemTy Elem, void *Data) -> hsa_status_t { |
104 | CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data); |
105 | return (*Unwrapped)(Elem); |
106 | }; |
107 | return Func(L, static_cast<void *>(&Cb)); |
108 | } |
109 | |
110 | /// Iterate elements using an HSA iterate function passing a parameter. Do not |
111 | /// use this function directly but the specialized ones below instead. |
112 | template <typename ElemTy, typename IterFuncTy, typename IterFuncArgTy, |
113 | typename CallbackTy> |
114 | hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) { |
115 | auto L = [](ElemTy Elem, void *Data) -> hsa_status_t { |
116 | CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data); |
117 | return (*Unwrapped)(Elem); |
118 | }; |
119 | return Func(FuncArg, L, static_cast<void *>(&Cb)); |
120 | } |
121 | |
122 | /// Iterate elements using an HSA iterate function passing a parameter. Do not |
123 | /// use this function directly but the specialized ones below instead. |
124 | template <typename Elem1Ty, typename Elem2Ty, typename IterFuncTy, |
125 | typename IterFuncArgTy, typename CallbackTy> |
126 | hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) { |
127 | auto L = [](Elem1Ty Elem1, Elem2Ty Elem2, void *Data) -> hsa_status_t { |
128 | CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data); |
129 | return (*Unwrapped)(Elem1, Elem2); |
130 | }; |
131 | return Func(FuncArg, L, static_cast<void *>(&Cb)); |
132 | } |
133 | |
134 | /// Iterate agents. |
135 | template <typename CallbackTy> Error iterateAgents(CallbackTy Callback) { |
136 | hsa_status_t Status = iterate<hsa_agent_t>(hsa_iterate_agents, Callback); |
137 | return Plugin::check(Status, "error in hsa_iterate_agents: %s" ); |
138 | } |
139 | |
140 | /// Iterate ISAs of an agent. |
141 | template <typename CallbackTy> |
142 | Error iterateAgentISAs(hsa_agent_t Agent, CallbackTy Cb) { |
143 | hsa_status_t Status = iterate<hsa_isa_t>(hsa_agent_iterate_isas, Agent, Cb); |
144 | return Plugin::check(Status, "error in hsa_agent_iterate_isas: %s" ); |
145 | } |
146 | |
147 | /// Iterate memory pools of an agent. |
148 | template <typename CallbackTy> |
149 | Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) { |
150 | hsa_status_t Status = iterate<hsa_amd_memory_pool_t>( |
151 | hsa_amd_agent_iterate_memory_pools, Agent, Cb); |
152 | return Plugin::check(Status, |
153 | "error in hsa_amd_agent_iterate_memory_pools: %s" ); |
154 | } |
155 | |
156 | /// Dispatches an asynchronous memory copy. |
157 | /// Enables different SDMA engines for the dispatch in a round-robin fashion. |
158 | Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent, |
159 | const void *Src, hsa_agent_t SrcAgent, size_t Size, |
160 | uint32_t NumDepSignals, const hsa_signal_t *DepSignals, |
161 | hsa_signal_t CompletionSignal) { |
162 | if (!UseMultipleSdmaEngines) { |
163 | hsa_status_t S = |
164 | hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, Size, |
165 | NumDepSignals, DepSignals, CompletionSignal); |
166 | return Plugin::check(S, "error in hsa_amd_memory_async_copy: %s" ); |
167 | } |
168 | |
169 | // This solution is probably not the best |
170 | #if !(HSA_AMD_INTERFACE_VERSION_MAJOR >= 1 && \ |
171 | HSA_AMD_INTERFACE_VERSION_MINOR >= 2) |
172 | return Plugin::error(ErrorCode::UNSUPPORTED, |
173 | "async copy on selected SDMA requires ROCm 5.7" ); |
174 | #else |
175 | static std::atomic<int> SdmaEngine{1}; |
176 | |
177 | // This atomics solution is probably not the best, but should be sufficient |
178 | // for now. |
179 | // In a worst case scenario, in which threads read the same value, they will |
180 | // dispatch to the same SDMA engine. This may result in sub-optimal |
181 | // performance. However, I think the possibility to be fairly low. |
182 | int LocalSdmaEngine = SdmaEngine.load(std::memory_order_acquire); |
183 | // This call is only avail in ROCm >= 5.7 |
184 | hsa_status_t S = hsa_amd_memory_async_copy_on_engine( |
185 | Dst, DstAgent, Src, SrcAgent, Size, NumDepSignals, DepSignals, |
186 | CompletionSignal, (hsa_amd_sdma_engine_id_t)LocalSdmaEngine, |
187 | /*force_copy_on_sdma=*/true); |
188 | // Increment to use one of two SDMA engines: 0x1, 0x2 |
189 | LocalSdmaEngine = (LocalSdmaEngine << 1) % 3; |
190 | SdmaEngine.store(LocalSdmaEngine, std::memory_order_relaxed); |
191 | |
192 | return Plugin::check(S, "error in hsa_amd_memory_async_copy_on_engine: %s" ); |
193 | #endif |
194 | } |
195 | |
196 | Error getTargetTripleAndFeatures(hsa_agent_t Agent, |
197 | SmallVector<SmallString<32>> &Targets) { |
198 | auto Err = hsa_utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) { |
199 | uint32_t Length; |
200 | hsa_status_t Status; |
201 | Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME_LENGTH, &Length); |
202 | if (Status != HSA_STATUS_SUCCESS) |
203 | return Status; |
204 | |
205 | llvm::SmallVector<char> ISAName(Length); |
206 | Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, ISAName.begin()); |
207 | if (Status != HSA_STATUS_SUCCESS) |
208 | return Status; |
209 | |
210 | llvm::StringRef TripleTarget(ISAName.begin(), Length); |
211 | if (TripleTarget.consume_front(Prefix: "amdgcn-amd-amdhsa" )) { |
212 | auto Target = TripleTarget.ltrim(Char: '-').rtrim(Char: '\0'); |
213 | Targets.push_back(Target); |
214 | } |
215 | return HSA_STATUS_SUCCESS; |
216 | }); |
217 | return Err; |
218 | } |
219 | } // namespace hsa_utils |
220 | |
221 | /// Utility class representing generic resource references to AMDGPU resources. |
222 | template <typename ResourceTy> |
223 | struct AMDGPUResourceRef : public GenericDeviceResourceRef { |
224 | /// The underlying handle type for resources. |
225 | using HandleTy = ResourceTy *; |
226 | |
227 | /// Create an empty reference to an invalid resource. |
228 | AMDGPUResourceRef() : Resource(nullptr) {} |
229 | |
230 | /// Create a reference to an existing resource. |
231 | AMDGPUResourceRef(HandleTy Resource) : Resource(Resource) {} |
232 | |
233 | virtual ~AMDGPUResourceRef() {} |
234 | |
235 | /// Create a new resource and save the reference. The reference must be empty |
236 | /// before calling to this function. |
237 | Error create(GenericDeviceTy &Device) override; |
238 | |
239 | /// Destroy the referenced resource and invalidate the reference. The |
240 | /// reference must be to a valid resource before calling to this function. |
241 | Error destroy(GenericDeviceTy &Device) override { |
242 | if (!Resource) |
243 | return Plugin::error(ErrorCode::INVALID_ARGUMENT, |
244 | "destroying an invalid resource" ); |
245 | |
246 | if (auto Err = Resource->deinit()) |
247 | return Err; |
248 | |
249 | delete Resource; |
250 | |
251 | Resource = nullptr; |
252 | return Plugin::success(); |
253 | } |
254 | |
255 | /// Get the underlying resource handle. |
256 | operator HandleTy() const { return Resource; } |
257 | |
258 | private: |
259 | /// The handle to the actual resource. |
260 | HandleTy Resource; |
261 | }; |
262 | |
263 | /// Class holding an HSA memory pool. |
264 | struct AMDGPUMemoryPoolTy { |
265 | /// Create a memory pool from an HSA memory pool. |
266 | AMDGPUMemoryPoolTy(hsa_amd_memory_pool_t MemoryPool) |
267 | : MemoryPool(MemoryPool), GlobalFlags(0) {} |
268 | |
269 | /// Initialize the memory pool retrieving its properties. |
270 | Error init() { |
271 | if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_SEGMENT, Segment)) |
272 | return Err; |
273 | |
274 | if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, GlobalFlags)) |
275 | return Err; |
276 | |
277 | return Plugin::success(); |
278 | } |
279 | |
280 | /// Getter of the HSA memory pool. |
281 | hsa_amd_memory_pool_t get() const { return MemoryPool; } |
282 | |
283 | /// Indicate the segment which belongs to. |
284 | bool isGlobal() const { return (Segment == HSA_AMD_SEGMENT_GLOBAL); } |
285 | bool isReadOnly() const { return (Segment == HSA_AMD_SEGMENT_READONLY); } |
286 | bool isPrivate() const { return (Segment == HSA_AMD_SEGMENT_PRIVATE); } |
287 | bool isGroup() const { return (Segment == HSA_AMD_SEGMENT_GROUP); } |
288 | |
289 | /// Indicate if it is fine-grained memory. Valid only for global. |
290 | bool isFineGrained() const { |
291 | assert(isGlobal() && "Not global memory" ); |
292 | return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED); |
293 | } |
294 | |
295 | /// Indicate if it is coarse-grained memory. Valid only for global. |
296 | bool isCoarseGrained() const { |
297 | assert(isGlobal() && "Not global memory" ); |
298 | return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED); |
299 | } |
300 | |
301 | /// Indicate if it supports storing kernel arguments. Valid only for global. |
302 | bool supportsKernelArgs() const { |
303 | assert(isGlobal() && "Not global memory" ); |
304 | return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT); |
305 | } |
306 | |
307 | /// Allocate memory on the memory pool. |
308 | Error allocate(size_t Size, void **PtrStorage) { |
309 | hsa_status_t Status = |
310 | hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, PtrStorage); |
311 | return Plugin::check(Status, "error in hsa_amd_memory_pool_allocate: %s" ); |
312 | } |
313 | |
314 | /// Return memory to the memory pool. |
315 | Error deallocate(void *Ptr) { |
316 | hsa_status_t Status = hsa_amd_memory_pool_free(Ptr); |
317 | return Plugin::check(Status, "error in hsa_amd_memory_pool_free: %s" ); |
318 | } |
319 | |
320 | /// Returns if the \p Agent can access the memory pool. |
321 | bool canAccess(hsa_agent_t Agent) { |
322 | hsa_amd_memory_pool_access_t Access; |
323 | if (hsa_amd_agent_memory_pool_get_info( |
324 | Agent, MemoryPool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &Access)) |
325 | return false; |
326 | return Access != HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED; |
327 | } |
328 | |
329 | /// Allow the device to access a specific allocation. |
330 | Error enableAccess(void *Ptr, int64_t Size, |
331 | const llvm::SmallVector<hsa_agent_t> &Agents) const { |
332 | #ifdef OMPTARGET_DEBUG |
333 | for (hsa_agent_t Agent : Agents) { |
334 | hsa_amd_memory_pool_access_t Access; |
335 | if (auto Err = |
336 | getAttr(Agent, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, Access)) |
337 | return Err; |
338 | |
339 | // The agent is not allowed to access the memory pool in any case. Do not |
340 | // continue because otherwise it result in undefined behavior. |
341 | if (Access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) |
342 | return Plugin::error(ErrorCode::INVALID_VALUE, |
343 | "an agent is not allowed to access a memory pool" ); |
344 | } |
345 | #endif |
346 | |
347 | // We can access but it is disabled by default. Enable the access then. |
348 | hsa_status_t Status = |
349 | hsa_amd_agents_allow_access(Agents.size(), Agents.data(), nullptr, Ptr); |
350 | return Plugin::check(Status, "error in hsa_amd_agents_allow_access: %s" ); |
351 | } |
352 | |
353 | /// Get attribute from the memory pool. |
354 | template <typename Ty> |
355 | Error getAttr(hsa_amd_memory_pool_info_t Kind, Ty &Value) const { |
356 | hsa_status_t Status; |
357 | Status = hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value); |
358 | return Plugin::check(Status, "error in hsa_amd_memory_pool_get_info: %s" ); |
359 | } |
360 | |
361 | template <typename Ty> |
362 | hsa_status_t getAttrRaw(hsa_amd_memory_pool_info_t Kind, Ty &Value) const { |
363 | return hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value); |
364 | } |
365 | |
366 | /// Get attribute from the memory pool relating to an agent. |
367 | template <typename Ty> |
368 | Error getAttr(hsa_agent_t Agent, hsa_amd_agent_memory_pool_info_t Kind, |
369 | Ty &Value) const { |
370 | hsa_status_t Status; |
371 | Status = |
372 | hsa_amd_agent_memory_pool_get_info(Agent, MemoryPool, Kind, &Value); |
373 | return Plugin::check(Status, |
374 | "error in hsa_amd_agent_memory_pool_get_info: %s" ); |
375 | } |
376 | |
377 | private: |
378 | /// The HSA memory pool. |
379 | hsa_amd_memory_pool_t MemoryPool; |
380 | |
381 | /// The segment where the memory pool belongs to. |
382 | hsa_amd_segment_t Segment; |
383 | |
384 | /// The global flags of memory pool. Only valid if the memory pool belongs to |
385 | /// the global segment. |
386 | uint32_t GlobalFlags; |
387 | }; |
388 | |
389 | /// Class that implements a memory manager that gets memory from a specific |
390 | /// memory pool. |
391 | struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy { |
392 | |
393 | /// Create an empty memory manager. |
394 | AMDGPUMemoryManagerTy(AMDGPUPluginTy &Plugin) |
395 | : Plugin(Plugin), MemoryPool(nullptr), MemoryManager(nullptr) {} |
396 | |
397 | /// Initialize the memory manager from a memory pool. |
398 | Error init(AMDGPUMemoryPoolTy &MemoryPool) { |
399 | const uint32_t Threshold = 1 << 30; |
400 | this->MemoryManager = new MemoryManagerTy(*this, Threshold); |
401 | this->MemoryPool = &MemoryPool; |
402 | return Plugin::success(); |
403 | } |
404 | |
405 | /// Deinitialize the memory manager and free its allocations. |
406 | Error deinit() { |
407 | assert(MemoryManager && "Invalid memory manager" ); |
408 | |
409 | // Delete and invalidate the memory manager. At this point, the memory |
410 | // manager will deallocate all its allocations. |
411 | delete MemoryManager; |
412 | MemoryManager = nullptr; |
413 | |
414 | return Plugin::success(); |
415 | } |
416 | |
417 | /// Reuse or allocate memory through the memory manager. |
418 | Error allocate(size_t Size, void **PtrStorage) { |
419 | assert(MemoryManager && "Invalid memory manager" ); |
420 | assert(PtrStorage && "Invalid pointer storage" ); |
421 | |
422 | *PtrStorage = MemoryManager->allocate(Size, nullptr); |
423 | if (Size && *PtrStorage == nullptr) |
424 | return Plugin::error(ErrorCode::OUT_OF_RESOURCES, |
425 | "failure to allocate from AMDGPU memory manager" ); |
426 | |
427 | return Plugin::success(); |
428 | } |
429 | |
430 | /// Release an allocation to be reused. |
431 | Error deallocate(void *Ptr) { |
432 | if (MemoryManager->free(Ptr)) |
433 | return Plugin::error(ErrorCode::UNKNOWN, |
434 | "failure to deallocate from AMDGPU memory manager" ); |
435 | |
436 | return Plugin::success(); |
437 | } |
438 | |
439 | private: |
440 | /// Allocation callback that will be called once the memory manager does not |
441 | /// have more previously allocated buffers. |
442 | void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) override; |
443 | |
444 | /// Deallocation callback that will be called by the memory manager. |
445 | int free(void *TgtPtr, TargetAllocTy Kind) override { |
446 | if (auto Err = MemoryPool->deallocate(Ptr: TgtPtr)) { |
447 | consumeError(Err: std::move(Err)); |
448 | return OFFLOAD_FAIL; |
449 | } |
450 | return OFFLOAD_SUCCESS; |
451 | } |
452 | |
453 | /// The underlying plugin that owns this memory manager. |
454 | AMDGPUPluginTy &Plugin; |
455 | |
456 | /// The memory pool used to allocate memory. |
457 | AMDGPUMemoryPoolTy *MemoryPool; |
458 | |
459 | /// Reference to the actual memory manager. |
460 | MemoryManagerTy *MemoryManager; |
461 | }; |
462 | |
463 | /// Class implementing the AMDGPU device images' properties. |
464 | struct AMDGPUDeviceImageTy : public DeviceImageTy { |
465 | /// Create the AMDGPU image with the id and the target image pointer. |
466 | AMDGPUDeviceImageTy(int32_t ImageId, GenericDeviceTy &Device, |
467 | const __tgt_device_image *TgtImage) |
468 | : DeviceImageTy(ImageId, Device, TgtImage) {} |
469 | |
470 | /// Prepare and load the executable corresponding to the image. |
471 | Error loadExecutable(const AMDGPUDeviceTy &Device); |
472 | |
473 | /// Unload the executable. |
474 | Error unloadExecutable() { |
475 | hsa_status_t Status = hsa_executable_destroy(Executable); |
476 | return Plugin::check(Status, "error in hsa_executable_destroy: %s" ); |
477 | } |
478 | |
479 | /// Get the executable. |
480 | hsa_executable_t getExecutable() const { return Executable; } |
481 | |
482 | /// Get to Code Object Version of the ELF |
483 | uint16_t getELFABIVersion() const { return ELFABIVersion; } |
484 | |
485 | /// Find an HSA device symbol by its name on the executable. |
486 | Expected<hsa_executable_symbol_t> |
487 | findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const; |
488 | |
489 | /// Get additional info for kernel, e.g., register spill counts |
490 | std::optional<offloading::amdgpu::AMDGPUKernelMetaData> |
491 | getKernelInfo(StringRef Identifier) const { |
492 | auto It = KernelInfoMap.find(Identifier); |
493 | |
494 | if (It == KernelInfoMap.end()) |
495 | return {}; |
496 | |
497 | return It->second; |
498 | } |
499 | |
500 | private: |
501 | /// The executable loaded on the agent. |
502 | hsa_executable_t Executable; |
503 | StringMap<offloading::amdgpu::AMDGPUKernelMetaData> KernelInfoMap; |
504 | uint16_t ELFABIVersion; |
505 | }; |
506 | |
507 | /// Class implementing the AMDGPU kernel functionalities which derives from the |
508 | /// generic kernel class. |
509 | struct AMDGPUKernelTy : public GenericKernelTy { |
510 | /// Create an AMDGPU kernel with a name and an execution mode. |
511 | AMDGPUKernelTy(const char *Name) : GenericKernelTy(Name) {} |
512 | |
513 | /// Initialize the AMDGPU kernel. |
514 | Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override { |
515 | AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(Image); |
516 | |
517 | // Kernel symbols have a ".kd" suffix. |
518 | std::string KernelName(getName()); |
519 | KernelName += ".kd" ; |
520 | |
521 | // Find the symbol on the device executable. |
522 | auto SymbolOrErr = AMDImage.findDeviceSymbol(Device, KernelName); |
523 | if (!SymbolOrErr) |
524 | return SymbolOrErr.takeError(); |
525 | |
526 | hsa_executable_symbol_t Symbol = *SymbolOrErr; |
527 | hsa_symbol_kind_t SymbolType; |
528 | hsa_status_t Status; |
529 | |
530 | // Retrieve different properties of the kernel symbol. |
531 | std::pair<hsa_executable_symbol_info_t, void *> RequiredInfos[] = { |
532 | {HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &SymbolType}, |
533 | {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &KernelObject}, |
534 | {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &ArgsSize}, |
535 | {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &GroupSize}, |
536 | {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &DynamicStack}, |
537 | {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &PrivateSize}}; |
538 | |
539 | for (auto &Info : RequiredInfos) { |
540 | Status = hsa_executable_symbol_get_info(Symbol, Info.first, Info.second); |
541 | if (auto Err = Plugin::check( |
542 | Status, "error in hsa_executable_symbol_get_info: %s" )) |
543 | return Err; |
544 | } |
545 | |
546 | // Make sure it is a kernel symbol. |
547 | if (SymbolType != HSA_SYMBOL_KIND_KERNEL) |
548 | return Plugin::error(ErrorCode::INVALID_BINARY, |
549 | "symbol %s is not a kernel function" ); |
550 | |
551 | // TODO: Read the kernel descriptor for the max threads per block. May be |
552 | // read from the image. |
553 | |
554 | ImplicitArgsSize = |
555 | hsa_utils::getImplicitArgsSize(AMDImage.getELFABIVersion()); |
556 | DP("ELFABIVersion: %d\n" , AMDImage.getELFABIVersion()); |
557 | |
558 | // Get additional kernel info read from image |
559 | KernelInfo = AMDImage.getKernelInfo(getName()); |
560 | if (!KernelInfo.has_value()) |
561 | INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device.getDeviceId(), |
562 | "Could not read extra information for kernel %s." , getName()); |
563 | |
564 | return Plugin::success(); |
565 | } |
566 | |
567 | /// Launch the AMDGPU kernel function. |
568 | Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3], |
569 | uint32_t NumBlocks[3], KernelArgsTy &KernelArgs, |
570 | KernelLaunchParamsTy LaunchParams, |
571 | AsyncInfoWrapperTy &AsyncInfoWrapper) const override; |
572 | |
573 | /// Print more elaborate kernel launch info for AMDGPU |
574 | Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice, |
575 | KernelArgsTy &KernelArgs, uint32_t NumThreads[3], |
576 | uint32_t NumBlocks[3]) const override; |
577 | |
578 | /// Get group and private segment kernel size. |
579 | uint32_t getGroupSize() const { return GroupSize; } |
580 | uint32_t getPrivateSize() const { return PrivateSize; } |
581 | |
582 | /// Get the HSA kernel object representing the kernel function. |
583 | uint64_t getKernelObject() const { return KernelObject; } |
584 | |
585 | /// Get the size of implicitargs based on the code object version. |
586 | uint32_t getImplicitArgsSize() const { return ImplicitArgsSize; } |
587 | |
588 | /// Indicates whether or not we need to set up our own private segment size. |
589 | bool usesDynamicStack() const { return DynamicStack; } |
590 | |
591 | private: |
592 | /// The kernel object to execute. |
593 | uint64_t KernelObject; |
594 | |
595 | /// The args, group and private segments sizes required by a kernel instance. |
596 | uint32_t ArgsSize; |
597 | uint32_t GroupSize; |
598 | uint32_t PrivateSize; |
599 | bool DynamicStack; |
600 | |
601 | /// The size of implicit kernel arguments. |
602 | uint32_t ImplicitArgsSize; |
603 | |
604 | /// Additional Info for the AMD GPU Kernel |
605 | std::optional<offloading::amdgpu::AMDGPUKernelMetaData> KernelInfo; |
606 | }; |
607 | |
608 | /// Class representing an HSA signal. Signals are used to define dependencies |
609 | /// between asynchronous operations: kernel launches and memory transfers. |
610 | struct AMDGPUSignalTy { |
611 | /// Create an empty signal. |
612 | AMDGPUSignalTy() : HSASignal({0}), UseCount() {} |
613 | AMDGPUSignalTy(AMDGPUDeviceTy &Device) : HSASignal({0}), UseCount() {} |
614 | |
615 | /// Initialize the signal with an initial value. |
616 | Error init(uint32_t InitialValue = 1) { |
617 | hsa_status_t Status = |
618 | hsa_amd_signal_create(InitialValue, 0, nullptr, 0, &HSASignal); |
619 | return Plugin::check(Status, "error in hsa_signal_create: %s" ); |
620 | } |
621 | |
622 | /// Deinitialize the signal. |
623 | Error deinit() { |
624 | hsa_status_t Status = hsa_signal_destroy(HSASignal); |
625 | return Plugin::check(Status, "error in hsa_signal_destroy: %s" ); |
626 | } |
627 | |
628 | /// Wait until the signal gets a zero value. |
629 | Error wait(const uint64_t ActiveTimeout = 0, |
630 | GenericDeviceTy *Device = nullptr) const { |
631 | if (ActiveTimeout) { |
632 | hsa_signal_value_t Got = 1; |
633 | Got = hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0, |
634 | ActiveTimeout, HSA_WAIT_STATE_ACTIVE); |
635 | if (Got == 0) |
636 | return Plugin::success(); |
637 | } |
638 | |
639 | // If there is an RPC device attached to this stream we run it as a server. |
640 | uint64_t Timeout = UINT64_MAX; |
641 | auto WaitState = HSA_WAIT_STATE_BLOCKED; |
642 | while (hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0, |
643 | Timeout, WaitState) != 0) |
644 | ; |
645 | return Plugin::success(); |
646 | } |
647 | |
648 | /// Load the value on the signal. |
649 | hsa_signal_value_t load() const { |
650 | return hsa_signal_load_scacquire(HSASignal); |
651 | } |
652 | |
653 | /// Signal decrementing by one. |
654 | void signal() { |
655 | assert(load() > 0 && "Invalid signal value" ); |
656 | hsa_signal_subtract_screlease(HSASignal, 1); |
657 | } |
658 | |
659 | /// Reset the signal value before reusing the signal. Do not call this |
660 | /// function if the signal is being currently used by any watcher, such as a |
661 | /// plugin thread or the HSA runtime. |
662 | void reset() { hsa_signal_store_screlease(HSASignal, 1); } |
663 | |
664 | /// Increase the number of concurrent uses. |
665 | void increaseUseCount() { UseCount.increase(); } |
666 | |
667 | /// Decrease the number of concurrent uses and return whether was the last. |
668 | bool decreaseUseCount() { return UseCount.decrease(); } |
669 | |
670 | hsa_signal_t get() const { return HSASignal; } |
671 | |
672 | private: |
673 | /// The underlying HSA signal. |
674 | hsa_signal_t HSASignal; |
675 | |
676 | /// Reference counter for tracking the concurrent use count. This is mainly |
677 | /// used for knowing how many streams are using the signal. |
678 | RefCountTy<> UseCount; |
679 | }; |
680 | |
681 | /// Classes for holding AMDGPU signals and managing signals. |
682 | using AMDGPUSignalRef = AMDGPUResourceRef<AMDGPUSignalTy>; |
683 | using AMDGPUSignalManagerTy = GenericDeviceResourceManagerTy<AMDGPUSignalRef>; |
684 | |
685 | /// Class holding an HSA queue to submit kernel and barrier packets. |
686 | struct AMDGPUQueueTy { |
687 | /// Create an empty queue. |
688 | AMDGPUQueueTy() : Queue(nullptr), Mutex(), NumUsers(0) {} |
689 | |
690 | /// Lazily initialize a new queue belonging to a specific agent. |
691 | Error init(GenericDeviceTy &Device, hsa_agent_t Agent, int32_t QueueSize) { |
692 | if (Queue) |
693 | return Plugin::success(); |
694 | hsa_status_t Status = |
695 | hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError, |
696 | &Device, UINT32_MAX, UINT32_MAX, &Queue); |
697 | return Plugin::check(Status, "error in hsa_queue_create: %s" ); |
698 | } |
699 | |
700 | /// Deinitialize the queue and destroy its resources. |
701 | Error deinit() { |
702 | std::lock_guard<std::mutex> Lock(Mutex); |
703 | if (!Queue) |
704 | return Plugin::success(); |
705 | hsa_status_t Status = hsa_queue_destroy(Queue); |
706 | return Plugin::check(Status, "error in hsa_queue_destroy: %s" ); |
707 | } |
708 | |
709 | /// Returns the number of streams, this queue is currently assigned to. |
710 | bool getUserCount() const { return NumUsers; } |
711 | |
712 | /// Returns if the underlying HSA queue is initialized. |
713 | bool isInitialized() { return Queue != nullptr; } |
714 | |
715 | /// Decrement user count of the queue object. |
716 | void removeUser() { --NumUsers; } |
717 | |
718 | /// Increase user count of the queue object. |
719 | void addUser() { ++NumUsers; } |
720 | |
721 | /// Push a kernel launch to the queue. The kernel launch requires an output |
722 | /// signal and can define an optional input signal (nullptr if none). |
723 | Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs, |
724 | uint32_t NumThreads[3], uint32_t NumBlocks[3], |
725 | uint32_t GroupSize, uint64_t StackSize, |
726 | AMDGPUSignalTy *OutputSignal, |
727 | AMDGPUSignalTy *InputSignal) { |
728 | assert(OutputSignal && "Invalid kernel output signal" ); |
729 | |
730 | // Lock the queue during the packet publishing process. Notice this blocks |
731 | // the addition of other packets to the queue. The following piece of code |
732 | // should be lightweight; do not block the thread, allocate memory, etc. |
733 | std::lock_guard<std::mutex> Lock(Mutex); |
734 | assert(Queue && "Interacted with a non-initialized queue!" ); |
735 | |
736 | // Add a barrier packet before the kernel packet in case there is a pending |
737 | // preceding operation. The barrier packet will delay the processing of |
738 | // subsequent queue's packets until the barrier input signal are satisfied. |
739 | // No need output signal needed because the dependency is already guaranteed |
740 | // by the queue barrier itself. |
741 | if (InputSignal && InputSignal->load()) |
742 | if (auto Err = pushBarrierImpl(OutputSignal: nullptr, InputSignal1: InputSignal)) |
743 | return Err; |
744 | |
745 | // Now prepare the kernel packet. |
746 | uint64_t PacketId; |
747 | hsa_kernel_dispatch_packet_t *Packet = acquirePacket(PacketId); |
748 | assert(Packet && "Invalid packet" ); |
749 | |
750 | // The first 32 bits of the packet are written after the other fields |
751 | uint16_t Dims = NumBlocks[2] * NumThreads[2] > 1 |
752 | ? 3 |
753 | : 1 + (NumBlocks[1] * NumThreads[1] != 1); |
754 | uint16_t Setup = UINT16_C(Dims) |
755 | << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; |
756 | Packet->workgroup_size_x = NumThreads[0]; |
757 | Packet->workgroup_size_y = NumThreads[1]; |
758 | Packet->workgroup_size_z = NumThreads[2]; |
759 | Packet->reserved0 = 0; |
760 | Packet->grid_size_x = NumBlocks[0] * NumThreads[0]; |
761 | Packet->grid_size_y = NumBlocks[1] * NumThreads[1]; |
762 | Packet->grid_size_z = NumBlocks[2] * NumThreads[2]; |
763 | Packet->private_segment_size = |
764 | Kernel.usesDynamicStack() ? StackSize : Kernel.getPrivateSize(); |
765 | Packet->group_segment_size = GroupSize; |
766 | Packet->kernel_object = Kernel.getKernelObject(); |
767 | Packet->kernarg_address = KernelArgs; |
768 | Packet->reserved2 = 0; |
769 | Packet->completion_signal = OutputSignal->get(); |
770 | |
771 | // Publish the packet. Do not modify the packet after this point. |
772 | publishKernelPacket(PacketId, Setup, Packet); |
773 | |
774 | return Plugin::success(); |
775 | } |
776 | |
777 | /// Push a barrier packet that will wait up to two input signals. All signals |
778 | /// are optional (nullptr if none). |
779 | Error pushBarrier(AMDGPUSignalTy *OutputSignal, |
780 | const AMDGPUSignalTy *InputSignal1, |
781 | const AMDGPUSignalTy *InputSignal2) { |
782 | // Lock the queue during the packet publishing process. |
783 | std::lock_guard<std::mutex> Lock(Mutex); |
784 | assert(Queue && "Interacted with a non-initialized queue!" ); |
785 | |
786 | // Push the barrier with the lock acquired. |
787 | return pushBarrierImpl(OutputSignal, InputSignal1, InputSignal2); |
788 | } |
789 | |
790 | private: |
791 | /// Push a barrier packet that will wait up to two input signals. Assumes the |
792 | /// the queue lock is acquired. |
793 | Error pushBarrierImpl(AMDGPUSignalTy *OutputSignal, |
794 | const AMDGPUSignalTy *InputSignal1, |
795 | const AMDGPUSignalTy *InputSignal2 = nullptr) { |
796 | // Add a queue barrier waiting on both the other stream's operation and the |
797 | // last operation on the current stream (if any). |
798 | uint64_t PacketId; |
799 | hsa_barrier_and_packet_t *Packet = |
800 | (hsa_barrier_and_packet_t *)acquirePacket(PacketId); |
801 | assert(Packet && "Invalid packet" ); |
802 | |
803 | Packet->reserved0 = 0; |
804 | Packet->reserved1 = 0; |
805 | Packet->dep_signal[0] = {0}; |
806 | Packet->dep_signal[1] = {0}; |
807 | Packet->dep_signal[2] = {0}; |
808 | Packet->dep_signal[3] = {0}; |
809 | Packet->dep_signal[4] = {0}; |
810 | Packet->reserved2 = 0; |
811 | Packet->completion_signal = {0}; |
812 | |
813 | // Set input and output dependencies if needed. |
814 | if (OutputSignal) |
815 | Packet->completion_signal = OutputSignal->get(); |
816 | if (InputSignal1) |
817 | Packet->dep_signal[0] = InputSignal1->get(); |
818 | if (InputSignal2) |
819 | Packet->dep_signal[1] = InputSignal2->get(); |
820 | |
821 | // Publish the packet. Do not modify the packet after this point. |
822 | publishBarrierPacket(PacketId, Packet); |
823 | |
824 | return Plugin::success(); |
825 | } |
826 | |
827 | /// Acquire a packet from the queue. This call may block the thread if there |
828 | /// is no space in the underlying HSA queue. It may need to wait until the HSA |
829 | /// runtime processes some packets. Assumes the queue lock is acquired. |
830 | hsa_kernel_dispatch_packet_t *acquirePacket(uint64_t &PacketId) { |
831 | // Increase the queue index with relaxed memory order. Notice this will need |
832 | // another subsequent atomic operation with acquire order. |
833 | PacketId = hsa_queue_add_write_index_relaxed(Queue, 1); |
834 | |
835 | // Wait for the package to be available. Notice the atomic operation uses |
836 | // the acquire memory order. |
837 | while (PacketId - hsa_queue_load_read_index_scacquire(Queue) >= Queue->size) |
838 | ; |
839 | |
840 | // Return the packet reference. |
841 | const uint32_t Mask = Queue->size - 1; // The size is a power of 2. |
842 | return (hsa_kernel_dispatch_packet_t *)Queue->base_address + |
843 | (PacketId & Mask); |
844 | } |
845 | |
846 | /// Publish the kernel packet so that the HSA runtime can start processing |
847 | /// the kernel launch. Do not modify the packet once this function is called. |
848 | /// Assumes the queue lock is acquired. |
849 | void publishKernelPacket(uint64_t PacketId, uint16_t Setup, |
850 | hsa_kernel_dispatch_packet_t *Packet) { |
851 | uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet); |
852 | |
853 | uint16_t = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; |
854 | Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; |
855 | Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; |
856 | |
857 | // Publish the packet. Do not modify the package after this point. |
858 | uint32_t = Header | (Setup << 16u); |
859 | __atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE); |
860 | |
861 | // Signal the doorbell about the published packet. |
862 | hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId); |
863 | } |
864 | |
865 | /// Publish the barrier packet so that the HSA runtime can start processing |
866 | /// the barrier. Next packets in the queue will not be processed until all |
867 | /// barrier dependencies (signals) are satisfied. Assumes the queue is locked |
868 | void publishBarrierPacket(uint64_t PacketId, |
869 | hsa_barrier_and_packet_t *Packet) { |
870 | uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet); |
871 | uint16_t Setup = 0; |
872 | uint16_t = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; |
873 | Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; |
874 | Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; |
875 | |
876 | // Publish the packet. Do not modify the package after this point. |
877 | uint32_t = Header | (Setup << 16u); |
878 | __atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE); |
879 | |
880 | // Signal the doorbell about the published packet. |
881 | hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId); |
882 | } |
883 | |
884 | /// Callback that will be called when an error is detected on the HSA queue. |
885 | static void callbackError(hsa_status_t Status, hsa_queue_t *Source, |
886 | void *Data); |
887 | |
888 | /// The HSA queue. |
889 | hsa_queue_t *Queue; |
890 | |
891 | /// Mutex to protect the acquiring and publishing of packets. For the moment, |
892 | /// we need this mutex to prevent publishing packets that are not ready to be |
893 | /// published in a multi-thread scenario. Without a queue lock, a thread T1 |
894 | /// could acquire packet P and thread T2 acquire packet P+1. Thread T2 could |
895 | /// publish its packet P+1 (signaling the queue's doorbell) before packet P |
896 | /// from T1 is ready to be processed. That scenario should be invalid. Thus, |
897 | /// we use the following mutex to make packet acquiring and publishing atomic. |
898 | /// TODO: There are other more advanced approaches to avoid this mutex using |
899 | /// atomic operations. We can further investigate it if this is a bottleneck. |
900 | std::mutex Mutex; |
901 | |
902 | /// The number of streams, this queue is currently assigned to. A queue is |
903 | /// considered idle when this is zero, otherwise: busy. |
904 | uint32_t NumUsers; |
905 | }; |
906 | |
907 | /// Struct that implements a stream of asynchronous operations for AMDGPU |
908 | /// devices. This class relies on signals to implement streams and define the |
909 | /// dependencies between asynchronous operations. |
910 | struct AMDGPUStreamTy { |
911 | private: |
912 | /// Utility struct holding arguments for async H2H memory copies. |
913 | struct MemcpyArgsTy { |
914 | void *Dst; |
915 | const void *Src; |
916 | size_t Size; |
917 | }; |
918 | |
919 | /// Utility struct holding arguments for freeing buffers to memory managers. |
920 | struct ReleaseBufferArgsTy { |
921 | void *Buffer; |
922 | AMDGPUMemoryManagerTy *MemoryManager; |
923 | }; |
924 | |
925 | /// Utility struct holding arguments for releasing signals to signal managers. |
926 | struct ReleaseSignalArgsTy { |
927 | AMDGPUSignalTy *Signal; |
928 | AMDGPUSignalManagerTy *SignalManager; |
929 | }; |
930 | |
931 | using AMDGPUStreamCallbackTy = Error(void *Data); |
932 | |
933 | /// The stream is composed of N stream's slots. The struct below represents |
934 | /// the fields of each slot. Each slot has a signal and an optional action |
935 | /// function. When appending an HSA asynchronous operation to the stream, one |
936 | /// slot is consumed and used to store the operation's information. The |
937 | /// operation's output signal is set to the consumed slot's signal. If there |
938 | /// is a previous asynchronous operation on the previous slot, the HSA async |
939 | /// operation's input signal is set to the signal of the previous slot. This |
940 | /// way, we obtain a chain of dependent async operations. The action is a |
941 | /// function that will be executed eventually after the operation is |
942 | /// completed, e.g., for releasing a buffer. |
943 | struct StreamSlotTy { |
944 | /// The output signal of the stream operation. May be used by the subsequent |
945 | /// operation as input signal. |
946 | AMDGPUSignalTy *Signal; |
947 | |
948 | /// The actions that must be performed after the operation's completion. Set |
949 | /// to nullptr when there is no action to perform. |
950 | llvm::SmallVector<AMDGPUStreamCallbackTy *> Callbacks; |
951 | |
952 | /// Space for the action's arguments. A pointer to these arguments is passed |
953 | /// to the action function. Notice the space of arguments is limited. |
954 | union ActionArgsTy { |
955 | MemcpyArgsTy MemcpyArgs; |
956 | ReleaseBufferArgsTy ReleaseBufferArgs; |
957 | ReleaseSignalArgsTy ReleaseSignalArgs; |
958 | void *CallbackArgs; |
959 | }; |
960 | |
961 | llvm::SmallVector<ActionArgsTy> ActionArgs; |
962 | |
963 | /// Create an empty slot. |
964 | StreamSlotTy() : Signal(nullptr), Callbacks({}), ActionArgs({}) {} |
965 | |
966 | /// Schedule a host memory copy action on the slot. |
967 | Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size) { |
968 | Callbacks.emplace_back(memcpyAction); |
969 | ActionArgs.emplace_back().MemcpyArgs = MemcpyArgsTy{Dst, Src, Size}; |
970 | return Plugin::success(); |
971 | } |
972 | |
973 | /// Schedule a release buffer action on the slot. |
974 | Error schedReleaseBuffer(void *Buffer, AMDGPUMemoryManagerTy &Manager) { |
975 | Callbacks.emplace_back(releaseBufferAction); |
976 | ActionArgs.emplace_back().ReleaseBufferArgs = |
977 | ReleaseBufferArgsTy{Buffer, &Manager}; |
978 | return Plugin::success(); |
979 | } |
980 | |
981 | /// Schedule a signal release action on the slot. |
982 | Error schedReleaseSignal(AMDGPUSignalTy *SignalToRelease, |
983 | AMDGPUSignalManagerTy *SignalManager) { |
984 | Callbacks.emplace_back(releaseSignalAction); |
985 | ActionArgs.emplace_back().ReleaseSignalArgs = |
986 | ReleaseSignalArgsTy{SignalToRelease, SignalManager}; |
987 | return Plugin::success(); |
988 | } |
989 | |
990 | /// Register a callback to be called on compleition |
991 | Error schedCallback(AMDGPUStreamCallbackTy *Func, void *Data) { |
992 | Callbacks.emplace_back(Func); |
993 | ActionArgs.emplace_back().CallbackArgs = Data; |
994 | |
995 | return Plugin::success(); |
996 | } |
997 | |
998 | // Perform the action if needed. |
999 | Error performAction() { |
1000 | if (Callbacks.empty()) |
1001 | return Plugin::success(); |
1002 | |
1003 | assert(Callbacks.size() == ActionArgs.size() && "Size mismatch" ); |
1004 | for (auto [Callback, ActionArg] : llvm::zip(Callbacks, ActionArgs)) { |
1005 | // Perform the action. |
1006 | if (Callback == memcpyAction) { |
1007 | if (auto Err = memcpyAction(&ActionArg)) |
1008 | return Err; |
1009 | } else if (Callback == releaseBufferAction) { |
1010 | if (auto Err = releaseBufferAction(&ActionArg)) |
1011 | return Err; |
1012 | } else if (Callback == releaseSignalAction) { |
1013 | if (auto Err = releaseSignalAction(&ActionArg)) |
1014 | return Err; |
1015 | } else if (Callback) { |
1016 | if (auto Err = Callback(ActionArg.CallbackArgs)) |
1017 | return Err; |
1018 | } |
1019 | } |
1020 | |
1021 | // Invalidate the action. |
1022 | Callbacks.clear(); |
1023 | ActionArgs.clear(); |
1024 | |
1025 | return Plugin::success(); |
1026 | } |
1027 | }; |
1028 | |
1029 | /// The device agent where the stream was created. |
1030 | hsa_agent_t Agent; |
1031 | |
1032 | /// The queue that the stream uses to launch kernels. |
1033 | AMDGPUQueueTy *Queue; |
1034 | |
1035 | /// The manager of signals to reuse signals. |
1036 | AMDGPUSignalManagerTy &SignalManager; |
1037 | |
1038 | /// A reference to the associated device. |
1039 | GenericDeviceTy &Device; |
1040 | |
1041 | /// Array of stream slots. Use std::deque because it can dynamically grow |
1042 | /// without invalidating the already inserted elements. For instance, the |
1043 | /// std::vector may invalidate the elements by reallocating the internal |
1044 | /// array if there is not enough space on new insertions. |
1045 | std::deque<StreamSlotTy> Slots; |
1046 | |
1047 | /// The next available slot on the queue. This is reset to zero each time the |
1048 | /// stream is synchronized. It also indicates the current number of consumed |
1049 | /// slots at a given time. |
1050 | uint32_t NextSlot; |
1051 | |
1052 | /// The synchronization id. This number is increased each time the stream is |
1053 | /// synchronized. It is useful to detect if an AMDGPUEventTy points to an |
1054 | /// operation that was already finalized in a previous stream sycnhronize. |
1055 | uint32_t SyncCycle; |
1056 | |
1057 | /// Mutex to protect stream's management. |
1058 | mutable std::mutex Mutex; |
1059 | |
1060 | /// Timeout hint for HSA actively waiting for signal value to change |
1061 | const uint64_t StreamBusyWaitMicroseconds; |
1062 | |
1063 | /// Indicate to spread data transfers across all available SDMAs |
1064 | bool UseMultipleSdmaEngines; |
1065 | |
1066 | /// Return the current number of asynchronous operations on the stream. |
1067 | uint32_t size() const { return NextSlot; } |
1068 | |
1069 | /// Return the last valid slot on the stream. |
1070 | uint32_t last() const { return size() - 1; } |
1071 | |
1072 | /// Consume one slot from the stream. Since the stream uses signals on demand |
1073 | /// and releases them once the slot is no longer used, the function requires |
1074 | /// an idle signal for the new consumed slot. |
1075 | std::pair<uint32_t, AMDGPUSignalTy *> consume(AMDGPUSignalTy *OutputSignal) { |
1076 | // Double the stream size if needed. Since we use std::deque, this operation |
1077 | // does not invalidate the already added slots. |
1078 | if (Slots.size() == NextSlot) |
1079 | Slots.resize(new_size: Slots.size() * 2); |
1080 | |
1081 | // Update the next available slot and the stream size. |
1082 | uint32_t Curr = NextSlot++; |
1083 | |
1084 | // Retrieve the input signal, if any, of the current operation. |
1085 | AMDGPUSignalTy *InputSignal = (Curr > 0) ? Slots[Curr - 1].Signal : nullptr; |
1086 | |
1087 | // Set the output signal of the current slot. |
1088 | Slots[Curr].Signal = OutputSignal; |
1089 | |
1090 | return std::make_pair(x&: Curr, y&: InputSignal); |
1091 | } |
1092 | |
1093 | /// Complete all pending post actions and reset the stream after synchronizing |
1094 | /// or positively querying the stream. |
1095 | Error complete() { |
1096 | for (uint32_t Slot = 0; Slot < NextSlot; ++Slot) { |
1097 | // Take the post action of the operation if any. |
1098 | if (auto Err = Slots[Slot].performAction()) |
1099 | return Err; |
1100 | |
1101 | // Release the slot's signal if possible. Otherwise, another user will. |
1102 | if (Slots[Slot].Signal->decreaseUseCount()) |
1103 | if (auto Err = SignalManager.returnResource(Slots[Slot].Signal)) |
1104 | return Err; |
1105 | |
1106 | Slots[Slot].Signal = nullptr; |
1107 | } |
1108 | |
1109 | // Reset the stream slots to zero. |
1110 | NextSlot = 0; |
1111 | |
1112 | // Increase the synchronization id since the stream completed a sync cycle. |
1113 | SyncCycle += 1; |
1114 | |
1115 | return Plugin::success(); |
1116 | } |
1117 | |
1118 | /// Make the current stream wait on a specific operation of another stream. |
1119 | /// The idea is to make the current stream waiting on two signals: 1) the last |
1120 | /// signal of the current stream, and 2) the last signal of the other stream. |
1121 | /// Use a barrier packet with two input signals. |
1122 | Error waitOnStreamOperation(AMDGPUStreamTy &OtherStream, uint32_t Slot) { |
1123 | if (Queue == nullptr) |
1124 | return Plugin::error(ErrorCode::INVALID_NULL_POINTER, |
1125 | "target queue was nullptr" ); |
1126 | |
1127 | /// The signal that we must wait from the other stream. |
1128 | AMDGPUSignalTy *OtherSignal = OtherStream.Slots[Slot].Signal; |
1129 | |
1130 | // Prevent the release of the other stream's signal. |
1131 | OtherSignal->increaseUseCount(); |
1132 | |
1133 | // Retrieve an available signal for the operation's output. |
1134 | AMDGPUSignalTy *OutputSignal = nullptr; |
1135 | if (auto Err = SignalManager.getResource(OutputSignal)) |
1136 | return Err; |
1137 | OutputSignal->reset(); |
1138 | OutputSignal->increaseUseCount(); |
1139 | |
1140 | // Consume stream slot and compute dependencies. |
1141 | auto [Curr, InputSignal] = consume(OutputSignal); |
1142 | |
1143 | // Setup the post action to release the signal. |
1144 | if (auto Err = Slots[Curr].schedReleaseSignal(OtherSignal, &SignalManager)) |
1145 | return Err; |
1146 | |
1147 | // Push a barrier into the queue with both input signals. |
1148 | return Queue->pushBarrier(OutputSignal, InputSignal1: InputSignal, InputSignal2: OtherSignal); |
1149 | } |
1150 | |
1151 | /// Callback for running a specific asynchronous operation. This callback is |
1152 | /// used for hsa_amd_signal_async_handler. The argument is the operation that |
1153 | /// should be executed. Notice we use the post action mechanism to codify the |
1154 | /// asynchronous operation. |
1155 | static bool asyncActionCallback(hsa_signal_value_t Value, void *Args) { |
1156 | StreamSlotTy *Slot = reinterpret_cast<StreamSlotTy *>(Args); |
1157 | assert(Slot && "Invalid slot" ); |
1158 | assert(Slot->Signal && "Invalid signal" ); |
1159 | |
1160 | // This thread is outside the stream mutex. Make sure the thread sees the |
1161 | // changes on the slot. |
1162 | std::atomic_thread_fence(m: std::memory_order_acquire); |
1163 | |
1164 | // Perform the operation. |
1165 | if (auto Err = Slot->performAction()) |
1166 | FATAL_MESSAGE(1, "Error performing post action: %s" , |
1167 | toString(E: std::move(Err)).data()); |
1168 | |
1169 | // Signal the output signal to notify the asynchronous operation finalized. |
1170 | Slot->Signal->signal(); |
1171 | |
1172 | // Unregister callback. |
1173 | return false; |
1174 | } |
1175 | |
1176 | // Callback for host-to-host memory copies. This is an asynchronous action. |
1177 | static Error memcpyAction(void *Data) { |
1178 | MemcpyArgsTy *Args = reinterpret_cast<MemcpyArgsTy *>(Data); |
1179 | assert(Args && "Invalid arguments" ); |
1180 | assert(Args->Dst && "Invalid destination buffer" ); |
1181 | assert(Args->Src && "Invalid source buffer" ); |
1182 | |
1183 | std::memcpy(dest: Args->Dst, src: Args->Src, n: Args->Size); |
1184 | |
1185 | return Plugin::success(); |
1186 | } |
1187 | |
1188 | /// Releasing a memory buffer to a memory manager. This is a post completion |
1189 | /// action. There are two kinds of memory buffers: |
1190 | /// 1. For kernel arguments. This buffer can be freed after receiving the |
1191 | /// kernel completion signal. |
1192 | /// 2. For H2D transfers that need pinned memory space for staging. This |
1193 | /// buffer can be freed after receiving the transfer completion signal. |
1194 | /// 3. For D2H transfers that need pinned memory space for staging. This |
1195 | /// buffer cannot be freed after receiving the transfer completion signal |
1196 | /// because of the following asynchronous H2H callback. |
1197 | /// For this reason, This action can only be taken at |
1198 | /// AMDGPUStreamTy::complete() |
1199 | /// Because of the case 3, all releaseBufferActions are taken at |
1200 | /// AMDGPUStreamTy::complete() in the current implementation. |
1201 | static Error releaseBufferAction(void *Data) { |
1202 | ReleaseBufferArgsTy *Args = reinterpret_cast<ReleaseBufferArgsTy *>(Data); |
1203 | assert(Args && "Invalid arguments" ); |
1204 | assert(Args->MemoryManager && "Invalid memory manager" ); |
1205 | |
1206 | // Release the allocation to the memory manager. |
1207 | return Args->MemoryManager->deallocate(Ptr: Args->Buffer); |
1208 | } |
1209 | |
1210 | /// Releasing a signal object back to SignalManager. This is a post completion |
1211 | /// action. This action can only be taken at AMDGPUStreamTy::complete() |
1212 | static Error releaseSignalAction(void *Data) { |
1213 | ReleaseSignalArgsTy *Args = reinterpret_cast<ReleaseSignalArgsTy *>(Data); |
1214 | assert(Args && "Invalid arguments" ); |
1215 | assert(Args->Signal && "Invalid signal" ); |
1216 | assert(Args->SignalManager && "Invalid signal manager" ); |
1217 | |
1218 | // Release the signal if needed. |
1219 | if (Args->Signal->decreaseUseCount()) |
1220 | if (auto Err = Args->SignalManager->returnResource(Args->Signal)) |
1221 | return Err; |
1222 | |
1223 | return Plugin::success(); |
1224 | } |
1225 | |
1226 | public: |
1227 | /// Create an empty stream associated with a specific device. |
1228 | AMDGPUStreamTy(AMDGPUDeviceTy &Device); |
1229 | |
1230 | /// Initialize the stream's signals. |
1231 | Error init() { return Plugin::success(); } |
1232 | |
1233 | /// Deinitialize the stream's signals. |
1234 | Error deinit() { return Plugin::success(); } |
1235 | |
1236 | /// Push a asynchronous kernel to the stream. The kernel arguments must be |
1237 | /// placed in a special allocation for kernel args and must keep alive until |
1238 | /// the kernel finalizes. Once the kernel is finished, the stream will release |
1239 | /// the kernel args buffer to the specified memory manager. |
1240 | Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs, |
1241 | uint32_t NumThreads[3], uint32_t NumBlocks[3], |
1242 | uint32_t GroupSize, uint64_t StackSize, |
1243 | AMDGPUMemoryManagerTy &MemoryManager) { |
1244 | if (Queue == nullptr) |
1245 | return Plugin::error(ErrorCode::INVALID_NULL_POINTER, |
1246 | "target queue was nullptr" ); |
1247 | |
1248 | // Retrieve an available signal for the operation's output. |
1249 | AMDGPUSignalTy *OutputSignal = nullptr; |
1250 | if (auto Err = SignalManager.getResource(OutputSignal)) |
1251 | return Err; |
1252 | OutputSignal->reset(); |
1253 | OutputSignal->increaseUseCount(); |
1254 | |
1255 | std::lock_guard<std::mutex> StreamLock(Mutex); |
1256 | |
1257 | // Consume stream slot and compute dependencies. |
1258 | auto [Curr, InputSignal] = consume(OutputSignal); |
1259 | |
1260 | // Setup the post action to release the kernel args buffer. |
1261 | if (auto Err = Slots[Curr].schedReleaseBuffer(Buffer: KernelArgs, Manager&: MemoryManager)) |
1262 | return Err; |
1263 | |
1264 | // If we are running an RPC server we want to wake up the server thread |
1265 | // whenever there is a kernel running and let it sleep otherwise. |
1266 | if (Device.getRPCServer()) |
1267 | Device.Plugin.getRPCServer().Thread->notify(); |
1268 | |
1269 | // Push the kernel with the output signal and an input signal (optional) |
1270 | if (auto Err = Queue->pushKernelLaunch(Kernel, KernelArgs, NumThreads, |
1271 | NumBlocks, GroupSize, StackSize, |
1272 | OutputSignal, InputSignal)) |
1273 | return Err; |
1274 | |
1275 | // Register a callback to indicate when the kernel is complete. |
1276 | if (Device.getRPCServer()) { |
1277 | if (auto Err = Slots[Curr].schedCallback( |
1278 | [](void *Data) -> llvm::Error { |
1279 | GenericPluginTy &Plugin = |
1280 | *reinterpret_cast<GenericPluginTy *>(Data); |
1281 | Plugin.getRPCServer().Thread->finish(); |
1282 | return Error::success(); |
1283 | }, |
1284 | &Device.Plugin)) |
1285 | return Err; |
1286 | } |
1287 | return Plugin::success(); |
1288 | } |
1289 | |
1290 | /// Push an asynchronous memory copy between pinned memory buffers. |
1291 | Error pushPinnedMemoryCopyAsync(void *Dst, const void *Src, |
1292 | uint64_t CopySize) { |
1293 | // Retrieve an available signal for the operation's output. |
1294 | AMDGPUSignalTy *OutputSignal = nullptr; |
1295 | if (auto Err = SignalManager.getResource(OutputSignal)) |
1296 | return Err; |
1297 | OutputSignal->reset(); |
1298 | OutputSignal->increaseUseCount(); |
1299 | |
1300 | std::lock_guard<std::mutex> Lock(Mutex); |
1301 | |
1302 | // Consume stream slot and compute dependencies. |
1303 | auto [Curr, InputSignal] = consume(OutputSignal); |
1304 | |
1305 | // Issue the async memory copy. |
1306 | if (InputSignal && InputSignal->load()) { |
1307 | hsa_signal_t InputSignalRaw = InputSignal->get(); |
1308 | return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, |
1309 | Agent, CopySize, 1, &InputSignalRaw, |
1310 | OutputSignal->get()); |
1311 | } |
1312 | |
1313 | return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, |
1314 | Agent, CopySize, 0, nullptr, |
1315 | OutputSignal->get()); |
1316 | } |
1317 | |
1318 | /// Push an asynchronous memory copy device-to-host involving an unpinned |
1319 | /// memory buffer. The operation consists of a two-step copy from the |
1320 | /// device buffer to an intermediate pinned host buffer, and then, to a |
1321 | /// unpinned host buffer. Both operations are asynchronous and dependent. |
1322 | /// The intermediate pinned buffer will be released to the specified memory |
1323 | /// manager once the operation completes. |
1324 | Error pushMemoryCopyD2HAsync(void *Dst, const void *Src, void *Inter, |
1325 | uint64_t CopySize, |
1326 | AMDGPUMemoryManagerTy &MemoryManager) { |
1327 | // Retrieve available signals for the operation's outputs. |
1328 | AMDGPUSignalTy *OutputSignals[2] = {}; |
1329 | if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals)) |
1330 | return Err; |
1331 | for (auto *Signal : OutputSignals) { |
1332 | Signal->reset(); |
1333 | Signal->increaseUseCount(); |
1334 | } |
1335 | |
1336 | std::lock_guard<std::mutex> Lock(Mutex); |
1337 | |
1338 | // Consume stream slot and compute dependencies. |
1339 | auto [Curr, InputSignal] = consume(OutputSignal: OutputSignals[0]); |
1340 | |
1341 | // Setup the post action for releasing the intermediate buffer. |
1342 | if (auto Err = Slots[Curr].schedReleaseBuffer(Buffer: Inter, Manager&: MemoryManager)) |
1343 | return Err; |
1344 | |
1345 | // Issue the first step: device to host transfer. Avoid defining the input |
1346 | // dependency if already satisfied. |
1347 | if (InputSignal && InputSignal->load()) { |
1348 | hsa_signal_t InputSignalRaw = InputSignal->get(); |
1349 | if (auto Err = hsa_utils::asyncMemCopy( |
1350 | UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1, |
1351 | &InputSignalRaw, OutputSignals[0]->get())) |
1352 | return Err; |
1353 | } else { |
1354 | if (auto Err = hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Inter, |
1355 | Agent, Src, Agent, CopySize, 0, |
1356 | nullptr, OutputSignals[0]->get())) |
1357 | return Err; |
1358 | } |
1359 | |
1360 | // Consume another stream slot and compute dependencies. |
1361 | std::tie(args&: Curr, args&: InputSignal) = consume(OutputSignal: OutputSignals[1]); |
1362 | assert(InputSignal && "Invalid input signal" ); |
1363 | |
1364 | // The std::memcpy is done asynchronously using an async handler. We store |
1365 | // the function's information in the action but it's not actually an action. |
1366 | if (auto Err = Slots[Curr].schedHostMemoryCopy(Dst, Src: Inter, Size: CopySize)) |
1367 | return Err; |
1368 | |
1369 | // Make changes on this slot visible to the async handler's thread. |
1370 | std::atomic_thread_fence(m: std::memory_order_release); |
1371 | |
1372 | // Issue the second step: host to host transfer. |
1373 | hsa_status_t Status = hsa_amd_signal_async_handler( |
1374 | InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback, |
1375 | (void *)&Slots[Curr]); |
1376 | |
1377 | return Plugin::check(Status, "error in hsa_amd_signal_async_handler: %s" ); |
1378 | } |
1379 | |
1380 | /// Push an asynchronous memory copy host-to-device involving an unpinned |
1381 | /// memory buffer. The operation consists of a two-step copy from the |
1382 | /// unpinned host buffer to an intermediate pinned host buffer, and then, to |
1383 | /// the pinned host buffer. Both operations are asynchronous and dependent. |
1384 | /// The intermediate pinned buffer will be released to the specified memory |
1385 | /// manager once the operation completes. |
1386 | Error pushMemoryCopyH2DAsync(void *Dst, const void *Src, void *Inter, |
1387 | uint64_t CopySize, |
1388 | AMDGPUMemoryManagerTy &MemoryManager) { |
1389 | // Retrieve available signals for the operation's outputs. |
1390 | AMDGPUSignalTy *OutputSignals[2] = {}; |
1391 | if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals)) |
1392 | return Err; |
1393 | for (auto *Signal : OutputSignals) { |
1394 | Signal->reset(); |
1395 | Signal->increaseUseCount(); |
1396 | } |
1397 | |
1398 | AMDGPUSignalTy *OutputSignal = OutputSignals[0]; |
1399 | |
1400 | std::lock_guard<std::mutex> Lock(Mutex); |
1401 | |
1402 | // Consume stream slot and compute dependencies. |
1403 | auto [Curr, InputSignal] = consume(OutputSignal); |
1404 | |
1405 | // Issue the first step: host to host transfer. |
1406 | if (InputSignal && InputSignal->load()) { |
1407 | // The std::memcpy is done asynchronously using an async handler. We store |
1408 | // the function's information in the action but it is not actually a |
1409 | // post action. |
1410 | if (auto Err = Slots[Curr].schedHostMemoryCopy(Dst: Inter, Src, Size: CopySize)) |
1411 | return Err; |
1412 | |
1413 | // Make changes on this slot visible to the async handler's thread. |
1414 | std::atomic_thread_fence(m: std::memory_order_release); |
1415 | |
1416 | hsa_status_t Status = hsa_amd_signal_async_handler( |
1417 | InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback, |
1418 | (void *)&Slots[Curr]); |
1419 | |
1420 | if (auto Err = Plugin::check(Status, |
1421 | "error in hsa_amd_signal_async_handler: %s" )) |
1422 | return Err; |
1423 | |
1424 | // Let's use now the second output signal. |
1425 | OutputSignal = OutputSignals[1]; |
1426 | |
1427 | // Consume another stream slot and compute dependencies. |
1428 | std::tie(args&: Curr, args&: InputSignal) = consume(OutputSignal); |
1429 | } else { |
1430 | // All preceding operations completed, copy the memory synchronously. |
1431 | std::memcpy(dest: Inter, src: Src, n: CopySize); |
1432 | |
1433 | // Return the second signal because it will not be used. |
1434 | OutputSignals[1]->decreaseUseCount(); |
1435 | if (auto Err = SignalManager.returnResource(OutputSignals[1])) |
1436 | return Err; |
1437 | } |
1438 | |
1439 | // Setup the post action to release the intermediate pinned buffer. |
1440 | if (auto Err = Slots[Curr].schedReleaseBuffer(Buffer: Inter, Manager&: MemoryManager)) |
1441 | return Err; |
1442 | |
1443 | // Issue the second step: host to device transfer. Avoid defining the input |
1444 | // dependency if already satisfied. |
1445 | if (InputSignal && InputSignal->load()) { |
1446 | hsa_signal_t InputSignalRaw = InputSignal->get(); |
1447 | return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, |
1448 | Agent, CopySize, 1, &InputSignalRaw, |
1449 | OutputSignal->get()); |
1450 | } |
1451 | return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, |
1452 | Agent, CopySize, 0, nullptr, |
1453 | OutputSignal->get()); |
1454 | } |
1455 | |
1456 | // AMDGPUDeviceTy is incomplete here, passing the underlying agent instead |
1457 | Error pushMemoryCopyD2DAsync(void *Dst, hsa_agent_t DstAgent, const void *Src, |
1458 | hsa_agent_t SrcAgent, uint64_t CopySize) { |
1459 | AMDGPUSignalTy *OutputSignal; |
1460 | if (auto Err = SignalManager.getResources(/*Num=*/1, &OutputSignal)) |
1461 | return Err; |
1462 | OutputSignal->reset(); |
1463 | OutputSignal->increaseUseCount(); |
1464 | |
1465 | std::lock_guard<std::mutex> Lock(Mutex); |
1466 | |
1467 | // Consume stream slot and compute dependencies. |
1468 | auto [Curr, InputSignal] = consume(OutputSignal); |
1469 | |
1470 | // The agents need to have access to the corresponding memory |
1471 | // This is presently only true if the pointers were originally |
1472 | // allocated by this runtime or the caller made the appropriate |
1473 | // access calls. |
1474 | |
1475 | if (InputSignal && InputSignal->load()) { |
1476 | hsa_signal_t InputSignalRaw = InputSignal->get(); |
1477 | return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src, |
1478 | SrcAgent, CopySize, 1, &InputSignalRaw, |
1479 | OutputSignal->get()); |
1480 | } |
1481 | return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src, |
1482 | SrcAgent, CopySize, 0, nullptr, |
1483 | OutputSignal->get()); |
1484 | } |
1485 | |
1486 | /// Synchronize with the stream. The current thread waits until all operations |
1487 | /// are finalized and it performs the pending post actions (i.e., releasing |
1488 | /// intermediate buffers). |
1489 | Error synchronize() { |
1490 | std::lock_guard<std::mutex> Lock(Mutex); |
1491 | |
1492 | // No need to synchronize anything. |
1493 | if (size() == 0) |
1494 | return Plugin::success(); |
1495 | |
1496 | // Wait until all previous operations on the stream have completed. |
1497 | if (auto Err = |
1498 | Slots[last()].Signal->wait(StreamBusyWaitMicroseconds, &Device)) |
1499 | return Err; |
1500 | |
1501 | // Reset the stream and perform all pending post actions. |
1502 | return complete(); |
1503 | } |
1504 | |
1505 | /// Query the stream and complete pending post actions if operations finished. |
1506 | /// Return whether all the operations completed. This operation does not block |
1507 | /// the calling thread. |
1508 | Expected<bool> query() { |
1509 | std::lock_guard<std::mutex> Lock(Mutex); |
1510 | |
1511 | // No need to query anything. |
1512 | if (size() == 0) |
1513 | return true; |
1514 | |
1515 | // The last operation did not complete yet. Return directly. |
1516 | if (Slots[last()].Signal->load()) |
1517 | return false; |
1518 | |
1519 | // Reset the stream and perform all pending post actions. |
1520 | if (auto Err = complete()) |
1521 | return std::move(Err); |
1522 | |
1523 | return true; |
1524 | } |
1525 | |
1526 | const AMDGPUQueueTy *getQueue() const { return Queue; } |
1527 | |
1528 | /// Record the state of the stream on an event. |
1529 | Error recordEvent(AMDGPUEventTy &Event) const; |
1530 | |
1531 | /// Make the stream wait on an event. |
1532 | Error waitEvent(const AMDGPUEventTy &Event); |
1533 | |
1534 | friend struct AMDGPUStreamManagerTy; |
1535 | }; |
1536 | |
1537 | /// Class representing an event on AMDGPU. The event basically stores some |
1538 | /// information regarding the state of the recorded stream. |
1539 | struct AMDGPUEventTy { |
1540 | /// Create an empty event. |
1541 | AMDGPUEventTy(AMDGPUDeviceTy &Device) |
1542 | : RecordedStream(nullptr), RecordedSlot(-1), RecordedSyncCycle(-1) {} |
1543 | |
1544 | /// Initialize and deinitialize. |
1545 | Error init() { return Plugin::success(); } |
1546 | Error deinit() { return Plugin::success(); } |
1547 | |
1548 | /// Record the state of a stream on the event. |
1549 | Error record(AMDGPUStreamTy &Stream) { |
1550 | std::lock_guard<std::mutex> Lock(Mutex); |
1551 | |
1552 | // Ignore the last recorded stream. |
1553 | RecordedStream = &Stream; |
1554 | |
1555 | return Stream.recordEvent(Event&: *this); |
1556 | } |
1557 | |
1558 | /// Make a stream wait on the current event. |
1559 | Error wait(AMDGPUStreamTy &Stream) { |
1560 | std::lock_guard<std::mutex> Lock(Mutex); |
1561 | |
1562 | if (!RecordedStream) |
1563 | return Plugin::error(ErrorCode::INVALID_ARGUMENT, |
1564 | "event does not have any recorded stream" ); |
1565 | |
1566 | // Synchronizing the same stream. Do nothing. |
1567 | if (RecordedStream == &Stream) |
1568 | return Plugin::success(); |
1569 | |
1570 | // No need to wait anything, the recorded stream already finished the |
1571 | // corresponding operation. |
1572 | if (RecordedSlot < 0) |
1573 | return Plugin::success(); |
1574 | |
1575 | return Stream.waitEvent(Event: *this); |
1576 | } |
1577 | |
1578 | protected: |
1579 | /// The stream registered in this event. |
1580 | AMDGPUStreamTy *RecordedStream; |
1581 | |
1582 | /// The recordered operation on the recorded stream. |
1583 | int64_t RecordedSlot; |
1584 | |
1585 | /// The sync cycle when the stream was recorded. Used to detect stale events. |
1586 | int64_t RecordedSyncCycle; |
1587 | |
1588 | /// Mutex to safely access event fields. |
1589 | mutable std::mutex Mutex; |
1590 | |
1591 | friend struct AMDGPUStreamTy; |
1592 | }; |
1593 | |
1594 | Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) const { |
1595 | std::lock_guard<std::mutex> Lock(Mutex); |
1596 | |
1597 | if (size() > 0) { |
1598 | // Record the synchronize identifier (to detect stale recordings) and |
1599 | // the last valid stream's operation. |
1600 | Event.RecordedSyncCycle = SyncCycle; |
1601 | Event.RecordedSlot = last(); |
1602 | |
1603 | assert(Event.RecordedSyncCycle >= 0 && "Invalid recorded sync cycle" ); |
1604 | assert(Event.RecordedSlot >= 0 && "Invalid recorded slot" ); |
1605 | } else { |
1606 | // The stream is empty, everything already completed, record nothing. |
1607 | Event.RecordedSyncCycle = -1; |
1608 | Event.RecordedSlot = -1; |
1609 | } |
1610 | return Plugin::success(); |
1611 | } |
1612 | |
1613 | Error AMDGPUStreamTy::waitEvent(const AMDGPUEventTy &Event) { |
1614 | // Retrieve the recorded stream on the event. |
1615 | AMDGPUStreamTy &RecordedStream = *Event.RecordedStream; |
1616 | |
1617 | std::scoped_lock<std::mutex, std::mutex> Lock(Mutex, RecordedStream.Mutex); |
1618 | |
1619 | // The recorded stream already completed the operation because the synchronize |
1620 | // identifier is already outdated. |
1621 | if (RecordedStream.SyncCycle != (uint32_t)Event.RecordedSyncCycle) |
1622 | return Plugin::success(); |
1623 | |
1624 | // Again, the recorded stream already completed the operation, the last |
1625 | // operation's output signal is satisfied. |
1626 | if (!RecordedStream.Slots[Event.RecordedSlot].Signal->load()) |
1627 | return Plugin::success(); |
1628 | |
1629 | // Otherwise, make the current stream wait on the other stream's operation. |
1630 | return waitOnStreamOperation(OtherStream&: RecordedStream, Slot: Event.RecordedSlot); |
1631 | } |
1632 | |
1633 | struct AMDGPUStreamManagerTy final |
1634 | : GenericDeviceResourceManagerTy<AMDGPUResourceRef<AMDGPUStreamTy>> { |
1635 | using ResourceRef = AMDGPUResourceRef<AMDGPUStreamTy>; |
1636 | using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>; |
1637 | |
1638 | AMDGPUStreamManagerTy(GenericDeviceTy &Device, hsa_agent_t HSAAgent) |
1639 | : GenericDeviceResourceManagerTy(Device), Device(Device), |
1640 | OMPX_QueueTracking("LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING" , true), |
1641 | NextQueue(0), Agent(HSAAgent) {} |
1642 | |
1643 | Error init(uint32_t InitialSize, int NumHSAQueues, int HSAQueueSize) { |
1644 | Queues = std::vector<AMDGPUQueueTy>(NumHSAQueues); |
1645 | QueueSize = HSAQueueSize; |
1646 | MaxNumQueues = NumHSAQueues; |
1647 | // Initialize one queue eagerly |
1648 | if (auto Err = Queues.front().init(Device, Agent, QueueSize)) |
1649 | return Err; |
1650 | |
1651 | return GenericDeviceResourceManagerTy::init(InitialSize); |
1652 | } |
1653 | |
1654 | /// Deinitialize the resource pool and delete all resources. This function |
1655 | /// must be called before the destructor. |
1656 | Error deinit() override { |
1657 | // De-init all queues |
1658 | for (AMDGPUQueueTy &Queue : Queues) { |
1659 | if (auto Err = Queue.deinit()) |
1660 | return Err; |
1661 | } |
1662 | |
1663 | return GenericDeviceResourceManagerTy::deinit(); |
1664 | } |
1665 | |
1666 | /// Get a single stream from the pool or create new resources. |
1667 | virtual Error getResource(AMDGPUStreamTy *&StreamHandle) override { |
1668 | return getResourcesImpl(1, &StreamHandle, [this](AMDGPUStreamTy *&Handle) { |
1669 | return assignNextQueue(Stream: Handle); |
1670 | }); |
1671 | } |
1672 | |
1673 | /// Return stream to the pool. |
1674 | virtual Error returnResource(AMDGPUStreamTy *StreamHandle) override { |
1675 | return returnResourceImpl(StreamHandle, [](AMDGPUStreamTy *Handle) { |
1676 | Handle->Queue->removeUser(); |
1677 | return Plugin::success(); |
1678 | }); |
1679 | } |
1680 | |
1681 | private: |
1682 | /// Search for and assign an preferably idle queue to the given Stream. If |
1683 | /// there is no queue without current users, choose the queue with the lowest |
1684 | /// user count. If utilization is ignored: use round robin selection. |
1685 | inline Error assignNextQueue(AMDGPUStreamTy *Stream) { |
1686 | // Start from zero when tracking utilization, otherwise: round robin policy. |
1687 | uint32_t Index = OMPX_QueueTracking ? 0 : NextQueue++ % MaxNumQueues; |
1688 | |
1689 | if (OMPX_QueueTracking) { |
1690 | // Find the least used queue. |
1691 | for (uint32_t I = 0; I < MaxNumQueues; ++I) { |
1692 | // Early exit when an initialized queue is idle. |
1693 | if (Queues[I].isInitialized() && Queues[I].getUserCount() == 0) { |
1694 | Index = I; |
1695 | break; |
1696 | } |
1697 | |
1698 | // Update the least used queue. |
1699 | if (Queues[Index].getUserCount() > Queues[I].getUserCount()) |
1700 | Index = I; |
1701 | } |
1702 | } |
1703 | |
1704 | // Make sure the queue is initialized, then add user & assign. |
1705 | if (auto Err = Queues[Index].init(Device, Agent, QueueSize)) |
1706 | return Err; |
1707 | Queues[Index].addUser(); |
1708 | Stream->Queue = &Queues[Index]; |
1709 | |
1710 | return Plugin::success(); |
1711 | } |
1712 | |
1713 | /// The device associated with this stream. |
1714 | GenericDeviceTy &Device; |
1715 | |
1716 | /// Envar for controlling the tracking of busy HSA queues. |
1717 | BoolEnvar OMPX_QueueTracking; |
1718 | |
1719 | /// The next queue index to use for round robin selection. |
1720 | uint32_t NextQueue; |
1721 | |
1722 | /// The queues which are assigned to requested streams. |
1723 | std::vector<AMDGPUQueueTy> Queues; |
1724 | |
1725 | /// The corresponding device as HSA agent. |
1726 | hsa_agent_t Agent; |
1727 | |
1728 | /// The maximum number of queues. |
1729 | uint32_t MaxNumQueues; |
1730 | |
1731 | /// The size of created queues. |
1732 | uint32_t QueueSize; |
1733 | }; |
1734 | |
1735 | /// Abstract class that holds the common members of the actual kernel devices |
1736 | /// and the host device. Both types should inherit from this class. |
1737 | struct AMDGenericDeviceTy { |
1738 | AMDGenericDeviceTy() {} |
1739 | |
1740 | virtual ~AMDGenericDeviceTy() {} |
1741 | |
1742 | /// Create all memory pools which the device has access to and classify them. |
1743 | Error initMemoryPools() { |
1744 | // Retrieve all memory pools from the device agent(s). |
1745 | Error Err = retrieveAllMemoryPools(); |
1746 | if (Err) |
1747 | return Err; |
1748 | |
1749 | for (AMDGPUMemoryPoolTy *MemoryPool : AllMemoryPools) { |
1750 | // Initialize the memory pool and retrieve some basic info. |
1751 | Error Err = MemoryPool->init(); |
1752 | if (Err) |
1753 | return Err; |
1754 | |
1755 | if (!MemoryPool->isGlobal()) |
1756 | continue; |
1757 | |
1758 | // Classify the memory pools depending on their properties. |
1759 | if (MemoryPool->isFineGrained()) { |
1760 | FineGrainedMemoryPools.push_back(MemoryPool); |
1761 | if (MemoryPool->supportsKernelArgs()) |
1762 | ArgsMemoryPools.push_back(MemoryPool); |
1763 | } else if (MemoryPool->isCoarseGrained()) { |
1764 | CoarseGrainedMemoryPools.push_back(MemoryPool); |
1765 | } |
1766 | } |
1767 | return Plugin::success(); |
1768 | } |
1769 | |
1770 | /// Destroy all memory pools. |
1771 | Error deinitMemoryPools() { |
1772 | for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) |
1773 | delete Pool; |
1774 | |
1775 | AllMemoryPools.clear(); |
1776 | FineGrainedMemoryPools.clear(); |
1777 | CoarseGrainedMemoryPools.clear(); |
1778 | ArgsMemoryPools.clear(); |
1779 | |
1780 | return Plugin::success(); |
1781 | } |
1782 | |
1783 | /// Retrieve and construct all memory pools from the device agent(s). |
1784 | virtual Error retrieveAllMemoryPools() = 0; |
1785 | |
1786 | /// Get the device agent. |
1787 | virtual hsa_agent_t getAgent() const = 0; |
1788 | |
1789 | protected: |
1790 | /// Array of all memory pools available to the host agents. |
1791 | llvm::SmallVector<AMDGPUMemoryPoolTy *> AllMemoryPools; |
1792 | |
1793 | /// Array of fine-grained memory pools available to the host agents. |
1794 | llvm::SmallVector<AMDGPUMemoryPoolTy *> FineGrainedMemoryPools; |
1795 | |
1796 | /// Array of coarse-grained memory pools available to the host agents. |
1797 | llvm::SmallVector<AMDGPUMemoryPoolTy *> CoarseGrainedMemoryPools; |
1798 | |
1799 | /// Array of kernel args memory pools available to the host agents. |
1800 | llvm::SmallVector<AMDGPUMemoryPoolTy *> ArgsMemoryPools; |
1801 | }; |
1802 | |
1803 | /// Class representing the host device. This host device may have more than one |
1804 | /// HSA host agent. We aggregate all its resources into the same instance. |
1805 | struct AMDHostDeviceTy : public AMDGenericDeviceTy { |
1806 | /// Create a host device from an array of host agents. |
1807 | AMDHostDeviceTy(AMDGPUPluginTy &Plugin, |
1808 | const llvm::SmallVector<hsa_agent_t> &HostAgents) |
1809 | : AMDGenericDeviceTy(), Agents(HostAgents), ArgsMemoryManager(Plugin), |
1810 | PinnedMemoryManager(Plugin) { |
1811 | assert(HostAgents.size() && "No host agent found" ); |
1812 | } |
1813 | |
1814 | /// Initialize the host device memory pools and the memory managers for |
1815 | /// kernel args and host pinned memory allocations. |
1816 | Error init() { |
1817 | if (auto Err = initMemoryPools()) |
1818 | return Err; |
1819 | |
1820 | if (auto Err = ArgsMemoryManager.init(getArgsMemoryPool())) |
1821 | return Err; |
1822 | |
1823 | if (auto Err = PinnedMemoryManager.init(getFineGrainedMemoryPool())) |
1824 | return Err; |
1825 | |
1826 | return Plugin::success(); |
1827 | } |
1828 | |
1829 | /// Deinitialize memory pools and managers. |
1830 | Error deinit() { |
1831 | if (auto Err = deinitMemoryPools()) |
1832 | return Err; |
1833 | |
1834 | if (auto Err = ArgsMemoryManager.deinit()) |
1835 | return Err; |
1836 | |
1837 | if (auto Err = PinnedMemoryManager.deinit()) |
1838 | return Err; |
1839 | |
1840 | return Plugin::success(); |
1841 | } |
1842 | |
1843 | /// Retrieve and construct all memory pools from the host agents. |
1844 | Error retrieveAllMemoryPools() override { |
1845 | // Iterate through the available pools across the host agents. |
1846 | for (hsa_agent_t Agent : Agents) { |
1847 | Error Err = hsa_utils::iterateAgentMemoryPools( |
1848 | Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) { |
1849 | AMDGPUMemoryPoolTy *MemoryPool = |
1850 | new AMDGPUMemoryPoolTy(HSAMemoryPool); |
1851 | AllMemoryPools.push_back(MemoryPool); |
1852 | return HSA_STATUS_SUCCESS; |
1853 | }); |
1854 | if (Err) |
1855 | return Err; |
1856 | } |
1857 | return Plugin::success(); |
1858 | } |
1859 | |
1860 | /// Get one of the host agents. Return always the first agent. |
1861 | hsa_agent_t getAgent() const override { return Agents[0]; } |
1862 | |
1863 | /// Get a memory pool for fine-grained allocations. |
1864 | AMDGPUMemoryPoolTy &getFineGrainedMemoryPool() { |
1865 | assert(!FineGrainedMemoryPools.empty() && "No fine-grained mempool" ); |
1866 | // Retrieve any memory pool. |
1867 | return *FineGrainedMemoryPools[0]; |
1868 | } |
1869 | |
1870 | AMDGPUMemoryPoolTy &getCoarseGrainedMemoryPool() { |
1871 | assert(!CoarseGrainedMemoryPools.empty() && "No coarse-grained mempool" ); |
1872 | // Retrieve any memory pool. |
1873 | return *CoarseGrainedMemoryPools[0]; |
1874 | } |
1875 | |
1876 | /// Get a memory pool for kernel args allocations. |
1877 | AMDGPUMemoryPoolTy &getArgsMemoryPool() { |
1878 | assert(!ArgsMemoryPools.empty() && "No kernelargs mempool" ); |
1879 | // Retrieve any memory pool. |
1880 | return *ArgsMemoryPools[0]; |
1881 | } |
1882 | |
1883 | /// Getters for kernel args and host pinned memory managers. |
1884 | AMDGPUMemoryManagerTy &getArgsMemoryManager() { return ArgsMemoryManager; } |
1885 | AMDGPUMemoryManagerTy &getPinnedMemoryManager() { |
1886 | return PinnedMemoryManager; |
1887 | } |
1888 | |
1889 | private: |
1890 | /// Array of agents on the host side. |
1891 | const llvm::SmallVector<hsa_agent_t> Agents; |
1892 | |
1893 | // Memory manager for kernel arguments. |
1894 | AMDGPUMemoryManagerTy ArgsMemoryManager; |
1895 | |
1896 | // Memory manager for pinned memory. |
1897 | AMDGPUMemoryManagerTy PinnedMemoryManager; |
1898 | }; |
1899 | |
1900 | /// Class implementing the AMDGPU device functionalities which derives from the |
1901 | /// generic device class. |
1902 | struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { |
1903 | // Create an AMDGPU device with a device id and default AMDGPU grid values. |
1904 | AMDGPUDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices, |
1905 | AMDHostDeviceTy &HostDevice, hsa_agent_t Agent) |
1906 | : GenericDeviceTy(Plugin, DeviceId, NumDevices, {}), AMDGenericDeviceTy(), |
1907 | OMPX_NumQueues("LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES" , 4), |
1908 | OMPX_QueueSize("LIBOMPTARGET_AMDGPU_HSA_QUEUE_SIZE" , 512), |
1909 | OMPX_DefaultTeamsPerCU("LIBOMPTARGET_AMDGPU_TEAMS_PER_CU" , 4), |
1910 | OMPX_MaxAsyncCopyBytes("LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES" , |
1911 | 1 * 1024 * 1024), // 1MB |
1912 | OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS" , |
1913 | 64), |
1914 | OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT" , 2000000), |
1915 | OMPX_UseMultipleSdmaEngines( |
1916 | "LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES" , false), |
1917 | OMPX_ApuMaps("OMPX_APU_MAPS" , false), AMDGPUStreamManager(*this, Agent), |
1918 | AMDGPUEventManager(*this), AMDGPUSignalManager(*this), Agent(Agent), |
1919 | HostDevice(HostDevice) {} |
1920 | |
1921 | ~AMDGPUDeviceTy() {} |
1922 | |
1923 | /// Initialize the device, its resources and get its properties. |
1924 | Error initImpl(GenericPluginTy &Plugin) override { |
1925 | // First setup all the memory pools. |
1926 | if (auto Err = initMemoryPools()) |
1927 | return Err; |
1928 | |
1929 | char GPUName[64]; |
1930 | if (auto Err = getDeviceAttr(HSA_AGENT_INFO_NAME, GPUName)) |
1931 | return Err; |
1932 | ComputeUnitKind = GPUName; |
1933 | |
1934 | // Get the wavefront size. |
1935 | uint32_t WavefrontSize = 0; |
1936 | if (auto Err = getDeviceAttr(HSA_AGENT_INFO_WAVEFRONT_SIZE, WavefrontSize)) |
1937 | return Err; |
1938 | GridValues.GV_Warp_Size = WavefrontSize; |
1939 | |
1940 | // Get the frequency of the steady clock. If the attribute is missing |
1941 | // assume running on an older libhsa and default to 0, omp_get_wtime |
1942 | // will be inaccurate but otherwise programs can still run. |
1943 | if (getDeviceAttrRaw(HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY, |
1944 | ClockFrequency) != HSA_STATUS_SUCCESS) |
1945 | ClockFrequency = 0; |
1946 | |
1947 | // Load the grid values depending on the wavefront. |
1948 | if (WavefrontSize == 32) |
1949 | GridValues = getAMDGPUGridValues<32>(); |
1950 | else if (WavefrontSize == 64) |
1951 | GridValues = getAMDGPUGridValues<64>(); |
1952 | else |
1953 | return Plugin::error(ErrorCode::UNSUPPORTED, |
1954 | "unexpected AMDGPU wavefront %d" , WavefrontSize); |
1955 | |
1956 | // Get maximum number of workitems per workgroup. |
1957 | uint16_t WorkgroupMaxDim[3]; |
1958 | if (auto Err = |
1959 | getDeviceAttr(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgroupMaxDim)) |
1960 | return Err; |
1961 | GridValues.GV_Max_WG_Size = WorkgroupMaxDim[0]; |
1962 | |
1963 | // Get maximum number of workgroups. |
1964 | hsa_dim3_t GridMaxDim; |
1965 | if (auto Err = getDeviceAttr(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim)) |
1966 | return Err; |
1967 | |
1968 | GridValues.GV_Max_Teams = GridMaxDim.x / GridValues.GV_Max_WG_Size; |
1969 | if (GridValues.GV_Max_Teams == 0) |
1970 | return Plugin::error(ErrorCode::INVALID_ARGUMENT, |
1971 | "maximum number of teams cannot be zero" ); |
1972 | |
1973 | // Compute the default number of teams. |
1974 | uint32_t ComputeUnits = 0; |
1975 | if (auto Err = |
1976 | getDeviceAttr(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, ComputeUnits)) |
1977 | return Err; |
1978 | GridValues.GV_Default_Num_Teams = ComputeUnits * OMPX_DefaultTeamsPerCU; |
1979 | |
1980 | uint32_t WavesPerCU = 0; |
1981 | if (auto Err = |
1982 | getDeviceAttr(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, WavesPerCU)) |
1983 | return Err; |
1984 | HardwareParallelism = ComputeUnits * WavesPerCU; |
1985 | |
1986 | // Get maximum size of any device queues and maximum number of queues. |
1987 | uint32_t MaxQueueSize; |
1988 | if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUE_MAX_SIZE, MaxQueueSize)) |
1989 | return Err; |
1990 | |
1991 | uint32_t MaxQueues; |
1992 | if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUES_MAX, MaxQueues)) |
1993 | return Err; |
1994 | |
1995 | // Compute the number of queues and their size. |
1996 | OMPX_NumQueues = std::max(1U, std::min(OMPX_NumQueues.get(), MaxQueues)); |
1997 | OMPX_QueueSize = std::min(OMPX_QueueSize.get(), MaxQueueSize); |
1998 | |
1999 | // Initialize stream pool. |
2000 | if (auto Err = AMDGPUStreamManager.init(OMPX_InitialNumStreams, |
2001 | OMPX_NumQueues, OMPX_QueueSize)) |
2002 | return Err; |
2003 | |
2004 | // Initialize event pool. |
2005 | if (auto Err = AMDGPUEventManager.init(OMPX_InitialNumEvents)) |
2006 | return Err; |
2007 | |
2008 | // Initialize signal pool. |
2009 | if (auto Err = AMDGPUSignalManager.init(OMPX_InitialNumSignals)) |
2010 | return Err; |
2011 | |
2012 | // Detect if XNACK is enabled |
2013 | SmallVector<SmallString<32>> Targets; |
2014 | if (auto Err = hsa_utils::getTargetTripleAndFeatures(Agent, Targets)) |
2015 | return Err; |
2016 | if (!Targets.empty() && Targets[0].str().contains("xnack+" )) |
2017 | IsXnackEnabled = true; |
2018 | |
2019 | // detect if device is an APU. |
2020 | if (auto Err = checkIfAPU()) |
2021 | return Err; |
2022 | |
2023 | return Plugin::success(); |
2024 | } |
2025 | |
2026 | /// Deinitialize the device and release its resources. |
2027 | Error deinitImpl() override { |
2028 | // Deinitialize the stream and event pools. |
2029 | if (auto Err = AMDGPUStreamManager.deinit()) |
2030 | return Err; |
2031 | |
2032 | if (auto Err = AMDGPUEventManager.deinit()) |
2033 | return Err; |
2034 | |
2035 | if (auto Err = AMDGPUSignalManager.deinit()) |
2036 | return Err; |
2037 | |
2038 | // Close modules if necessary. |
2039 | if (!LoadedImages.empty()) { |
2040 | // Each image has its own module. |
2041 | for (DeviceImageTy *Image : LoadedImages) { |
2042 | AMDGPUDeviceImageTy &AMDImage = |
2043 | static_cast<AMDGPUDeviceImageTy &>(*Image); |
2044 | |
2045 | // Unload the executable of the image. |
2046 | if (auto Err = AMDImage.unloadExecutable()) |
2047 | return Err; |
2048 | } |
2049 | } |
2050 | |
2051 | // Invalidate agent reference. |
2052 | Agent = {0}; |
2053 | |
2054 | return Plugin::success(); |
2055 | } |
2056 | |
2057 | virtual Error callGlobalConstructors(GenericPluginTy &Plugin, |
2058 | DeviceImageTy &Image) override { |
2059 | GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); |
2060 | if (Handler.isSymbolInImage(*this, Image, "amdgcn.device.fini" )) |
2061 | Image.setPendingGlobalDtors(); |
2062 | |
2063 | return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true); |
2064 | } |
2065 | |
2066 | virtual Error callGlobalDestructors(GenericPluginTy &Plugin, |
2067 | DeviceImageTy &Image) override { |
2068 | if (Image.hasPendingGlobalDtors()) |
2069 | return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false); |
2070 | return Plugin::success(); |
2071 | } |
2072 | |
2073 | uint64_t getStreamBusyWaitMicroseconds() const { return OMPX_StreamBusyWait; } |
2074 | |
2075 | Expected<std::unique_ptr<MemoryBuffer>> |
2076 | doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const override { |
2077 | |
2078 | // TODO: We should try to avoid materialization but there seems to be no |
2079 | // good linker interface w/o file i/o. |
2080 | SmallString<128> LinkerInputFilePath; |
2081 | std::error_code EC = sys::fs::createTemporaryFile("amdgpu-pre-link-jit" , |
2082 | "o" , LinkerInputFilePath); |
2083 | if (EC) |
2084 | return Plugin::error(ErrorCode::HOST_IO, |
2085 | "failed to create temporary file for linker" ); |
2086 | |
2087 | // Write the file's contents to the output file. |
2088 | Expected<std::unique_ptr<FileOutputBuffer>> OutputOrErr = |
2089 | FileOutputBuffer::create(LinkerInputFilePath, MB->getBuffer().size()); |
2090 | if (!OutputOrErr) |
2091 | return OutputOrErr.takeError(); |
2092 | std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr); |
2093 | llvm::copy(Range: MB->getBuffer(), Out: Output->getBufferStart()); |
2094 | if (Error E = Output->commit()) |
2095 | return std::move(E); |
2096 | |
2097 | SmallString<128> LinkerOutputFilePath; |
2098 | EC = sys::fs::createTemporaryFile("amdgpu-pre-link-jit" , "so" , |
2099 | LinkerOutputFilePath); |
2100 | if (EC) |
2101 | return Plugin::error(ErrorCode::HOST_IO, |
2102 | "failed to create temporary file for linker" ); |
2103 | |
2104 | const auto &ErrorOrPath = sys::findProgramByName("lld" ); |
2105 | if (!ErrorOrPath) |
2106 | return createStringError(ErrorCode::HOST_TOOL_NOT_FOUND, |
2107 | "failed to find `lld` on the PATH." ); |
2108 | |
2109 | std::string LLDPath = ErrorOrPath.get(); |
2110 | INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(), |
2111 | "Using `%s` to link JITed amdgcn output." , LLDPath.c_str()); |
2112 | |
2113 | std::string MCPU = "-plugin-opt=mcpu=" + getComputeUnitKind(); |
2114 | StringRef Args[] = {LLDPath, |
2115 | "-flavor" , |
2116 | "gnu" , |
2117 | "--no-undefined" , |
2118 | "-shared" , |
2119 | MCPU, |
2120 | "-o" , |
2121 | LinkerOutputFilePath.data(), |
2122 | LinkerInputFilePath.data()}; |
2123 | |
2124 | std::string Error; |
2125 | int RC = sys::ExecuteAndWait(LLDPath, Args, std::nullopt, {}, 0, 0, &Error); |
2126 | if (RC) |
2127 | return Plugin::error(ErrorCode::LINK_FAILURE, |
2128 | "linking optimized bitcode failed: %s" , |
2129 | Error.c_str()); |
2130 | |
2131 | auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(LinkerOutputFilePath); |
2132 | if (!BufferOrErr) |
2133 | return Plugin::error(ErrorCode::HOST_IO, |
2134 | "failed to open temporary file for lld" ); |
2135 | |
2136 | // Clean up the temporary files afterwards. |
2137 | if (sys::fs::remove(LinkerOutputFilePath)) |
2138 | return Plugin::error(ErrorCode::HOST_IO, |
2139 | "failed to remove temporary output file for lld" ); |
2140 | if (sys::fs::remove(LinkerInputFilePath)) |
2141 | return Plugin::error(ErrorCode::HOST_IO, |
2142 | "failed to remove temporary input file for lld" ); |
2143 | |
2144 | return std::move(*BufferOrErr); |
2145 | } |
2146 | |
2147 | /// See GenericDeviceTy::getComputeUnitKind(). |
2148 | std::string getComputeUnitKind() const override { return ComputeUnitKind; } |
2149 | |
2150 | /// Returns the clock frequency for the given AMDGPU device. |
2151 | uint64_t getClockFrequency() const override { return ClockFrequency; } |
2152 | |
2153 | /// Allocate and construct an AMDGPU kernel. |
2154 | Expected<GenericKernelTy &> constructKernel(const char *Name) override { |
2155 | // Allocate and construct the AMDGPU kernel. |
2156 | AMDGPUKernelTy *AMDGPUKernel = Plugin.allocate<AMDGPUKernelTy>(); |
2157 | if (!AMDGPUKernel) |
2158 | return Plugin::error(ErrorCode::OUT_OF_RESOURCES, |
2159 | "failed to allocate memory for AMDGPU kernel" ); |
2160 | |
2161 | new (AMDGPUKernel) AMDGPUKernelTy(Name); |
2162 | |
2163 | return *AMDGPUKernel; |
2164 | } |
2165 | |
2166 | /// Set the current context to this device's context. Do nothing since the |
2167 | /// AMDGPU devices do not have the concept of contexts. |
2168 | Error setContext() override { return Plugin::success(); } |
2169 | |
2170 | /// AMDGPU returns the product of the number of compute units and the waves |
2171 | /// per compute unit. |
2172 | uint64_t getHardwareParallelism() const override { |
2173 | return HardwareParallelism; |
2174 | } |
2175 | |
2176 | /// We want to set up the RPC server for host services to the GPU if it is |
2177 | /// available. |
2178 | bool shouldSetupRPCServer() const override { return true; } |
2179 | |
2180 | /// The RPC interface should have enough space for all available parallelism. |
2181 | uint64_t requestedRPCPortCount() const override { |
2182 | return getHardwareParallelism(); |
2183 | } |
2184 | |
2185 | /// Get the stream of the asynchronous info structure or get a new one. |
2186 | Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper, |
2187 | AMDGPUStreamTy *&Stream) { |
2188 | // Get the stream (if any) from the async info. |
2189 | Stream = AsyncInfoWrapper.getQueueAs<AMDGPUStreamTy *>(); |
2190 | if (!Stream) { |
2191 | // There was no stream; get an idle one. |
2192 | if (auto Err = AMDGPUStreamManager.getResource(Stream)) |
2193 | return Err; |
2194 | |
2195 | // Modify the async info's stream. |
2196 | AsyncInfoWrapper.setQueueAs<AMDGPUStreamTy *>(Stream); |
2197 | } |
2198 | return Plugin::success(); |
2199 | } |
2200 | |
2201 | /// Load the binary image into the device and allocate an image object. |
2202 | Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage, |
2203 | int32_t ImageId) override { |
2204 | // Allocate and initialize the image object. |
2205 | AMDGPUDeviceImageTy *AMDImage = Plugin.allocate<AMDGPUDeviceImageTy>(); |
2206 | new (AMDImage) AMDGPUDeviceImageTy(ImageId, *this, TgtImage); |
2207 | |
2208 | // Load the HSA executable. |
2209 | if (Error Err = AMDImage->loadExecutable(Device: *this)) |
2210 | return std::move(Err); |
2211 | |
2212 | return AMDImage; |
2213 | } |
2214 | |
2215 | /// Allocate memory on the device or related to the device. |
2216 | void *allocate(size_t Size, void *, TargetAllocTy Kind) override; |
2217 | |
2218 | /// Deallocate memory on the device or related to the device. |
2219 | int free(void *TgtPtr, TargetAllocTy Kind) override { |
2220 | if (TgtPtr == nullptr) |
2221 | return OFFLOAD_SUCCESS; |
2222 | |
2223 | AMDGPUMemoryPoolTy *MemoryPool = nullptr; |
2224 | switch (Kind) { |
2225 | case TARGET_ALLOC_DEFAULT: |
2226 | case TARGET_ALLOC_DEVICE: |
2227 | case TARGET_ALLOC_DEVICE_NON_BLOCKING: |
2228 | MemoryPool = CoarseGrainedMemoryPools[0]; |
2229 | break; |
2230 | case TARGET_ALLOC_HOST: |
2231 | MemoryPool = &HostDevice.getFineGrainedMemoryPool(); |
2232 | break; |
2233 | case TARGET_ALLOC_SHARED: |
2234 | MemoryPool = &HostDevice.getFineGrainedMemoryPool(); |
2235 | break; |
2236 | } |
2237 | |
2238 | if (!MemoryPool) { |
2239 | REPORT("No memory pool for the specified allocation kind\n" ); |
2240 | return OFFLOAD_FAIL; |
2241 | } |
2242 | |
2243 | if (Error Err = MemoryPool->deallocate(Ptr: TgtPtr)) { |
2244 | REPORT("%s\n" , toString(E: std::move(Err)).data()); |
2245 | return OFFLOAD_FAIL; |
2246 | } |
2247 | |
2248 | return OFFLOAD_SUCCESS; |
2249 | } |
2250 | |
2251 | /// Synchronize current thread with the pending operations on the async info. |
2252 | Error synchronizeImpl(__tgt_async_info &AsyncInfo) override { |
2253 | AMDGPUStreamTy *Stream = |
2254 | reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue); |
2255 | assert(Stream && "Invalid stream" ); |
2256 | |
2257 | if (auto Err = Stream->synchronize()) |
2258 | return Err; |
2259 | |
2260 | // Once the stream is synchronized, return it to stream pool and reset |
2261 | // AsyncInfo. This is to make sure the synchronization only works for its |
2262 | // own tasks. |
2263 | AsyncInfo.Queue = nullptr; |
2264 | return AMDGPUStreamManager.returnResource(Stream); |
2265 | } |
2266 | |
2267 | /// Query for the completion of the pending operations on the async info. |
2268 | Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override { |
2269 | AMDGPUStreamTy *Stream = |
2270 | reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue); |
2271 | assert(Stream && "Invalid stream" ); |
2272 | |
2273 | auto CompletedOrErr = Stream->query(); |
2274 | if (!CompletedOrErr) |
2275 | return CompletedOrErr.takeError(); |
2276 | |
2277 | // Return if it the stream did not complete yet. |
2278 | if (!(*CompletedOrErr)) |
2279 | return Plugin::success(); |
2280 | |
2281 | // Once the stream is completed, return it to stream pool and reset |
2282 | // AsyncInfo. This is to make sure the synchronization only works for its |
2283 | // own tasks. |
2284 | AsyncInfo.Queue = nullptr; |
2285 | return AMDGPUStreamManager.returnResource(Stream); |
2286 | } |
2287 | |
2288 | /// Pin the host buffer and return the device pointer that should be used for |
2289 | /// device transfers. |
2290 | Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override { |
2291 | void *PinnedPtr = nullptr; |
2292 | |
2293 | hsa_status_t Status = |
2294 | hsa_amd_memory_lock(HstPtr, Size, nullptr, 0, &PinnedPtr); |
2295 | if (auto Err = Plugin::check(Status, "error in hsa_amd_memory_lock: %s\n" )) |
2296 | return std::move(Err); |
2297 | |
2298 | return PinnedPtr; |
2299 | } |
2300 | |
2301 | /// Unpin the host buffer. |
2302 | Error dataUnlockImpl(void *HstPtr) override { |
2303 | hsa_status_t Status = hsa_amd_memory_unlock(HstPtr); |
2304 | return Plugin::check(Status, "error in hsa_amd_memory_unlock: %s\n" ); |
2305 | } |
2306 | |
2307 | /// Check through the HSA runtime whether the \p HstPtr buffer is pinned. |
2308 | Expected<bool> isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr, |
2309 | void *&BaseDevAccessiblePtr, |
2310 | size_t &BaseSize) const override { |
2311 | hsa_amd_pointer_info_t Info; |
2312 | Info.size = sizeof(hsa_amd_pointer_info_t); |
2313 | |
2314 | hsa_status_t Status = hsa_amd_pointer_info( |
2315 | HstPtr, &Info, /*Allocator=*/nullptr, /*num_agents_accessible=*/nullptr, |
2316 | /*accessible=*/nullptr); |
2317 | if (auto Err = Plugin::check(Status, "error in hsa_amd_pointer_info: %s" )) |
2318 | return std::move(Err); |
2319 | |
2320 | // The buffer may be locked or allocated through HSA allocators. Assume that |
2321 | // the buffer is host pinned if the runtime reports a HSA type. |
2322 | if (Info.type != HSA_EXT_POINTER_TYPE_LOCKED && |
2323 | Info.type != HSA_EXT_POINTER_TYPE_HSA) |
2324 | return false; |
2325 | |
2326 | assert(Info.hostBaseAddress && "Invalid host pinned address" ); |
2327 | assert(Info.agentBaseAddress && "Invalid agent pinned address" ); |
2328 | assert(Info.sizeInBytes > 0 && "Invalid pinned allocation size" ); |
2329 | |
2330 | // Save the allocation info in the output parameters. |
2331 | BaseHstPtr = Info.hostBaseAddress; |
2332 | BaseDevAccessiblePtr = Info.agentBaseAddress; |
2333 | BaseSize = Info.sizeInBytes; |
2334 | |
2335 | return true; |
2336 | } |
2337 | |
2338 | /// Submit data to the device (host to device transfer). |
2339 | Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size, |
2340 | AsyncInfoWrapperTy &AsyncInfoWrapper) override { |
2341 | AMDGPUStreamTy *Stream = nullptr; |
2342 | void *PinnedPtr = nullptr; |
2343 | |
2344 | // Use one-step asynchronous operation when host memory is already pinned. |
2345 | if (void *PinnedPtr = |
2346 | PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) { |
2347 | if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
2348 | return Err; |
2349 | return Stream->pushPinnedMemoryCopyAsync(Dst: TgtPtr, Src: PinnedPtr, CopySize: Size); |
2350 | } |
2351 | |
2352 | // For large transfers use synchronous behavior. |
2353 | if (Size >= OMPX_MaxAsyncCopyBytes) { |
2354 | if (AsyncInfoWrapper.hasQueue()) |
2355 | if (auto Err = synchronize(AsyncInfoWrapper)) |
2356 | return Err; |
2357 | |
2358 | hsa_status_t Status; |
2359 | Status = hsa_amd_memory_lock(const_cast<void *>(HstPtr), Size, nullptr, 0, |
2360 | &PinnedPtr); |
2361 | if (auto Err = |
2362 | Plugin::check(Status, "error in hsa_amd_memory_lock: %s\n" )) |
2363 | return Err; |
2364 | |
2365 | AMDGPUSignalTy Signal; |
2366 | if (auto Err = Signal.init()) |
2367 | return Err; |
2368 | |
2369 | if (auto Err = hsa_utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr, |
2370 | Agent, PinnedPtr, Agent, Size, 0, |
2371 | nullptr, Signal.get())) |
2372 | return Err; |
2373 | |
2374 | if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds())) |
2375 | return Err; |
2376 | |
2377 | if (auto Err = Signal.deinit()) |
2378 | return Err; |
2379 | |
2380 | Status = hsa_amd_memory_unlock(const_cast<void *>(HstPtr)); |
2381 | return Plugin::check(Status, "error in hsa_amd_memory_unlock: %s\n" ); |
2382 | } |
2383 | |
2384 | // Otherwise, use two-step copy with an intermediate pinned host buffer. |
2385 | AMDGPUMemoryManagerTy &PinnedMemoryManager = |
2386 | HostDevice.getPinnedMemoryManager(); |
2387 | if (auto Err = PinnedMemoryManager.allocate(Size, PtrStorage: &PinnedPtr)) |
2388 | return Err; |
2389 | |
2390 | if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
2391 | return Err; |
2392 | |
2393 | return Stream->pushMemoryCopyH2DAsync(Dst: TgtPtr, Src: HstPtr, Inter: PinnedPtr, CopySize: Size, |
2394 | MemoryManager&: PinnedMemoryManager); |
2395 | } |
2396 | |
2397 | /// Retrieve data from the device (device to host transfer). |
2398 | Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size, |
2399 | AsyncInfoWrapperTy &AsyncInfoWrapper) override { |
2400 | AMDGPUStreamTy *Stream = nullptr; |
2401 | void *PinnedPtr = nullptr; |
2402 | |
2403 | // Use one-step asynchronous operation when host memory is already pinned. |
2404 | if (void *PinnedPtr = |
2405 | PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) { |
2406 | if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
2407 | return Err; |
2408 | |
2409 | return Stream->pushPinnedMemoryCopyAsync(Dst: PinnedPtr, Src: TgtPtr, CopySize: Size); |
2410 | } |
2411 | |
2412 | // For large transfers use synchronous behavior. |
2413 | if (Size >= OMPX_MaxAsyncCopyBytes) { |
2414 | if (AsyncInfoWrapper.hasQueue()) |
2415 | if (auto Err = synchronize(AsyncInfoWrapper)) |
2416 | return Err; |
2417 | |
2418 | hsa_status_t Status; |
2419 | Status = hsa_amd_memory_lock(const_cast<void *>(HstPtr), Size, nullptr, 0, |
2420 | &PinnedPtr); |
2421 | if (auto Err = |
2422 | Plugin::check(Status, "error in hsa_amd_memory_lock: %s\n" )) |
2423 | return Err; |
2424 | |
2425 | AMDGPUSignalTy Signal; |
2426 | if (auto Err = Signal.init()) |
2427 | return Err; |
2428 | |
2429 | if (auto Err = hsa_utils::asyncMemCopy(useMultipleSdmaEngines(), |
2430 | PinnedPtr, Agent, TgtPtr, Agent, |
2431 | Size, 0, nullptr, Signal.get())) |
2432 | return Err; |
2433 | |
2434 | if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds())) |
2435 | return Err; |
2436 | |
2437 | if (auto Err = Signal.deinit()) |
2438 | return Err; |
2439 | |
2440 | Status = hsa_amd_memory_unlock(const_cast<void *>(HstPtr)); |
2441 | return Plugin::check(Status, "error in hsa_amd_memory_unlock: %s\n" ); |
2442 | } |
2443 | |
2444 | // Otherwise, use two-step copy with an intermediate pinned host buffer. |
2445 | AMDGPUMemoryManagerTy &PinnedMemoryManager = |
2446 | HostDevice.getPinnedMemoryManager(); |
2447 | if (auto Err = PinnedMemoryManager.allocate(Size, PtrStorage: &PinnedPtr)) |
2448 | return Err; |
2449 | |
2450 | if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
2451 | return Err; |
2452 | |
2453 | return Stream->pushMemoryCopyD2HAsync(Dst: HstPtr, Src: TgtPtr, Inter: PinnedPtr, CopySize: Size, |
2454 | MemoryManager&: PinnedMemoryManager); |
2455 | } |
2456 | |
2457 | /// Exchange data between two devices within the plugin. |
2458 | Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice, |
2459 | void *DstPtr, int64_t Size, |
2460 | AsyncInfoWrapperTy &AsyncInfoWrapper) override { |
2461 | AMDGPUDeviceTy &DstDevice = static_cast<AMDGPUDeviceTy &>(DstGenericDevice); |
2462 | |
2463 | // For large transfers use synchronous behavior. |
2464 | if (Size >= OMPX_MaxAsyncCopyBytes) { |
2465 | if (AsyncInfoWrapper.hasQueue()) |
2466 | if (auto Err = synchronize(AsyncInfoWrapper)) |
2467 | return Err; |
2468 | |
2469 | AMDGPUSignalTy Signal; |
2470 | if (auto Err = Signal.init()) |
2471 | return Err; |
2472 | |
2473 | if (auto Err = hsa_utils::asyncMemCopy( |
2474 | useMultipleSdmaEngines(), DstPtr, DstDevice.getAgent(), SrcPtr, |
2475 | getAgent(), (uint64_t)Size, 0, nullptr, Signal.get())) |
2476 | return Err; |
2477 | |
2478 | if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds())) |
2479 | return Err; |
2480 | |
2481 | return Signal.deinit(); |
2482 | } |
2483 | |
2484 | AMDGPUStreamTy *Stream = nullptr; |
2485 | if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
2486 | return Err; |
2487 | if (Size <= 0) |
2488 | return Plugin::success(); |
2489 | |
2490 | return Stream->pushMemoryCopyD2DAsync(DstPtr, DstDevice.getAgent(), SrcPtr, |
2491 | getAgent(), (uint64_t)Size); |
2492 | } |
2493 | |
2494 | /// Initialize the async info for interoperability purposes. |
2495 | Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override { |
2496 | // TODO: Implement this function. |
2497 | return Plugin::success(); |
2498 | } |
2499 | |
2500 | /// Initialize the device info for interoperability purposes. |
2501 | Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override { |
2502 | DeviceInfo->Context = nullptr; |
2503 | |
2504 | if (!DeviceInfo->Device) |
2505 | DeviceInfo->Device = reinterpret_cast<void *>(Agent.handle); |
2506 | |
2507 | return Plugin::success(); |
2508 | } |
2509 | |
2510 | /// Create an event. |
2511 | Error createEventImpl(void **EventPtrStorage) override { |
2512 | AMDGPUEventTy **Event = reinterpret_cast<AMDGPUEventTy **>(EventPtrStorage); |
2513 | return AMDGPUEventManager.getResource(*Event); |
2514 | } |
2515 | |
2516 | /// Destroy a previously created event. |
2517 | Error destroyEventImpl(void *EventPtr) override { |
2518 | AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr); |
2519 | return AMDGPUEventManager.returnResource(Event); |
2520 | } |
2521 | |
2522 | /// Record the event. |
2523 | Error recordEventImpl(void *EventPtr, |
2524 | AsyncInfoWrapperTy &AsyncInfoWrapper) override { |
2525 | AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr); |
2526 | assert(Event && "Invalid event" ); |
2527 | |
2528 | AMDGPUStreamTy *Stream = nullptr; |
2529 | if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
2530 | return Err; |
2531 | |
2532 | return Event->record(Stream&: *Stream); |
2533 | } |
2534 | |
2535 | /// Make the stream wait on the event. |
2536 | Error waitEventImpl(void *EventPtr, |
2537 | AsyncInfoWrapperTy &AsyncInfoWrapper) override { |
2538 | AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr); |
2539 | |
2540 | AMDGPUStreamTy *Stream = nullptr; |
2541 | if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
2542 | return Err; |
2543 | |
2544 | return Event->wait(Stream&: *Stream); |
2545 | } |
2546 | |
2547 | /// Synchronize the current thread with the event. |
2548 | Error syncEventImpl(void *EventPtr) override { |
2549 | return Plugin::error(ErrorCode::UNIMPLEMENTED, |
2550 | "synchronize event not implemented" ); |
2551 | } |
2552 | |
2553 | /// Print information about the device. |
2554 | Error obtainInfoImpl(InfoQueueTy &Info) override { |
2555 | char TmpChar[1000]; |
2556 | const char *TmpCharPtr = "Unknown" ; |
2557 | uint16_t Major, Minor; |
2558 | uint32_t TmpUInt, TmpUInt2; |
2559 | uint32_t CacheSize[4]; |
2560 | size_t TmpSt; |
2561 | bool TmpBool; |
2562 | uint16_t WorkgrpMaxDim[3]; |
2563 | hsa_dim3_t GridMaxDim; |
2564 | hsa_status_t Status, Status2; |
2565 | |
2566 | Status = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major); |
2567 | Status2 = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor); |
2568 | if (Status == HSA_STATUS_SUCCESS && Status2 == HSA_STATUS_SUCCESS) |
2569 | Info.add("HSA Runtime Version" , |
2570 | std::to_string(val: Major) + "." + std::to_string(val: Minor)); |
2571 | |
2572 | Info.add("HSA OpenMP Device Number" , DeviceId); |
2573 | |
2574 | Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar); |
2575 | if (Status == HSA_STATUS_SUCCESS) |
2576 | Info.add("Product Name" , TmpChar); |
2577 | |
2578 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_NAME, TmpChar); |
2579 | if (Status == HSA_STATUS_SUCCESS) |
2580 | Info.add("Device Name" , TmpChar); |
2581 | |
2582 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_VENDOR_NAME, TmpChar); |
2583 | if (Status == HSA_STATUS_SUCCESS) |
2584 | Info.add("Vendor Name" , TmpChar); |
2585 | |
2586 | hsa_device_type_t DevType; |
2587 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_DEVICE, DevType); |
2588 | if (Status == HSA_STATUS_SUCCESS) { |
2589 | switch (DevType) { |
2590 | case HSA_DEVICE_TYPE_CPU: |
2591 | TmpCharPtr = "CPU" ; |
2592 | break; |
2593 | case HSA_DEVICE_TYPE_GPU: |
2594 | TmpCharPtr = "GPU" ; |
2595 | break; |
2596 | case HSA_DEVICE_TYPE_DSP: |
2597 | TmpCharPtr = "DSP" ; |
2598 | break; |
2599 | } |
2600 | Info.add("Device Type" , TmpCharPtr); |
2601 | } |
2602 | |
2603 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUES_MAX, TmpUInt); |
2604 | if (Status == HSA_STATUS_SUCCESS) |
2605 | Info.add("Max Queues" , TmpUInt); |
2606 | |
2607 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MIN_SIZE, TmpUInt); |
2608 | if (Status == HSA_STATUS_SUCCESS) |
2609 | Info.add("Queue Min Size" , TmpUInt); |
2610 | |
2611 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MAX_SIZE, TmpUInt); |
2612 | if (Status == HSA_STATUS_SUCCESS) |
2613 | Info.add("Queue Max Size" , TmpUInt); |
2614 | |
2615 | // FIXME: This is deprecated according to HSA documentation. But using |
2616 | // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during |
2617 | // runtime. |
2618 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_CACHE_SIZE, CacheSize); |
2619 | if (Status == HSA_STATUS_SUCCESS) { |
2620 | Info.add("Cache" ); |
2621 | |
2622 | for (int I = 0; I < 4; I++) |
2623 | if (CacheSize[I]) |
2624 | Info.add<InfoLevel2>("L" + std::to_string(I), CacheSize[I]); |
2625 | } |
2626 | |
2627 | Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_CACHELINE_SIZE, TmpUInt); |
2628 | if (Status == HSA_STATUS_SUCCESS) |
2629 | Info.add("Cacheline Size" , TmpUInt); |
2630 | |
2631 | Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt); |
2632 | if (Status == HSA_STATUS_SUCCESS) |
2633 | Info.add("Max Clock Freq" , TmpUInt, "MHz" ); |
2634 | |
2635 | Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, TmpUInt); |
2636 | if (Status == HSA_STATUS_SUCCESS) |
2637 | Info.add("Compute Units" , TmpUInt); |
2638 | |
2639 | Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, TmpUInt); |
2640 | if (Status == HSA_STATUS_SUCCESS) |
2641 | Info.add("SIMD per CU" , TmpUInt); |
2642 | |
2643 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_FAST_F16_OPERATION, TmpBool); |
2644 | if (Status == HSA_STATUS_SUCCESS) |
2645 | Info.add("Fast F16 Operation" , TmpBool); |
2646 | |
2647 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_WAVEFRONT_SIZE, TmpUInt2); |
2648 | if (Status == HSA_STATUS_SUCCESS) |
2649 | Info.add("Wavefront Size" , TmpUInt2); |
2650 | |
2651 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, TmpUInt); |
2652 | if (Status == HSA_STATUS_SUCCESS) |
2653 | Info.add("Workgroup Max Size" , TmpUInt); |
2654 | |
2655 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgrpMaxDim); |
2656 | if (Status == HSA_STATUS_SUCCESS) { |
2657 | Info.add("Workgroup Max Size per Dimension" ); |
2658 | Info.add<InfoLevel2>("x" , WorkgrpMaxDim[0]); |
2659 | Info.add<InfoLevel2>("y" , WorkgrpMaxDim[1]); |
2660 | Info.add<InfoLevel2>("z" , WorkgrpMaxDim[2]); |
2661 | } |
2662 | |
2663 | Status = getDeviceAttrRaw( |
2664 | (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, TmpUInt); |
2665 | if (Status == HSA_STATUS_SUCCESS) { |
2666 | Info.add("Max Waves Per CU" , TmpUInt); |
2667 | Info.add("Max Work-item Per CU" , TmpUInt * TmpUInt2); |
2668 | } |
2669 | |
2670 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_SIZE, TmpUInt); |
2671 | if (Status == HSA_STATUS_SUCCESS) |
2672 | Info.add("Grid Max Size" , TmpUInt); |
2673 | |
2674 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim); |
2675 | if (Status == HSA_STATUS_SUCCESS) { |
2676 | Info.add("Grid Max Size per Dimension" ); |
2677 | Info.add<InfoLevel2>("x" , GridMaxDim.x); |
2678 | Info.add<InfoLevel2>("y" , GridMaxDim.y); |
2679 | Info.add<InfoLevel2>("z" , GridMaxDim.z); |
2680 | } |
2681 | |
2682 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_FBARRIER_MAX_SIZE, TmpUInt); |
2683 | if (Status == HSA_STATUS_SUCCESS) |
2684 | Info.add("Max fbarriers/Workgrp" , TmpUInt); |
2685 | |
2686 | Info.add("Memory Pools" ); |
2687 | for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) { |
2688 | std::string TmpStr, TmpStr2; |
2689 | |
2690 | if (Pool->isGlobal()) |
2691 | TmpStr = "Global" ; |
2692 | else if (Pool->isReadOnly()) |
2693 | TmpStr = "ReadOnly" ; |
2694 | else if (Pool->isPrivate()) |
2695 | TmpStr = "Private" ; |
2696 | else if (Pool->isGroup()) |
2697 | TmpStr = "Group" ; |
2698 | else |
2699 | TmpStr = "Unknown" ; |
2700 | |
2701 | Info.add<InfoLevel2>(std::string("Pool " ) + TmpStr); |
2702 | |
2703 | if (Pool->isGlobal()) { |
2704 | if (Pool->isFineGrained()) |
2705 | TmpStr2 += "Fine Grained " ; |
2706 | if (Pool->isCoarseGrained()) |
2707 | TmpStr2 += "Coarse Grained " ; |
2708 | if (Pool->supportsKernelArgs()) |
2709 | TmpStr2 += "Kernarg " ; |
2710 | |
2711 | Info.add<InfoLevel3>("Flags" , TmpStr2); |
2712 | } |
2713 | |
2714 | Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, TmpSt); |
2715 | if (Status == HSA_STATUS_SUCCESS) |
2716 | Info.add<InfoLevel3>("Size" , TmpSt, "bytes" ); |
2717 | |
2718 | Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, |
2719 | TmpBool); |
2720 | if (Status == HSA_STATUS_SUCCESS) |
2721 | Info.add<InfoLevel3>("Allocatable" , TmpBool); |
2722 | |
2723 | Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, |
2724 | TmpSt); |
2725 | if (Status == HSA_STATUS_SUCCESS) |
2726 | Info.add<InfoLevel3>("Runtime Alloc Granule" , TmpSt, "bytes" ); |
2727 | |
2728 | Status = Pool->getAttrRaw( |
2729 | HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, TmpSt); |
2730 | if (Status == HSA_STATUS_SUCCESS) |
2731 | Info.add<InfoLevel3>("Runtime Alloc Alignment" , TmpSt, "bytes" ); |
2732 | |
2733 | Status = |
2734 | Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, TmpBool); |
2735 | if (Status == HSA_STATUS_SUCCESS) |
2736 | Info.add<InfoLevel3>("Accessible by all" , TmpBool); |
2737 | } |
2738 | |
2739 | Info.add("ISAs" ); |
2740 | auto Err = hsa_utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) { |
2741 | Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, TmpChar); |
2742 | if (Status == HSA_STATUS_SUCCESS) |
2743 | Info.add<InfoLevel2>("Name" , TmpChar); |
2744 | |
2745 | return Status; |
2746 | }); |
2747 | |
2748 | // Silently consume the error. |
2749 | if (Err) |
2750 | consumeError(std::move(Err)); |
2751 | |
2752 | return Plugin::success(); |
2753 | } |
2754 | |
2755 | /// Returns true if auto zero-copy the best configuration for the current |
2756 | /// arch. |
2757 | /// On AMDGPUs, automatic zero-copy is turned on |
2758 | /// when running on an APU with XNACK (unified memory) support |
2759 | /// enabled. On discrete GPUs, automatic zero-copy is triggered |
2760 | /// if the user sets the environment variable OMPX_APU_MAPS=1 |
2761 | /// and if XNACK is enabled. The rationale is that zero-copy |
2762 | /// is the best configuration (performance, memory footprint) on APUs, |
2763 | /// while it is often not the best on discrete GPUs. |
2764 | /// XNACK can be enabled with a kernel boot parameter or with |
2765 | /// the HSA_XNACK environment variable. |
2766 | bool useAutoZeroCopyImpl() override { |
2767 | return ((IsAPU || OMPX_ApuMaps) && IsXnackEnabled); |
2768 | } |
2769 | |
2770 | /// Getters and setters for stack and heap sizes. |
2771 | Error getDeviceStackSize(uint64_t &Value) override { |
2772 | Value = StackSize; |
2773 | return Plugin::success(); |
2774 | } |
2775 | Error setDeviceStackSize(uint64_t Value) override { |
2776 | StackSize = Value; |
2777 | return Plugin::success(); |
2778 | } |
2779 | Error getDeviceHeapSize(uint64_t &Value) override { |
2780 | Value = DeviceMemoryPoolSize; |
2781 | return Plugin::success(); |
2782 | } |
2783 | Error setDeviceHeapSize(uint64_t Value) override { |
2784 | for (DeviceImageTy *Image : LoadedImages) |
2785 | if (auto Err = setupDeviceMemoryPool(Plugin, *Image, Value)) |
2786 | return Err; |
2787 | DeviceMemoryPoolSize = Value; |
2788 | return Plugin::success(); |
2789 | } |
2790 | Error getDeviceMemorySize(uint64_t &Value) override { |
2791 | for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) { |
2792 | if (Pool->isGlobal()) { |
2793 | hsa_status_t Status = |
2794 | Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, Value); |
2795 | return Plugin::check(Status, "error in getting device memory size: %s" ); |
2796 | } |
2797 | } |
2798 | return Plugin::error(ErrorCode::UNSUPPORTED, |
2799 | "getDeviceMemorySize:: no global pool" ); |
2800 | } |
2801 | |
2802 | /// AMDGPU-specific function to get device attributes. |
2803 | template <typename Ty> Error getDeviceAttr(uint32_t Kind, Ty &Value) { |
2804 | hsa_status_t Status = |
2805 | hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value); |
2806 | return Plugin::check(Status, "Error in hsa_agent_get_info: %s" ); |
2807 | } |
2808 | |
2809 | template <typename Ty> |
2810 | hsa_status_t getDeviceAttrRaw(uint32_t Kind, Ty &Value) { |
2811 | return hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value); |
2812 | } |
2813 | |
2814 | /// Get the device agent. |
2815 | hsa_agent_t getAgent() const override { return Agent; } |
2816 | |
2817 | /// Get the signal manager. |
2818 | AMDGPUSignalManagerTy &getSignalManager() { return AMDGPUSignalManager; } |
2819 | |
2820 | /// Retrieve and construct all memory pools of the device agent. |
2821 | Error retrieveAllMemoryPools() override { |
2822 | // Iterate through the available pools of the device agent. |
2823 | return hsa_utils::iterateAgentMemoryPools( |
2824 | Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) { |
2825 | AMDGPUMemoryPoolTy *MemoryPool = |
2826 | Plugin.allocate<AMDGPUMemoryPoolTy>(); |
2827 | new (MemoryPool) AMDGPUMemoryPoolTy(HSAMemoryPool); |
2828 | AllMemoryPools.push_back(MemoryPool); |
2829 | return HSA_STATUS_SUCCESS; |
2830 | }); |
2831 | } |
2832 | |
2833 | bool useMultipleSdmaEngines() const { return OMPX_UseMultipleSdmaEngines; } |
2834 | |
2835 | private: |
2836 | using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>; |
2837 | using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>; |
2838 | |
2839 | /// Common method to invoke a single threaded constructor or destructor |
2840 | /// kernel by name. |
2841 | Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image, |
2842 | bool IsCtor) { |
2843 | const char *KernelName = |
2844 | IsCtor ? "amdgcn.device.init" : "amdgcn.device.fini" ; |
2845 | // Perform a quick check for the named kernel in the image. The kernel |
2846 | // should be created by the 'amdgpu-lower-ctor-dtor' pass. |
2847 | GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); |
2848 | if (IsCtor && !Handler.isSymbolInImage(*this, Image, KernelName)) |
2849 | return Plugin::success(); |
2850 | |
2851 | // Allocate and construct the AMDGPU kernel. |
2852 | AMDGPUKernelTy AMDGPUKernel(KernelName); |
2853 | if (auto Err = AMDGPUKernel.init(*this, Image)) |
2854 | return Err; |
2855 | |
2856 | AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr); |
2857 | |
2858 | KernelArgsTy KernelArgs = {}; |
2859 | uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u}; |
2860 | if (auto Err = AMDGPUKernel.launchImpl( |
2861 | *this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs, |
2862 | KernelLaunchParamsTy{}, AsyncInfoWrapper)) |
2863 | return Err; |
2864 | |
2865 | Error Err = Plugin::success(); |
2866 | AsyncInfoWrapper.finalize(Err); |
2867 | |
2868 | return Err; |
2869 | } |
2870 | |
2871 | /// Detect if current architecture is an APU. |
2872 | Error checkIfAPU() { |
2873 | // TODO: replace with ROCr API once it becomes available. |
2874 | llvm::StringRef StrGfxName(ComputeUnitKind); |
2875 | bool MayBeAPU = llvm::StringSwitch<bool>(StrGfxName) |
2876 | .Case(S: "gfx942" , Value: true) |
2877 | .Default(Value: false); |
2878 | if (!MayBeAPU) |
2879 | return Plugin::success(); |
2880 | |
2881 | // can be MI300A or MI300X |
2882 | uint32_t ChipID = 0; |
2883 | if (auto Err = getDeviceAttr(HSA_AMD_AGENT_INFO_CHIP_ID, ChipID)) |
2884 | return Err; |
2885 | |
2886 | if (!(ChipID & 0x1)) { |
2887 | IsAPU = true; |
2888 | return Plugin::success(); |
2889 | } |
2890 | return Plugin::success(); |
2891 | } |
2892 | |
2893 | /// Envar for controlling the number of HSA queues per device. High number of |
2894 | /// queues may degrade performance. |
2895 | UInt32Envar OMPX_NumQueues; |
2896 | |
2897 | /// Envar for controlling the size of each HSA queue. The size is the number |
2898 | /// of HSA packets a queue is expected to hold. It is also the number of HSA |
2899 | /// packets that can be pushed into each queue without waiting the driver to |
2900 | /// process them. |
2901 | UInt32Envar OMPX_QueueSize; |
2902 | |
2903 | /// Envar for controlling the default number of teams relative to the number |
2904 | /// of compute units (CUs) the device has: |
2905 | /// #default_teams = OMPX_DefaultTeamsPerCU * #CUs. |
2906 | UInt32Envar OMPX_DefaultTeamsPerCU; |
2907 | |
2908 | /// Envar specifying the maximum size in bytes where the memory copies are |
2909 | /// asynchronous operations. Up to this transfer size, the memory copies are |
2910 | /// asynchronous operations pushed to the corresponding stream. For larger |
2911 | /// transfers, they are synchronous transfers. |
2912 | UInt32Envar OMPX_MaxAsyncCopyBytes; |
2913 | |
2914 | /// Envar controlling the initial number of HSA signals per device. There is |
2915 | /// one manager of signals per device managing several pre-allocated signals. |
2916 | /// These signals are mainly used by AMDGPU streams. If needed, more signals |
2917 | /// will be created. |
2918 | UInt32Envar OMPX_InitialNumSignals; |
2919 | |
2920 | /// Environment variables to set the time to wait in active state before |
2921 | /// switching to blocked state. The default 2000000 busywaits for 2 seconds |
2922 | /// before going into a blocking HSA wait state. The unit for these variables |
2923 | /// are microseconds. |
2924 | UInt32Envar OMPX_StreamBusyWait; |
2925 | |
2926 | /// Use ROCm 5.7 interface for multiple SDMA engines |
2927 | BoolEnvar OMPX_UseMultipleSdmaEngines; |
2928 | |
2929 | /// Value of OMPX_APU_MAPS env var used to force |
2930 | /// automatic zero-copy behavior on non-APU GPUs. |
2931 | BoolEnvar OMPX_ApuMaps; |
2932 | |
2933 | /// Stream manager for AMDGPU streams. |
2934 | AMDGPUStreamManagerTy AMDGPUStreamManager; |
2935 | |
2936 | /// Event manager for AMDGPU events. |
2937 | AMDGPUEventManagerTy AMDGPUEventManager; |
2938 | |
2939 | /// Signal manager for AMDGPU signals. |
2940 | AMDGPUSignalManagerTy AMDGPUSignalManager; |
2941 | |
2942 | /// The agent handler corresponding to the device. |
2943 | hsa_agent_t Agent; |
2944 | |
2945 | /// The GPU architecture. |
2946 | std::string ComputeUnitKind; |
2947 | |
2948 | /// The frequency of the steady clock inside the device. |
2949 | uint64_t ClockFrequency; |
2950 | |
2951 | /// The total number of concurrent work items that can be running on the GPU. |
2952 | uint64_t HardwareParallelism; |
2953 | |
2954 | /// Reference to the host device. |
2955 | AMDHostDeviceTy &HostDevice; |
2956 | |
2957 | /// The current size of the global device memory pool (managed by us). |
2958 | uint64_t DeviceMemoryPoolSize = 1L << 29L /*512MB=*/; |
2959 | |
2960 | /// The current size of the stack that will be used in cases where it could |
2961 | /// not be statically determined. |
2962 | uint64_t StackSize = 16 * 1024 /* 16 KB */; |
2963 | |
2964 | /// Is the plugin associated with an APU? |
2965 | bool IsAPU = false; |
2966 | |
2967 | /// True is the system is configured with XNACK-Enabled. |
2968 | /// False otherwise. |
2969 | bool IsXnackEnabled = false; |
2970 | }; |
2971 | |
2972 | Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) { |
2973 | hsa_code_object_reader_t Reader; |
2974 | hsa_status_t Status = |
2975 | hsa_code_object_reader_create_from_memory(getStart(), getSize(), &Reader); |
2976 | if (auto Err = Plugin::check( |
2977 | Status, "error in hsa_code_object_reader_create_from_memory: %s" )) |
2978 | return Err; |
2979 | |
2980 | Status = hsa_executable_create_alt( |
2981 | HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "" , &Executable); |
2982 | if (auto Err = |
2983 | Plugin::check(Status, "error in hsa_executable_create_alt: %s" )) |
2984 | return Err; |
2985 | |
2986 | hsa_loaded_code_object_t Object; |
2987 | Status = hsa_executable_load_agent_code_object(Executable, Device.getAgent(), |
2988 | Reader, "" , &Object); |
2989 | if (auto Err = Plugin::check( |
2990 | Status, "error in hsa_executable_load_agent_code_object: %s" )) |
2991 | return Err; |
2992 | |
2993 | Status = hsa_executable_freeze(Executable, "" ); |
2994 | if (auto Err = Plugin::check(Status, "error in hsa_executable_freeze: %s" )) |
2995 | return Err; |
2996 | |
2997 | uint32_t Result; |
2998 | Status = hsa_executable_validate(Executable, &Result); |
2999 | if (auto Err = Plugin::check(Status, "error in hsa_executable_validate: %s" )) |
3000 | return Err; |
3001 | |
3002 | if (Result) |
3003 | return Plugin::error(ErrorCode::INVALID_BINARY, |
3004 | "loaded HSA executable does not validate" ); |
3005 | |
3006 | Status = hsa_code_object_reader_destroy(Reader); |
3007 | if (auto Err = |
3008 | Plugin::check(Status, "error in hsa_code_object_reader_destroy: %s" )) |
3009 | return Err; |
3010 | |
3011 | if (auto Err = hsa_utils::readAMDGPUMetaDataFromImage( |
3012 | getMemoryBuffer(), KernelInfoMap, ELFABIVersion)) |
3013 | return Err; |
3014 | |
3015 | return Plugin::success(); |
3016 | } |
3017 | |
3018 | Expected<hsa_executable_symbol_t> |
3019 | AMDGPUDeviceImageTy::findDeviceSymbol(GenericDeviceTy &Device, |
3020 | StringRef SymbolName) const { |
3021 | |
3022 | AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(Device); |
3023 | hsa_agent_t Agent = AMDGPUDevice.getAgent(); |
3024 | |
3025 | hsa_executable_symbol_t Symbol; |
3026 | hsa_status_t Status = hsa_executable_get_symbol_by_name( |
3027 | Executable, SymbolName.data(), &Agent, &Symbol); |
3028 | if (auto Err = Plugin::check( |
3029 | Status, "error in hsa_executable_get_symbol_by_name(%s): %s" , |
3030 | SymbolName.data())) |
3031 | return std::move(Err); |
3032 | |
3033 | return Symbol; |
3034 | } |
3035 | |
3036 | template <typename ResourceTy> |
3037 | Error AMDGPUResourceRef<ResourceTy>::create(GenericDeviceTy &Device) { |
3038 | if (Resource) |
3039 | return Plugin::error(ErrorCode::INVALID_ARGUMENT, |
3040 | "creating an existing resource" ); |
3041 | |
3042 | AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(Device); |
3043 | |
3044 | Resource = new ResourceTy(AMDGPUDevice); |
3045 | |
3046 | return Resource->init(); |
3047 | } |
3048 | |
3049 | AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device) |
3050 | : Agent(Device.getAgent()), Queue(nullptr), |
3051 | SignalManager(Device.getSignalManager()), Device(Device), |
3052 | // Initialize the std::deque with some empty positions. |
3053 | Slots(32), NextSlot(0), SyncCycle(0), |
3054 | StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()), |
3055 | UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {} |
3056 | |
3057 | /// Class implementing the AMDGPU-specific functionalities of the global |
3058 | /// handler. |
3059 | struct AMDGPUGlobalHandlerTy final : public GenericGlobalHandlerTy { |
3060 | /// Get the metadata of a global from the device. The name and size of the |
3061 | /// global is read from DeviceGlobal and the address of the global is written |
3062 | /// to DeviceGlobal. |
3063 | Error getGlobalMetadataFromDevice(GenericDeviceTy &Device, |
3064 | DeviceImageTy &Image, |
3065 | GlobalTy &DeviceGlobal) override { |
3066 | AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(Image); |
3067 | |
3068 | // Find the symbol on the device executable. |
3069 | auto SymbolOrErr = |
3070 | AMDImage.findDeviceSymbol(Device, DeviceGlobal.getName()); |
3071 | if (!SymbolOrErr) |
3072 | return SymbolOrErr.takeError(); |
3073 | |
3074 | hsa_executable_symbol_t Symbol = *SymbolOrErr; |
3075 | hsa_symbol_kind_t SymbolType; |
3076 | hsa_status_t Status; |
3077 | uint64_t SymbolAddr; |
3078 | uint32_t SymbolSize; |
3079 | |
3080 | // Retrieve the type, address and size of the symbol. |
3081 | std::pair<hsa_executable_symbol_info_t, void *> RequiredInfos[] = { |
3082 | {HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &SymbolType}, |
3083 | {HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &SymbolAddr}, |
3084 | {HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &SymbolSize}}; |
3085 | |
3086 | for (auto &Info : RequiredInfos) { |
3087 | Status = hsa_executable_symbol_get_info(Symbol, Info.first, Info.second); |
3088 | if (auto Err = Plugin::check( |
3089 | Status, "error in hsa_executable_symbol_get_info: %s" )) |
3090 | return Err; |
3091 | } |
3092 | |
3093 | // Check the size of the symbol. |
3094 | if (SymbolSize != DeviceGlobal.getSize()) |
3095 | return Plugin::error( |
3096 | ErrorCode::INVALID_BINARY, |
3097 | "failed to load global '%s' due to size mismatch (%zu != %zu)" , |
3098 | DeviceGlobal.getName().data(), SymbolSize, |
3099 | (size_t)DeviceGlobal.getSize()); |
3100 | |
3101 | // Store the symbol address on the device global metadata. |
3102 | DeviceGlobal.setPtr(reinterpret_cast<void *>(SymbolAddr)); |
3103 | |
3104 | return Plugin::success(); |
3105 | } |
3106 | }; |
3107 | |
3108 | /// Class implementing the AMDGPU-specific functionalities of the plugin. |
3109 | struct AMDGPUPluginTy final : public GenericPluginTy { |
3110 | /// Create an AMDGPU plugin and initialize the AMDGPU driver. |
3111 | AMDGPUPluginTy() |
3112 | : GenericPluginTy(getTripleArch()), Initialized(false), |
3113 | HostDevice(nullptr) {} |
3114 | |
3115 | /// This class should not be copied. |
3116 | AMDGPUPluginTy(const AMDGPUPluginTy &) = delete; |
3117 | AMDGPUPluginTy(AMDGPUPluginTy &&) = delete; |
3118 | |
3119 | /// Initialize the plugin and return the number of devices. |
3120 | Expected<int32_t> initImpl() override { |
3121 | hsa_status_t Status = hsa_init(); |
3122 | if (Status != HSA_STATUS_SUCCESS) { |
3123 | // Cannot call hsa_success_string. |
3124 | DP("Failed to initialize AMDGPU's HSA library\n" ); |
3125 | return 0; |
3126 | } |
3127 | |
3128 | // The initialization of HSA was successful. It should be safe to call |
3129 | // HSA functions from now on, e.g., hsa_shut_down. |
3130 | Initialized = true; |
3131 | |
3132 | // Register event handler to detect memory errors on the devices. |
3133 | Status = hsa_amd_register_system_event_handler(eventHandler, this); |
3134 | if (auto Err = Plugin::check( |
3135 | Status, "error in hsa_amd_register_system_event_handler: %s" )) |
3136 | return std::move(Err); |
3137 | |
3138 | // List of host (CPU) agents. |
3139 | llvm::SmallVector<hsa_agent_t> HostAgents; |
3140 | |
3141 | // Count the number of available agents. |
3142 | auto Err = hsa_utils::iterateAgents(Callback: [&](hsa_agent_t Agent) { |
3143 | // Get the device type of the agent. |
3144 | hsa_device_type_t DeviceType; |
3145 | hsa_status_t Status = |
3146 | hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType); |
3147 | if (Status != HSA_STATUS_SUCCESS) |
3148 | return Status; |
3149 | |
3150 | // Classify the agents into kernel (GPU) and host (CPU) kernels. |
3151 | if (DeviceType == HSA_DEVICE_TYPE_GPU) { |
3152 | // Ensure that the GPU agent supports kernel dispatch packets. |
3153 | hsa_agent_feature_t Features; |
3154 | Status = hsa_agent_get_info(Agent, HSA_AGENT_INFO_FEATURE, &Features); |
3155 | if (Features & HSA_AGENT_FEATURE_KERNEL_DISPATCH) |
3156 | KernelAgents.push_back(Agent); |
3157 | } else if (DeviceType == HSA_DEVICE_TYPE_CPU) { |
3158 | HostAgents.push_back(Agent); |
3159 | } |
3160 | return HSA_STATUS_SUCCESS; |
3161 | }); |
3162 | |
3163 | if (Err) |
3164 | return std::move(Err); |
3165 | |
3166 | int32_t NumDevices = KernelAgents.size(); |
3167 | if (NumDevices == 0) { |
3168 | // Do not initialize if there are no devices. |
3169 | DP("There are no devices supporting AMDGPU.\n" ); |
3170 | return 0; |
3171 | } |
3172 | |
3173 | // There are kernel agents but there is no host agent. That should be |
3174 | // treated as an error. |
3175 | if (HostAgents.empty()) |
3176 | return Plugin::error(ErrorCode::BACKEND_FAILURE, "no AMDGPU host agents" ); |
3177 | |
3178 | // Initialize the host device using host agents. |
3179 | HostDevice = allocate<AMDHostDeviceTy>(); |
3180 | new (HostDevice) AMDHostDeviceTy(*this, HostAgents); |
3181 | |
3182 | // Setup the memory pools of available for the host. |
3183 | if (auto Err = HostDevice->init()) |
3184 | return std::move(Err); |
3185 | |
3186 | return NumDevices; |
3187 | } |
3188 | |
3189 | /// Deinitialize the plugin. |
3190 | Error deinitImpl() override { |
3191 | // The HSA runtime was not initialized, so nothing from the plugin was |
3192 | // actually initialized. |
3193 | if (!Initialized) |
3194 | return Plugin::success(); |
3195 | |
3196 | if (HostDevice) |
3197 | if (auto Err = HostDevice->deinit()) |
3198 | return Err; |
3199 | |
3200 | // Finalize the HSA runtime. |
3201 | hsa_status_t Status = hsa_shut_down(); |
3202 | return Plugin::check(Status, "error in hsa_shut_down: %s" ); |
3203 | } |
3204 | |
3205 | /// Creates an AMDGPU device. |
3206 | GenericDeviceTy *createDevice(GenericPluginTy &Plugin, int32_t DeviceId, |
3207 | int32_t NumDevices) override { |
3208 | return new AMDGPUDeviceTy(Plugin, DeviceId, NumDevices, getHostDevice(), |
3209 | getKernelAgent(DeviceId)); |
3210 | } |
3211 | |
3212 | /// Creates an AMDGPU global handler. |
3213 | GenericGlobalHandlerTy *createGlobalHandler() override { |
3214 | return new AMDGPUGlobalHandlerTy(); |
3215 | } |
3216 | |
3217 | Triple::ArchType getTripleArch() const override { return Triple::amdgcn; } |
3218 | |
3219 | const char *getName() const override { return GETNAME(TARGET_NAME); } |
3220 | |
3221 | /// Get the ELF code for recognizing the compatible image binary. |
3222 | uint16_t getMagicElfBits() const override { return ELF::EM_AMDGPU; } |
3223 | |
3224 | /// Check whether the image is compatible with an AMDGPU device. |
3225 | Expected<bool> isELFCompatible(uint32_t DeviceId, |
3226 | StringRef Image) const override { |
3227 | // Get the associated architecture and flags from the ELF. |
3228 | auto ElfOrErr = ELF64LEObjectFile::create( |
3229 | MemoryBufferRef(Image, /*Identifier=*/"" ), /*InitContent=*/false); |
3230 | if (!ElfOrErr) |
3231 | return ElfOrErr.takeError(); |
3232 | std::optional<StringRef> Processor = ElfOrErr->tryGetCPUName(); |
3233 | if (!Processor) |
3234 | return false; |
3235 | |
3236 | SmallVector<SmallString<32>> Targets; |
3237 | if (auto Err = hsa_utils::getTargetTripleAndFeatures( |
3238 | getKernelAgent(DeviceId), Targets)) |
3239 | return Err; |
3240 | for (auto &Target : Targets) |
3241 | if (offloading::amdgpu::isImageCompatibleWithEnv( |
3242 | Processor ? *Processor : "" , ElfOrErr->getPlatformFlags(), |
3243 | Target.str())) |
3244 | return true; |
3245 | return false; |
3246 | } |
3247 | |
3248 | bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) override { |
3249 | return true; |
3250 | } |
3251 | |
3252 | /// Get the host device instance. |
3253 | AMDHostDeviceTy &getHostDevice() { |
3254 | assert(HostDevice && "Host device not initialized" ); |
3255 | return *HostDevice; |
3256 | } |
3257 | |
3258 | /// Get the kernel agent with the corresponding agent id. |
3259 | hsa_agent_t getKernelAgent(int32_t AgentId) const { |
3260 | assert((uint32_t)AgentId < KernelAgents.size() && "Invalid agent id" ); |
3261 | return KernelAgents[AgentId]; |
3262 | } |
3263 | |
3264 | /// Get the list of the available kernel agents. |
3265 | const llvm::SmallVector<hsa_agent_t> &getKernelAgents() const { |
3266 | return KernelAgents; |
3267 | } |
3268 | |
3269 | private: |
3270 | /// Event handler that will be called by ROCr if an event is detected. |
3271 | static hsa_status_t eventHandler(const hsa_amd_event_t *Event, |
3272 | void *PluginPtr) { |
3273 | if (Event->event_type != HSA_AMD_GPU_MEMORY_FAULT_EVENT) |
3274 | return HSA_STATUS_SUCCESS; |
3275 | |
3276 | SmallVector<std::string> Reasons; |
3277 | uint32_t ReasonsMask = Event->memory_fault.fault_reason_mask; |
3278 | if (ReasonsMask & HSA_AMD_MEMORY_FAULT_PAGE_NOT_PRESENT) |
3279 | Reasons.emplace_back(Args: "Page not present or supervisor privilege" ); |
3280 | if (ReasonsMask & HSA_AMD_MEMORY_FAULT_READ_ONLY) |
3281 | Reasons.emplace_back(Args: "Write access to a read-only page" ); |
3282 | if (ReasonsMask & HSA_AMD_MEMORY_FAULT_NX) |
3283 | Reasons.emplace_back(Args: "Execute access to a page marked NX" ); |
3284 | if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HOST_ONLY) |
3285 | Reasons.emplace_back(Args: "GPU attempted access to a host only page" ); |
3286 | if (ReasonsMask & HSA_AMD_MEMORY_FAULT_DRAMECC) |
3287 | Reasons.emplace_back(Args: "DRAM ECC failure" ); |
3288 | if (ReasonsMask & HSA_AMD_MEMORY_FAULT_IMPRECISE) |
3289 | Reasons.emplace_back(Args: "Can't determine the exact fault address" ); |
3290 | if (ReasonsMask & HSA_AMD_MEMORY_FAULT_SRAMECC) |
3291 | Reasons.emplace_back(Args: "SRAM ECC failure (ie registers, no fault address)" ); |
3292 | if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HANG) |
3293 | Reasons.emplace_back(Args: "GPU reset following unspecified hang" ); |
3294 | |
3295 | // If we do not know the reason, say so, otherwise remove the trailing comma |
3296 | // and space. |
3297 | if (Reasons.empty()) |
3298 | Reasons.emplace_back(Args: "Unknown (" + std::to_string(val: ReasonsMask) + ")" ); |
3299 | |
3300 | uint32_t Node = -1; |
3301 | hsa_agent_get_info(Event->memory_fault.agent, HSA_AGENT_INFO_NODE, &Node); |
3302 | |
3303 | AMDGPUPluginTy &Plugin = *reinterpret_cast<AMDGPUPluginTy *>(PluginPtr); |
3304 | for (uint32_t I = 0, E = Plugin.getNumDevices(); |
3305 | Node != uint32_t(-1) && I < E; ++I) { |
3306 | AMDGPUDeviceTy &AMDGPUDevice = |
3307 | reinterpret_cast<AMDGPUDeviceTy &>(Plugin.getDevice(I)); |
3308 | auto KernelTraceInfoRecord = |
3309 | AMDGPUDevice.KernelLaunchTraces.getExclusiveAccessor(); |
3310 | |
3311 | uint32_t DeviceNode = -1; |
3312 | if (auto Err = |
3313 | AMDGPUDevice.getDeviceAttr(HSA_AGENT_INFO_NODE, DeviceNode)) { |
3314 | consumeError(std::move(Err)); |
3315 | continue; |
3316 | } |
3317 | if (DeviceNode != Node) |
3318 | continue; |
3319 | void *DevicePtr = (void *)Event->memory_fault.virtual_address; |
3320 | std::string S; |
3321 | llvm::raw_string_ostream OS(S); |
3322 | OS << llvm::format("memory access fault by GPU %" PRIu32 |
3323 | " (agent 0x%" PRIx64 |
3324 | ") at virtual address %p. Reasons: %s" , |
3325 | Node, Event->memory_fault.agent.handle, |
3326 | (void *)Event->memory_fault.virtual_address, |
3327 | llvm::join(Reasons, ", " ).c_str()); |
3328 | ErrorReporter::reportKernelTraces(AMDGPUDevice, *KernelTraceInfoRecord); |
3329 | ErrorReporter::reportMemoryAccessError(AMDGPUDevice, DevicePtr, S, |
3330 | /*Abort*/ true); |
3331 | } |
3332 | |
3333 | // Abort the execution since we do not recover from this error. |
3334 | FATAL_MESSAGE(1, |
3335 | "memory access fault by GPU %" PRIu32 " (agent 0x%" PRIx64 |
3336 | ") at virtual address %p. Reasons: %s" , |
3337 | Node, Event->memory_fault.agent.handle, |
3338 | (void *)Event->memory_fault.virtual_address, |
3339 | llvm::join(Reasons, ", " ).c_str()); |
3340 | |
3341 | return HSA_STATUS_ERROR; |
3342 | } |
3343 | |
3344 | /// Indicate whether the HSA runtime was correctly initialized. Even if there |
3345 | /// is no available devices this boolean will be true. It indicates whether |
3346 | /// we can safely call HSA functions (e.g., hsa_shut_down). |
3347 | bool Initialized; |
3348 | |
3349 | /// Arrays of the available GPU and CPU agents. These arrays of handles should |
3350 | /// not be here but in the AMDGPUDeviceTy structures directly. However, the |
3351 | /// HSA standard does not provide API functions to retirve agents directly, |
3352 | /// only iterating functions. We cache the agents here for convenience. |
3353 | llvm::SmallVector<hsa_agent_t> KernelAgents; |
3354 | |
3355 | /// The device representing all HSA host agents. |
3356 | AMDHostDeviceTy *HostDevice; |
3357 | }; |
3358 | |
3359 | Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, |
3360 | uint32_t NumThreads[3], uint32_t NumBlocks[3], |
3361 | KernelArgsTy &KernelArgs, |
3362 | KernelLaunchParamsTy LaunchParams, |
3363 | AsyncInfoWrapperTy &AsyncInfoWrapper) const { |
3364 | if (ArgsSize != LaunchParams.Size && |
3365 | ArgsSize > LaunchParams.Size + getImplicitArgsSize()) |
3366 | return Plugin::error(ErrorCode::INVALID_ARGUMENT, |
3367 | "invalid kernel arguments size" ); |
3368 | |
3369 | AMDGPUPluginTy &AMDGPUPlugin = |
3370 | static_cast<AMDGPUPluginTy &>(GenericDevice.Plugin); |
3371 | AMDHostDeviceTy &HostDevice = AMDGPUPlugin.getHostDevice(); |
3372 | AMDGPUMemoryManagerTy &ArgsMemoryManager = HostDevice.getArgsMemoryManager(); |
3373 | |
3374 | void *AllArgs = nullptr; |
3375 | if (auto Err = ArgsMemoryManager.allocate(ArgsSize, &AllArgs)) |
3376 | return Err; |
3377 | |
3378 | // Account for user requested dynamic shared memory. |
3379 | uint32_t GroupSize = getGroupSize(); |
3380 | if (uint32_t MaxDynCGroupMem = std::max( |
3381 | KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize())) { |
3382 | GroupSize += MaxDynCGroupMem; |
3383 | } |
3384 | |
3385 | uint64_t StackSize; |
3386 | if (auto Err = GenericDevice.getDeviceStackSize(StackSize)) |
3387 | return Err; |
3388 | |
3389 | // Copy the explicit arguments. |
3390 | // TODO: We should expose the args memory manager alloc to the common part as |
3391 | // alternative to copying them twice. |
3392 | if (LaunchParams.Size) |
3393 | std::memcpy(dest: AllArgs, src: LaunchParams.Data, n: LaunchParams.Size); |
3394 | |
3395 | AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(GenericDevice); |
3396 | |
3397 | AMDGPUStreamTy *Stream = nullptr; |
3398 | if (auto Err = AMDGPUDevice.getStream(AsyncInfoWrapper, Stream)) |
3399 | return Err; |
3400 | |
3401 | uint64_t ImplArgsOffset = utils::roundUp( |
3402 | LaunchParams.Size, alignof(hsa_utils::AMDGPUImplicitArgsTy)); |
3403 | if (ArgsSize > ImplArgsOffset) { |
3404 | hsa_utils::AMDGPUImplicitArgsTy *ImplArgs = |
3405 | reinterpret_cast<hsa_utils::AMDGPUImplicitArgsTy *>( |
3406 | utils::advancePtr(AllArgs, ImplArgsOffset)); |
3407 | |
3408 | // Set the COV5+ implicit arguments to the appropriate values if present. |
3409 | uint64_t ImplArgsSize = ArgsSize - ImplArgsOffset; |
3410 | std::memset(s: ImplArgs, c: 0, n: ImplArgsSize); |
3411 | |
3412 | using ImplArgsTy = hsa_utils::AMDGPUImplicitArgsTy; |
3413 | hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::BlockCountX, ImplArgsSize, |
3414 | NumBlocks[0]); |
3415 | hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::BlockCountY, ImplArgsSize, |
3416 | NumBlocks[1]); |
3417 | hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::BlockCountZ, ImplArgsSize, |
3418 | NumBlocks[2]); |
3419 | |
3420 | hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GroupSizeX, ImplArgsSize, |
3421 | NumThreads[0]); |
3422 | hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GroupSizeY, ImplArgsSize, |
3423 | NumThreads[1]); |
3424 | hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GroupSizeZ, ImplArgsSize, |
3425 | NumThreads[2]); |
3426 | |
3427 | hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GridDims, ImplArgsSize, |
3428 | NumBlocks[2] * NumThreads[2] > 1 |
3429 | ? 3 |
3430 | : 1 + (NumBlocks[1] * NumThreads[1] != 1)); |
3431 | |
3432 | hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::DynamicLdsSize, ImplArgsSize, |
3433 | KernelArgs.DynCGroupMem); |
3434 | } |
3435 | |
3436 | // Push the kernel launch into the stream. |
3437 | return Stream->pushKernelLaunch(Kernel: *this, KernelArgs: AllArgs, NumThreads, NumBlocks, |
3438 | GroupSize, StackSize, MemoryManager&: ArgsMemoryManager); |
3439 | } |
3440 | |
3441 | Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice, |
3442 | KernelArgsTy &KernelArgs, |
3443 | uint32_t NumThreads[3], |
3444 | uint32_t NumBlocks[3]) const { |
3445 | // Only do all this when the output is requested |
3446 | if (!(getInfoLevel() & OMP_INFOTYPE_PLUGIN_KERNEL)) |
3447 | return Plugin::success(); |
3448 | |
3449 | // We don't have data to print additional info, but no hard error |
3450 | if (!KernelInfo.has_value()) |
3451 | return Plugin::success(); |
3452 | |
3453 | // General Info |
3454 | auto NumGroups = NumBlocks; |
3455 | auto ThreadsPerGroup = NumThreads; |
3456 | |
3457 | // Kernel Arguments Info |
3458 | auto ArgNum = KernelArgs.NumArgs; |
3459 | auto LoopTripCount = KernelArgs.Tripcount; |
3460 | |
3461 | // Details for AMDGPU kernels (read from image) |
3462 | // https://www.llvm.org/docs/AMDGPUUsage.html#code-object-v4-metadata |
3463 | auto GroupSegmentSize = (*KernelInfo).GroupSegmentList; |
3464 | auto SGPRCount = (*KernelInfo).SGPRCount; |
3465 | auto VGPRCount = (*KernelInfo).VGPRCount; |
3466 | auto SGPRSpillCount = (*KernelInfo).SGPRSpillCount; |
3467 | auto VGPRSpillCount = (*KernelInfo).VGPRSpillCount; |
3468 | auto MaxFlatWorkgroupSize = (*KernelInfo).MaxFlatWorkgroupSize; |
3469 | |
3470 | // Prints additional launch info that contains the following. |
3471 | // Num Args: The number of kernel arguments |
3472 | // Teams x Thrds: The number of teams and the number of threads actually |
3473 | // running. |
3474 | // MaxFlatWorkgroupSize: Maximum flat work-group size supported by the |
3475 | // kernel in work-items |
3476 | // LDS Usage: Amount of bytes used in LDS storage |
3477 | // S/VGPR Count: the number of S/V GPRs occupied by the kernel |
3478 | // S/VGPR Spill Count: how many S/VGPRs are spilled by the kernel |
3479 | // Tripcount: loop tripcount for the kernel |
3480 | INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(), |
3481 | "#Args: %d Teams x Thrds: %4ux%4u (MaxFlatWorkGroupSize: %u) LDS " |
3482 | "Usage: %uB #SGPRs/VGPRs: %u/%u #SGPR/VGPR Spills: %u/%u Tripcount: " |
3483 | "%lu\n" , |
3484 | ArgNum, NumGroups[0] * NumGroups[1] * NumGroups[2], |
3485 | ThreadsPerGroup[0] * ThreadsPerGroup[1] * ThreadsPerGroup[2], |
3486 | MaxFlatWorkgroupSize, GroupSegmentSize, SGPRCount, VGPRCount, |
3487 | SGPRSpillCount, VGPRSpillCount, LoopTripCount); |
3488 | |
3489 | return Plugin::success(); |
3490 | } |
3491 | |
3492 | template <typename... ArgsTy> |
3493 | static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) { |
3494 | hsa_status_t ResultCode = static_cast<hsa_status_t>(Code); |
3495 | if (ResultCode == HSA_STATUS_SUCCESS || ResultCode == HSA_STATUS_INFO_BREAK) |
3496 | return Plugin::success(); |
3497 | |
3498 | const char *Desc = "unknown error" ; |
3499 | hsa_status_t Ret = hsa_status_string(ResultCode, &Desc); |
3500 | if (Ret != HSA_STATUS_SUCCESS) |
3501 | REPORT("Unrecognized " GETNAME(TARGET_NAME) " error code %d\n" , Code); |
3502 | |
3503 | // TODO: Add more entries to this switch |
3504 | ErrorCode OffloadErrCode; |
3505 | switch (ResultCode) { |
3506 | case HSA_STATUS_ERROR_INVALID_SYMBOL_NAME: |
3507 | OffloadErrCode = ErrorCode::NOT_FOUND; |
3508 | break; |
3509 | default: |
3510 | OffloadErrCode = ErrorCode::UNKNOWN; |
3511 | } |
3512 | |
3513 | return Plugin::error(OffloadErrCode, ErrFmt, Args..., Desc); |
3514 | } |
3515 | |
3516 | void *AMDGPUMemoryManagerTy::allocate(size_t Size, void *HstPtr, |
3517 | TargetAllocTy Kind) { |
3518 | // Allocate memory from the pool. |
3519 | void *Ptr = nullptr; |
3520 | if (auto Err = MemoryPool->allocate(Size, PtrStorage: &Ptr)) { |
3521 | consumeError(Err: std::move(Err)); |
3522 | return nullptr; |
3523 | } |
3524 | assert(Ptr && "Invalid pointer" ); |
3525 | |
3526 | // Get a list of agents that can access this memory pool. |
3527 | llvm::SmallVector<hsa_agent_t> Agents; |
3528 | llvm::copy_if( |
3529 | Plugin.getKernelAgents(), std::back_inserter(Agents), |
3530 | [&](hsa_agent_t Agent) { return MemoryPool->canAccess(Agent); }); |
3531 | |
3532 | // Allow all valid kernel agents to access the allocation. |
3533 | if (auto Err = MemoryPool->enableAccess(Ptr, Size, Agents)) { |
3534 | REPORT("%s\n" , toString(std::move(Err)).data()); |
3535 | return nullptr; |
3536 | } |
3537 | return Ptr; |
3538 | } |
3539 | |
3540 | void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) { |
3541 | if (Size == 0) |
3542 | return nullptr; |
3543 | |
3544 | // Find the correct memory pool. |
3545 | AMDGPUMemoryPoolTy *MemoryPool = nullptr; |
3546 | switch (Kind) { |
3547 | case TARGET_ALLOC_DEFAULT: |
3548 | case TARGET_ALLOC_DEVICE: |
3549 | case TARGET_ALLOC_DEVICE_NON_BLOCKING: |
3550 | MemoryPool = CoarseGrainedMemoryPools[0]; |
3551 | break; |
3552 | case TARGET_ALLOC_HOST: |
3553 | MemoryPool = &HostDevice.getFineGrainedMemoryPool(); |
3554 | break; |
3555 | case TARGET_ALLOC_SHARED: |
3556 | MemoryPool = &HostDevice.getFineGrainedMemoryPool(); |
3557 | break; |
3558 | } |
3559 | |
3560 | if (!MemoryPool) { |
3561 | REPORT("No memory pool for the specified allocation kind\n" ); |
3562 | return nullptr; |
3563 | } |
3564 | |
3565 | // Allocate from the corresponding memory pool. |
3566 | void *Alloc = nullptr; |
3567 | if (Error Err = MemoryPool->allocate(Size, PtrStorage: &Alloc)) { |
3568 | REPORT("%s\n" , toString(E: std::move(Err)).data()); |
3569 | return nullptr; |
3570 | } |
3571 | |
3572 | if (Alloc) { |
3573 | // Get a list of agents that can access this memory pool. Inherently |
3574 | // necessary for host or shared allocations Also enabled for device memory |
3575 | // to allow device to device memcpy |
3576 | llvm::SmallVector<hsa_agent_t> Agents; |
3577 | llvm::copy_if(static_cast<AMDGPUPluginTy &>(Plugin).getKernelAgents(), |
3578 | std::back_inserter(Agents), [&](hsa_agent_t Agent) { |
3579 | return MemoryPool->canAccess(Agent); |
3580 | }); |
3581 | |
3582 | // Enable all valid kernel agents to access the buffer. |
3583 | if (auto Err = MemoryPool->enableAccess(Alloc, Size, Agents)) { |
3584 | REPORT("%s\n" , toString(std::move(Err)).data()); |
3585 | return nullptr; |
3586 | } |
3587 | } |
3588 | |
3589 | return Alloc; |
3590 | } |
3591 | |
3592 | void AMDGPUQueueTy::callbackError(hsa_status_t Status, hsa_queue_t *Source, |
3593 | void *Data) { |
3594 | auto &AMDGPUDevice = *reinterpret_cast<AMDGPUDeviceTy *>(Data); |
3595 | |
3596 | if (Status == HSA_STATUS_ERROR_EXCEPTION) { |
3597 | auto KernelTraceInfoRecord = |
3598 | AMDGPUDevice.KernelLaunchTraces.getExclusiveAccessor(); |
3599 | std::function<bool(__tgt_async_info &)> AsyncInfoWrapperMatcher = |
3600 | [=](__tgt_async_info &AsyncInfo) { |
3601 | auto *Stream = reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue); |
3602 | if (!Stream || !Stream->getQueue()) |
3603 | return false; |
3604 | return Stream->getQueue()->Queue == Source; |
3605 | }; |
3606 | ErrorReporter::reportTrapInKernel(AMDGPUDevice, *KernelTraceInfoRecord, |
3607 | AsyncInfoWrapperMatcher); |
3608 | } |
3609 | |
3610 | auto Err = Plugin::check(Status, "received error in queue %p: %s" , Source); |
3611 | FATAL_MESSAGE(1, "%s" , toString(std::move(Err)).data()); |
3612 | } |
3613 | |
3614 | } // namespace plugin |
3615 | } // namespace target |
3616 | } // namespace omp |
3617 | } // namespace llvm |
3618 | |
3619 | extern "C" { |
3620 | llvm::omp::target::plugin::GenericPluginTy *createPlugin_amdgpu() { |
3621 | return new llvm::omp::target::plugin::AMDGPUPluginTy(); |
3622 | } |
3623 | } |
3624 | |