| 1 | //===----RTLs/amdgpu/src/rtl.cpp - Target RTLs Implementation ----- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // RTL NextGen for AMDGPU machine |
| 10 | // |
| 11 | //===----------------------------------------------------------------------===// |
| 12 | |
| 13 | #include <atomic> |
| 14 | #include <cassert> |
| 15 | #include <cstddef> |
| 16 | #include <cstdint> |
| 17 | #include <deque> |
| 18 | #include <functional> |
| 19 | #include <mutex> |
| 20 | #include <string> |
| 21 | #include <system_error> |
| 22 | #include <unistd.h> |
| 23 | #include <unordered_map> |
| 24 | |
| 25 | #include "ErrorReporting.h" |
| 26 | #include "Shared/APITypes.h" |
| 27 | #include "Shared/Debug.h" |
| 28 | #include "Shared/Environment.h" |
| 29 | #include "Shared/RefCnt.h" |
| 30 | #include "Shared/Utils.h" |
| 31 | #include "Utils/ELF.h" |
| 32 | |
| 33 | #include "GlobalHandler.h" |
| 34 | #include "OpenMP/OMPT/Callback.h" |
| 35 | #include "PluginInterface.h" |
| 36 | #include "UtilitiesRTL.h" |
| 37 | #include "omptarget.h" |
| 38 | |
| 39 | #include "llvm/ADT/SmallString.h" |
| 40 | #include "llvm/ADT/SmallVector.h" |
| 41 | #include "llvm/ADT/StringRef.h" |
| 42 | #include "llvm/BinaryFormat/ELF.h" |
| 43 | #include "llvm/Frontend/OpenMP/OMPConstants.h" |
| 44 | #include "llvm/Frontend/OpenMP/OMPGridValues.h" |
| 45 | #include "llvm/Support/Error.h" |
| 46 | #include "llvm/Support/FileOutputBuffer.h" |
| 47 | #include "llvm/Support/FileSystem.h" |
| 48 | #include "llvm/Support/MemoryBuffer.h" |
| 49 | #include "llvm/Support/Program.h" |
| 50 | #include "llvm/Support/Signals.h" |
| 51 | #include "llvm/Support/raw_ostream.h" |
| 52 | |
| 53 | #if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || \ |
| 54 | !defined(__ORDER_BIG_ENDIAN__) |
| 55 | #error "Missing preprocessor definitions for endianness detection." |
| 56 | #endif |
| 57 | |
| 58 | // The HSA headers require these definitions. |
| 59 | #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) |
| 60 | #define LITTLEENDIAN_CPU |
| 61 | #elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) |
| 62 | #define BIGENDIAN_CPU |
| 63 | #endif |
| 64 | |
| 65 | #if defined(__has_include) |
| 66 | #if __has_include("hsa.h") |
| 67 | #include "hsa.h" |
| 68 | #include "hsa_ext_amd.h" |
| 69 | #elif __has_include("hsa/hsa.h") |
| 70 | #include "hsa/hsa.h" |
| 71 | #include "hsa/hsa_ext_amd.h" |
| 72 | #endif |
| 73 | #else |
| 74 | #include "hsa/hsa.h" |
| 75 | #include "hsa/hsa_ext_amd.h" |
| 76 | #endif |
| 77 | |
| 78 | using namespace error; |
| 79 | |
| 80 | namespace llvm { |
| 81 | namespace omp { |
| 82 | namespace target { |
| 83 | namespace plugin { |
| 84 | |
| 85 | /// Forward declarations for all specialized data structures. |
| 86 | struct AMDGPUKernelTy; |
| 87 | struct AMDGPUDeviceTy; |
| 88 | struct AMDGPUPluginTy; |
| 89 | struct AMDGPUStreamTy; |
| 90 | struct AMDGPUEventTy; |
| 91 | struct AMDGPUStreamManagerTy; |
| 92 | struct AMDGPUEventManagerTy; |
| 93 | struct AMDGPUDeviceImageTy; |
| 94 | struct AMDGPUMemoryManagerTy; |
| 95 | struct AMDGPUMemoryPoolTy; |
| 96 | |
| 97 | namespace hsa_utils { |
| 98 | |
| 99 | /// Iterate elements using an HSA iterate function. Do not use this function |
| 100 | /// directly but the specialized ones below instead. |
| 101 | template <typename ElemTy, typename IterFuncTy, typename CallbackTy> |
| 102 | hsa_status_t iterate(IterFuncTy Func, CallbackTy Cb) { |
| 103 | auto L = [](ElemTy Elem, void *Data) -> hsa_status_t { |
| 104 | CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data); |
| 105 | return (*Unwrapped)(Elem); |
| 106 | }; |
| 107 | return Func(L, static_cast<void *>(&Cb)); |
| 108 | } |
| 109 | |
| 110 | /// Iterate elements using an HSA iterate function passing a parameter. Do not |
| 111 | /// use this function directly but the specialized ones below instead. |
| 112 | template <typename ElemTy, typename IterFuncTy, typename IterFuncArgTy, |
| 113 | typename CallbackTy> |
| 114 | hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) { |
| 115 | auto L = [](ElemTy Elem, void *Data) -> hsa_status_t { |
| 116 | CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data); |
| 117 | return (*Unwrapped)(Elem); |
| 118 | }; |
| 119 | return Func(FuncArg, L, static_cast<void *>(&Cb)); |
| 120 | } |
| 121 | |
| 122 | /// Iterate elements using an HSA iterate function passing a parameter. Do not |
| 123 | /// use this function directly but the specialized ones below instead. |
| 124 | template <typename Elem1Ty, typename Elem2Ty, typename IterFuncTy, |
| 125 | typename IterFuncArgTy, typename CallbackTy> |
| 126 | hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) { |
| 127 | auto L = [](Elem1Ty Elem1, Elem2Ty Elem2, void *Data) -> hsa_status_t { |
| 128 | CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data); |
| 129 | return (*Unwrapped)(Elem1, Elem2); |
| 130 | }; |
| 131 | return Func(FuncArg, L, static_cast<void *>(&Cb)); |
| 132 | } |
| 133 | |
| 134 | /// Iterate agents. |
| 135 | template <typename CallbackTy> Error iterateAgents(CallbackTy Callback) { |
| 136 | hsa_status_t Status = iterate<hsa_agent_t>(hsa_iterate_agents, Callback); |
| 137 | return Plugin::check(Status, "error in hsa_iterate_agents: %s" ); |
| 138 | } |
| 139 | |
| 140 | /// Iterate ISAs of an agent. |
| 141 | template <typename CallbackTy> |
| 142 | Error iterateAgentISAs(hsa_agent_t Agent, CallbackTy Cb) { |
| 143 | hsa_status_t Status = iterate<hsa_isa_t>(hsa_agent_iterate_isas, Agent, Cb); |
| 144 | return Plugin::check(Status, "error in hsa_agent_iterate_isas: %s" ); |
| 145 | } |
| 146 | |
| 147 | /// Iterate memory pools of an agent. |
| 148 | template <typename CallbackTy> |
| 149 | Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) { |
| 150 | hsa_status_t Status = iterate<hsa_amd_memory_pool_t>( |
| 151 | hsa_amd_agent_iterate_memory_pools, Agent, Cb); |
| 152 | return Plugin::check(Status, |
| 153 | "error in hsa_amd_agent_iterate_memory_pools: %s" ); |
| 154 | } |
| 155 | |
| 156 | /// Dispatches an asynchronous memory copy. |
| 157 | /// Enables different SDMA engines for the dispatch in a round-robin fashion. |
| 158 | Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent, |
| 159 | const void *Src, hsa_agent_t SrcAgent, size_t Size, |
| 160 | uint32_t NumDepSignals, const hsa_signal_t *DepSignals, |
| 161 | hsa_signal_t CompletionSignal) { |
| 162 | if (!UseMultipleSdmaEngines) { |
| 163 | hsa_status_t S = |
| 164 | hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, Size, |
| 165 | NumDepSignals, DepSignals, CompletionSignal); |
| 166 | return Plugin::check(S, "error in hsa_amd_memory_async_copy: %s" ); |
| 167 | } |
| 168 | |
| 169 | // This solution is probably not the best |
| 170 | #if !(HSA_AMD_INTERFACE_VERSION_MAJOR >= 1 && \ |
| 171 | HSA_AMD_INTERFACE_VERSION_MINOR >= 2) |
| 172 | return Plugin::error(ErrorCode::UNSUPPORTED, |
| 173 | "async copy on selected SDMA requires ROCm 5.7" ); |
| 174 | #else |
| 175 | static std::atomic<int> SdmaEngine{1}; |
| 176 | |
| 177 | // This atomics solution is probably not the best, but should be sufficient |
| 178 | // for now. |
| 179 | // In a worst case scenario, in which threads read the same value, they will |
| 180 | // dispatch to the same SDMA engine. This may result in sub-optimal |
| 181 | // performance. However, I think the possibility to be fairly low. |
| 182 | int LocalSdmaEngine = SdmaEngine.load(std::memory_order_acquire); |
| 183 | // This call is only avail in ROCm >= 5.7 |
| 184 | hsa_status_t S = hsa_amd_memory_async_copy_on_engine( |
| 185 | Dst, DstAgent, Src, SrcAgent, Size, NumDepSignals, DepSignals, |
| 186 | CompletionSignal, (hsa_amd_sdma_engine_id_t)LocalSdmaEngine, |
| 187 | /*force_copy_on_sdma=*/true); |
| 188 | // Increment to use one of two SDMA engines: 0x1, 0x2 |
| 189 | LocalSdmaEngine = (LocalSdmaEngine << 1) % 3; |
| 190 | SdmaEngine.store(LocalSdmaEngine, std::memory_order_relaxed); |
| 191 | |
| 192 | return Plugin::check(S, "error in hsa_amd_memory_async_copy_on_engine: %s" ); |
| 193 | #endif |
| 194 | } |
| 195 | |
| 196 | Error getTargetTripleAndFeatures(hsa_agent_t Agent, |
| 197 | SmallVector<SmallString<32>> &Targets) { |
| 198 | auto Err = hsa_utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) { |
| 199 | uint32_t Length; |
| 200 | hsa_status_t Status; |
| 201 | Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME_LENGTH, &Length); |
| 202 | if (Status != HSA_STATUS_SUCCESS) |
| 203 | return Status; |
| 204 | |
| 205 | llvm::SmallVector<char> ISAName(Length); |
| 206 | Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, ISAName.begin()); |
| 207 | if (Status != HSA_STATUS_SUCCESS) |
| 208 | return Status; |
| 209 | |
| 210 | llvm::StringRef TripleTarget(ISAName.begin(), Length); |
| 211 | if (TripleTarget.consume_front(Prefix: "amdgcn-amd-amdhsa" )) { |
| 212 | auto Target = TripleTarget.ltrim(Char: '-').rtrim(Char: '\0'); |
| 213 | Targets.push_back(Target); |
| 214 | } |
| 215 | return HSA_STATUS_SUCCESS; |
| 216 | }); |
| 217 | return Err; |
| 218 | } |
| 219 | } // namespace hsa_utils |
| 220 | |
| 221 | /// Utility class representing generic resource references to AMDGPU resources. |
| 222 | template <typename ResourceTy> |
| 223 | struct AMDGPUResourceRef : public GenericDeviceResourceRef { |
| 224 | /// The underlying handle type for resources. |
| 225 | using HandleTy = ResourceTy *; |
| 226 | |
| 227 | /// Create an empty reference to an invalid resource. |
| 228 | AMDGPUResourceRef() : Resource(nullptr) {} |
| 229 | |
| 230 | /// Create a reference to an existing resource. |
| 231 | AMDGPUResourceRef(HandleTy Resource) : Resource(Resource) {} |
| 232 | |
| 233 | virtual ~AMDGPUResourceRef() {} |
| 234 | |
| 235 | /// Create a new resource and save the reference. The reference must be empty |
| 236 | /// before calling to this function. |
| 237 | Error create(GenericDeviceTy &Device) override; |
| 238 | |
| 239 | /// Destroy the referenced resource and invalidate the reference. The |
| 240 | /// reference must be to a valid resource before calling to this function. |
| 241 | Error destroy(GenericDeviceTy &Device) override { |
| 242 | if (!Resource) |
| 243 | return Plugin::error(ErrorCode::INVALID_ARGUMENT, |
| 244 | "destroying an invalid resource" ); |
| 245 | |
| 246 | if (auto Err = Resource->deinit()) |
| 247 | return Err; |
| 248 | |
| 249 | delete Resource; |
| 250 | |
| 251 | Resource = nullptr; |
| 252 | return Plugin::success(); |
| 253 | } |
| 254 | |
| 255 | /// Get the underlying resource handle. |
| 256 | operator HandleTy() const { return Resource; } |
| 257 | |
| 258 | private: |
| 259 | /// The handle to the actual resource. |
| 260 | HandleTy Resource; |
| 261 | }; |
| 262 | |
| 263 | /// Class holding an HSA memory pool. |
| 264 | struct AMDGPUMemoryPoolTy { |
| 265 | /// Create a memory pool from an HSA memory pool. |
| 266 | AMDGPUMemoryPoolTy(hsa_amd_memory_pool_t MemoryPool) |
| 267 | : MemoryPool(MemoryPool), GlobalFlags(0) {} |
| 268 | |
| 269 | /// Initialize the memory pool retrieving its properties. |
| 270 | Error init() { |
| 271 | if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_SEGMENT, Segment)) |
| 272 | return Err; |
| 273 | |
| 274 | if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, GlobalFlags)) |
| 275 | return Err; |
| 276 | |
| 277 | return Plugin::success(); |
| 278 | } |
| 279 | |
| 280 | /// Getter of the HSA memory pool. |
| 281 | hsa_amd_memory_pool_t get() const { return MemoryPool; } |
| 282 | |
| 283 | /// Indicate the segment which belongs to. |
| 284 | bool isGlobal() const { return (Segment == HSA_AMD_SEGMENT_GLOBAL); } |
| 285 | bool isReadOnly() const { return (Segment == HSA_AMD_SEGMENT_READONLY); } |
| 286 | bool isPrivate() const { return (Segment == HSA_AMD_SEGMENT_PRIVATE); } |
| 287 | bool isGroup() const { return (Segment == HSA_AMD_SEGMENT_GROUP); } |
| 288 | |
| 289 | /// Indicate if it is fine-grained memory. Valid only for global. |
| 290 | bool isFineGrained() const { |
| 291 | assert(isGlobal() && "Not global memory" ); |
| 292 | return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED); |
| 293 | } |
| 294 | |
| 295 | /// Indicate if it is coarse-grained memory. Valid only for global. |
| 296 | bool isCoarseGrained() const { |
| 297 | assert(isGlobal() && "Not global memory" ); |
| 298 | return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED); |
| 299 | } |
| 300 | |
| 301 | /// Indicate if it supports storing kernel arguments. Valid only for global. |
| 302 | bool supportsKernelArgs() const { |
| 303 | assert(isGlobal() && "Not global memory" ); |
| 304 | return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT); |
| 305 | } |
| 306 | |
| 307 | /// Allocate memory on the memory pool. |
| 308 | Error allocate(size_t Size, void **PtrStorage) { |
| 309 | hsa_status_t Status = |
| 310 | hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, PtrStorage); |
| 311 | return Plugin::check(Status, "error in hsa_amd_memory_pool_allocate: %s" ); |
| 312 | } |
| 313 | |
| 314 | /// Return memory to the memory pool. |
| 315 | Error deallocate(void *Ptr) { |
| 316 | hsa_status_t Status = hsa_amd_memory_pool_free(Ptr); |
| 317 | return Plugin::check(Status, "error in hsa_amd_memory_pool_free: %s" ); |
| 318 | } |
| 319 | |
| 320 | /// Returns if the \p Agent can access the memory pool. |
| 321 | bool canAccess(hsa_agent_t Agent) { |
| 322 | hsa_amd_memory_pool_access_t Access; |
| 323 | if (hsa_amd_agent_memory_pool_get_info( |
| 324 | Agent, MemoryPool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &Access)) |
| 325 | return false; |
| 326 | return Access != HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED; |
| 327 | } |
| 328 | |
| 329 | /// Allow the device to access a specific allocation. |
| 330 | Error enableAccess(void *Ptr, int64_t Size, |
| 331 | const llvm::SmallVector<hsa_agent_t> &Agents) const { |
| 332 | #ifdef OMPTARGET_DEBUG |
| 333 | for (hsa_agent_t Agent : Agents) { |
| 334 | hsa_amd_memory_pool_access_t Access; |
| 335 | if (auto Err = |
| 336 | getAttr(Agent, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, Access)) |
| 337 | return Err; |
| 338 | |
| 339 | // The agent is not allowed to access the memory pool in any case. Do not |
| 340 | // continue because otherwise it result in undefined behavior. |
| 341 | if (Access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) |
| 342 | return Plugin::error(ErrorCode::INVALID_VALUE, |
| 343 | "an agent is not allowed to access a memory pool" ); |
| 344 | } |
| 345 | #endif |
| 346 | |
| 347 | // We can access but it is disabled by default. Enable the access then. |
| 348 | hsa_status_t Status = |
| 349 | hsa_amd_agents_allow_access(Agents.size(), Agents.data(), nullptr, Ptr); |
| 350 | return Plugin::check(Status, "error in hsa_amd_agents_allow_access: %s" ); |
| 351 | } |
| 352 | |
| 353 | /// Get attribute from the memory pool. |
| 354 | template <typename Ty> |
| 355 | Error getAttr(hsa_amd_memory_pool_info_t Kind, Ty &Value) const { |
| 356 | hsa_status_t Status; |
| 357 | Status = hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value); |
| 358 | return Plugin::check(Status, "error in hsa_amd_memory_pool_get_info: %s" ); |
| 359 | } |
| 360 | |
| 361 | template <typename Ty> |
| 362 | hsa_status_t getAttrRaw(hsa_amd_memory_pool_info_t Kind, Ty &Value) const { |
| 363 | return hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value); |
| 364 | } |
| 365 | |
| 366 | /// Get attribute from the memory pool relating to an agent. |
| 367 | template <typename Ty> |
| 368 | Error getAttr(hsa_agent_t Agent, hsa_amd_agent_memory_pool_info_t Kind, |
| 369 | Ty &Value) const { |
| 370 | hsa_status_t Status; |
| 371 | Status = |
| 372 | hsa_amd_agent_memory_pool_get_info(Agent, MemoryPool, Kind, &Value); |
| 373 | return Plugin::check(Status, |
| 374 | "error in hsa_amd_agent_memory_pool_get_info: %s" ); |
| 375 | } |
| 376 | |
| 377 | private: |
| 378 | /// The HSA memory pool. |
| 379 | hsa_amd_memory_pool_t MemoryPool; |
| 380 | |
| 381 | /// The segment where the memory pool belongs to. |
| 382 | hsa_amd_segment_t Segment; |
| 383 | |
| 384 | /// The global flags of memory pool. Only valid if the memory pool belongs to |
| 385 | /// the global segment. |
| 386 | uint32_t GlobalFlags; |
| 387 | }; |
| 388 | |
| 389 | /// Class that implements a memory manager that gets memory from a specific |
| 390 | /// memory pool. |
| 391 | struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy { |
| 392 | |
| 393 | /// Create an empty memory manager. |
| 394 | AMDGPUMemoryManagerTy(AMDGPUPluginTy &Plugin) |
| 395 | : Plugin(Plugin), MemoryPool(nullptr), MemoryManager(nullptr) {} |
| 396 | |
| 397 | /// Initialize the memory manager from a memory pool. |
| 398 | Error init(AMDGPUMemoryPoolTy &MemoryPool) { |
| 399 | const uint32_t Threshold = 1 << 30; |
| 400 | this->MemoryManager = new MemoryManagerTy(*this, Threshold); |
| 401 | this->MemoryPool = &MemoryPool; |
| 402 | return Plugin::success(); |
| 403 | } |
| 404 | |
| 405 | /// Deinitialize the memory manager and free its allocations. |
| 406 | Error deinit() { |
| 407 | assert(MemoryManager && "Invalid memory manager" ); |
| 408 | |
| 409 | // Delete and invalidate the memory manager. At this point, the memory |
| 410 | // manager will deallocate all its allocations. |
| 411 | delete MemoryManager; |
| 412 | MemoryManager = nullptr; |
| 413 | |
| 414 | return Plugin::success(); |
| 415 | } |
| 416 | |
| 417 | /// Reuse or allocate memory through the memory manager. |
| 418 | Error allocate(size_t Size, void **PtrStorage) { |
| 419 | assert(MemoryManager && "Invalid memory manager" ); |
| 420 | assert(PtrStorage && "Invalid pointer storage" ); |
| 421 | |
| 422 | *PtrStorage = MemoryManager->allocate(Size, nullptr); |
| 423 | if (Size && *PtrStorage == nullptr) |
| 424 | return Plugin::error(ErrorCode::OUT_OF_RESOURCES, |
| 425 | "failure to allocate from AMDGPU memory manager" ); |
| 426 | |
| 427 | return Plugin::success(); |
| 428 | } |
| 429 | |
| 430 | /// Release an allocation to be reused. |
| 431 | Error deallocate(void *Ptr) { |
| 432 | if (MemoryManager->free(Ptr)) |
| 433 | return Plugin::error(ErrorCode::UNKNOWN, |
| 434 | "failure to deallocate from AMDGPU memory manager" ); |
| 435 | |
| 436 | return Plugin::success(); |
| 437 | } |
| 438 | |
| 439 | private: |
| 440 | /// Allocation callback that will be called once the memory manager does not |
| 441 | /// have more previously allocated buffers. |
| 442 | void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) override; |
| 443 | |
| 444 | /// Deallocation callback that will be called by the memory manager. |
| 445 | int free(void *TgtPtr, TargetAllocTy Kind) override { |
| 446 | if (auto Err = MemoryPool->deallocate(Ptr: TgtPtr)) { |
| 447 | consumeError(Err: std::move(Err)); |
| 448 | return OFFLOAD_FAIL; |
| 449 | } |
| 450 | return OFFLOAD_SUCCESS; |
| 451 | } |
| 452 | |
| 453 | /// The underlying plugin that owns this memory manager. |
| 454 | AMDGPUPluginTy &Plugin; |
| 455 | |
| 456 | /// The memory pool used to allocate memory. |
| 457 | AMDGPUMemoryPoolTy *MemoryPool; |
| 458 | |
| 459 | /// Reference to the actual memory manager. |
| 460 | MemoryManagerTy *MemoryManager; |
| 461 | }; |
| 462 | |
| 463 | /// Class implementing the AMDGPU device images' properties. |
| 464 | struct AMDGPUDeviceImageTy : public DeviceImageTy { |
| 465 | /// Create the AMDGPU image with the id and the target image pointer. |
| 466 | AMDGPUDeviceImageTy(int32_t ImageId, GenericDeviceTy &Device, |
| 467 | const __tgt_device_image *TgtImage) |
| 468 | : DeviceImageTy(ImageId, Device, TgtImage) {} |
| 469 | |
| 470 | /// Prepare and load the executable corresponding to the image. |
| 471 | Error loadExecutable(const AMDGPUDeviceTy &Device); |
| 472 | |
| 473 | /// Unload the executable. |
| 474 | Error unloadExecutable() { |
| 475 | hsa_status_t Status = hsa_executable_destroy(Executable); |
| 476 | return Plugin::check(Status, "error in hsa_executable_destroy: %s" ); |
| 477 | } |
| 478 | |
| 479 | /// Get the executable. |
| 480 | hsa_executable_t getExecutable() const { return Executable; } |
| 481 | |
| 482 | /// Get to Code Object Version of the ELF |
| 483 | uint16_t getELFABIVersion() const { return ELFABIVersion; } |
| 484 | |
| 485 | /// Find an HSA device symbol by its name on the executable. |
| 486 | Expected<hsa_executable_symbol_t> |
| 487 | findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const; |
| 488 | |
| 489 | /// Get additional info for kernel, e.g., register spill counts |
| 490 | std::optional<offloading::amdgpu::AMDGPUKernelMetaData> |
| 491 | getKernelInfo(StringRef Identifier) const { |
| 492 | auto It = KernelInfoMap.find(Identifier); |
| 493 | |
| 494 | if (It == KernelInfoMap.end()) |
| 495 | return {}; |
| 496 | |
| 497 | return It->second; |
| 498 | } |
| 499 | |
| 500 | private: |
| 501 | /// The executable loaded on the agent. |
| 502 | hsa_executable_t Executable; |
| 503 | StringMap<offloading::amdgpu::AMDGPUKernelMetaData> KernelInfoMap; |
| 504 | uint16_t ELFABIVersion; |
| 505 | }; |
| 506 | |
| 507 | /// Class implementing the AMDGPU kernel functionalities which derives from the |
| 508 | /// generic kernel class. |
| 509 | struct AMDGPUKernelTy : public GenericKernelTy { |
| 510 | /// Create an AMDGPU kernel with a name and an execution mode. |
| 511 | AMDGPUKernelTy(const char *Name) : GenericKernelTy(Name) {} |
| 512 | |
| 513 | /// Initialize the AMDGPU kernel. |
| 514 | Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override { |
| 515 | AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(Image); |
| 516 | |
| 517 | // Kernel symbols have a ".kd" suffix. |
| 518 | std::string KernelName(getName()); |
| 519 | KernelName += ".kd" ; |
| 520 | |
| 521 | // Find the symbol on the device executable. |
| 522 | auto SymbolOrErr = AMDImage.findDeviceSymbol(Device, KernelName); |
| 523 | if (!SymbolOrErr) |
| 524 | return SymbolOrErr.takeError(); |
| 525 | |
| 526 | hsa_executable_symbol_t Symbol = *SymbolOrErr; |
| 527 | hsa_symbol_kind_t SymbolType; |
| 528 | hsa_status_t Status; |
| 529 | |
| 530 | // Retrieve different properties of the kernel symbol. |
| 531 | std::pair<hsa_executable_symbol_info_t, void *> RequiredInfos[] = { |
| 532 | {HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &SymbolType}, |
| 533 | {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &KernelObject}, |
| 534 | {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &ArgsSize}, |
| 535 | {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &GroupSize}, |
| 536 | {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &DynamicStack}, |
| 537 | {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &PrivateSize}}; |
| 538 | |
| 539 | for (auto &Info : RequiredInfos) { |
| 540 | Status = hsa_executable_symbol_get_info(Symbol, Info.first, Info.second); |
| 541 | if (auto Err = Plugin::check( |
| 542 | Status, "error in hsa_executable_symbol_get_info: %s" )) |
| 543 | return Err; |
| 544 | } |
| 545 | |
| 546 | // Make sure it is a kernel symbol. |
| 547 | if (SymbolType != HSA_SYMBOL_KIND_KERNEL) |
| 548 | return Plugin::error(ErrorCode::INVALID_BINARY, |
| 549 | "symbol %s is not a kernel function" ); |
| 550 | |
| 551 | // TODO: Read the kernel descriptor for the max threads per block. May be |
| 552 | // read from the image. |
| 553 | |
| 554 | ImplicitArgsSize = |
| 555 | hsa_utils::getImplicitArgsSize(AMDImage.getELFABIVersion()); |
| 556 | DP("ELFABIVersion: %d\n" , AMDImage.getELFABIVersion()); |
| 557 | |
| 558 | // Get additional kernel info read from image |
| 559 | KernelInfo = AMDImage.getKernelInfo(getName()); |
| 560 | if (!KernelInfo.has_value()) |
| 561 | INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device.getDeviceId(), |
| 562 | "Could not read extra information for kernel %s." , getName()); |
| 563 | |
| 564 | return Plugin::success(); |
| 565 | } |
| 566 | |
| 567 | /// Launch the AMDGPU kernel function. |
| 568 | Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3], |
| 569 | uint32_t NumBlocks[3], KernelArgsTy &KernelArgs, |
| 570 | KernelLaunchParamsTy LaunchParams, |
| 571 | AsyncInfoWrapperTy &AsyncInfoWrapper) const override; |
| 572 | |
| 573 | /// Print more elaborate kernel launch info for AMDGPU |
| 574 | Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice, |
| 575 | KernelArgsTy &KernelArgs, uint32_t NumThreads[3], |
| 576 | uint32_t NumBlocks[3]) const override; |
| 577 | |
| 578 | /// Get group and private segment kernel size. |
| 579 | uint32_t getGroupSize() const { return GroupSize; } |
| 580 | uint32_t getPrivateSize() const { return PrivateSize; } |
| 581 | |
| 582 | /// Get the HSA kernel object representing the kernel function. |
| 583 | uint64_t getKernelObject() const { return KernelObject; } |
| 584 | |
| 585 | /// Get the size of implicitargs based on the code object version. |
| 586 | uint32_t getImplicitArgsSize() const { return ImplicitArgsSize; } |
| 587 | |
| 588 | /// Indicates whether or not we need to set up our own private segment size. |
| 589 | bool usesDynamicStack() const { return DynamicStack; } |
| 590 | |
| 591 | private: |
| 592 | /// The kernel object to execute. |
| 593 | uint64_t KernelObject; |
| 594 | |
| 595 | /// The args, group and private segments sizes required by a kernel instance. |
| 596 | uint32_t ArgsSize; |
| 597 | uint32_t GroupSize; |
| 598 | uint32_t PrivateSize; |
| 599 | bool DynamicStack; |
| 600 | |
| 601 | /// The size of implicit kernel arguments. |
| 602 | uint32_t ImplicitArgsSize; |
| 603 | |
| 604 | /// Additional Info for the AMD GPU Kernel |
| 605 | std::optional<offloading::amdgpu::AMDGPUKernelMetaData> KernelInfo; |
| 606 | }; |
| 607 | |
| 608 | /// Class representing an HSA signal. Signals are used to define dependencies |
| 609 | /// between asynchronous operations: kernel launches and memory transfers. |
| 610 | struct AMDGPUSignalTy { |
| 611 | /// Create an empty signal. |
| 612 | AMDGPUSignalTy() : HSASignal({0}), UseCount() {} |
| 613 | AMDGPUSignalTy(AMDGPUDeviceTy &Device) : HSASignal({0}), UseCount() {} |
| 614 | |
| 615 | /// Initialize the signal with an initial value. |
| 616 | Error init(uint32_t InitialValue = 1) { |
| 617 | hsa_status_t Status = |
| 618 | hsa_amd_signal_create(InitialValue, 0, nullptr, 0, &HSASignal); |
| 619 | return Plugin::check(Status, "error in hsa_signal_create: %s" ); |
| 620 | } |
| 621 | |
| 622 | /// Deinitialize the signal. |
| 623 | Error deinit() { |
| 624 | hsa_status_t Status = hsa_signal_destroy(HSASignal); |
| 625 | return Plugin::check(Status, "error in hsa_signal_destroy: %s" ); |
| 626 | } |
| 627 | |
| 628 | /// Wait until the signal gets a zero value. |
| 629 | Error wait(const uint64_t ActiveTimeout = 0, |
| 630 | GenericDeviceTy *Device = nullptr) const { |
| 631 | if (ActiveTimeout) { |
| 632 | hsa_signal_value_t Got = 1; |
| 633 | Got = hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0, |
| 634 | ActiveTimeout, HSA_WAIT_STATE_ACTIVE); |
| 635 | if (Got == 0) |
| 636 | return Plugin::success(); |
| 637 | } |
| 638 | |
| 639 | // If there is an RPC device attached to this stream we run it as a server. |
| 640 | uint64_t Timeout = UINT64_MAX; |
| 641 | auto WaitState = HSA_WAIT_STATE_BLOCKED; |
| 642 | while (hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0, |
| 643 | Timeout, WaitState) != 0) |
| 644 | ; |
| 645 | return Plugin::success(); |
| 646 | } |
| 647 | |
| 648 | /// Load the value on the signal. |
| 649 | hsa_signal_value_t load() const { |
| 650 | return hsa_signal_load_scacquire(HSASignal); |
| 651 | } |
| 652 | |
| 653 | /// Signal decrementing by one. |
| 654 | void signal() { |
| 655 | assert(load() > 0 && "Invalid signal value" ); |
| 656 | hsa_signal_subtract_screlease(HSASignal, 1); |
| 657 | } |
| 658 | |
| 659 | /// Reset the signal value before reusing the signal. Do not call this |
| 660 | /// function if the signal is being currently used by any watcher, such as a |
| 661 | /// plugin thread or the HSA runtime. |
| 662 | void reset() { hsa_signal_store_screlease(HSASignal, 1); } |
| 663 | |
| 664 | /// Increase the number of concurrent uses. |
| 665 | void increaseUseCount() { UseCount.increase(); } |
| 666 | |
| 667 | /// Decrease the number of concurrent uses and return whether was the last. |
| 668 | bool decreaseUseCount() { return UseCount.decrease(); } |
| 669 | |
| 670 | hsa_signal_t get() const { return HSASignal; } |
| 671 | |
| 672 | private: |
| 673 | /// The underlying HSA signal. |
| 674 | hsa_signal_t HSASignal; |
| 675 | |
| 676 | /// Reference counter for tracking the concurrent use count. This is mainly |
| 677 | /// used for knowing how many streams are using the signal. |
| 678 | RefCountTy<> UseCount; |
| 679 | }; |
| 680 | |
| 681 | /// Classes for holding AMDGPU signals and managing signals. |
| 682 | using AMDGPUSignalRef = AMDGPUResourceRef<AMDGPUSignalTy>; |
| 683 | using AMDGPUSignalManagerTy = GenericDeviceResourceManagerTy<AMDGPUSignalRef>; |
| 684 | |
| 685 | /// Class holding an HSA queue to submit kernel and barrier packets. |
| 686 | struct AMDGPUQueueTy { |
| 687 | /// Create an empty queue. |
| 688 | AMDGPUQueueTy() : Queue(nullptr), Mutex(), NumUsers(0) {} |
| 689 | |
| 690 | /// Lazily initialize a new queue belonging to a specific agent. |
| 691 | Error init(GenericDeviceTy &Device, hsa_agent_t Agent, int32_t QueueSize) { |
| 692 | if (Queue) |
| 693 | return Plugin::success(); |
| 694 | hsa_status_t Status = |
| 695 | hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError, |
| 696 | &Device, UINT32_MAX, UINT32_MAX, &Queue); |
| 697 | return Plugin::check(Status, "error in hsa_queue_create: %s" ); |
| 698 | } |
| 699 | |
| 700 | /// Deinitialize the queue and destroy its resources. |
| 701 | Error deinit() { |
| 702 | std::lock_guard<std::mutex> Lock(Mutex); |
| 703 | if (!Queue) |
| 704 | return Plugin::success(); |
| 705 | hsa_status_t Status = hsa_queue_destroy(Queue); |
| 706 | return Plugin::check(Status, "error in hsa_queue_destroy: %s" ); |
| 707 | } |
| 708 | |
| 709 | /// Returns the number of streams, this queue is currently assigned to. |
| 710 | bool getUserCount() const { return NumUsers; } |
| 711 | |
| 712 | /// Returns if the underlying HSA queue is initialized. |
| 713 | bool isInitialized() { return Queue != nullptr; } |
| 714 | |
| 715 | /// Decrement user count of the queue object. |
| 716 | void removeUser() { --NumUsers; } |
| 717 | |
| 718 | /// Increase user count of the queue object. |
| 719 | void addUser() { ++NumUsers; } |
| 720 | |
| 721 | /// Push a kernel launch to the queue. The kernel launch requires an output |
| 722 | /// signal and can define an optional input signal (nullptr if none). |
| 723 | Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs, |
| 724 | uint32_t NumThreads[3], uint32_t NumBlocks[3], |
| 725 | uint32_t GroupSize, uint64_t StackSize, |
| 726 | AMDGPUSignalTy *OutputSignal, |
| 727 | AMDGPUSignalTy *InputSignal) { |
| 728 | assert(OutputSignal && "Invalid kernel output signal" ); |
| 729 | |
| 730 | // Lock the queue during the packet publishing process. Notice this blocks |
| 731 | // the addition of other packets to the queue. The following piece of code |
| 732 | // should be lightweight; do not block the thread, allocate memory, etc. |
| 733 | std::lock_guard<std::mutex> Lock(Mutex); |
| 734 | assert(Queue && "Interacted with a non-initialized queue!" ); |
| 735 | |
| 736 | // Add a barrier packet before the kernel packet in case there is a pending |
| 737 | // preceding operation. The barrier packet will delay the processing of |
| 738 | // subsequent queue's packets until the barrier input signal are satisfied. |
| 739 | // No need output signal needed because the dependency is already guaranteed |
| 740 | // by the queue barrier itself. |
| 741 | if (InputSignal && InputSignal->load()) |
| 742 | if (auto Err = pushBarrierImpl(OutputSignal: nullptr, InputSignal1: InputSignal)) |
| 743 | return Err; |
| 744 | |
| 745 | // Now prepare the kernel packet. |
| 746 | uint64_t PacketId; |
| 747 | hsa_kernel_dispatch_packet_t *Packet = acquirePacket(PacketId); |
| 748 | assert(Packet && "Invalid packet" ); |
| 749 | |
| 750 | // The first 32 bits of the packet are written after the other fields |
| 751 | uint16_t Dims = NumBlocks[2] * NumThreads[2] > 1 |
| 752 | ? 3 |
| 753 | : 1 + (NumBlocks[1] * NumThreads[1] != 1); |
| 754 | uint16_t Setup = UINT16_C(Dims) |
| 755 | << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; |
| 756 | Packet->workgroup_size_x = NumThreads[0]; |
| 757 | Packet->workgroup_size_y = NumThreads[1]; |
| 758 | Packet->workgroup_size_z = NumThreads[2]; |
| 759 | Packet->reserved0 = 0; |
| 760 | Packet->grid_size_x = NumBlocks[0] * NumThreads[0]; |
| 761 | Packet->grid_size_y = NumBlocks[1] * NumThreads[1]; |
| 762 | Packet->grid_size_z = NumBlocks[2] * NumThreads[2]; |
| 763 | Packet->private_segment_size = |
| 764 | Kernel.usesDynamicStack() ? StackSize : Kernel.getPrivateSize(); |
| 765 | Packet->group_segment_size = GroupSize; |
| 766 | Packet->kernel_object = Kernel.getKernelObject(); |
| 767 | Packet->kernarg_address = KernelArgs; |
| 768 | Packet->reserved2 = 0; |
| 769 | Packet->completion_signal = OutputSignal->get(); |
| 770 | |
| 771 | // Publish the packet. Do not modify the packet after this point. |
| 772 | publishKernelPacket(PacketId, Setup, Packet); |
| 773 | |
| 774 | return Plugin::success(); |
| 775 | } |
| 776 | |
| 777 | /// Push a barrier packet that will wait up to two input signals. All signals |
| 778 | /// are optional (nullptr if none). |
| 779 | Error pushBarrier(AMDGPUSignalTy *OutputSignal, |
| 780 | const AMDGPUSignalTy *InputSignal1, |
| 781 | const AMDGPUSignalTy *InputSignal2) { |
| 782 | // Lock the queue during the packet publishing process. |
| 783 | std::lock_guard<std::mutex> Lock(Mutex); |
| 784 | assert(Queue && "Interacted with a non-initialized queue!" ); |
| 785 | |
| 786 | // Push the barrier with the lock acquired. |
| 787 | return pushBarrierImpl(OutputSignal, InputSignal1, InputSignal2); |
| 788 | } |
| 789 | |
| 790 | private: |
| 791 | /// Push a barrier packet that will wait up to two input signals. Assumes the |
| 792 | /// the queue lock is acquired. |
| 793 | Error pushBarrierImpl(AMDGPUSignalTy *OutputSignal, |
| 794 | const AMDGPUSignalTy *InputSignal1, |
| 795 | const AMDGPUSignalTy *InputSignal2 = nullptr) { |
| 796 | // Add a queue barrier waiting on both the other stream's operation and the |
| 797 | // last operation on the current stream (if any). |
| 798 | uint64_t PacketId; |
| 799 | hsa_barrier_and_packet_t *Packet = |
| 800 | (hsa_barrier_and_packet_t *)acquirePacket(PacketId); |
| 801 | assert(Packet && "Invalid packet" ); |
| 802 | |
| 803 | Packet->reserved0 = 0; |
| 804 | Packet->reserved1 = 0; |
| 805 | Packet->dep_signal[0] = {0}; |
| 806 | Packet->dep_signal[1] = {0}; |
| 807 | Packet->dep_signal[2] = {0}; |
| 808 | Packet->dep_signal[3] = {0}; |
| 809 | Packet->dep_signal[4] = {0}; |
| 810 | Packet->reserved2 = 0; |
| 811 | Packet->completion_signal = {0}; |
| 812 | |
| 813 | // Set input and output dependencies if needed. |
| 814 | if (OutputSignal) |
| 815 | Packet->completion_signal = OutputSignal->get(); |
| 816 | if (InputSignal1) |
| 817 | Packet->dep_signal[0] = InputSignal1->get(); |
| 818 | if (InputSignal2) |
| 819 | Packet->dep_signal[1] = InputSignal2->get(); |
| 820 | |
| 821 | // Publish the packet. Do not modify the packet after this point. |
| 822 | publishBarrierPacket(PacketId, Packet); |
| 823 | |
| 824 | return Plugin::success(); |
| 825 | } |
| 826 | |
| 827 | /// Acquire a packet from the queue. This call may block the thread if there |
| 828 | /// is no space in the underlying HSA queue. It may need to wait until the HSA |
| 829 | /// runtime processes some packets. Assumes the queue lock is acquired. |
| 830 | hsa_kernel_dispatch_packet_t *acquirePacket(uint64_t &PacketId) { |
| 831 | // Increase the queue index with relaxed memory order. Notice this will need |
| 832 | // another subsequent atomic operation with acquire order. |
| 833 | PacketId = hsa_queue_add_write_index_relaxed(Queue, 1); |
| 834 | |
| 835 | // Wait for the package to be available. Notice the atomic operation uses |
| 836 | // the acquire memory order. |
| 837 | while (PacketId - hsa_queue_load_read_index_scacquire(Queue) >= Queue->size) |
| 838 | ; |
| 839 | |
| 840 | // Return the packet reference. |
| 841 | const uint32_t Mask = Queue->size - 1; // The size is a power of 2. |
| 842 | return (hsa_kernel_dispatch_packet_t *)Queue->base_address + |
| 843 | (PacketId & Mask); |
| 844 | } |
| 845 | |
| 846 | /// Publish the kernel packet so that the HSA runtime can start processing |
| 847 | /// the kernel launch. Do not modify the packet once this function is called. |
| 848 | /// Assumes the queue lock is acquired. |
| 849 | void publishKernelPacket(uint64_t PacketId, uint16_t Setup, |
| 850 | hsa_kernel_dispatch_packet_t *Packet) { |
| 851 | uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet); |
| 852 | |
| 853 | uint16_t = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; |
| 854 | Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; |
| 855 | Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; |
| 856 | |
| 857 | // Publish the packet. Do not modify the package after this point. |
| 858 | uint32_t = Header | (Setup << 16u); |
| 859 | __atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE); |
| 860 | |
| 861 | // Signal the doorbell about the published packet. |
| 862 | hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId); |
| 863 | } |
| 864 | |
| 865 | /// Publish the barrier packet so that the HSA runtime can start processing |
| 866 | /// the barrier. Next packets in the queue will not be processed until all |
| 867 | /// barrier dependencies (signals) are satisfied. Assumes the queue is locked |
| 868 | void publishBarrierPacket(uint64_t PacketId, |
| 869 | hsa_barrier_and_packet_t *Packet) { |
| 870 | uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet); |
| 871 | uint16_t Setup = 0; |
| 872 | uint16_t = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; |
| 873 | Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; |
| 874 | Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; |
| 875 | |
| 876 | // Publish the packet. Do not modify the package after this point. |
| 877 | uint32_t = Header | (Setup << 16u); |
| 878 | __atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE); |
| 879 | |
| 880 | // Signal the doorbell about the published packet. |
| 881 | hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId); |
| 882 | } |
| 883 | |
| 884 | /// Callback that will be called when an error is detected on the HSA queue. |
| 885 | static void callbackError(hsa_status_t Status, hsa_queue_t *Source, |
| 886 | void *Data); |
| 887 | |
| 888 | /// The HSA queue. |
| 889 | hsa_queue_t *Queue; |
| 890 | |
| 891 | /// Mutex to protect the acquiring and publishing of packets. For the moment, |
| 892 | /// we need this mutex to prevent publishing packets that are not ready to be |
| 893 | /// published in a multi-thread scenario. Without a queue lock, a thread T1 |
| 894 | /// could acquire packet P and thread T2 acquire packet P+1. Thread T2 could |
| 895 | /// publish its packet P+1 (signaling the queue's doorbell) before packet P |
| 896 | /// from T1 is ready to be processed. That scenario should be invalid. Thus, |
| 897 | /// we use the following mutex to make packet acquiring and publishing atomic. |
| 898 | /// TODO: There are other more advanced approaches to avoid this mutex using |
| 899 | /// atomic operations. We can further investigate it if this is a bottleneck. |
| 900 | std::mutex Mutex; |
| 901 | |
| 902 | /// The number of streams, this queue is currently assigned to. A queue is |
| 903 | /// considered idle when this is zero, otherwise: busy. |
| 904 | uint32_t NumUsers; |
| 905 | }; |
| 906 | |
| 907 | /// Struct that implements a stream of asynchronous operations for AMDGPU |
| 908 | /// devices. This class relies on signals to implement streams and define the |
| 909 | /// dependencies between asynchronous operations. |
| 910 | struct AMDGPUStreamTy { |
| 911 | private: |
| 912 | /// Utility struct holding arguments for async H2H memory copies. |
| 913 | struct MemcpyArgsTy { |
| 914 | void *Dst; |
| 915 | const void *Src; |
| 916 | size_t Size; |
| 917 | }; |
| 918 | |
| 919 | /// Utility struct holding arguments for freeing buffers to memory managers. |
| 920 | struct ReleaseBufferArgsTy { |
| 921 | void *Buffer; |
| 922 | AMDGPUMemoryManagerTy *MemoryManager; |
| 923 | }; |
| 924 | |
| 925 | /// Utility struct holding arguments for releasing signals to signal managers. |
| 926 | struct ReleaseSignalArgsTy { |
| 927 | AMDGPUSignalTy *Signal; |
| 928 | AMDGPUSignalManagerTy *SignalManager; |
| 929 | }; |
| 930 | |
| 931 | using AMDGPUStreamCallbackTy = Error(void *Data); |
| 932 | |
| 933 | /// The stream is composed of N stream's slots. The struct below represents |
| 934 | /// the fields of each slot. Each slot has a signal and an optional action |
| 935 | /// function. When appending an HSA asynchronous operation to the stream, one |
| 936 | /// slot is consumed and used to store the operation's information. The |
| 937 | /// operation's output signal is set to the consumed slot's signal. If there |
| 938 | /// is a previous asynchronous operation on the previous slot, the HSA async |
| 939 | /// operation's input signal is set to the signal of the previous slot. This |
| 940 | /// way, we obtain a chain of dependent async operations. The action is a |
| 941 | /// function that will be executed eventually after the operation is |
| 942 | /// completed, e.g., for releasing a buffer. |
| 943 | struct StreamSlotTy { |
| 944 | /// The output signal of the stream operation. May be used by the subsequent |
| 945 | /// operation as input signal. |
| 946 | AMDGPUSignalTy *Signal; |
| 947 | |
| 948 | /// The actions that must be performed after the operation's completion. Set |
| 949 | /// to nullptr when there is no action to perform. |
| 950 | llvm::SmallVector<AMDGPUStreamCallbackTy *> Callbacks; |
| 951 | |
| 952 | /// Space for the action's arguments. A pointer to these arguments is passed |
| 953 | /// to the action function. Notice the space of arguments is limited. |
| 954 | union ActionArgsTy { |
| 955 | MemcpyArgsTy MemcpyArgs; |
| 956 | ReleaseBufferArgsTy ReleaseBufferArgs; |
| 957 | ReleaseSignalArgsTy ReleaseSignalArgs; |
| 958 | void *CallbackArgs; |
| 959 | }; |
| 960 | |
| 961 | llvm::SmallVector<ActionArgsTy> ActionArgs; |
| 962 | |
| 963 | /// Create an empty slot. |
| 964 | StreamSlotTy() : Signal(nullptr), Callbacks({}), ActionArgs({}) {} |
| 965 | |
| 966 | /// Schedule a host memory copy action on the slot. |
| 967 | Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size) { |
| 968 | Callbacks.emplace_back(memcpyAction); |
| 969 | ActionArgs.emplace_back().MemcpyArgs = MemcpyArgsTy{Dst, Src, Size}; |
| 970 | return Plugin::success(); |
| 971 | } |
| 972 | |
| 973 | /// Schedule a release buffer action on the slot. |
| 974 | Error schedReleaseBuffer(void *Buffer, AMDGPUMemoryManagerTy &Manager) { |
| 975 | Callbacks.emplace_back(releaseBufferAction); |
| 976 | ActionArgs.emplace_back().ReleaseBufferArgs = |
| 977 | ReleaseBufferArgsTy{Buffer, &Manager}; |
| 978 | return Plugin::success(); |
| 979 | } |
| 980 | |
| 981 | /// Schedule a signal release action on the slot. |
| 982 | Error schedReleaseSignal(AMDGPUSignalTy *SignalToRelease, |
| 983 | AMDGPUSignalManagerTy *SignalManager) { |
| 984 | Callbacks.emplace_back(releaseSignalAction); |
| 985 | ActionArgs.emplace_back().ReleaseSignalArgs = |
| 986 | ReleaseSignalArgsTy{SignalToRelease, SignalManager}; |
| 987 | return Plugin::success(); |
| 988 | } |
| 989 | |
| 990 | /// Register a callback to be called on compleition |
| 991 | Error schedCallback(AMDGPUStreamCallbackTy *Func, void *Data) { |
| 992 | Callbacks.emplace_back(Func); |
| 993 | ActionArgs.emplace_back().CallbackArgs = Data; |
| 994 | |
| 995 | return Plugin::success(); |
| 996 | } |
| 997 | |
| 998 | // Perform the action if needed. |
| 999 | Error performAction() { |
| 1000 | if (Callbacks.empty()) |
| 1001 | return Plugin::success(); |
| 1002 | |
| 1003 | assert(Callbacks.size() == ActionArgs.size() && "Size mismatch" ); |
| 1004 | for (auto [Callback, ActionArg] : llvm::zip(Callbacks, ActionArgs)) { |
| 1005 | // Perform the action. |
| 1006 | if (Callback == memcpyAction) { |
| 1007 | if (auto Err = memcpyAction(&ActionArg)) |
| 1008 | return Err; |
| 1009 | } else if (Callback == releaseBufferAction) { |
| 1010 | if (auto Err = releaseBufferAction(&ActionArg)) |
| 1011 | return Err; |
| 1012 | } else if (Callback == releaseSignalAction) { |
| 1013 | if (auto Err = releaseSignalAction(&ActionArg)) |
| 1014 | return Err; |
| 1015 | } else if (Callback) { |
| 1016 | if (auto Err = Callback(ActionArg.CallbackArgs)) |
| 1017 | return Err; |
| 1018 | } |
| 1019 | } |
| 1020 | |
| 1021 | // Invalidate the action. |
| 1022 | Callbacks.clear(); |
| 1023 | ActionArgs.clear(); |
| 1024 | |
| 1025 | return Plugin::success(); |
| 1026 | } |
| 1027 | }; |
| 1028 | |
| 1029 | /// The device agent where the stream was created. |
| 1030 | hsa_agent_t Agent; |
| 1031 | |
| 1032 | /// The queue that the stream uses to launch kernels. |
| 1033 | AMDGPUQueueTy *Queue; |
| 1034 | |
| 1035 | /// The manager of signals to reuse signals. |
| 1036 | AMDGPUSignalManagerTy &SignalManager; |
| 1037 | |
| 1038 | /// A reference to the associated device. |
| 1039 | GenericDeviceTy &Device; |
| 1040 | |
| 1041 | /// Array of stream slots. Use std::deque because it can dynamically grow |
| 1042 | /// without invalidating the already inserted elements. For instance, the |
| 1043 | /// std::vector may invalidate the elements by reallocating the internal |
| 1044 | /// array if there is not enough space on new insertions. |
| 1045 | std::deque<StreamSlotTy> Slots; |
| 1046 | |
| 1047 | /// The next available slot on the queue. This is reset to zero each time the |
| 1048 | /// stream is synchronized. It also indicates the current number of consumed |
| 1049 | /// slots at a given time. |
| 1050 | uint32_t NextSlot; |
| 1051 | |
| 1052 | /// The synchronization id. This number is increased each time the stream is |
| 1053 | /// synchronized. It is useful to detect if an AMDGPUEventTy points to an |
| 1054 | /// operation that was already finalized in a previous stream sycnhronize. |
| 1055 | uint32_t SyncCycle; |
| 1056 | |
| 1057 | /// Mutex to protect stream's management. |
| 1058 | mutable std::mutex Mutex; |
| 1059 | |
| 1060 | /// Timeout hint for HSA actively waiting for signal value to change |
| 1061 | const uint64_t StreamBusyWaitMicroseconds; |
| 1062 | |
| 1063 | /// Indicate to spread data transfers across all available SDMAs |
| 1064 | bool UseMultipleSdmaEngines; |
| 1065 | |
| 1066 | /// Return the current number of asynchronous operations on the stream. |
| 1067 | uint32_t size() const { return NextSlot; } |
| 1068 | |
| 1069 | /// Return the last valid slot on the stream. |
| 1070 | uint32_t last() const { return size() - 1; } |
| 1071 | |
| 1072 | /// Consume one slot from the stream. Since the stream uses signals on demand |
| 1073 | /// and releases them once the slot is no longer used, the function requires |
| 1074 | /// an idle signal for the new consumed slot. |
| 1075 | std::pair<uint32_t, AMDGPUSignalTy *> consume(AMDGPUSignalTy *OutputSignal) { |
| 1076 | // Double the stream size if needed. Since we use std::deque, this operation |
| 1077 | // does not invalidate the already added slots. |
| 1078 | if (Slots.size() == NextSlot) |
| 1079 | Slots.resize(new_size: Slots.size() * 2); |
| 1080 | |
| 1081 | // Update the next available slot and the stream size. |
| 1082 | uint32_t Curr = NextSlot++; |
| 1083 | |
| 1084 | // Retrieve the input signal, if any, of the current operation. |
| 1085 | AMDGPUSignalTy *InputSignal = (Curr > 0) ? Slots[Curr - 1].Signal : nullptr; |
| 1086 | |
| 1087 | // Set the output signal of the current slot. |
| 1088 | Slots[Curr].Signal = OutputSignal; |
| 1089 | |
| 1090 | return std::make_pair(x&: Curr, y&: InputSignal); |
| 1091 | } |
| 1092 | |
| 1093 | /// Complete all pending post actions and reset the stream after synchronizing |
| 1094 | /// or positively querying the stream. |
| 1095 | Error complete() { |
| 1096 | for (uint32_t Slot = 0; Slot < NextSlot; ++Slot) { |
| 1097 | // Take the post action of the operation if any. |
| 1098 | if (auto Err = Slots[Slot].performAction()) |
| 1099 | return Err; |
| 1100 | |
| 1101 | // Release the slot's signal if possible. Otherwise, another user will. |
| 1102 | if (Slots[Slot].Signal->decreaseUseCount()) |
| 1103 | if (auto Err = SignalManager.returnResource(Slots[Slot].Signal)) |
| 1104 | return Err; |
| 1105 | |
| 1106 | Slots[Slot].Signal = nullptr; |
| 1107 | } |
| 1108 | |
| 1109 | // Reset the stream slots to zero. |
| 1110 | NextSlot = 0; |
| 1111 | |
| 1112 | // Increase the synchronization id since the stream completed a sync cycle. |
| 1113 | SyncCycle += 1; |
| 1114 | |
| 1115 | return Plugin::success(); |
| 1116 | } |
| 1117 | |
| 1118 | /// Make the current stream wait on a specific operation of another stream. |
| 1119 | /// The idea is to make the current stream waiting on two signals: 1) the last |
| 1120 | /// signal of the current stream, and 2) the last signal of the other stream. |
| 1121 | /// Use a barrier packet with two input signals. |
| 1122 | Error waitOnStreamOperation(AMDGPUStreamTy &OtherStream, uint32_t Slot) { |
| 1123 | if (Queue == nullptr) |
| 1124 | return Plugin::error(ErrorCode::INVALID_NULL_POINTER, |
| 1125 | "target queue was nullptr" ); |
| 1126 | |
| 1127 | /// The signal that we must wait from the other stream. |
| 1128 | AMDGPUSignalTy *OtherSignal = OtherStream.Slots[Slot].Signal; |
| 1129 | |
| 1130 | // Prevent the release of the other stream's signal. |
| 1131 | OtherSignal->increaseUseCount(); |
| 1132 | |
| 1133 | // Retrieve an available signal for the operation's output. |
| 1134 | AMDGPUSignalTy *OutputSignal = nullptr; |
| 1135 | if (auto Err = SignalManager.getResource(OutputSignal)) |
| 1136 | return Err; |
| 1137 | OutputSignal->reset(); |
| 1138 | OutputSignal->increaseUseCount(); |
| 1139 | |
| 1140 | // Consume stream slot and compute dependencies. |
| 1141 | auto [Curr, InputSignal] = consume(OutputSignal); |
| 1142 | |
| 1143 | // Setup the post action to release the signal. |
| 1144 | if (auto Err = Slots[Curr].schedReleaseSignal(OtherSignal, &SignalManager)) |
| 1145 | return Err; |
| 1146 | |
| 1147 | // Push a barrier into the queue with both input signals. |
| 1148 | return Queue->pushBarrier(OutputSignal, InputSignal1: InputSignal, InputSignal2: OtherSignal); |
| 1149 | } |
| 1150 | |
| 1151 | /// Callback for running a specific asynchronous operation. This callback is |
| 1152 | /// used for hsa_amd_signal_async_handler. The argument is the operation that |
| 1153 | /// should be executed. Notice we use the post action mechanism to codify the |
| 1154 | /// asynchronous operation. |
| 1155 | static bool asyncActionCallback(hsa_signal_value_t Value, void *Args) { |
| 1156 | StreamSlotTy *Slot = reinterpret_cast<StreamSlotTy *>(Args); |
| 1157 | assert(Slot && "Invalid slot" ); |
| 1158 | assert(Slot->Signal && "Invalid signal" ); |
| 1159 | |
| 1160 | // This thread is outside the stream mutex. Make sure the thread sees the |
| 1161 | // changes on the slot. |
| 1162 | std::atomic_thread_fence(m: std::memory_order_acquire); |
| 1163 | |
| 1164 | // Perform the operation. |
| 1165 | if (auto Err = Slot->performAction()) |
| 1166 | FATAL_MESSAGE(1, "Error performing post action: %s" , |
| 1167 | toString(E: std::move(Err)).data()); |
| 1168 | |
| 1169 | // Signal the output signal to notify the asynchronous operation finalized. |
| 1170 | Slot->Signal->signal(); |
| 1171 | |
| 1172 | // Unregister callback. |
| 1173 | return false; |
| 1174 | } |
| 1175 | |
| 1176 | // Callback for host-to-host memory copies. This is an asynchronous action. |
| 1177 | static Error memcpyAction(void *Data) { |
| 1178 | MemcpyArgsTy *Args = reinterpret_cast<MemcpyArgsTy *>(Data); |
| 1179 | assert(Args && "Invalid arguments" ); |
| 1180 | assert(Args->Dst && "Invalid destination buffer" ); |
| 1181 | assert(Args->Src && "Invalid source buffer" ); |
| 1182 | |
| 1183 | std::memcpy(dest: Args->Dst, src: Args->Src, n: Args->Size); |
| 1184 | |
| 1185 | return Plugin::success(); |
| 1186 | } |
| 1187 | |
| 1188 | /// Releasing a memory buffer to a memory manager. This is a post completion |
| 1189 | /// action. There are two kinds of memory buffers: |
| 1190 | /// 1. For kernel arguments. This buffer can be freed after receiving the |
| 1191 | /// kernel completion signal. |
| 1192 | /// 2. For H2D transfers that need pinned memory space for staging. This |
| 1193 | /// buffer can be freed after receiving the transfer completion signal. |
| 1194 | /// 3. For D2H transfers that need pinned memory space for staging. This |
| 1195 | /// buffer cannot be freed after receiving the transfer completion signal |
| 1196 | /// because of the following asynchronous H2H callback. |
| 1197 | /// For this reason, This action can only be taken at |
| 1198 | /// AMDGPUStreamTy::complete() |
| 1199 | /// Because of the case 3, all releaseBufferActions are taken at |
| 1200 | /// AMDGPUStreamTy::complete() in the current implementation. |
| 1201 | static Error releaseBufferAction(void *Data) { |
| 1202 | ReleaseBufferArgsTy *Args = reinterpret_cast<ReleaseBufferArgsTy *>(Data); |
| 1203 | assert(Args && "Invalid arguments" ); |
| 1204 | assert(Args->MemoryManager && "Invalid memory manager" ); |
| 1205 | |
| 1206 | // Release the allocation to the memory manager. |
| 1207 | return Args->MemoryManager->deallocate(Ptr: Args->Buffer); |
| 1208 | } |
| 1209 | |
| 1210 | /// Releasing a signal object back to SignalManager. This is a post completion |
| 1211 | /// action. This action can only be taken at AMDGPUStreamTy::complete() |
| 1212 | static Error releaseSignalAction(void *Data) { |
| 1213 | ReleaseSignalArgsTy *Args = reinterpret_cast<ReleaseSignalArgsTy *>(Data); |
| 1214 | assert(Args && "Invalid arguments" ); |
| 1215 | assert(Args->Signal && "Invalid signal" ); |
| 1216 | assert(Args->SignalManager && "Invalid signal manager" ); |
| 1217 | |
| 1218 | // Release the signal if needed. |
| 1219 | if (Args->Signal->decreaseUseCount()) |
| 1220 | if (auto Err = Args->SignalManager->returnResource(Args->Signal)) |
| 1221 | return Err; |
| 1222 | |
| 1223 | return Plugin::success(); |
| 1224 | } |
| 1225 | |
| 1226 | public: |
| 1227 | /// Create an empty stream associated with a specific device. |
| 1228 | AMDGPUStreamTy(AMDGPUDeviceTy &Device); |
| 1229 | |
| 1230 | /// Initialize the stream's signals. |
| 1231 | Error init() { return Plugin::success(); } |
| 1232 | |
| 1233 | /// Deinitialize the stream's signals. |
| 1234 | Error deinit() { return Plugin::success(); } |
| 1235 | |
| 1236 | /// Push a asynchronous kernel to the stream. The kernel arguments must be |
| 1237 | /// placed in a special allocation for kernel args and must keep alive until |
| 1238 | /// the kernel finalizes. Once the kernel is finished, the stream will release |
| 1239 | /// the kernel args buffer to the specified memory manager. |
| 1240 | Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs, |
| 1241 | uint32_t NumThreads[3], uint32_t NumBlocks[3], |
| 1242 | uint32_t GroupSize, uint64_t StackSize, |
| 1243 | AMDGPUMemoryManagerTy &MemoryManager) { |
| 1244 | if (Queue == nullptr) |
| 1245 | return Plugin::error(ErrorCode::INVALID_NULL_POINTER, |
| 1246 | "target queue was nullptr" ); |
| 1247 | |
| 1248 | // Retrieve an available signal for the operation's output. |
| 1249 | AMDGPUSignalTy *OutputSignal = nullptr; |
| 1250 | if (auto Err = SignalManager.getResource(OutputSignal)) |
| 1251 | return Err; |
| 1252 | OutputSignal->reset(); |
| 1253 | OutputSignal->increaseUseCount(); |
| 1254 | |
| 1255 | std::lock_guard<std::mutex> StreamLock(Mutex); |
| 1256 | |
| 1257 | // Consume stream slot and compute dependencies. |
| 1258 | auto [Curr, InputSignal] = consume(OutputSignal); |
| 1259 | |
| 1260 | // Setup the post action to release the kernel args buffer. |
| 1261 | if (auto Err = Slots[Curr].schedReleaseBuffer(Buffer: KernelArgs, Manager&: MemoryManager)) |
| 1262 | return Err; |
| 1263 | |
| 1264 | // If we are running an RPC server we want to wake up the server thread |
| 1265 | // whenever there is a kernel running and let it sleep otherwise. |
| 1266 | if (Device.getRPCServer()) |
| 1267 | Device.Plugin.getRPCServer().Thread->notify(); |
| 1268 | |
| 1269 | // Push the kernel with the output signal and an input signal (optional) |
| 1270 | if (auto Err = Queue->pushKernelLaunch(Kernel, KernelArgs, NumThreads, |
| 1271 | NumBlocks, GroupSize, StackSize, |
| 1272 | OutputSignal, InputSignal)) |
| 1273 | return Err; |
| 1274 | |
| 1275 | // Register a callback to indicate when the kernel is complete. |
| 1276 | if (Device.getRPCServer()) { |
| 1277 | if (auto Err = Slots[Curr].schedCallback( |
| 1278 | [](void *Data) -> llvm::Error { |
| 1279 | GenericPluginTy &Plugin = |
| 1280 | *reinterpret_cast<GenericPluginTy *>(Data); |
| 1281 | Plugin.getRPCServer().Thread->finish(); |
| 1282 | return Error::success(); |
| 1283 | }, |
| 1284 | &Device.Plugin)) |
| 1285 | return Err; |
| 1286 | } |
| 1287 | return Plugin::success(); |
| 1288 | } |
| 1289 | |
| 1290 | /// Push an asynchronous memory copy between pinned memory buffers. |
| 1291 | Error pushPinnedMemoryCopyAsync(void *Dst, const void *Src, |
| 1292 | uint64_t CopySize) { |
| 1293 | // Retrieve an available signal for the operation's output. |
| 1294 | AMDGPUSignalTy *OutputSignal = nullptr; |
| 1295 | if (auto Err = SignalManager.getResource(OutputSignal)) |
| 1296 | return Err; |
| 1297 | OutputSignal->reset(); |
| 1298 | OutputSignal->increaseUseCount(); |
| 1299 | |
| 1300 | std::lock_guard<std::mutex> Lock(Mutex); |
| 1301 | |
| 1302 | // Consume stream slot and compute dependencies. |
| 1303 | auto [Curr, InputSignal] = consume(OutputSignal); |
| 1304 | |
| 1305 | // Issue the async memory copy. |
| 1306 | if (InputSignal && InputSignal->load()) { |
| 1307 | hsa_signal_t InputSignalRaw = InputSignal->get(); |
| 1308 | return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, |
| 1309 | Agent, CopySize, 1, &InputSignalRaw, |
| 1310 | OutputSignal->get()); |
| 1311 | } |
| 1312 | |
| 1313 | return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, |
| 1314 | Agent, CopySize, 0, nullptr, |
| 1315 | OutputSignal->get()); |
| 1316 | } |
| 1317 | |
| 1318 | /// Push an asynchronous memory copy device-to-host involving an unpinned |
| 1319 | /// memory buffer. The operation consists of a two-step copy from the |
| 1320 | /// device buffer to an intermediate pinned host buffer, and then, to a |
| 1321 | /// unpinned host buffer. Both operations are asynchronous and dependent. |
| 1322 | /// The intermediate pinned buffer will be released to the specified memory |
| 1323 | /// manager once the operation completes. |
| 1324 | Error pushMemoryCopyD2HAsync(void *Dst, const void *Src, void *Inter, |
| 1325 | uint64_t CopySize, |
| 1326 | AMDGPUMemoryManagerTy &MemoryManager) { |
| 1327 | // Retrieve available signals for the operation's outputs. |
| 1328 | AMDGPUSignalTy *OutputSignals[2] = {}; |
| 1329 | if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals)) |
| 1330 | return Err; |
| 1331 | for (auto *Signal : OutputSignals) { |
| 1332 | Signal->reset(); |
| 1333 | Signal->increaseUseCount(); |
| 1334 | } |
| 1335 | |
| 1336 | std::lock_guard<std::mutex> Lock(Mutex); |
| 1337 | |
| 1338 | // Consume stream slot and compute dependencies. |
| 1339 | auto [Curr, InputSignal] = consume(OutputSignal: OutputSignals[0]); |
| 1340 | |
| 1341 | // Setup the post action for releasing the intermediate buffer. |
| 1342 | if (auto Err = Slots[Curr].schedReleaseBuffer(Buffer: Inter, Manager&: MemoryManager)) |
| 1343 | return Err; |
| 1344 | |
| 1345 | // Issue the first step: device to host transfer. Avoid defining the input |
| 1346 | // dependency if already satisfied. |
| 1347 | if (InputSignal && InputSignal->load()) { |
| 1348 | hsa_signal_t InputSignalRaw = InputSignal->get(); |
| 1349 | if (auto Err = hsa_utils::asyncMemCopy( |
| 1350 | UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1, |
| 1351 | &InputSignalRaw, OutputSignals[0]->get())) |
| 1352 | return Err; |
| 1353 | } else { |
| 1354 | if (auto Err = hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Inter, |
| 1355 | Agent, Src, Agent, CopySize, 0, |
| 1356 | nullptr, OutputSignals[0]->get())) |
| 1357 | return Err; |
| 1358 | } |
| 1359 | |
| 1360 | // Consume another stream slot and compute dependencies. |
| 1361 | std::tie(args&: Curr, args&: InputSignal) = consume(OutputSignal: OutputSignals[1]); |
| 1362 | assert(InputSignal && "Invalid input signal" ); |
| 1363 | |
| 1364 | // The std::memcpy is done asynchronously using an async handler. We store |
| 1365 | // the function's information in the action but it's not actually an action. |
| 1366 | if (auto Err = Slots[Curr].schedHostMemoryCopy(Dst, Src: Inter, Size: CopySize)) |
| 1367 | return Err; |
| 1368 | |
| 1369 | // Make changes on this slot visible to the async handler's thread. |
| 1370 | std::atomic_thread_fence(m: std::memory_order_release); |
| 1371 | |
| 1372 | // Issue the second step: host to host transfer. |
| 1373 | hsa_status_t Status = hsa_amd_signal_async_handler( |
| 1374 | InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback, |
| 1375 | (void *)&Slots[Curr]); |
| 1376 | |
| 1377 | return Plugin::check(Status, "error in hsa_amd_signal_async_handler: %s" ); |
| 1378 | } |
| 1379 | |
| 1380 | /// Push an asynchronous memory copy host-to-device involving an unpinned |
| 1381 | /// memory buffer. The operation consists of a two-step copy from the |
| 1382 | /// unpinned host buffer to an intermediate pinned host buffer, and then, to |
| 1383 | /// the pinned host buffer. Both operations are asynchronous and dependent. |
| 1384 | /// The intermediate pinned buffer will be released to the specified memory |
| 1385 | /// manager once the operation completes. |
| 1386 | Error pushMemoryCopyH2DAsync(void *Dst, const void *Src, void *Inter, |
| 1387 | uint64_t CopySize, |
| 1388 | AMDGPUMemoryManagerTy &MemoryManager) { |
| 1389 | // Retrieve available signals for the operation's outputs. |
| 1390 | AMDGPUSignalTy *OutputSignals[2] = {}; |
| 1391 | if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals)) |
| 1392 | return Err; |
| 1393 | for (auto *Signal : OutputSignals) { |
| 1394 | Signal->reset(); |
| 1395 | Signal->increaseUseCount(); |
| 1396 | } |
| 1397 | |
| 1398 | AMDGPUSignalTy *OutputSignal = OutputSignals[0]; |
| 1399 | |
| 1400 | std::lock_guard<std::mutex> Lock(Mutex); |
| 1401 | |
| 1402 | // Consume stream slot and compute dependencies. |
| 1403 | auto [Curr, InputSignal] = consume(OutputSignal); |
| 1404 | |
| 1405 | // Issue the first step: host to host transfer. |
| 1406 | if (InputSignal && InputSignal->load()) { |
| 1407 | // The std::memcpy is done asynchronously using an async handler. We store |
| 1408 | // the function's information in the action but it is not actually a |
| 1409 | // post action. |
| 1410 | if (auto Err = Slots[Curr].schedHostMemoryCopy(Dst: Inter, Src, Size: CopySize)) |
| 1411 | return Err; |
| 1412 | |
| 1413 | // Make changes on this slot visible to the async handler's thread. |
| 1414 | std::atomic_thread_fence(m: std::memory_order_release); |
| 1415 | |
| 1416 | hsa_status_t Status = hsa_amd_signal_async_handler( |
| 1417 | InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback, |
| 1418 | (void *)&Slots[Curr]); |
| 1419 | |
| 1420 | if (auto Err = Plugin::check(Status, |
| 1421 | "error in hsa_amd_signal_async_handler: %s" )) |
| 1422 | return Err; |
| 1423 | |
| 1424 | // Let's use now the second output signal. |
| 1425 | OutputSignal = OutputSignals[1]; |
| 1426 | |
| 1427 | // Consume another stream slot and compute dependencies. |
| 1428 | std::tie(args&: Curr, args&: InputSignal) = consume(OutputSignal); |
| 1429 | } else { |
| 1430 | // All preceding operations completed, copy the memory synchronously. |
| 1431 | std::memcpy(dest: Inter, src: Src, n: CopySize); |
| 1432 | |
| 1433 | // Return the second signal because it will not be used. |
| 1434 | OutputSignals[1]->decreaseUseCount(); |
| 1435 | if (auto Err = SignalManager.returnResource(OutputSignals[1])) |
| 1436 | return Err; |
| 1437 | } |
| 1438 | |
| 1439 | // Setup the post action to release the intermediate pinned buffer. |
| 1440 | if (auto Err = Slots[Curr].schedReleaseBuffer(Buffer: Inter, Manager&: MemoryManager)) |
| 1441 | return Err; |
| 1442 | |
| 1443 | // Issue the second step: host to device transfer. Avoid defining the input |
| 1444 | // dependency if already satisfied. |
| 1445 | if (InputSignal && InputSignal->load()) { |
| 1446 | hsa_signal_t InputSignalRaw = InputSignal->get(); |
| 1447 | return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, |
| 1448 | Agent, CopySize, 1, &InputSignalRaw, |
| 1449 | OutputSignal->get()); |
| 1450 | } |
| 1451 | return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, |
| 1452 | Agent, CopySize, 0, nullptr, |
| 1453 | OutputSignal->get()); |
| 1454 | } |
| 1455 | |
| 1456 | // AMDGPUDeviceTy is incomplete here, passing the underlying agent instead |
| 1457 | Error pushMemoryCopyD2DAsync(void *Dst, hsa_agent_t DstAgent, const void *Src, |
| 1458 | hsa_agent_t SrcAgent, uint64_t CopySize) { |
| 1459 | AMDGPUSignalTy *OutputSignal; |
| 1460 | if (auto Err = SignalManager.getResources(/*Num=*/1, &OutputSignal)) |
| 1461 | return Err; |
| 1462 | OutputSignal->reset(); |
| 1463 | OutputSignal->increaseUseCount(); |
| 1464 | |
| 1465 | std::lock_guard<std::mutex> Lock(Mutex); |
| 1466 | |
| 1467 | // Consume stream slot and compute dependencies. |
| 1468 | auto [Curr, InputSignal] = consume(OutputSignal); |
| 1469 | |
| 1470 | // The agents need to have access to the corresponding memory |
| 1471 | // This is presently only true if the pointers were originally |
| 1472 | // allocated by this runtime or the caller made the appropriate |
| 1473 | // access calls. |
| 1474 | |
| 1475 | if (InputSignal && InputSignal->load()) { |
| 1476 | hsa_signal_t InputSignalRaw = InputSignal->get(); |
| 1477 | return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src, |
| 1478 | SrcAgent, CopySize, 1, &InputSignalRaw, |
| 1479 | OutputSignal->get()); |
| 1480 | } |
| 1481 | return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src, |
| 1482 | SrcAgent, CopySize, 0, nullptr, |
| 1483 | OutputSignal->get()); |
| 1484 | } |
| 1485 | |
| 1486 | /// Synchronize with the stream. The current thread waits until all operations |
| 1487 | /// are finalized and it performs the pending post actions (i.e., releasing |
| 1488 | /// intermediate buffers). |
| 1489 | Error synchronize() { |
| 1490 | std::lock_guard<std::mutex> Lock(Mutex); |
| 1491 | |
| 1492 | // No need to synchronize anything. |
| 1493 | if (size() == 0) |
| 1494 | return Plugin::success(); |
| 1495 | |
| 1496 | // Wait until all previous operations on the stream have completed. |
| 1497 | if (auto Err = |
| 1498 | Slots[last()].Signal->wait(StreamBusyWaitMicroseconds, &Device)) |
| 1499 | return Err; |
| 1500 | |
| 1501 | // Reset the stream and perform all pending post actions. |
| 1502 | return complete(); |
| 1503 | } |
| 1504 | |
| 1505 | /// Query the stream and complete pending post actions if operations finished. |
| 1506 | /// Return whether all the operations completed. This operation does not block |
| 1507 | /// the calling thread. |
| 1508 | Expected<bool> query() { |
| 1509 | std::lock_guard<std::mutex> Lock(Mutex); |
| 1510 | |
| 1511 | // No need to query anything. |
| 1512 | if (size() == 0) |
| 1513 | return true; |
| 1514 | |
| 1515 | // The last operation did not complete yet. Return directly. |
| 1516 | if (Slots[last()].Signal->load()) |
| 1517 | return false; |
| 1518 | |
| 1519 | // Reset the stream and perform all pending post actions. |
| 1520 | if (auto Err = complete()) |
| 1521 | return std::move(Err); |
| 1522 | |
| 1523 | return true; |
| 1524 | } |
| 1525 | |
| 1526 | const AMDGPUQueueTy *getQueue() const { return Queue; } |
| 1527 | |
| 1528 | /// Record the state of the stream on an event. |
| 1529 | Error recordEvent(AMDGPUEventTy &Event) const; |
| 1530 | |
| 1531 | /// Make the stream wait on an event. |
| 1532 | Error waitEvent(const AMDGPUEventTy &Event); |
| 1533 | |
| 1534 | friend struct AMDGPUStreamManagerTy; |
| 1535 | }; |
| 1536 | |
| 1537 | /// Class representing an event on AMDGPU. The event basically stores some |
| 1538 | /// information regarding the state of the recorded stream. |
| 1539 | struct AMDGPUEventTy { |
| 1540 | /// Create an empty event. |
| 1541 | AMDGPUEventTy(AMDGPUDeviceTy &Device) |
| 1542 | : RecordedStream(nullptr), RecordedSlot(-1), RecordedSyncCycle(-1) {} |
| 1543 | |
| 1544 | /// Initialize and deinitialize. |
| 1545 | Error init() { return Plugin::success(); } |
| 1546 | Error deinit() { return Plugin::success(); } |
| 1547 | |
| 1548 | /// Record the state of a stream on the event. |
| 1549 | Error record(AMDGPUStreamTy &Stream) { |
| 1550 | std::lock_guard<std::mutex> Lock(Mutex); |
| 1551 | |
| 1552 | // Ignore the last recorded stream. |
| 1553 | RecordedStream = &Stream; |
| 1554 | |
| 1555 | return Stream.recordEvent(Event&: *this); |
| 1556 | } |
| 1557 | |
| 1558 | /// Make a stream wait on the current event. |
| 1559 | Error wait(AMDGPUStreamTy &Stream) { |
| 1560 | std::lock_guard<std::mutex> Lock(Mutex); |
| 1561 | |
| 1562 | if (!RecordedStream) |
| 1563 | return Plugin::error(ErrorCode::INVALID_ARGUMENT, |
| 1564 | "event does not have any recorded stream" ); |
| 1565 | |
| 1566 | // Synchronizing the same stream. Do nothing. |
| 1567 | if (RecordedStream == &Stream) |
| 1568 | return Plugin::success(); |
| 1569 | |
| 1570 | // No need to wait anything, the recorded stream already finished the |
| 1571 | // corresponding operation. |
| 1572 | if (RecordedSlot < 0) |
| 1573 | return Plugin::success(); |
| 1574 | |
| 1575 | return Stream.waitEvent(Event: *this); |
| 1576 | } |
| 1577 | |
| 1578 | protected: |
| 1579 | /// The stream registered in this event. |
| 1580 | AMDGPUStreamTy *RecordedStream; |
| 1581 | |
| 1582 | /// The recordered operation on the recorded stream. |
| 1583 | int64_t RecordedSlot; |
| 1584 | |
| 1585 | /// The sync cycle when the stream was recorded. Used to detect stale events. |
| 1586 | int64_t RecordedSyncCycle; |
| 1587 | |
| 1588 | /// Mutex to safely access event fields. |
| 1589 | mutable std::mutex Mutex; |
| 1590 | |
| 1591 | friend struct AMDGPUStreamTy; |
| 1592 | }; |
| 1593 | |
| 1594 | Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) const { |
| 1595 | std::lock_guard<std::mutex> Lock(Mutex); |
| 1596 | |
| 1597 | if (size() > 0) { |
| 1598 | // Record the synchronize identifier (to detect stale recordings) and |
| 1599 | // the last valid stream's operation. |
| 1600 | Event.RecordedSyncCycle = SyncCycle; |
| 1601 | Event.RecordedSlot = last(); |
| 1602 | |
| 1603 | assert(Event.RecordedSyncCycle >= 0 && "Invalid recorded sync cycle" ); |
| 1604 | assert(Event.RecordedSlot >= 0 && "Invalid recorded slot" ); |
| 1605 | } else { |
| 1606 | // The stream is empty, everything already completed, record nothing. |
| 1607 | Event.RecordedSyncCycle = -1; |
| 1608 | Event.RecordedSlot = -1; |
| 1609 | } |
| 1610 | return Plugin::success(); |
| 1611 | } |
| 1612 | |
| 1613 | Error AMDGPUStreamTy::waitEvent(const AMDGPUEventTy &Event) { |
| 1614 | // Retrieve the recorded stream on the event. |
| 1615 | AMDGPUStreamTy &RecordedStream = *Event.RecordedStream; |
| 1616 | |
| 1617 | std::scoped_lock<std::mutex, std::mutex> Lock(Mutex, RecordedStream.Mutex); |
| 1618 | |
| 1619 | // The recorded stream already completed the operation because the synchronize |
| 1620 | // identifier is already outdated. |
| 1621 | if (RecordedStream.SyncCycle != (uint32_t)Event.RecordedSyncCycle) |
| 1622 | return Plugin::success(); |
| 1623 | |
| 1624 | // Again, the recorded stream already completed the operation, the last |
| 1625 | // operation's output signal is satisfied. |
| 1626 | if (!RecordedStream.Slots[Event.RecordedSlot].Signal->load()) |
| 1627 | return Plugin::success(); |
| 1628 | |
| 1629 | // Otherwise, make the current stream wait on the other stream's operation. |
| 1630 | return waitOnStreamOperation(OtherStream&: RecordedStream, Slot: Event.RecordedSlot); |
| 1631 | } |
| 1632 | |
| 1633 | struct AMDGPUStreamManagerTy final |
| 1634 | : GenericDeviceResourceManagerTy<AMDGPUResourceRef<AMDGPUStreamTy>> { |
| 1635 | using ResourceRef = AMDGPUResourceRef<AMDGPUStreamTy>; |
| 1636 | using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>; |
| 1637 | |
| 1638 | AMDGPUStreamManagerTy(GenericDeviceTy &Device, hsa_agent_t HSAAgent) |
| 1639 | : GenericDeviceResourceManagerTy(Device), Device(Device), |
| 1640 | OMPX_QueueTracking("LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING" , true), |
| 1641 | NextQueue(0), Agent(HSAAgent) {} |
| 1642 | |
| 1643 | Error init(uint32_t InitialSize, int NumHSAQueues, int HSAQueueSize) { |
| 1644 | Queues = std::vector<AMDGPUQueueTy>(NumHSAQueues); |
| 1645 | QueueSize = HSAQueueSize; |
| 1646 | MaxNumQueues = NumHSAQueues; |
| 1647 | // Initialize one queue eagerly |
| 1648 | if (auto Err = Queues.front().init(Device, Agent, QueueSize)) |
| 1649 | return Err; |
| 1650 | |
| 1651 | return GenericDeviceResourceManagerTy::init(InitialSize); |
| 1652 | } |
| 1653 | |
| 1654 | /// Deinitialize the resource pool and delete all resources. This function |
| 1655 | /// must be called before the destructor. |
| 1656 | Error deinit() override { |
| 1657 | // De-init all queues |
| 1658 | for (AMDGPUQueueTy &Queue : Queues) { |
| 1659 | if (auto Err = Queue.deinit()) |
| 1660 | return Err; |
| 1661 | } |
| 1662 | |
| 1663 | return GenericDeviceResourceManagerTy::deinit(); |
| 1664 | } |
| 1665 | |
| 1666 | /// Get a single stream from the pool or create new resources. |
| 1667 | virtual Error getResource(AMDGPUStreamTy *&StreamHandle) override { |
| 1668 | return getResourcesImpl(1, &StreamHandle, [this](AMDGPUStreamTy *&Handle) { |
| 1669 | return assignNextQueue(Stream: Handle); |
| 1670 | }); |
| 1671 | } |
| 1672 | |
| 1673 | /// Return stream to the pool. |
| 1674 | virtual Error returnResource(AMDGPUStreamTy *StreamHandle) override { |
| 1675 | return returnResourceImpl(StreamHandle, [](AMDGPUStreamTy *Handle) { |
| 1676 | Handle->Queue->removeUser(); |
| 1677 | return Plugin::success(); |
| 1678 | }); |
| 1679 | } |
| 1680 | |
| 1681 | private: |
| 1682 | /// Search for and assign an preferably idle queue to the given Stream. If |
| 1683 | /// there is no queue without current users, choose the queue with the lowest |
| 1684 | /// user count. If utilization is ignored: use round robin selection. |
| 1685 | inline Error assignNextQueue(AMDGPUStreamTy *Stream) { |
| 1686 | // Start from zero when tracking utilization, otherwise: round robin policy. |
| 1687 | uint32_t Index = OMPX_QueueTracking ? 0 : NextQueue++ % MaxNumQueues; |
| 1688 | |
| 1689 | if (OMPX_QueueTracking) { |
| 1690 | // Find the least used queue. |
| 1691 | for (uint32_t I = 0; I < MaxNumQueues; ++I) { |
| 1692 | // Early exit when an initialized queue is idle. |
| 1693 | if (Queues[I].isInitialized() && Queues[I].getUserCount() == 0) { |
| 1694 | Index = I; |
| 1695 | break; |
| 1696 | } |
| 1697 | |
| 1698 | // Update the least used queue. |
| 1699 | if (Queues[Index].getUserCount() > Queues[I].getUserCount()) |
| 1700 | Index = I; |
| 1701 | } |
| 1702 | } |
| 1703 | |
| 1704 | // Make sure the queue is initialized, then add user & assign. |
| 1705 | if (auto Err = Queues[Index].init(Device, Agent, QueueSize)) |
| 1706 | return Err; |
| 1707 | Queues[Index].addUser(); |
| 1708 | Stream->Queue = &Queues[Index]; |
| 1709 | |
| 1710 | return Plugin::success(); |
| 1711 | } |
| 1712 | |
| 1713 | /// The device associated with this stream. |
| 1714 | GenericDeviceTy &Device; |
| 1715 | |
| 1716 | /// Envar for controlling the tracking of busy HSA queues. |
| 1717 | BoolEnvar OMPX_QueueTracking; |
| 1718 | |
| 1719 | /// The next queue index to use for round robin selection. |
| 1720 | uint32_t NextQueue; |
| 1721 | |
| 1722 | /// The queues which are assigned to requested streams. |
| 1723 | std::vector<AMDGPUQueueTy> Queues; |
| 1724 | |
| 1725 | /// The corresponding device as HSA agent. |
| 1726 | hsa_agent_t Agent; |
| 1727 | |
| 1728 | /// The maximum number of queues. |
| 1729 | uint32_t MaxNumQueues; |
| 1730 | |
| 1731 | /// The size of created queues. |
| 1732 | uint32_t QueueSize; |
| 1733 | }; |
| 1734 | |
| 1735 | /// Abstract class that holds the common members of the actual kernel devices |
| 1736 | /// and the host device. Both types should inherit from this class. |
| 1737 | struct AMDGenericDeviceTy { |
| 1738 | AMDGenericDeviceTy() {} |
| 1739 | |
| 1740 | virtual ~AMDGenericDeviceTy() {} |
| 1741 | |
| 1742 | /// Create all memory pools which the device has access to and classify them. |
| 1743 | Error initMemoryPools() { |
| 1744 | // Retrieve all memory pools from the device agent(s). |
| 1745 | Error Err = retrieveAllMemoryPools(); |
| 1746 | if (Err) |
| 1747 | return Err; |
| 1748 | |
| 1749 | for (AMDGPUMemoryPoolTy *MemoryPool : AllMemoryPools) { |
| 1750 | // Initialize the memory pool and retrieve some basic info. |
| 1751 | Error Err = MemoryPool->init(); |
| 1752 | if (Err) |
| 1753 | return Err; |
| 1754 | |
| 1755 | if (!MemoryPool->isGlobal()) |
| 1756 | continue; |
| 1757 | |
| 1758 | // Classify the memory pools depending on their properties. |
| 1759 | if (MemoryPool->isFineGrained()) { |
| 1760 | FineGrainedMemoryPools.push_back(MemoryPool); |
| 1761 | if (MemoryPool->supportsKernelArgs()) |
| 1762 | ArgsMemoryPools.push_back(MemoryPool); |
| 1763 | } else if (MemoryPool->isCoarseGrained()) { |
| 1764 | CoarseGrainedMemoryPools.push_back(MemoryPool); |
| 1765 | } |
| 1766 | } |
| 1767 | return Plugin::success(); |
| 1768 | } |
| 1769 | |
| 1770 | /// Destroy all memory pools. |
| 1771 | Error deinitMemoryPools() { |
| 1772 | for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) |
| 1773 | delete Pool; |
| 1774 | |
| 1775 | AllMemoryPools.clear(); |
| 1776 | FineGrainedMemoryPools.clear(); |
| 1777 | CoarseGrainedMemoryPools.clear(); |
| 1778 | ArgsMemoryPools.clear(); |
| 1779 | |
| 1780 | return Plugin::success(); |
| 1781 | } |
| 1782 | |
| 1783 | /// Retrieve and construct all memory pools from the device agent(s). |
| 1784 | virtual Error retrieveAllMemoryPools() = 0; |
| 1785 | |
| 1786 | /// Get the device agent. |
| 1787 | virtual hsa_agent_t getAgent() const = 0; |
| 1788 | |
| 1789 | protected: |
| 1790 | /// Array of all memory pools available to the host agents. |
| 1791 | llvm::SmallVector<AMDGPUMemoryPoolTy *> AllMemoryPools; |
| 1792 | |
| 1793 | /// Array of fine-grained memory pools available to the host agents. |
| 1794 | llvm::SmallVector<AMDGPUMemoryPoolTy *> FineGrainedMemoryPools; |
| 1795 | |
| 1796 | /// Array of coarse-grained memory pools available to the host agents. |
| 1797 | llvm::SmallVector<AMDGPUMemoryPoolTy *> CoarseGrainedMemoryPools; |
| 1798 | |
| 1799 | /// Array of kernel args memory pools available to the host agents. |
| 1800 | llvm::SmallVector<AMDGPUMemoryPoolTy *> ArgsMemoryPools; |
| 1801 | }; |
| 1802 | |
| 1803 | /// Class representing the host device. This host device may have more than one |
| 1804 | /// HSA host agent. We aggregate all its resources into the same instance. |
| 1805 | struct AMDHostDeviceTy : public AMDGenericDeviceTy { |
| 1806 | /// Create a host device from an array of host agents. |
| 1807 | AMDHostDeviceTy(AMDGPUPluginTy &Plugin, |
| 1808 | const llvm::SmallVector<hsa_agent_t> &HostAgents) |
| 1809 | : AMDGenericDeviceTy(), Agents(HostAgents), ArgsMemoryManager(Plugin), |
| 1810 | PinnedMemoryManager(Plugin) { |
| 1811 | assert(HostAgents.size() && "No host agent found" ); |
| 1812 | } |
| 1813 | |
| 1814 | /// Initialize the host device memory pools and the memory managers for |
| 1815 | /// kernel args and host pinned memory allocations. |
| 1816 | Error init() { |
| 1817 | if (auto Err = initMemoryPools()) |
| 1818 | return Err; |
| 1819 | |
| 1820 | if (auto Err = ArgsMemoryManager.init(getArgsMemoryPool())) |
| 1821 | return Err; |
| 1822 | |
| 1823 | if (auto Err = PinnedMemoryManager.init(getFineGrainedMemoryPool())) |
| 1824 | return Err; |
| 1825 | |
| 1826 | return Plugin::success(); |
| 1827 | } |
| 1828 | |
| 1829 | /// Deinitialize memory pools and managers. |
| 1830 | Error deinit() { |
| 1831 | if (auto Err = deinitMemoryPools()) |
| 1832 | return Err; |
| 1833 | |
| 1834 | if (auto Err = ArgsMemoryManager.deinit()) |
| 1835 | return Err; |
| 1836 | |
| 1837 | if (auto Err = PinnedMemoryManager.deinit()) |
| 1838 | return Err; |
| 1839 | |
| 1840 | return Plugin::success(); |
| 1841 | } |
| 1842 | |
| 1843 | /// Retrieve and construct all memory pools from the host agents. |
| 1844 | Error retrieveAllMemoryPools() override { |
| 1845 | // Iterate through the available pools across the host agents. |
| 1846 | for (hsa_agent_t Agent : Agents) { |
| 1847 | Error Err = hsa_utils::iterateAgentMemoryPools( |
| 1848 | Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) { |
| 1849 | AMDGPUMemoryPoolTy *MemoryPool = |
| 1850 | new AMDGPUMemoryPoolTy(HSAMemoryPool); |
| 1851 | AllMemoryPools.push_back(MemoryPool); |
| 1852 | return HSA_STATUS_SUCCESS; |
| 1853 | }); |
| 1854 | if (Err) |
| 1855 | return Err; |
| 1856 | } |
| 1857 | return Plugin::success(); |
| 1858 | } |
| 1859 | |
| 1860 | /// Get one of the host agents. Return always the first agent. |
| 1861 | hsa_agent_t getAgent() const override { return Agents[0]; } |
| 1862 | |
| 1863 | /// Get a memory pool for fine-grained allocations. |
| 1864 | AMDGPUMemoryPoolTy &getFineGrainedMemoryPool() { |
| 1865 | assert(!FineGrainedMemoryPools.empty() && "No fine-grained mempool" ); |
| 1866 | // Retrieve any memory pool. |
| 1867 | return *FineGrainedMemoryPools[0]; |
| 1868 | } |
| 1869 | |
| 1870 | AMDGPUMemoryPoolTy &getCoarseGrainedMemoryPool() { |
| 1871 | assert(!CoarseGrainedMemoryPools.empty() && "No coarse-grained mempool" ); |
| 1872 | // Retrieve any memory pool. |
| 1873 | return *CoarseGrainedMemoryPools[0]; |
| 1874 | } |
| 1875 | |
| 1876 | /// Get a memory pool for kernel args allocations. |
| 1877 | AMDGPUMemoryPoolTy &getArgsMemoryPool() { |
| 1878 | assert(!ArgsMemoryPools.empty() && "No kernelargs mempool" ); |
| 1879 | // Retrieve any memory pool. |
| 1880 | return *ArgsMemoryPools[0]; |
| 1881 | } |
| 1882 | |
| 1883 | /// Getters for kernel args and host pinned memory managers. |
| 1884 | AMDGPUMemoryManagerTy &getArgsMemoryManager() { return ArgsMemoryManager; } |
| 1885 | AMDGPUMemoryManagerTy &getPinnedMemoryManager() { |
| 1886 | return PinnedMemoryManager; |
| 1887 | } |
| 1888 | |
| 1889 | private: |
| 1890 | /// Array of agents on the host side. |
| 1891 | const llvm::SmallVector<hsa_agent_t> Agents; |
| 1892 | |
| 1893 | // Memory manager for kernel arguments. |
| 1894 | AMDGPUMemoryManagerTy ArgsMemoryManager; |
| 1895 | |
| 1896 | // Memory manager for pinned memory. |
| 1897 | AMDGPUMemoryManagerTy PinnedMemoryManager; |
| 1898 | }; |
| 1899 | |
| 1900 | /// Class implementing the AMDGPU device functionalities which derives from the |
| 1901 | /// generic device class. |
| 1902 | struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { |
| 1903 | // Create an AMDGPU device with a device id and default AMDGPU grid values. |
| 1904 | AMDGPUDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices, |
| 1905 | AMDHostDeviceTy &HostDevice, hsa_agent_t Agent) |
| 1906 | : GenericDeviceTy(Plugin, DeviceId, NumDevices, {}), AMDGenericDeviceTy(), |
| 1907 | OMPX_NumQueues("LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES" , 4), |
| 1908 | OMPX_QueueSize("LIBOMPTARGET_AMDGPU_HSA_QUEUE_SIZE" , 512), |
| 1909 | OMPX_DefaultTeamsPerCU("LIBOMPTARGET_AMDGPU_TEAMS_PER_CU" , 4), |
| 1910 | OMPX_MaxAsyncCopyBytes("LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES" , |
| 1911 | 1 * 1024 * 1024), // 1MB |
| 1912 | OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS" , |
| 1913 | 64), |
| 1914 | OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT" , 2000000), |
| 1915 | OMPX_UseMultipleSdmaEngines( |
| 1916 | "LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES" , false), |
| 1917 | OMPX_ApuMaps("OMPX_APU_MAPS" , false), AMDGPUStreamManager(*this, Agent), |
| 1918 | AMDGPUEventManager(*this), AMDGPUSignalManager(*this), Agent(Agent), |
| 1919 | HostDevice(HostDevice) {} |
| 1920 | |
| 1921 | ~AMDGPUDeviceTy() {} |
| 1922 | |
| 1923 | /// Initialize the device, its resources and get its properties. |
| 1924 | Error initImpl(GenericPluginTy &Plugin) override { |
| 1925 | // First setup all the memory pools. |
| 1926 | if (auto Err = initMemoryPools()) |
| 1927 | return Err; |
| 1928 | |
| 1929 | char GPUName[64]; |
| 1930 | if (auto Err = getDeviceAttr(HSA_AGENT_INFO_NAME, GPUName)) |
| 1931 | return Err; |
| 1932 | ComputeUnitKind = GPUName; |
| 1933 | |
| 1934 | // Get the wavefront size. |
| 1935 | uint32_t WavefrontSize = 0; |
| 1936 | if (auto Err = getDeviceAttr(HSA_AGENT_INFO_WAVEFRONT_SIZE, WavefrontSize)) |
| 1937 | return Err; |
| 1938 | GridValues.GV_Warp_Size = WavefrontSize; |
| 1939 | |
| 1940 | // Get the frequency of the steady clock. If the attribute is missing |
| 1941 | // assume running on an older libhsa and default to 0, omp_get_wtime |
| 1942 | // will be inaccurate but otherwise programs can still run. |
| 1943 | if (getDeviceAttrRaw(HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY, |
| 1944 | ClockFrequency) != HSA_STATUS_SUCCESS) |
| 1945 | ClockFrequency = 0; |
| 1946 | |
| 1947 | // Load the grid values depending on the wavefront. |
| 1948 | if (WavefrontSize == 32) |
| 1949 | GridValues = getAMDGPUGridValues<32>(); |
| 1950 | else if (WavefrontSize == 64) |
| 1951 | GridValues = getAMDGPUGridValues<64>(); |
| 1952 | else |
| 1953 | return Plugin::error(ErrorCode::UNSUPPORTED, |
| 1954 | "unexpected AMDGPU wavefront %d" , WavefrontSize); |
| 1955 | |
| 1956 | // Get maximum number of workitems per workgroup. |
| 1957 | uint16_t WorkgroupMaxDim[3]; |
| 1958 | if (auto Err = |
| 1959 | getDeviceAttr(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgroupMaxDim)) |
| 1960 | return Err; |
| 1961 | GridValues.GV_Max_WG_Size = WorkgroupMaxDim[0]; |
| 1962 | |
| 1963 | // Get maximum number of workgroups. |
| 1964 | hsa_dim3_t GridMaxDim; |
| 1965 | if (auto Err = getDeviceAttr(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim)) |
| 1966 | return Err; |
| 1967 | |
| 1968 | GridValues.GV_Max_Teams = GridMaxDim.x / GridValues.GV_Max_WG_Size; |
| 1969 | if (GridValues.GV_Max_Teams == 0) |
| 1970 | return Plugin::error(ErrorCode::INVALID_ARGUMENT, |
| 1971 | "maximum number of teams cannot be zero" ); |
| 1972 | |
| 1973 | // Compute the default number of teams. |
| 1974 | uint32_t ComputeUnits = 0; |
| 1975 | if (auto Err = |
| 1976 | getDeviceAttr(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, ComputeUnits)) |
| 1977 | return Err; |
| 1978 | GridValues.GV_Default_Num_Teams = ComputeUnits * OMPX_DefaultTeamsPerCU; |
| 1979 | |
| 1980 | uint32_t WavesPerCU = 0; |
| 1981 | if (auto Err = |
| 1982 | getDeviceAttr(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, WavesPerCU)) |
| 1983 | return Err; |
| 1984 | HardwareParallelism = ComputeUnits * WavesPerCU; |
| 1985 | |
| 1986 | // Get maximum size of any device queues and maximum number of queues. |
| 1987 | uint32_t MaxQueueSize; |
| 1988 | if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUE_MAX_SIZE, MaxQueueSize)) |
| 1989 | return Err; |
| 1990 | |
| 1991 | uint32_t MaxQueues; |
| 1992 | if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUES_MAX, MaxQueues)) |
| 1993 | return Err; |
| 1994 | |
| 1995 | // Compute the number of queues and their size. |
| 1996 | OMPX_NumQueues = std::max(1U, std::min(OMPX_NumQueues.get(), MaxQueues)); |
| 1997 | OMPX_QueueSize = std::min(OMPX_QueueSize.get(), MaxQueueSize); |
| 1998 | |
| 1999 | // Initialize stream pool. |
| 2000 | if (auto Err = AMDGPUStreamManager.init(OMPX_InitialNumStreams, |
| 2001 | OMPX_NumQueues, OMPX_QueueSize)) |
| 2002 | return Err; |
| 2003 | |
| 2004 | // Initialize event pool. |
| 2005 | if (auto Err = AMDGPUEventManager.init(OMPX_InitialNumEvents)) |
| 2006 | return Err; |
| 2007 | |
| 2008 | // Initialize signal pool. |
| 2009 | if (auto Err = AMDGPUSignalManager.init(OMPX_InitialNumSignals)) |
| 2010 | return Err; |
| 2011 | |
| 2012 | // Detect if XNACK is enabled |
| 2013 | SmallVector<SmallString<32>> Targets; |
| 2014 | if (auto Err = hsa_utils::getTargetTripleAndFeatures(Agent, Targets)) |
| 2015 | return Err; |
| 2016 | if (!Targets.empty() && Targets[0].str().contains("xnack+" )) |
| 2017 | IsXnackEnabled = true; |
| 2018 | |
| 2019 | // detect if device is an APU. |
| 2020 | if (auto Err = checkIfAPU()) |
| 2021 | return Err; |
| 2022 | |
| 2023 | return Plugin::success(); |
| 2024 | } |
| 2025 | |
| 2026 | /// Deinitialize the device and release its resources. |
| 2027 | Error deinitImpl() override { |
| 2028 | // Deinitialize the stream and event pools. |
| 2029 | if (auto Err = AMDGPUStreamManager.deinit()) |
| 2030 | return Err; |
| 2031 | |
| 2032 | if (auto Err = AMDGPUEventManager.deinit()) |
| 2033 | return Err; |
| 2034 | |
| 2035 | if (auto Err = AMDGPUSignalManager.deinit()) |
| 2036 | return Err; |
| 2037 | |
| 2038 | // Close modules if necessary. |
| 2039 | if (!LoadedImages.empty()) { |
| 2040 | // Each image has its own module. |
| 2041 | for (DeviceImageTy *Image : LoadedImages) { |
| 2042 | AMDGPUDeviceImageTy &AMDImage = |
| 2043 | static_cast<AMDGPUDeviceImageTy &>(*Image); |
| 2044 | |
| 2045 | // Unload the executable of the image. |
| 2046 | if (auto Err = AMDImage.unloadExecutable()) |
| 2047 | return Err; |
| 2048 | } |
| 2049 | } |
| 2050 | |
| 2051 | // Invalidate agent reference. |
| 2052 | Agent = {0}; |
| 2053 | |
| 2054 | return Plugin::success(); |
| 2055 | } |
| 2056 | |
| 2057 | virtual Error callGlobalConstructors(GenericPluginTy &Plugin, |
| 2058 | DeviceImageTy &Image) override { |
| 2059 | GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); |
| 2060 | if (Handler.isSymbolInImage(*this, Image, "amdgcn.device.fini" )) |
| 2061 | Image.setPendingGlobalDtors(); |
| 2062 | |
| 2063 | return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true); |
| 2064 | } |
| 2065 | |
| 2066 | virtual Error callGlobalDestructors(GenericPluginTy &Plugin, |
| 2067 | DeviceImageTy &Image) override { |
| 2068 | if (Image.hasPendingGlobalDtors()) |
| 2069 | return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false); |
| 2070 | return Plugin::success(); |
| 2071 | } |
| 2072 | |
| 2073 | uint64_t getStreamBusyWaitMicroseconds() const { return OMPX_StreamBusyWait; } |
| 2074 | |
| 2075 | Expected<std::unique_ptr<MemoryBuffer>> |
| 2076 | doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const override { |
| 2077 | |
| 2078 | // TODO: We should try to avoid materialization but there seems to be no |
| 2079 | // good linker interface w/o file i/o. |
| 2080 | SmallString<128> LinkerInputFilePath; |
| 2081 | std::error_code EC = sys::fs::createTemporaryFile("amdgpu-pre-link-jit" , |
| 2082 | "o" , LinkerInputFilePath); |
| 2083 | if (EC) |
| 2084 | return Plugin::error(ErrorCode::HOST_IO, |
| 2085 | "failed to create temporary file for linker" ); |
| 2086 | |
| 2087 | // Write the file's contents to the output file. |
| 2088 | Expected<std::unique_ptr<FileOutputBuffer>> OutputOrErr = |
| 2089 | FileOutputBuffer::create(LinkerInputFilePath, MB->getBuffer().size()); |
| 2090 | if (!OutputOrErr) |
| 2091 | return OutputOrErr.takeError(); |
| 2092 | std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr); |
| 2093 | llvm::copy(Range: MB->getBuffer(), Out: Output->getBufferStart()); |
| 2094 | if (Error E = Output->commit()) |
| 2095 | return std::move(E); |
| 2096 | |
| 2097 | SmallString<128> LinkerOutputFilePath; |
| 2098 | EC = sys::fs::createTemporaryFile("amdgpu-pre-link-jit" , "so" , |
| 2099 | LinkerOutputFilePath); |
| 2100 | if (EC) |
| 2101 | return Plugin::error(ErrorCode::HOST_IO, |
| 2102 | "failed to create temporary file for linker" ); |
| 2103 | |
| 2104 | const auto &ErrorOrPath = sys::findProgramByName("lld" ); |
| 2105 | if (!ErrorOrPath) |
| 2106 | return createStringError(ErrorCode::HOST_TOOL_NOT_FOUND, |
| 2107 | "failed to find `lld` on the PATH." ); |
| 2108 | |
| 2109 | std::string LLDPath = ErrorOrPath.get(); |
| 2110 | INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(), |
| 2111 | "Using `%s` to link JITed amdgcn output." , LLDPath.c_str()); |
| 2112 | |
| 2113 | std::string MCPU = "-plugin-opt=mcpu=" + getComputeUnitKind(); |
| 2114 | StringRef Args[] = {LLDPath, |
| 2115 | "-flavor" , |
| 2116 | "gnu" , |
| 2117 | "--no-undefined" , |
| 2118 | "-shared" , |
| 2119 | MCPU, |
| 2120 | "-o" , |
| 2121 | LinkerOutputFilePath.data(), |
| 2122 | LinkerInputFilePath.data()}; |
| 2123 | |
| 2124 | std::string Error; |
| 2125 | int RC = sys::ExecuteAndWait(LLDPath, Args, std::nullopt, {}, 0, 0, &Error); |
| 2126 | if (RC) |
| 2127 | return Plugin::error(ErrorCode::LINK_FAILURE, |
| 2128 | "linking optimized bitcode failed: %s" , |
| 2129 | Error.c_str()); |
| 2130 | |
| 2131 | auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(LinkerOutputFilePath); |
| 2132 | if (!BufferOrErr) |
| 2133 | return Plugin::error(ErrorCode::HOST_IO, |
| 2134 | "failed to open temporary file for lld" ); |
| 2135 | |
| 2136 | // Clean up the temporary files afterwards. |
| 2137 | if (sys::fs::remove(LinkerOutputFilePath)) |
| 2138 | return Plugin::error(ErrorCode::HOST_IO, |
| 2139 | "failed to remove temporary output file for lld" ); |
| 2140 | if (sys::fs::remove(LinkerInputFilePath)) |
| 2141 | return Plugin::error(ErrorCode::HOST_IO, |
| 2142 | "failed to remove temporary input file for lld" ); |
| 2143 | |
| 2144 | return std::move(*BufferOrErr); |
| 2145 | } |
| 2146 | |
| 2147 | /// See GenericDeviceTy::getComputeUnitKind(). |
| 2148 | std::string getComputeUnitKind() const override { return ComputeUnitKind; } |
| 2149 | |
| 2150 | /// Returns the clock frequency for the given AMDGPU device. |
| 2151 | uint64_t getClockFrequency() const override { return ClockFrequency; } |
| 2152 | |
| 2153 | /// Allocate and construct an AMDGPU kernel. |
| 2154 | Expected<GenericKernelTy &> constructKernel(const char *Name) override { |
| 2155 | // Allocate and construct the AMDGPU kernel. |
| 2156 | AMDGPUKernelTy *AMDGPUKernel = Plugin.allocate<AMDGPUKernelTy>(); |
| 2157 | if (!AMDGPUKernel) |
| 2158 | return Plugin::error(ErrorCode::OUT_OF_RESOURCES, |
| 2159 | "failed to allocate memory for AMDGPU kernel" ); |
| 2160 | |
| 2161 | new (AMDGPUKernel) AMDGPUKernelTy(Name); |
| 2162 | |
| 2163 | return *AMDGPUKernel; |
| 2164 | } |
| 2165 | |
| 2166 | /// Set the current context to this device's context. Do nothing since the |
| 2167 | /// AMDGPU devices do not have the concept of contexts. |
| 2168 | Error setContext() override { return Plugin::success(); } |
| 2169 | |
| 2170 | /// AMDGPU returns the product of the number of compute units and the waves |
| 2171 | /// per compute unit. |
| 2172 | uint64_t getHardwareParallelism() const override { |
| 2173 | return HardwareParallelism; |
| 2174 | } |
| 2175 | |
| 2176 | /// We want to set up the RPC server for host services to the GPU if it is |
| 2177 | /// available. |
| 2178 | bool shouldSetupRPCServer() const override { return true; } |
| 2179 | |
| 2180 | /// The RPC interface should have enough space for all available parallelism. |
| 2181 | uint64_t requestedRPCPortCount() const override { |
| 2182 | return getHardwareParallelism(); |
| 2183 | } |
| 2184 | |
| 2185 | /// Get the stream of the asynchronous info structure or get a new one. |
| 2186 | Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper, |
| 2187 | AMDGPUStreamTy *&Stream) { |
| 2188 | // Get the stream (if any) from the async info. |
| 2189 | Stream = AsyncInfoWrapper.getQueueAs<AMDGPUStreamTy *>(); |
| 2190 | if (!Stream) { |
| 2191 | // There was no stream; get an idle one. |
| 2192 | if (auto Err = AMDGPUStreamManager.getResource(Stream)) |
| 2193 | return Err; |
| 2194 | |
| 2195 | // Modify the async info's stream. |
| 2196 | AsyncInfoWrapper.setQueueAs<AMDGPUStreamTy *>(Stream); |
| 2197 | } |
| 2198 | return Plugin::success(); |
| 2199 | } |
| 2200 | |
| 2201 | /// Load the binary image into the device and allocate an image object. |
| 2202 | Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage, |
| 2203 | int32_t ImageId) override { |
| 2204 | // Allocate and initialize the image object. |
| 2205 | AMDGPUDeviceImageTy *AMDImage = Plugin.allocate<AMDGPUDeviceImageTy>(); |
| 2206 | new (AMDImage) AMDGPUDeviceImageTy(ImageId, *this, TgtImage); |
| 2207 | |
| 2208 | // Load the HSA executable. |
| 2209 | if (Error Err = AMDImage->loadExecutable(Device: *this)) |
| 2210 | return std::move(Err); |
| 2211 | |
| 2212 | return AMDImage; |
| 2213 | } |
| 2214 | |
| 2215 | /// Allocate memory on the device or related to the device. |
| 2216 | void *allocate(size_t Size, void *, TargetAllocTy Kind) override; |
| 2217 | |
| 2218 | /// Deallocate memory on the device or related to the device. |
| 2219 | int free(void *TgtPtr, TargetAllocTy Kind) override { |
| 2220 | if (TgtPtr == nullptr) |
| 2221 | return OFFLOAD_SUCCESS; |
| 2222 | |
| 2223 | AMDGPUMemoryPoolTy *MemoryPool = nullptr; |
| 2224 | switch (Kind) { |
| 2225 | case TARGET_ALLOC_DEFAULT: |
| 2226 | case TARGET_ALLOC_DEVICE: |
| 2227 | case TARGET_ALLOC_DEVICE_NON_BLOCKING: |
| 2228 | MemoryPool = CoarseGrainedMemoryPools[0]; |
| 2229 | break; |
| 2230 | case TARGET_ALLOC_HOST: |
| 2231 | MemoryPool = &HostDevice.getFineGrainedMemoryPool(); |
| 2232 | break; |
| 2233 | case TARGET_ALLOC_SHARED: |
| 2234 | MemoryPool = &HostDevice.getFineGrainedMemoryPool(); |
| 2235 | break; |
| 2236 | } |
| 2237 | |
| 2238 | if (!MemoryPool) { |
| 2239 | REPORT("No memory pool for the specified allocation kind\n" ); |
| 2240 | return OFFLOAD_FAIL; |
| 2241 | } |
| 2242 | |
| 2243 | if (Error Err = MemoryPool->deallocate(Ptr: TgtPtr)) { |
| 2244 | REPORT("%s\n" , toString(E: std::move(Err)).data()); |
| 2245 | return OFFLOAD_FAIL; |
| 2246 | } |
| 2247 | |
| 2248 | return OFFLOAD_SUCCESS; |
| 2249 | } |
| 2250 | |
| 2251 | /// Synchronize current thread with the pending operations on the async info. |
| 2252 | Error synchronizeImpl(__tgt_async_info &AsyncInfo) override { |
| 2253 | AMDGPUStreamTy *Stream = |
| 2254 | reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue); |
| 2255 | assert(Stream && "Invalid stream" ); |
| 2256 | |
| 2257 | if (auto Err = Stream->synchronize()) |
| 2258 | return Err; |
| 2259 | |
| 2260 | // Once the stream is synchronized, return it to stream pool and reset |
| 2261 | // AsyncInfo. This is to make sure the synchronization only works for its |
| 2262 | // own tasks. |
| 2263 | AsyncInfo.Queue = nullptr; |
| 2264 | return AMDGPUStreamManager.returnResource(Stream); |
| 2265 | } |
| 2266 | |
| 2267 | /// Query for the completion of the pending operations on the async info. |
| 2268 | Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override { |
| 2269 | AMDGPUStreamTy *Stream = |
| 2270 | reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue); |
| 2271 | assert(Stream && "Invalid stream" ); |
| 2272 | |
| 2273 | auto CompletedOrErr = Stream->query(); |
| 2274 | if (!CompletedOrErr) |
| 2275 | return CompletedOrErr.takeError(); |
| 2276 | |
| 2277 | // Return if it the stream did not complete yet. |
| 2278 | if (!(*CompletedOrErr)) |
| 2279 | return Plugin::success(); |
| 2280 | |
| 2281 | // Once the stream is completed, return it to stream pool and reset |
| 2282 | // AsyncInfo. This is to make sure the synchronization only works for its |
| 2283 | // own tasks. |
| 2284 | AsyncInfo.Queue = nullptr; |
| 2285 | return AMDGPUStreamManager.returnResource(Stream); |
| 2286 | } |
| 2287 | |
| 2288 | /// Pin the host buffer and return the device pointer that should be used for |
| 2289 | /// device transfers. |
| 2290 | Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override { |
| 2291 | void *PinnedPtr = nullptr; |
| 2292 | |
| 2293 | hsa_status_t Status = |
| 2294 | hsa_amd_memory_lock(HstPtr, Size, nullptr, 0, &PinnedPtr); |
| 2295 | if (auto Err = Plugin::check(Status, "error in hsa_amd_memory_lock: %s\n" )) |
| 2296 | return std::move(Err); |
| 2297 | |
| 2298 | return PinnedPtr; |
| 2299 | } |
| 2300 | |
| 2301 | /// Unpin the host buffer. |
| 2302 | Error dataUnlockImpl(void *HstPtr) override { |
| 2303 | hsa_status_t Status = hsa_amd_memory_unlock(HstPtr); |
| 2304 | return Plugin::check(Status, "error in hsa_amd_memory_unlock: %s\n" ); |
| 2305 | } |
| 2306 | |
| 2307 | /// Check through the HSA runtime whether the \p HstPtr buffer is pinned. |
| 2308 | Expected<bool> isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr, |
| 2309 | void *&BaseDevAccessiblePtr, |
| 2310 | size_t &BaseSize) const override { |
| 2311 | hsa_amd_pointer_info_t Info; |
| 2312 | Info.size = sizeof(hsa_amd_pointer_info_t); |
| 2313 | |
| 2314 | hsa_status_t Status = hsa_amd_pointer_info( |
| 2315 | HstPtr, &Info, /*Allocator=*/nullptr, /*num_agents_accessible=*/nullptr, |
| 2316 | /*accessible=*/nullptr); |
| 2317 | if (auto Err = Plugin::check(Status, "error in hsa_amd_pointer_info: %s" )) |
| 2318 | return std::move(Err); |
| 2319 | |
| 2320 | // The buffer may be locked or allocated through HSA allocators. Assume that |
| 2321 | // the buffer is host pinned if the runtime reports a HSA type. |
| 2322 | if (Info.type != HSA_EXT_POINTER_TYPE_LOCKED && |
| 2323 | Info.type != HSA_EXT_POINTER_TYPE_HSA) |
| 2324 | return false; |
| 2325 | |
| 2326 | assert(Info.hostBaseAddress && "Invalid host pinned address" ); |
| 2327 | assert(Info.agentBaseAddress && "Invalid agent pinned address" ); |
| 2328 | assert(Info.sizeInBytes > 0 && "Invalid pinned allocation size" ); |
| 2329 | |
| 2330 | // Save the allocation info in the output parameters. |
| 2331 | BaseHstPtr = Info.hostBaseAddress; |
| 2332 | BaseDevAccessiblePtr = Info.agentBaseAddress; |
| 2333 | BaseSize = Info.sizeInBytes; |
| 2334 | |
| 2335 | return true; |
| 2336 | } |
| 2337 | |
| 2338 | /// Submit data to the device (host to device transfer). |
| 2339 | Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size, |
| 2340 | AsyncInfoWrapperTy &AsyncInfoWrapper) override { |
| 2341 | AMDGPUStreamTy *Stream = nullptr; |
| 2342 | void *PinnedPtr = nullptr; |
| 2343 | |
| 2344 | // Use one-step asynchronous operation when host memory is already pinned. |
| 2345 | if (void *PinnedPtr = |
| 2346 | PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) { |
| 2347 | if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
| 2348 | return Err; |
| 2349 | return Stream->pushPinnedMemoryCopyAsync(Dst: TgtPtr, Src: PinnedPtr, CopySize: Size); |
| 2350 | } |
| 2351 | |
| 2352 | // For large transfers use synchronous behavior. |
| 2353 | if (Size >= OMPX_MaxAsyncCopyBytes) { |
| 2354 | if (AsyncInfoWrapper.hasQueue()) |
| 2355 | if (auto Err = synchronize(AsyncInfoWrapper)) |
| 2356 | return Err; |
| 2357 | |
| 2358 | hsa_status_t Status; |
| 2359 | Status = hsa_amd_memory_lock(const_cast<void *>(HstPtr), Size, nullptr, 0, |
| 2360 | &PinnedPtr); |
| 2361 | if (auto Err = |
| 2362 | Plugin::check(Status, "error in hsa_amd_memory_lock: %s\n" )) |
| 2363 | return Err; |
| 2364 | |
| 2365 | AMDGPUSignalTy Signal; |
| 2366 | if (auto Err = Signal.init()) |
| 2367 | return Err; |
| 2368 | |
| 2369 | if (auto Err = hsa_utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr, |
| 2370 | Agent, PinnedPtr, Agent, Size, 0, |
| 2371 | nullptr, Signal.get())) |
| 2372 | return Err; |
| 2373 | |
| 2374 | if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds())) |
| 2375 | return Err; |
| 2376 | |
| 2377 | if (auto Err = Signal.deinit()) |
| 2378 | return Err; |
| 2379 | |
| 2380 | Status = hsa_amd_memory_unlock(const_cast<void *>(HstPtr)); |
| 2381 | return Plugin::check(Status, "error in hsa_amd_memory_unlock: %s\n" ); |
| 2382 | } |
| 2383 | |
| 2384 | // Otherwise, use two-step copy with an intermediate pinned host buffer. |
| 2385 | AMDGPUMemoryManagerTy &PinnedMemoryManager = |
| 2386 | HostDevice.getPinnedMemoryManager(); |
| 2387 | if (auto Err = PinnedMemoryManager.allocate(Size, PtrStorage: &PinnedPtr)) |
| 2388 | return Err; |
| 2389 | |
| 2390 | if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
| 2391 | return Err; |
| 2392 | |
| 2393 | return Stream->pushMemoryCopyH2DAsync(Dst: TgtPtr, Src: HstPtr, Inter: PinnedPtr, CopySize: Size, |
| 2394 | MemoryManager&: PinnedMemoryManager); |
| 2395 | } |
| 2396 | |
| 2397 | /// Retrieve data from the device (device to host transfer). |
| 2398 | Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size, |
| 2399 | AsyncInfoWrapperTy &AsyncInfoWrapper) override { |
| 2400 | AMDGPUStreamTy *Stream = nullptr; |
| 2401 | void *PinnedPtr = nullptr; |
| 2402 | |
| 2403 | // Use one-step asynchronous operation when host memory is already pinned. |
| 2404 | if (void *PinnedPtr = |
| 2405 | PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) { |
| 2406 | if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
| 2407 | return Err; |
| 2408 | |
| 2409 | return Stream->pushPinnedMemoryCopyAsync(Dst: PinnedPtr, Src: TgtPtr, CopySize: Size); |
| 2410 | } |
| 2411 | |
| 2412 | // For large transfers use synchronous behavior. |
| 2413 | if (Size >= OMPX_MaxAsyncCopyBytes) { |
| 2414 | if (AsyncInfoWrapper.hasQueue()) |
| 2415 | if (auto Err = synchronize(AsyncInfoWrapper)) |
| 2416 | return Err; |
| 2417 | |
| 2418 | hsa_status_t Status; |
| 2419 | Status = hsa_amd_memory_lock(const_cast<void *>(HstPtr), Size, nullptr, 0, |
| 2420 | &PinnedPtr); |
| 2421 | if (auto Err = |
| 2422 | Plugin::check(Status, "error in hsa_amd_memory_lock: %s\n" )) |
| 2423 | return Err; |
| 2424 | |
| 2425 | AMDGPUSignalTy Signal; |
| 2426 | if (auto Err = Signal.init()) |
| 2427 | return Err; |
| 2428 | |
| 2429 | if (auto Err = hsa_utils::asyncMemCopy(useMultipleSdmaEngines(), |
| 2430 | PinnedPtr, Agent, TgtPtr, Agent, |
| 2431 | Size, 0, nullptr, Signal.get())) |
| 2432 | return Err; |
| 2433 | |
| 2434 | if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds())) |
| 2435 | return Err; |
| 2436 | |
| 2437 | if (auto Err = Signal.deinit()) |
| 2438 | return Err; |
| 2439 | |
| 2440 | Status = hsa_amd_memory_unlock(const_cast<void *>(HstPtr)); |
| 2441 | return Plugin::check(Status, "error in hsa_amd_memory_unlock: %s\n" ); |
| 2442 | } |
| 2443 | |
| 2444 | // Otherwise, use two-step copy with an intermediate pinned host buffer. |
| 2445 | AMDGPUMemoryManagerTy &PinnedMemoryManager = |
| 2446 | HostDevice.getPinnedMemoryManager(); |
| 2447 | if (auto Err = PinnedMemoryManager.allocate(Size, PtrStorage: &PinnedPtr)) |
| 2448 | return Err; |
| 2449 | |
| 2450 | if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
| 2451 | return Err; |
| 2452 | |
| 2453 | return Stream->pushMemoryCopyD2HAsync(Dst: HstPtr, Src: TgtPtr, Inter: PinnedPtr, CopySize: Size, |
| 2454 | MemoryManager&: PinnedMemoryManager); |
| 2455 | } |
| 2456 | |
| 2457 | /// Exchange data between two devices within the plugin. |
| 2458 | Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice, |
| 2459 | void *DstPtr, int64_t Size, |
| 2460 | AsyncInfoWrapperTy &AsyncInfoWrapper) override { |
| 2461 | AMDGPUDeviceTy &DstDevice = static_cast<AMDGPUDeviceTy &>(DstGenericDevice); |
| 2462 | |
| 2463 | // For large transfers use synchronous behavior. |
| 2464 | if (Size >= OMPX_MaxAsyncCopyBytes) { |
| 2465 | if (AsyncInfoWrapper.hasQueue()) |
| 2466 | if (auto Err = synchronize(AsyncInfoWrapper)) |
| 2467 | return Err; |
| 2468 | |
| 2469 | AMDGPUSignalTy Signal; |
| 2470 | if (auto Err = Signal.init()) |
| 2471 | return Err; |
| 2472 | |
| 2473 | if (auto Err = hsa_utils::asyncMemCopy( |
| 2474 | useMultipleSdmaEngines(), DstPtr, DstDevice.getAgent(), SrcPtr, |
| 2475 | getAgent(), (uint64_t)Size, 0, nullptr, Signal.get())) |
| 2476 | return Err; |
| 2477 | |
| 2478 | if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds())) |
| 2479 | return Err; |
| 2480 | |
| 2481 | return Signal.deinit(); |
| 2482 | } |
| 2483 | |
| 2484 | AMDGPUStreamTy *Stream = nullptr; |
| 2485 | if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
| 2486 | return Err; |
| 2487 | if (Size <= 0) |
| 2488 | return Plugin::success(); |
| 2489 | |
| 2490 | return Stream->pushMemoryCopyD2DAsync(DstPtr, DstDevice.getAgent(), SrcPtr, |
| 2491 | getAgent(), (uint64_t)Size); |
| 2492 | } |
| 2493 | |
| 2494 | /// Initialize the async info for interoperability purposes. |
| 2495 | Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override { |
| 2496 | // TODO: Implement this function. |
| 2497 | return Plugin::success(); |
| 2498 | } |
| 2499 | |
| 2500 | /// Initialize the device info for interoperability purposes. |
| 2501 | Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override { |
| 2502 | DeviceInfo->Context = nullptr; |
| 2503 | |
| 2504 | if (!DeviceInfo->Device) |
| 2505 | DeviceInfo->Device = reinterpret_cast<void *>(Agent.handle); |
| 2506 | |
| 2507 | return Plugin::success(); |
| 2508 | } |
| 2509 | |
| 2510 | /// Create an event. |
| 2511 | Error createEventImpl(void **EventPtrStorage) override { |
| 2512 | AMDGPUEventTy **Event = reinterpret_cast<AMDGPUEventTy **>(EventPtrStorage); |
| 2513 | return AMDGPUEventManager.getResource(*Event); |
| 2514 | } |
| 2515 | |
| 2516 | /// Destroy a previously created event. |
| 2517 | Error destroyEventImpl(void *EventPtr) override { |
| 2518 | AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr); |
| 2519 | return AMDGPUEventManager.returnResource(Event); |
| 2520 | } |
| 2521 | |
| 2522 | /// Record the event. |
| 2523 | Error recordEventImpl(void *EventPtr, |
| 2524 | AsyncInfoWrapperTy &AsyncInfoWrapper) override { |
| 2525 | AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr); |
| 2526 | assert(Event && "Invalid event" ); |
| 2527 | |
| 2528 | AMDGPUStreamTy *Stream = nullptr; |
| 2529 | if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
| 2530 | return Err; |
| 2531 | |
| 2532 | return Event->record(Stream&: *Stream); |
| 2533 | } |
| 2534 | |
| 2535 | /// Make the stream wait on the event. |
| 2536 | Error waitEventImpl(void *EventPtr, |
| 2537 | AsyncInfoWrapperTy &AsyncInfoWrapper) override { |
| 2538 | AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr); |
| 2539 | |
| 2540 | AMDGPUStreamTy *Stream = nullptr; |
| 2541 | if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
| 2542 | return Err; |
| 2543 | |
| 2544 | return Event->wait(Stream&: *Stream); |
| 2545 | } |
| 2546 | |
| 2547 | /// Synchronize the current thread with the event. |
| 2548 | Error syncEventImpl(void *EventPtr) override { |
| 2549 | return Plugin::error(ErrorCode::UNIMPLEMENTED, |
| 2550 | "synchronize event not implemented" ); |
| 2551 | } |
| 2552 | |
| 2553 | /// Print information about the device. |
| 2554 | Error obtainInfoImpl(InfoQueueTy &Info) override { |
| 2555 | char TmpChar[1000]; |
| 2556 | const char *TmpCharPtr = "Unknown" ; |
| 2557 | uint16_t Major, Minor; |
| 2558 | uint32_t TmpUInt, TmpUInt2; |
| 2559 | uint32_t CacheSize[4]; |
| 2560 | size_t TmpSt; |
| 2561 | bool TmpBool; |
| 2562 | uint16_t WorkgrpMaxDim[3]; |
| 2563 | hsa_dim3_t GridMaxDim; |
| 2564 | hsa_status_t Status, Status2; |
| 2565 | |
| 2566 | Status = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major); |
| 2567 | Status2 = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor); |
| 2568 | if (Status == HSA_STATUS_SUCCESS && Status2 == HSA_STATUS_SUCCESS) |
| 2569 | Info.add("HSA Runtime Version" , |
| 2570 | std::to_string(val: Major) + "." + std::to_string(val: Minor)); |
| 2571 | |
| 2572 | Info.add("HSA OpenMP Device Number" , DeviceId); |
| 2573 | |
| 2574 | Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar); |
| 2575 | if (Status == HSA_STATUS_SUCCESS) |
| 2576 | Info.add("Product Name" , TmpChar); |
| 2577 | |
| 2578 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_NAME, TmpChar); |
| 2579 | if (Status == HSA_STATUS_SUCCESS) |
| 2580 | Info.add("Device Name" , TmpChar); |
| 2581 | |
| 2582 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_VENDOR_NAME, TmpChar); |
| 2583 | if (Status == HSA_STATUS_SUCCESS) |
| 2584 | Info.add("Vendor Name" , TmpChar); |
| 2585 | |
| 2586 | hsa_device_type_t DevType; |
| 2587 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_DEVICE, DevType); |
| 2588 | if (Status == HSA_STATUS_SUCCESS) { |
| 2589 | switch (DevType) { |
| 2590 | case HSA_DEVICE_TYPE_CPU: |
| 2591 | TmpCharPtr = "CPU" ; |
| 2592 | break; |
| 2593 | case HSA_DEVICE_TYPE_GPU: |
| 2594 | TmpCharPtr = "GPU" ; |
| 2595 | break; |
| 2596 | case HSA_DEVICE_TYPE_DSP: |
| 2597 | TmpCharPtr = "DSP" ; |
| 2598 | break; |
| 2599 | } |
| 2600 | Info.add("Device Type" , TmpCharPtr); |
| 2601 | } |
| 2602 | |
| 2603 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUES_MAX, TmpUInt); |
| 2604 | if (Status == HSA_STATUS_SUCCESS) |
| 2605 | Info.add("Max Queues" , TmpUInt); |
| 2606 | |
| 2607 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MIN_SIZE, TmpUInt); |
| 2608 | if (Status == HSA_STATUS_SUCCESS) |
| 2609 | Info.add("Queue Min Size" , TmpUInt); |
| 2610 | |
| 2611 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MAX_SIZE, TmpUInt); |
| 2612 | if (Status == HSA_STATUS_SUCCESS) |
| 2613 | Info.add("Queue Max Size" , TmpUInt); |
| 2614 | |
| 2615 | // FIXME: This is deprecated according to HSA documentation. But using |
| 2616 | // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during |
| 2617 | // runtime. |
| 2618 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_CACHE_SIZE, CacheSize); |
| 2619 | if (Status == HSA_STATUS_SUCCESS) { |
| 2620 | Info.add("Cache" ); |
| 2621 | |
| 2622 | for (int I = 0; I < 4; I++) |
| 2623 | if (CacheSize[I]) |
| 2624 | Info.add<InfoLevel2>("L" + std::to_string(I), CacheSize[I]); |
| 2625 | } |
| 2626 | |
| 2627 | Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_CACHELINE_SIZE, TmpUInt); |
| 2628 | if (Status == HSA_STATUS_SUCCESS) |
| 2629 | Info.add("Cacheline Size" , TmpUInt); |
| 2630 | |
| 2631 | Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt); |
| 2632 | if (Status == HSA_STATUS_SUCCESS) |
| 2633 | Info.add("Max Clock Freq" , TmpUInt, "MHz" ); |
| 2634 | |
| 2635 | Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, TmpUInt); |
| 2636 | if (Status == HSA_STATUS_SUCCESS) |
| 2637 | Info.add("Compute Units" , TmpUInt); |
| 2638 | |
| 2639 | Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, TmpUInt); |
| 2640 | if (Status == HSA_STATUS_SUCCESS) |
| 2641 | Info.add("SIMD per CU" , TmpUInt); |
| 2642 | |
| 2643 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_FAST_F16_OPERATION, TmpBool); |
| 2644 | if (Status == HSA_STATUS_SUCCESS) |
| 2645 | Info.add("Fast F16 Operation" , TmpBool); |
| 2646 | |
| 2647 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_WAVEFRONT_SIZE, TmpUInt2); |
| 2648 | if (Status == HSA_STATUS_SUCCESS) |
| 2649 | Info.add("Wavefront Size" , TmpUInt2); |
| 2650 | |
| 2651 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, TmpUInt); |
| 2652 | if (Status == HSA_STATUS_SUCCESS) |
| 2653 | Info.add("Workgroup Max Size" , TmpUInt); |
| 2654 | |
| 2655 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgrpMaxDim); |
| 2656 | if (Status == HSA_STATUS_SUCCESS) { |
| 2657 | Info.add("Workgroup Max Size per Dimension" ); |
| 2658 | Info.add<InfoLevel2>("x" , WorkgrpMaxDim[0]); |
| 2659 | Info.add<InfoLevel2>("y" , WorkgrpMaxDim[1]); |
| 2660 | Info.add<InfoLevel2>("z" , WorkgrpMaxDim[2]); |
| 2661 | } |
| 2662 | |
| 2663 | Status = getDeviceAttrRaw( |
| 2664 | (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, TmpUInt); |
| 2665 | if (Status == HSA_STATUS_SUCCESS) { |
| 2666 | Info.add("Max Waves Per CU" , TmpUInt); |
| 2667 | Info.add("Max Work-item Per CU" , TmpUInt * TmpUInt2); |
| 2668 | } |
| 2669 | |
| 2670 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_SIZE, TmpUInt); |
| 2671 | if (Status == HSA_STATUS_SUCCESS) |
| 2672 | Info.add("Grid Max Size" , TmpUInt); |
| 2673 | |
| 2674 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim); |
| 2675 | if (Status == HSA_STATUS_SUCCESS) { |
| 2676 | Info.add("Grid Max Size per Dimension" ); |
| 2677 | Info.add<InfoLevel2>("x" , GridMaxDim.x); |
| 2678 | Info.add<InfoLevel2>("y" , GridMaxDim.y); |
| 2679 | Info.add<InfoLevel2>("z" , GridMaxDim.z); |
| 2680 | } |
| 2681 | |
| 2682 | Status = getDeviceAttrRaw(HSA_AGENT_INFO_FBARRIER_MAX_SIZE, TmpUInt); |
| 2683 | if (Status == HSA_STATUS_SUCCESS) |
| 2684 | Info.add("Max fbarriers/Workgrp" , TmpUInt); |
| 2685 | |
| 2686 | Info.add("Memory Pools" ); |
| 2687 | for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) { |
| 2688 | std::string TmpStr, TmpStr2; |
| 2689 | |
| 2690 | if (Pool->isGlobal()) |
| 2691 | TmpStr = "Global" ; |
| 2692 | else if (Pool->isReadOnly()) |
| 2693 | TmpStr = "ReadOnly" ; |
| 2694 | else if (Pool->isPrivate()) |
| 2695 | TmpStr = "Private" ; |
| 2696 | else if (Pool->isGroup()) |
| 2697 | TmpStr = "Group" ; |
| 2698 | else |
| 2699 | TmpStr = "Unknown" ; |
| 2700 | |
| 2701 | Info.add<InfoLevel2>(std::string("Pool " ) + TmpStr); |
| 2702 | |
| 2703 | if (Pool->isGlobal()) { |
| 2704 | if (Pool->isFineGrained()) |
| 2705 | TmpStr2 += "Fine Grained " ; |
| 2706 | if (Pool->isCoarseGrained()) |
| 2707 | TmpStr2 += "Coarse Grained " ; |
| 2708 | if (Pool->supportsKernelArgs()) |
| 2709 | TmpStr2 += "Kernarg " ; |
| 2710 | |
| 2711 | Info.add<InfoLevel3>("Flags" , TmpStr2); |
| 2712 | } |
| 2713 | |
| 2714 | Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, TmpSt); |
| 2715 | if (Status == HSA_STATUS_SUCCESS) |
| 2716 | Info.add<InfoLevel3>("Size" , TmpSt, "bytes" ); |
| 2717 | |
| 2718 | Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, |
| 2719 | TmpBool); |
| 2720 | if (Status == HSA_STATUS_SUCCESS) |
| 2721 | Info.add<InfoLevel3>("Allocatable" , TmpBool); |
| 2722 | |
| 2723 | Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, |
| 2724 | TmpSt); |
| 2725 | if (Status == HSA_STATUS_SUCCESS) |
| 2726 | Info.add<InfoLevel3>("Runtime Alloc Granule" , TmpSt, "bytes" ); |
| 2727 | |
| 2728 | Status = Pool->getAttrRaw( |
| 2729 | HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, TmpSt); |
| 2730 | if (Status == HSA_STATUS_SUCCESS) |
| 2731 | Info.add<InfoLevel3>("Runtime Alloc Alignment" , TmpSt, "bytes" ); |
| 2732 | |
| 2733 | Status = |
| 2734 | Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, TmpBool); |
| 2735 | if (Status == HSA_STATUS_SUCCESS) |
| 2736 | Info.add<InfoLevel3>("Accessible by all" , TmpBool); |
| 2737 | } |
| 2738 | |
| 2739 | Info.add("ISAs" ); |
| 2740 | auto Err = hsa_utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) { |
| 2741 | Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, TmpChar); |
| 2742 | if (Status == HSA_STATUS_SUCCESS) |
| 2743 | Info.add<InfoLevel2>("Name" , TmpChar); |
| 2744 | |
| 2745 | return Status; |
| 2746 | }); |
| 2747 | |
| 2748 | // Silently consume the error. |
| 2749 | if (Err) |
| 2750 | consumeError(std::move(Err)); |
| 2751 | |
| 2752 | return Plugin::success(); |
| 2753 | } |
| 2754 | |
| 2755 | /// Returns true if auto zero-copy the best configuration for the current |
| 2756 | /// arch. |
| 2757 | /// On AMDGPUs, automatic zero-copy is turned on |
| 2758 | /// when running on an APU with XNACK (unified memory) support |
| 2759 | /// enabled. On discrete GPUs, automatic zero-copy is triggered |
| 2760 | /// if the user sets the environment variable OMPX_APU_MAPS=1 |
| 2761 | /// and if XNACK is enabled. The rationale is that zero-copy |
| 2762 | /// is the best configuration (performance, memory footprint) on APUs, |
| 2763 | /// while it is often not the best on discrete GPUs. |
| 2764 | /// XNACK can be enabled with a kernel boot parameter or with |
| 2765 | /// the HSA_XNACK environment variable. |
| 2766 | bool useAutoZeroCopyImpl() override { |
| 2767 | return ((IsAPU || OMPX_ApuMaps) && IsXnackEnabled); |
| 2768 | } |
| 2769 | |
| 2770 | /// Getters and setters for stack and heap sizes. |
| 2771 | Error getDeviceStackSize(uint64_t &Value) override { |
| 2772 | Value = StackSize; |
| 2773 | return Plugin::success(); |
| 2774 | } |
| 2775 | Error setDeviceStackSize(uint64_t Value) override { |
| 2776 | StackSize = Value; |
| 2777 | return Plugin::success(); |
| 2778 | } |
| 2779 | Error getDeviceHeapSize(uint64_t &Value) override { |
| 2780 | Value = DeviceMemoryPoolSize; |
| 2781 | return Plugin::success(); |
| 2782 | } |
| 2783 | Error setDeviceHeapSize(uint64_t Value) override { |
| 2784 | for (DeviceImageTy *Image : LoadedImages) |
| 2785 | if (auto Err = setupDeviceMemoryPool(Plugin, *Image, Value)) |
| 2786 | return Err; |
| 2787 | DeviceMemoryPoolSize = Value; |
| 2788 | return Plugin::success(); |
| 2789 | } |
| 2790 | Error getDeviceMemorySize(uint64_t &Value) override { |
| 2791 | for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) { |
| 2792 | if (Pool->isGlobal()) { |
| 2793 | hsa_status_t Status = |
| 2794 | Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, Value); |
| 2795 | return Plugin::check(Status, "error in getting device memory size: %s" ); |
| 2796 | } |
| 2797 | } |
| 2798 | return Plugin::error(ErrorCode::UNSUPPORTED, |
| 2799 | "getDeviceMemorySize:: no global pool" ); |
| 2800 | } |
| 2801 | |
| 2802 | /// AMDGPU-specific function to get device attributes. |
| 2803 | template <typename Ty> Error getDeviceAttr(uint32_t Kind, Ty &Value) { |
| 2804 | hsa_status_t Status = |
| 2805 | hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value); |
| 2806 | return Plugin::check(Status, "Error in hsa_agent_get_info: %s" ); |
| 2807 | } |
| 2808 | |
| 2809 | template <typename Ty> |
| 2810 | hsa_status_t getDeviceAttrRaw(uint32_t Kind, Ty &Value) { |
| 2811 | return hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value); |
| 2812 | } |
| 2813 | |
| 2814 | /// Get the device agent. |
| 2815 | hsa_agent_t getAgent() const override { return Agent; } |
| 2816 | |
| 2817 | /// Get the signal manager. |
| 2818 | AMDGPUSignalManagerTy &getSignalManager() { return AMDGPUSignalManager; } |
| 2819 | |
| 2820 | /// Retrieve and construct all memory pools of the device agent. |
| 2821 | Error retrieveAllMemoryPools() override { |
| 2822 | // Iterate through the available pools of the device agent. |
| 2823 | return hsa_utils::iterateAgentMemoryPools( |
| 2824 | Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) { |
| 2825 | AMDGPUMemoryPoolTy *MemoryPool = |
| 2826 | Plugin.allocate<AMDGPUMemoryPoolTy>(); |
| 2827 | new (MemoryPool) AMDGPUMemoryPoolTy(HSAMemoryPool); |
| 2828 | AllMemoryPools.push_back(MemoryPool); |
| 2829 | return HSA_STATUS_SUCCESS; |
| 2830 | }); |
| 2831 | } |
| 2832 | |
| 2833 | bool useMultipleSdmaEngines() const { return OMPX_UseMultipleSdmaEngines; } |
| 2834 | |
| 2835 | private: |
| 2836 | using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>; |
| 2837 | using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>; |
| 2838 | |
| 2839 | /// Common method to invoke a single threaded constructor or destructor |
| 2840 | /// kernel by name. |
| 2841 | Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image, |
| 2842 | bool IsCtor) { |
| 2843 | const char *KernelName = |
| 2844 | IsCtor ? "amdgcn.device.init" : "amdgcn.device.fini" ; |
| 2845 | // Perform a quick check for the named kernel in the image. The kernel |
| 2846 | // should be created by the 'amdgpu-lower-ctor-dtor' pass. |
| 2847 | GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); |
| 2848 | if (IsCtor && !Handler.isSymbolInImage(*this, Image, KernelName)) |
| 2849 | return Plugin::success(); |
| 2850 | |
| 2851 | // Allocate and construct the AMDGPU kernel. |
| 2852 | AMDGPUKernelTy AMDGPUKernel(KernelName); |
| 2853 | if (auto Err = AMDGPUKernel.init(*this, Image)) |
| 2854 | return Err; |
| 2855 | |
| 2856 | AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr); |
| 2857 | |
| 2858 | KernelArgsTy KernelArgs = {}; |
| 2859 | uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u}; |
| 2860 | if (auto Err = AMDGPUKernel.launchImpl( |
| 2861 | *this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs, |
| 2862 | KernelLaunchParamsTy{}, AsyncInfoWrapper)) |
| 2863 | return Err; |
| 2864 | |
| 2865 | Error Err = Plugin::success(); |
| 2866 | AsyncInfoWrapper.finalize(Err); |
| 2867 | |
| 2868 | return Err; |
| 2869 | } |
| 2870 | |
| 2871 | /// Detect if current architecture is an APU. |
| 2872 | Error checkIfAPU() { |
| 2873 | // TODO: replace with ROCr API once it becomes available. |
| 2874 | llvm::StringRef StrGfxName(ComputeUnitKind); |
| 2875 | bool MayBeAPU = llvm::StringSwitch<bool>(StrGfxName) |
| 2876 | .Case(S: "gfx942" , Value: true) |
| 2877 | .Default(Value: false); |
| 2878 | if (!MayBeAPU) |
| 2879 | return Plugin::success(); |
| 2880 | |
| 2881 | // can be MI300A or MI300X |
| 2882 | uint32_t ChipID = 0; |
| 2883 | if (auto Err = getDeviceAttr(HSA_AMD_AGENT_INFO_CHIP_ID, ChipID)) |
| 2884 | return Err; |
| 2885 | |
| 2886 | if (!(ChipID & 0x1)) { |
| 2887 | IsAPU = true; |
| 2888 | return Plugin::success(); |
| 2889 | } |
| 2890 | return Plugin::success(); |
| 2891 | } |
| 2892 | |
| 2893 | /// Envar for controlling the number of HSA queues per device. High number of |
| 2894 | /// queues may degrade performance. |
| 2895 | UInt32Envar OMPX_NumQueues; |
| 2896 | |
| 2897 | /// Envar for controlling the size of each HSA queue. The size is the number |
| 2898 | /// of HSA packets a queue is expected to hold. It is also the number of HSA |
| 2899 | /// packets that can be pushed into each queue without waiting the driver to |
| 2900 | /// process them. |
| 2901 | UInt32Envar OMPX_QueueSize; |
| 2902 | |
| 2903 | /// Envar for controlling the default number of teams relative to the number |
| 2904 | /// of compute units (CUs) the device has: |
| 2905 | /// #default_teams = OMPX_DefaultTeamsPerCU * #CUs. |
| 2906 | UInt32Envar OMPX_DefaultTeamsPerCU; |
| 2907 | |
| 2908 | /// Envar specifying the maximum size in bytes where the memory copies are |
| 2909 | /// asynchronous operations. Up to this transfer size, the memory copies are |
| 2910 | /// asynchronous operations pushed to the corresponding stream. For larger |
| 2911 | /// transfers, they are synchronous transfers. |
| 2912 | UInt32Envar OMPX_MaxAsyncCopyBytes; |
| 2913 | |
| 2914 | /// Envar controlling the initial number of HSA signals per device. There is |
| 2915 | /// one manager of signals per device managing several pre-allocated signals. |
| 2916 | /// These signals are mainly used by AMDGPU streams. If needed, more signals |
| 2917 | /// will be created. |
| 2918 | UInt32Envar OMPX_InitialNumSignals; |
| 2919 | |
| 2920 | /// Environment variables to set the time to wait in active state before |
| 2921 | /// switching to blocked state. The default 2000000 busywaits for 2 seconds |
| 2922 | /// before going into a blocking HSA wait state. The unit for these variables |
| 2923 | /// are microseconds. |
| 2924 | UInt32Envar OMPX_StreamBusyWait; |
| 2925 | |
| 2926 | /// Use ROCm 5.7 interface for multiple SDMA engines |
| 2927 | BoolEnvar OMPX_UseMultipleSdmaEngines; |
| 2928 | |
| 2929 | /// Value of OMPX_APU_MAPS env var used to force |
| 2930 | /// automatic zero-copy behavior on non-APU GPUs. |
| 2931 | BoolEnvar OMPX_ApuMaps; |
| 2932 | |
| 2933 | /// Stream manager for AMDGPU streams. |
| 2934 | AMDGPUStreamManagerTy AMDGPUStreamManager; |
| 2935 | |
| 2936 | /// Event manager for AMDGPU events. |
| 2937 | AMDGPUEventManagerTy AMDGPUEventManager; |
| 2938 | |
| 2939 | /// Signal manager for AMDGPU signals. |
| 2940 | AMDGPUSignalManagerTy AMDGPUSignalManager; |
| 2941 | |
| 2942 | /// The agent handler corresponding to the device. |
| 2943 | hsa_agent_t Agent; |
| 2944 | |
| 2945 | /// The GPU architecture. |
| 2946 | std::string ComputeUnitKind; |
| 2947 | |
| 2948 | /// The frequency of the steady clock inside the device. |
| 2949 | uint64_t ClockFrequency; |
| 2950 | |
| 2951 | /// The total number of concurrent work items that can be running on the GPU. |
| 2952 | uint64_t HardwareParallelism; |
| 2953 | |
| 2954 | /// Reference to the host device. |
| 2955 | AMDHostDeviceTy &HostDevice; |
| 2956 | |
| 2957 | /// The current size of the global device memory pool (managed by us). |
| 2958 | uint64_t DeviceMemoryPoolSize = 1L << 29L /*512MB=*/; |
| 2959 | |
| 2960 | /// The current size of the stack that will be used in cases where it could |
| 2961 | /// not be statically determined. |
| 2962 | uint64_t StackSize = 16 * 1024 /* 16 KB */; |
| 2963 | |
| 2964 | /// Is the plugin associated with an APU? |
| 2965 | bool IsAPU = false; |
| 2966 | |
| 2967 | /// True is the system is configured with XNACK-Enabled. |
| 2968 | /// False otherwise. |
| 2969 | bool IsXnackEnabled = false; |
| 2970 | }; |
| 2971 | |
| 2972 | Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) { |
| 2973 | hsa_code_object_reader_t Reader; |
| 2974 | hsa_status_t Status = |
| 2975 | hsa_code_object_reader_create_from_memory(getStart(), getSize(), &Reader); |
| 2976 | if (auto Err = Plugin::check( |
| 2977 | Status, "error in hsa_code_object_reader_create_from_memory: %s" )) |
| 2978 | return Err; |
| 2979 | |
| 2980 | Status = hsa_executable_create_alt( |
| 2981 | HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "" , &Executable); |
| 2982 | if (auto Err = |
| 2983 | Plugin::check(Status, "error in hsa_executable_create_alt: %s" )) |
| 2984 | return Err; |
| 2985 | |
| 2986 | hsa_loaded_code_object_t Object; |
| 2987 | Status = hsa_executable_load_agent_code_object(Executable, Device.getAgent(), |
| 2988 | Reader, "" , &Object); |
| 2989 | if (auto Err = Plugin::check( |
| 2990 | Status, "error in hsa_executable_load_agent_code_object: %s" )) |
| 2991 | return Err; |
| 2992 | |
| 2993 | Status = hsa_executable_freeze(Executable, "" ); |
| 2994 | if (auto Err = Plugin::check(Status, "error in hsa_executable_freeze: %s" )) |
| 2995 | return Err; |
| 2996 | |
| 2997 | uint32_t Result; |
| 2998 | Status = hsa_executable_validate(Executable, &Result); |
| 2999 | if (auto Err = Plugin::check(Status, "error in hsa_executable_validate: %s" )) |
| 3000 | return Err; |
| 3001 | |
| 3002 | if (Result) |
| 3003 | return Plugin::error(ErrorCode::INVALID_BINARY, |
| 3004 | "loaded HSA executable does not validate" ); |
| 3005 | |
| 3006 | Status = hsa_code_object_reader_destroy(Reader); |
| 3007 | if (auto Err = |
| 3008 | Plugin::check(Status, "error in hsa_code_object_reader_destroy: %s" )) |
| 3009 | return Err; |
| 3010 | |
| 3011 | if (auto Err = hsa_utils::readAMDGPUMetaDataFromImage( |
| 3012 | getMemoryBuffer(), KernelInfoMap, ELFABIVersion)) |
| 3013 | return Err; |
| 3014 | |
| 3015 | return Plugin::success(); |
| 3016 | } |
| 3017 | |
| 3018 | Expected<hsa_executable_symbol_t> |
| 3019 | AMDGPUDeviceImageTy::findDeviceSymbol(GenericDeviceTy &Device, |
| 3020 | StringRef SymbolName) const { |
| 3021 | |
| 3022 | AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(Device); |
| 3023 | hsa_agent_t Agent = AMDGPUDevice.getAgent(); |
| 3024 | |
| 3025 | hsa_executable_symbol_t Symbol; |
| 3026 | hsa_status_t Status = hsa_executable_get_symbol_by_name( |
| 3027 | Executable, SymbolName.data(), &Agent, &Symbol); |
| 3028 | if (auto Err = Plugin::check( |
| 3029 | Status, "error in hsa_executable_get_symbol_by_name(%s): %s" , |
| 3030 | SymbolName.data())) |
| 3031 | return std::move(Err); |
| 3032 | |
| 3033 | return Symbol; |
| 3034 | } |
| 3035 | |
| 3036 | template <typename ResourceTy> |
| 3037 | Error AMDGPUResourceRef<ResourceTy>::create(GenericDeviceTy &Device) { |
| 3038 | if (Resource) |
| 3039 | return Plugin::error(ErrorCode::INVALID_ARGUMENT, |
| 3040 | "creating an existing resource" ); |
| 3041 | |
| 3042 | AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(Device); |
| 3043 | |
| 3044 | Resource = new ResourceTy(AMDGPUDevice); |
| 3045 | |
| 3046 | return Resource->init(); |
| 3047 | } |
| 3048 | |
| 3049 | AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device) |
| 3050 | : Agent(Device.getAgent()), Queue(nullptr), |
| 3051 | SignalManager(Device.getSignalManager()), Device(Device), |
| 3052 | // Initialize the std::deque with some empty positions. |
| 3053 | Slots(32), NextSlot(0), SyncCycle(0), |
| 3054 | StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()), |
| 3055 | UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {} |
| 3056 | |
| 3057 | /// Class implementing the AMDGPU-specific functionalities of the global |
| 3058 | /// handler. |
| 3059 | struct AMDGPUGlobalHandlerTy final : public GenericGlobalHandlerTy { |
| 3060 | /// Get the metadata of a global from the device. The name and size of the |
| 3061 | /// global is read from DeviceGlobal and the address of the global is written |
| 3062 | /// to DeviceGlobal. |
| 3063 | Error getGlobalMetadataFromDevice(GenericDeviceTy &Device, |
| 3064 | DeviceImageTy &Image, |
| 3065 | GlobalTy &DeviceGlobal) override { |
| 3066 | AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(Image); |
| 3067 | |
| 3068 | // Find the symbol on the device executable. |
| 3069 | auto SymbolOrErr = |
| 3070 | AMDImage.findDeviceSymbol(Device, DeviceGlobal.getName()); |
| 3071 | if (!SymbolOrErr) |
| 3072 | return SymbolOrErr.takeError(); |
| 3073 | |
| 3074 | hsa_executable_symbol_t Symbol = *SymbolOrErr; |
| 3075 | hsa_symbol_kind_t SymbolType; |
| 3076 | hsa_status_t Status; |
| 3077 | uint64_t SymbolAddr; |
| 3078 | uint32_t SymbolSize; |
| 3079 | |
| 3080 | // Retrieve the type, address and size of the symbol. |
| 3081 | std::pair<hsa_executable_symbol_info_t, void *> RequiredInfos[] = { |
| 3082 | {HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &SymbolType}, |
| 3083 | {HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &SymbolAddr}, |
| 3084 | {HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &SymbolSize}}; |
| 3085 | |
| 3086 | for (auto &Info : RequiredInfos) { |
| 3087 | Status = hsa_executable_symbol_get_info(Symbol, Info.first, Info.second); |
| 3088 | if (auto Err = Plugin::check( |
| 3089 | Status, "error in hsa_executable_symbol_get_info: %s" )) |
| 3090 | return Err; |
| 3091 | } |
| 3092 | |
| 3093 | // Check the size of the symbol. |
| 3094 | if (SymbolSize != DeviceGlobal.getSize()) |
| 3095 | return Plugin::error( |
| 3096 | ErrorCode::INVALID_BINARY, |
| 3097 | "failed to load global '%s' due to size mismatch (%zu != %zu)" , |
| 3098 | DeviceGlobal.getName().data(), SymbolSize, |
| 3099 | (size_t)DeviceGlobal.getSize()); |
| 3100 | |
| 3101 | // Store the symbol address on the device global metadata. |
| 3102 | DeviceGlobal.setPtr(reinterpret_cast<void *>(SymbolAddr)); |
| 3103 | |
| 3104 | return Plugin::success(); |
| 3105 | } |
| 3106 | }; |
| 3107 | |
| 3108 | /// Class implementing the AMDGPU-specific functionalities of the plugin. |
| 3109 | struct AMDGPUPluginTy final : public GenericPluginTy { |
| 3110 | /// Create an AMDGPU plugin and initialize the AMDGPU driver. |
| 3111 | AMDGPUPluginTy() |
| 3112 | : GenericPluginTy(getTripleArch()), Initialized(false), |
| 3113 | HostDevice(nullptr) {} |
| 3114 | |
| 3115 | /// This class should not be copied. |
| 3116 | AMDGPUPluginTy(const AMDGPUPluginTy &) = delete; |
| 3117 | AMDGPUPluginTy(AMDGPUPluginTy &&) = delete; |
| 3118 | |
| 3119 | /// Initialize the plugin and return the number of devices. |
| 3120 | Expected<int32_t> initImpl() override { |
| 3121 | hsa_status_t Status = hsa_init(); |
| 3122 | if (Status != HSA_STATUS_SUCCESS) { |
| 3123 | // Cannot call hsa_success_string. |
| 3124 | DP("Failed to initialize AMDGPU's HSA library\n" ); |
| 3125 | return 0; |
| 3126 | } |
| 3127 | |
| 3128 | // The initialization of HSA was successful. It should be safe to call |
| 3129 | // HSA functions from now on, e.g., hsa_shut_down. |
| 3130 | Initialized = true; |
| 3131 | |
| 3132 | // Register event handler to detect memory errors on the devices. |
| 3133 | Status = hsa_amd_register_system_event_handler(eventHandler, this); |
| 3134 | if (auto Err = Plugin::check( |
| 3135 | Status, "error in hsa_amd_register_system_event_handler: %s" )) |
| 3136 | return std::move(Err); |
| 3137 | |
| 3138 | // List of host (CPU) agents. |
| 3139 | llvm::SmallVector<hsa_agent_t> HostAgents; |
| 3140 | |
| 3141 | // Count the number of available agents. |
| 3142 | auto Err = hsa_utils::iterateAgents(Callback: [&](hsa_agent_t Agent) { |
| 3143 | // Get the device type of the agent. |
| 3144 | hsa_device_type_t DeviceType; |
| 3145 | hsa_status_t Status = |
| 3146 | hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType); |
| 3147 | if (Status != HSA_STATUS_SUCCESS) |
| 3148 | return Status; |
| 3149 | |
| 3150 | // Classify the agents into kernel (GPU) and host (CPU) kernels. |
| 3151 | if (DeviceType == HSA_DEVICE_TYPE_GPU) { |
| 3152 | // Ensure that the GPU agent supports kernel dispatch packets. |
| 3153 | hsa_agent_feature_t Features; |
| 3154 | Status = hsa_agent_get_info(Agent, HSA_AGENT_INFO_FEATURE, &Features); |
| 3155 | if (Features & HSA_AGENT_FEATURE_KERNEL_DISPATCH) |
| 3156 | KernelAgents.push_back(Agent); |
| 3157 | } else if (DeviceType == HSA_DEVICE_TYPE_CPU) { |
| 3158 | HostAgents.push_back(Agent); |
| 3159 | } |
| 3160 | return HSA_STATUS_SUCCESS; |
| 3161 | }); |
| 3162 | |
| 3163 | if (Err) |
| 3164 | return std::move(Err); |
| 3165 | |
| 3166 | int32_t NumDevices = KernelAgents.size(); |
| 3167 | if (NumDevices == 0) { |
| 3168 | // Do not initialize if there are no devices. |
| 3169 | DP("There are no devices supporting AMDGPU.\n" ); |
| 3170 | return 0; |
| 3171 | } |
| 3172 | |
| 3173 | // There are kernel agents but there is no host agent. That should be |
| 3174 | // treated as an error. |
| 3175 | if (HostAgents.empty()) |
| 3176 | return Plugin::error(ErrorCode::BACKEND_FAILURE, "no AMDGPU host agents" ); |
| 3177 | |
| 3178 | // Initialize the host device using host agents. |
| 3179 | HostDevice = allocate<AMDHostDeviceTy>(); |
| 3180 | new (HostDevice) AMDHostDeviceTy(*this, HostAgents); |
| 3181 | |
| 3182 | // Setup the memory pools of available for the host. |
| 3183 | if (auto Err = HostDevice->init()) |
| 3184 | return std::move(Err); |
| 3185 | |
| 3186 | return NumDevices; |
| 3187 | } |
| 3188 | |
| 3189 | /// Deinitialize the plugin. |
| 3190 | Error deinitImpl() override { |
| 3191 | // The HSA runtime was not initialized, so nothing from the plugin was |
| 3192 | // actually initialized. |
| 3193 | if (!Initialized) |
| 3194 | return Plugin::success(); |
| 3195 | |
| 3196 | if (HostDevice) |
| 3197 | if (auto Err = HostDevice->deinit()) |
| 3198 | return Err; |
| 3199 | |
| 3200 | // Finalize the HSA runtime. |
| 3201 | hsa_status_t Status = hsa_shut_down(); |
| 3202 | return Plugin::check(Status, "error in hsa_shut_down: %s" ); |
| 3203 | } |
| 3204 | |
| 3205 | /// Creates an AMDGPU device. |
| 3206 | GenericDeviceTy *createDevice(GenericPluginTy &Plugin, int32_t DeviceId, |
| 3207 | int32_t NumDevices) override { |
| 3208 | return new AMDGPUDeviceTy(Plugin, DeviceId, NumDevices, getHostDevice(), |
| 3209 | getKernelAgent(DeviceId)); |
| 3210 | } |
| 3211 | |
| 3212 | /// Creates an AMDGPU global handler. |
| 3213 | GenericGlobalHandlerTy *createGlobalHandler() override { |
| 3214 | return new AMDGPUGlobalHandlerTy(); |
| 3215 | } |
| 3216 | |
| 3217 | Triple::ArchType getTripleArch() const override { return Triple::amdgcn; } |
| 3218 | |
| 3219 | const char *getName() const override { return GETNAME(TARGET_NAME); } |
| 3220 | |
| 3221 | /// Get the ELF code for recognizing the compatible image binary. |
| 3222 | uint16_t getMagicElfBits() const override { return ELF::EM_AMDGPU; } |
| 3223 | |
| 3224 | /// Check whether the image is compatible with an AMDGPU device. |
| 3225 | Expected<bool> isELFCompatible(uint32_t DeviceId, |
| 3226 | StringRef Image) const override { |
| 3227 | // Get the associated architecture and flags from the ELF. |
| 3228 | auto ElfOrErr = ELF64LEObjectFile::create( |
| 3229 | MemoryBufferRef(Image, /*Identifier=*/"" ), /*InitContent=*/false); |
| 3230 | if (!ElfOrErr) |
| 3231 | return ElfOrErr.takeError(); |
| 3232 | std::optional<StringRef> Processor = ElfOrErr->tryGetCPUName(); |
| 3233 | if (!Processor) |
| 3234 | return false; |
| 3235 | |
| 3236 | SmallVector<SmallString<32>> Targets; |
| 3237 | if (auto Err = hsa_utils::getTargetTripleAndFeatures( |
| 3238 | getKernelAgent(DeviceId), Targets)) |
| 3239 | return Err; |
| 3240 | for (auto &Target : Targets) |
| 3241 | if (offloading::amdgpu::isImageCompatibleWithEnv( |
| 3242 | Processor ? *Processor : "" , ElfOrErr->getPlatformFlags(), |
| 3243 | Target.str())) |
| 3244 | return true; |
| 3245 | return false; |
| 3246 | } |
| 3247 | |
| 3248 | bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) override { |
| 3249 | return true; |
| 3250 | } |
| 3251 | |
| 3252 | /// Get the host device instance. |
| 3253 | AMDHostDeviceTy &getHostDevice() { |
| 3254 | assert(HostDevice && "Host device not initialized" ); |
| 3255 | return *HostDevice; |
| 3256 | } |
| 3257 | |
| 3258 | /// Get the kernel agent with the corresponding agent id. |
| 3259 | hsa_agent_t getKernelAgent(int32_t AgentId) const { |
| 3260 | assert((uint32_t)AgentId < KernelAgents.size() && "Invalid agent id" ); |
| 3261 | return KernelAgents[AgentId]; |
| 3262 | } |
| 3263 | |
| 3264 | /// Get the list of the available kernel agents. |
| 3265 | const llvm::SmallVector<hsa_agent_t> &getKernelAgents() const { |
| 3266 | return KernelAgents; |
| 3267 | } |
| 3268 | |
| 3269 | private: |
| 3270 | /// Event handler that will be called by ROCr if an event is detected. |
| 3271 | static hsa_status_t eventHandler(const hsa_amd_event_t *Event, |
| 3272 | void *PluginPtr) { |
| 3273 | if (Event->event_type != HSA_AMD_GPU_MEMORY_FAULT_EVENT) |
| 3274 | return HSA_STATUS_SUCCESS; |
| 3275 | |
| 3276 | SmallVector<std::string> Reasons; |
| 3277 | uint32_t ReasonsMask = Event->memory_fault.fault_reason_mask; |
| 3278 | if (ReasonsMask & HSA_AMD_MEMORY_FAULT_PAGE_NOT_PRESENT) |
| 3279 | Reasons.emplace_back(Args: "Page not present or supervisor privilege" ); |
| 3280 | if (ReasonsMask & HSA_AMD_MEMORY_FAULT_READ_ONLY) |
| 3281 | Reasons.emplace_back(Args: "Write access to a read-only page" ); |
| 3282 | if (ReasonsMask & HSA_AMD_MEMORY_FAULT_NX) |
| 3283 | Reasons.emplace_back(Args: "Execute access to a page marked NX" ); |
| 3284 | if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HOST_ONLY) |
| 3285 | Reasons.emplace_back(Args: "GPU attempted access to a host only page" ); |
| 3286 | if (ReasonsMask & HSA_AMD_MEMORY_FAULT_DRAMECC) |
| 3287 | Reasons.emplace_back(Args: "DRAM ECC failure" ); |
| 3288 | if (ReasonsMask & HSA_AMD_MEMORY_FAULT_IMPRECISE) |
| 3289 | Reasons.emplace_back(Args: "Can't determine the exact fault address" ); |
| 3290 | if (ReasonsMask & HSA_AMD_MEMORY_FAULT_SRAMECC) |
| 3291 | Reasons.emplace_back(Args: "SRAM ECC failure (ie registers, no fault address)" ); |
| 3292 | if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HANG) |
| 3293 | Reasons.emplace_back(Args: "GPU reset following unspecified hang" ); |
| 3294 | |
| 3295 | // If we do not know the reason, say so, otherwise remove the trailing comma |
| 3296 | // and space. |
| 3297 | if (Reasons.empty()) |
| 3298 | Reasons.emplace_back(Args: "Unknown (" + std::to_string(val: ReasonsMask) + ")" ); |
| 3299 | |
| 3300 | uint32_t Node = -1; |
| 3301 | hsa_agent_get_info(Event->memory_fault.agent, HSA_AGENT_INFO_NODE, &Node); |
| 3302 | |
| 3303 | AMDGPUPluginTy &Plugin = *reinterpret_cast<AMDGPUPluginTy *>(PluginPtr); |
| 3304 | for (uint32_t I = 0, E = Plugin.getNumDevices(); |
| 3305 | Node != uint32_t(-1) && I < E; ++I) { |
| 3306 | AMDGPUDeviceTy &AMDGPUDevice = |
| 3307 | reinterpret_cast<AMDGPUDeviceTy &>(Plugin.getDevice(I)); |
| 3308 | auto KernelTraceInfoRecord = |
| 3309 | AMDGPUDevice.KernelLaunchTraces.getExclusiveAccessor(); |
| 3310 | |
| 3311 | uint32_t DeviceNode = -1; |
| 3312 | if (auto Err = |
| 3313 | AMDGPUDevice.getDeviceAttr(HSA_AGENT_INFO_NODE, DeviceNode)) { |
| 3314 | consumeError(std::move(Err)); |
| 3315 | continue; |
| 3316 | } |
| 3317 | if (DeviceNode != Node) |
| 3318 | continue; |
| 3319 | void *DevicePtr = (void *)Event->memory_fault.virtual_address; |
| 3320 | std::string S; |
| 3321 | llvm::raw_string_ostream OS(S); |
| 3322 | OS << llvm::format("memory access fault by GPU %" PRIu32 |
| 3323 | " (agent 0x%" PRIx64 |
| 3324 | ") at virtual address %p. Reasons: %s" , |
| 3325 | Node, Event->memory_fault.agent.handle, |
| 3326 | (void *)Event->memory_fault.virtual_address, |
| 3327 | llvm::join(Reasons, ", " ).c_str()); |
| 3328 | ErrorReporter::reportKernelTraces(AMDGPUDevice, *KernelTraceInfoRecord); |
| 3329 | ErrorReporter::reportMemoryAccessError(AMDGPUDevice, DevicePtr, S, |
| 3330 | /*Abort*/ true); |
| 3331 | } |
| 3332 | |
| 3333 | // Abort the execution since we do not recover from this error. |
| 3334 | FATAL_MESSAGE(1, |
| 3335 | "memory access fault by GPU %" PRIu32 " (agent 0x%" PRIx64 |
| 3336 | ") at virtual address %p. Reasons: %s" , |
| 3337 | Node, Event->memory_fault.agent.handle, |
| 3338 | (void *)Event->memory_fault.virtual_address, |
| 3339 | llvm::join(Reasons, ", " ).c_str()); |
| 3340 | |
| 3341 | return HSA_STATUS_ERROR; |
| 3342 | } |
| 3343 | |
| 3344 | /// Indicate whether the HSA runtime was correctly initialized. Even if there |
| 3345 | /// is no available devices this boolean will be true. It indicates whether |
| 3346 | /// we can safely call HSA functions (e.g., hsa_shut_down). |
| 3347 | bool Initialized; |
| 3348 | |
| 3349 | /// Arrays of the available GPU and CPU agents. These arrays of handles should |
| 3350 | /// not be here but in the AMDGPUDeviceTy structures directly. However, the |
| 3351 | /// HSA standard does not provide API functions to retirve agents directly, |
| 3352 | /// only iterating functions. We cache the agents here for convenience. |
| 3353 | llvm::SmallVector<hsa_agent_t> KernelAgents; |
| 3354 | |
| 3355 | /// The device representing all HSA host agents. |
| 3356 | AMDHostDeviceTy *HostDevice; |
| 3357 | }; |
| 3358 | |
| 3359 | Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, |
| 3360 | uint32_t NumThreads[3], uint32_t NumBlocks[3], |
| 3361 | KernelArgsTy &KernelArgs, |
| 3362 | KernelLaunchParamsTy LaunchParams, |
| 3363 | AsyncInfoWrapperTy &AsyncInfoWrapper) const { |
| 3364 | if (ArgsSize != LaunchParams.Size && |
| 3365 | ArgsSize > LaunchParams.Size + getImplicitArgsSize()) |
| 3366 | return Plugin::error(ErrorCode::INVALID_ARGUMENT, |
| 3367 | "invalid kernel arguments size" ); |
| 3368 | |
| 3369 | AMDGPUPluginTy &AMDGPUPlugin = |
| 3370 | static_cast<AMDGPUPluginTy &>(GenericDevice.Plugin); |
| 3371 | AMDHostDeviceTy &HostDevice = AMDGPUPlugin.getHostDevice(); |
| 3372 | AMDGPUMemoryManagerTy &ArgsMemoryManager = HostDevice.getArgsMemoryManager(); |
| 3373 | |
| 3374 | void *AllArgs = nullptr; |
| 3375 | if (auto Err = ArgsMemoryManager.allocate(ArgsSize, &AllArgs)) |
| 3376 | return Err; |
| 3377 | |
| 3378 | // Account for user requested dynamic shared memory. |
| 3379 | uint32_t GroupSize = getGroupSize(); |
| 3380 | if (uint32_t MaxDynCGroupMem = std::max( |
| 3381 | KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize())) { |
| 3382 | GroupSize += MaxDynCGroupMem; |
| 3383 | } |
| 3384 | |
| 3385 | uint64_t StackSize; |
| 3386 | if (auto Err = GenericDevice.getDeviceStackSize(StackSize)) |
| 3387 | return Err; |
| 3388 | |
| 3389 | // Copy the explicit arguments. |
| 3390 | // TODO: We should expose the args memory manager alloc to the common part as |
| 3391 | // alternative to copying them twice. |
| 3392 | if (LaunchParams.Size) |
| 3393 | std::memcpy(dest: AllArgs, src: LaunchParams.Data, n: LaunchParams.Size); |
| 3394 | |
| 3395 | AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(GenericDevice); |
| 3396 | |
| 3397 | AMDGPUStreamTy *Stream = nullptr; |
| 3398 | if (auto Err = AMDGPUDevice.getStream(AsyncInfoWrapper, Stream)) |
| 3399 | return Err; |
| 3400 | |
| 3401 | uint64_t ImplArgsOffset = utils::roundUp( |
| 3402 | LaunchParams.Size, alignof(hsa_utils::AMDGPUImplicitArgsTy)); |
| 3403 | if (ArgsSize > ImplArgsOffset) { |
| 3404 | hsa_utils::AMDGPUImplicitArgsTy *ImplArgs = |
| 3405 | reinterpret_cast<hsa_utils::AMDGPUImplicitArgsTy *>( |
| 3406 | utils::advancePtr(AllArgs, ImplArgsOffset)); |
| 3407 | |
| 3408 | // Set the COV5+ implicit arguments to the appropriate values if present. |
| 3409 | uint64_t ImplArgsSize = ArgsSize - ImplArgsOffset; |
| 3410 | std::memset(s: ImplArgs, c: 0, n: ImplArgsSize); |
| 3411 | |
| 3412 | using ImplArgsTy = hsa_utils::AMDGPUImplicitArgsTy; |
| 3413 | hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::BlockCountX, ImplArgsSize, |
| 3414 | NumBlocks[0]); |
| 3415 | hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::BlockCountY, ImplArgsSize, |
| 3416 | NumBlocks[1]); |
| 3417 | hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::BlockCountZ, ImplArgsSize, |
| 3418 | NumBlocks[2]); |
| 3419 | |
| 3420 | hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GroupSizeX, ImplArgsSize, |
| 3421 | NumThreads[0]); |
| 3422 | hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GroupSizeY, ImplArgsSize, |
| 3423 | NumThreads[1]); |
| 3424 | hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GroupSizeZ, ImplArgsSize, |
| 3425 | NumThreads[2]); |
| 3426 | |
| 3427 | hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GridDims, ImplArgsSize, |
| 3428 | NumBlocks[2] * NumThreads[2] > 1 |
| 3429 | ? 3 |
| 3430 | : 1 + (NumBlocks[1] * NumThreads[1] != 1)); |
| 3431 | |
| 3432 | hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::DynamicLdsSize, ImplArgsSize, |
| 3433 | KernelArgs.DynCGroupMem); |
| 3434 | } |
| 3435 | |
| 3436 | // Push the kernel launch into the stream. |
| 3437 | return Stream->pushKernelLaunch(Kernel: *this, KernelArgs: AllArgs, NumThreads, NumBlocks, |
| 3438 | GroupSize, StackSize, MemoryManager&: ArgsMemoryManager); |
| 3439 | } |
| 3440 | |
| 3441 | Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice, |
| 3442 | KernelArgsTy &KernelArgs, |
| 3443 | uint32_t NumThreads[3], |
| 3444 | uint32_t NumBlocks[3]) const { |
| 3445 | // Only do all this when the output is requested |
| 3446 | if (!(getInfoLevel() & OMP_INFOTYPE_PLUGIN_KERNEL)) |
| 3447 | return Plugin::success(); |
| 3448 | |
| 3449 | // We don't have data to print additional info, but no hard error |
| 3450 | if (!KernelInfo.has_value()) |
| 3451 | return Plugin::success(); |
| 3452 | |
| 3453 | // General Info |
| 3454 | auto NumGroups = NumBlocks; |
| 3455 | auto ThreadsPerGroup = NumThreads; |
| 3456 | |
| 3457 | // Kernel Arguments Info |
| 3458 | auto ArgNum = KernelArgs.NumArgs; |
| 3459 | auto LoopTripCount = KernelArgs.Tripcount; |
| 3460 | |
| 3461 | // Details for AMDGPU kernels (read from image) |
| 3462 | // https://www.llvm.org/docs/AMDGPUUsage.html#code-object-v4-metadata |
| 3463 | auto GroupSegmentSize = (*KernelInfo).GroupSegmentList; |
| 3464 | auto SGPRCount = (*KernelInfo).SGPRCount; |
| 3465 | auto VGPRCount = (*KernelInfo).VGPRCount; |
| 3466 | auto SGPRSpillCount = (*KernelInfo).SGPRSpillCount; |
| 3467 | auto VGPRSpillCount = (*KernelInfo).VGPRSpillCount; |
| 3468 | auto MaxFlatWorkgroupSize = (*KernelInfo).MaxFlatWorkgroupSize; |
| 3469 | |
| 3470 | // Prints additional launch info that contains the following. |
| 3471 | // Num Args: The number of kernel arguments |
| 3472 | // Teams x Thrds: The number of teams and the number of threads actually |
| 3473 | // running. |
| 3474 | // MaxFlatWorkgroupSize: Maximum flat work-group size supported by the |
| 3475 | // kernel in work-items |
| 3476 | // LDS Usage: Amount of bytes used in LDS storage |
| 3477 | // S/VGPR Count: the number of S/V GPRs occupied by the kernel |
| 3478 | // S/VGPR Spill Count: how many S/VGPRs are spilled by the kernel |
| 3479 | // Tripcount: loop tripcount for the kernel |
| 3480 | INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(), |
| 3481 | "#Args: %d Teams x Thrds: %4ux%4u (MaxFlatWorkGroupSize: %u) LDS " |
| 3482 | "Usage: %uB #SGPRs/VGPRs: %u/%u #SGPR/VGPR Spills: %u/%u Tripcount: " |
| 3483 | "%lu\n" , |
| 3484 | ArgNum, NumGroups[0] * NumGroups[1] * NumGroups[2], |
| 3485 | ThreadsPerGroup[0] * ThreadsPerGroup[1] * ThreadsPerGroup[2], |
| 3486 | MaxFlatWorkgroupSize, GroupSegmentSize, SGPRCount, VGPRCount, |
| 3487 | SGPRSpillCount, VGPRSpillCount, LoopTripCount); |
| 3488 | |
| 3489 | return Plugin::success(); |
| 3490 | } |
| 3491 | |
| 3492 | template <typename... ArgsTy> |
| 3493 | static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) { |
| 3494 | hsa_status_t ResultCode = static_cast<hsa_status_t>(Code); |
| 3495 | if (ResultCode == HSA_STATUS_SUCCESS || ResultCode == HSA_STATUS_INFO_BREAK) |
| 3496 | return Plugin::success(); |
| 3497 | |
| 3498 | const char *Desc = "unknown error" ; |
| 3499 | hsa_status_t Ret = hsa_status_string(ResultCode, &Desc); |
| 3500 | if (Ret != HSA_STATUS_SUCCESS) |
| 3501 | REPORT("Unrecognized " GETNAME(TARGET_NAME) " error code %d\n" , Code); |
| 3502 | |
| 3503 | // TODO: Add more entries to this switch |
| 3504 | ErrorCode OffloadErrCode; |
| 3505 | switch (ResultCode) { |
| 3506 | case HSA_STATUS_ERROR_INVALID_SYMBOL_NAME: |
| 3507 | OffloadErrCode = ErrorCode::NOT_FOUND; |
| 3508 | break; |
| 3509 | default: |
| 3510 | OffloadErrCode = ErrorCode::UNKNOWN; |
| 3511 | } |
| 3512 | |
| 3513 | return Plugin::error(OffloadErrCode, ErrFmt, Args..., Desc); |
| 3514 | } |
| 3515 | |
| 3516 | void *AMDGPUMemoryManagerTy::allocate(size_t Size, void *HstPtr, |
| 3517 | TargetAllocTy Kind) { |
| 3518 | // Allocate memory from the pool. |
| 3519 | void *Ptr = nullptr; |
| 3520 | if (auto Err = MemoryPool->allocate(Size, PtrStorage: &Ptr)) { |
| 3521 | consumeError(Err: std::move(Err)); |
| 3522 | return nullptr; |
| 3523 | } |
| 3524 | assert(Ptr && "Invalid pointer" ); |
| 3525 | |
| 3526 | // Get a list of agents that can access this memory pool. |
| 3527 | llvm::SmallVector<hsa_agent_t> Agents; |
| 3528 | llvm::copy_if( |
| 3529 | Plugin.getKernelAgents(), std::back_inserter(Agents), |
| 3530 | [&](hsa_agent_t Agent) { return MemoryPool->canAccess(Agent); }); |
| 3531 | |
| 3532 | // Allow all valid kernel agents to access the allocation. |
| 3533 | if (auto Err = MemoryPool->enableAccess(Ptr, Size, Agents)) { |
| 3534 | REPORT("%s\n" , toString(std::move(Err)).data()); |
| 3535 | return nullptr; |
| 3536 | } |
| 3537 | return Ptr; |
| 3538 | } |
| 3539 | |
| 3540 | void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) { |
| 3541 | if (Size == 0) |
| 3542 | return nullptr; |
| 3543 | |
| 3544 | // Find the correct memory pool. |
| 3545 | AMDGPUMemoryPoolTy *MemoryPool = nullptr; |
| 3546 | switch (Kind) { |
| 3547 | case TARGET_ALLOC_DEFAULT: |
| 3548 | case TARGET_ALLOC_DEVICE: |
| 3549 | case TARGET_ALLOC_DEVICE_NON_BLOCKING: |
| 3550 | MemoryPool = CoarseGrainedMemoryPools[0]; |
| 3551 | break; |
| 3552 | case TARGET_ALLOC_HOST: |
| 3553 | MemoryPool = &HostDevice.getFineGrainedMemoryPool(); |
| 3554 | break; |
| 3555 | case TARGET_ALLOC_SHARED: |
| 3556 | MemoryPool = &HostDevice.getFineGrainedMemoryPool(); |
| 3557 | break; |
| 3558 | } |
| 3559 | |
| 3560 | if (!MemoryPool) { |
| 3561 | REPORT("No memory pool for the specified allocation kind\n" ); |
| 3562 | return nullptr; |
| 3563 | } |
| 3564 | |
| 3565 | // Allocate from the corresponding memory pool. |
| 3566 | void *Alloc = nullptr; |
| 3567 | if (Error Err = MemoryPool->allocate(Size, PtrStorage: &Alloc)) { |
| 3568 | REPORT("%s\n" , toString(E: std::move(Err)).data()); |
| 3569 | return nullptr; |
| 3570 | } |
| 3571 | |
| 3572 | if (Alloc) { |
| 3573 | // Get a list of agents that can access this memory pool. Inherently |
| 3574 | // necessary for host or shared allocations Also enabled for device memory |
| 3575 | // to allow device to device memcpy |
| 3576 | llvm::SmallVector<hsa_agent_t> Agents; |
| 3577 | llvm::copy_if(static_cast<AMDGPUPluginTy &>(Plugin).getKernelAgents(), |
| 3578 | std::back_inserter(Agents), [&](hsa_agent_t Agent) { |
| 3579 | return MemoryPool->canAccess(Agent); |
| 3580 | }); |
| 3581 | |
| 3582 | // Enable all valid kernel agents to access the buffer. |
| 3583 | if (auto Err = MemoryPool->enableAccess(Alloc, Size, Agents)) { |
| 3584 | REPORT("%s\n" , toString(std::move(Err)).data()); |
| 3585 | return nullptr; |
| 3586 | } |
| 3587 | } |
| 3588 | |
| 3589 | return Alloc; |
| 3590 | } |
| 3591 | |
| 3592 | void AMDGPUQueueTy::callbackError(hsa_status_t Status, hsa_queue_t *Source, |
| 3593 | void *Data) { |
| 3594 | auto &AMDGPUDevice = *reinterpret_cast<AMDGPUDeviceTy *>(Data); |
| 3595 | |
| 3596 | if (Status == HSA_STATUS_ERROR_EXCEPTION) { |
| 3597 | auto KernelTraceInfoRecord = |
| 3598 | AMDGPUDevice.KernelLaunchTraces.getExclusiveAccessor(); |
| 3599 | std::function<bool(__tgt_async_info &)> AsyncInfoWrapperMatcher = |
| 3600 | [=](__tgt_async_info &AsyncInfo) { |
| 3601 | auto *Stream = reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue); |
| 3602 | if (!Stream || !Stream->getQueue()) |
| 3603 | return false; |
| 3604 | return Stream->getQueue()->Queue == Source; |
| 3605 | }; |
| 3606 | ErrorReporter::reportTrapInKernel(AMDGPUDevice, *KernelTraceInfoRecord, |
| 3607 | AsyncInfoWrapperMatcher); |
| 3608 | } |
| 3609 | |
| 3610 | auto Err = Plugin::check(Status, "received error in queue %p: %s" , Source); |
| 3611 | FATAL_MESSAGE(1, "%s" , toString(std::move(Err)).data()); |
| 3612 | } |
| 3613 | |
| 3614 | } // namespace plugin |
| 3615 | } // namespace target |
| 3616 | } // namespace omp |
| 3617 | } // namespace llvm |
| 3618 | |
| 3619 | extern "C" { |
| 3620 | llvm::omp::target::plugin::GenericPluginTy *createPlugin_amdgpu() { |
| 3621 | return new llvm::omp::target::plugin::AMDGPUPluginTy(); |
| 3622 | } |
| 3623 | } |
| 3624 | |