| 1 | //===- CudaRuntimeWrappers.cpp - MLIR CUDA API wrapper library ------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // Implements C wrappers around the CUDA library for easy linking in ORC jit. |
| 10 | // Also adds some debugging helpers that are helpful when writing MLIR code to |
| 11 | // run on GPUs. |
| 12 | // |
| 13 | //===----------------------------------------------------------------------===// |
| 14 | |
| 15 | #include "mlir/ExecutionEngine/CRunnerUtils.h" |
| 16 | |
| 17 | #include <cstdio> |
| 18 | |
| 19 | #include "cuda.h" |
| 20 | #include "cuda_bf16.h" |
| 21 | #include "cuda_fp16.h" |
| 22 | |
| 23 | #ifdef MLIR_ENABLE_CUDA_CUSPARSE |
| 24 | #include "cusparse.h" |
| 25 | #ifdef MLIR_ENABLE_CUDA_CUSPARSELT |
| 26 | #include "cusparseLt.h" |
| 27 | #endif // MLIR_ENABLE_CUDA_CUSPARSELT |
| 28 | #endif // MLIR_ENABLE_CUDA_CUSPARSE |
| 29 | |
| 30 | #ifdef _WIN32 |
| 31 | #include <malloc.h> |
| 32 | #define MLIR_CUDA_WRAPPERS_EXPORT __declspec(dllexport) |
| 33 | #else |
| 34 | #define MLIR_CUDA_WRAPPERS_EXPORT __attribute__((visibility("default"))) |
| 35 | #endif // _WIN32 |
| 36 | |
| 37 | #define CUDA_REPORT_IF_ERROR(expr) \ |
| 38 | [](CUresult result) { \ |
| 39 | if (!result) \ |
| 40 | return; \ |
| 41 | const char *name = nullptr; \ |
| 42 | cuGetErrorName(result, &name); \ |
| 43 | if (!name) \ |
| 44 | name = "<unknown>"; \ |
| 45 | fprintf(stderr, "'%s' failed with '%s'\n", #expr, name); \ |
| 46 | }(expr) |
| 47 | |
| 48 | #define CUSPARSE_REPORT_IF_ERROR(expr) \ |
| 49 | { \ |
| 50 | cusparseStatus_t status = (expr); \ |
| 51 | if (status != CUSPARSE_STATUS_SUCCESS) { \ |
| 52 | fprintf(stderr, "cuSPARSE '%s' failed with '%s'\n", #expr, \ |
| 53 | cusparseGetErrorString(status)); \ |
| 54 | } \ |
| 55 | } |
| 56 | |
| 57 | thread_local static int32_t defaultDevice = 0; |
| 58 | |
| 59 | /// Helper method that checks environment value for debugging. |
| 60 | bool isDebugEnabled() { |
| 61 | const char *kDebugEnvironmentVariable = "MLIR_CUDA_DEBUG" ; |
| 62 | static bool isEnabled = getenv(kDebugEnvironmentVariable) != nullptr; |
| 63 | return isEnabled; |
| 64 | } |
| 65 | |
| 66 | #define debug_print(fmt, ...) \ |
| 67 | do { \ |
| 68 | if (isDebugEnabled()) \ |
| 69 | fprintf(stderr, "%s:%d:%s(): " fmt, "CudaRuntimeWrappers.cpp", __LINE__, \ |
| 70 | __func__, __VA_ARGS__); \ |
| 71 | } while (0) |
| 72 | |
| 73 | // Returns default CUdevice |
| 74 | CUdevice getDefaultCuDevice() { |
| 75 | CUdevice device; |
| 76 | CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice)); |
| 77 | return device; |
| 78 | } |
| 79 | |
| 80 | // Make the primary context of the current default device current for the |
| 81 | // duration |
| 82 | // of the instance and restore the previous context on destruction. |
| 83 | class ScopedContext { |
| 84 | public: |
| 85 | ScopedContext() { |
| 86 | // Static reference to CUDA primary context for device ordinal |
| 87 | // defaultDevice. |
| 88 | static CUcontext context = [] { |
| 89 | CUDA_REPORT_IF_ERROR(cuInit(/*flags=*/0)); |
| 90 | CUcontext ctx; |
| 91 | // Note: this does not affect the current context. |
| 92 | CUDA_REPORT_IF_ERROR( |
| 93 | cuDevicePrimaryCtxRetain(&ctx, getDefaultCuDevice())); |
| 94 | return ctx; |
| 95 | }(); |
| 96 | |
| 97 | CUDA_REPORT_IF_ERROR(cuCtxPushCurrent(context)); |
| 98 | } |
| 99 | |
| 100 | ~ScopedContext() { CUDA_REPORT_IF_ERROR(cuCtxPopCurrent(nullptr)); } |
| 101 | }; |
| 102 | |
| 103 | #ifdef MLIR_ENABLE_CUDA_CUSPARSE |
| 104 | // Note that (1) Nvidia confirms the safety to share handle across multiple |
| 105 | // instances, and streams. (2) Clients are responsible to call the @mgpu |
| 106 | // environment initialization/destruction in a thread-safe manner, e.g., |
| 107 | // at the beginning of the program before multi-threads are created. |
| 108 | static cusparseHandle_t cusparse_env = nullptr; |
| 109 | |
| 110 | #ifdef MLIR_ENABLE_CUDA_CUSPARSELT |
| 111 | // cusparseLtHandle_t is not a pointer type, so we need an additional flag to |
| 112 | // indicate whether it is initialized. |
| 113 | static cusparseLtHandle_t cusparseLt_env; |
| 114 | static bool cusparseLt_initiated = false; |
| 115 | |
| 116 | #endif // MLIR_ENABLE_CUDA_CUSPARSELT |
| 117 | #endif // MLIR_ENABLE_CUDA_CUSPARSE |
| 118 | |
| 119 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule |
| 120 | mgpuModuleLoad(void *data, size_t /*gpuBlobSize*/) { |
| 121 | ScopedContext scopedContext; |
| 122 | CUmodule module = nullptr; |
| 123 | CUDA_REPORT_IF_ERROR(cuModuleLoadData(&module, data)); |
| 124 | return module; |
| 125 | } |
| 126 | |
| 127 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoadJIT(void *data, |
| 128 | int optLevel) { |
| 129 | ScopedContext scopedContext; |
| 130 | CUmodule module = nullptr; |
| 131 | char jitErrorBuffer[4096] = {0}; |
| 132 | CUjit_option jitOptions[] = {CU_JIT_ERROR_LOG_BUFFER, |
| 133 | CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, |
| 134 | CU_JIT_OPTIMIZATION_LEVEL}; |
| 135 | void *jitOptionsVals[] = {jitErrorBuffer, |
| 136 | reinterpret_cast<void *>(sizeof(jitErrorBuffer)), |
| 137 | reinterpret_cast<void *>(optLevel)}; |
| 138 | |
| 139 | CUresult result = |
| 140 | cuModuleLoadDataEx(&module, data, 3, jitOptions, jitOptionsVals); |
| 141 | if (result) { |
| 142 | fprintf(stderr, format: "JIT compilation failed with: '%s'\n" , jitErrorBuffer); |
| 143 | CUDA_REPORT_IF_ERROR(result); |
| 144 | } |
| 145 | return module; |
| 146 | } |
| 147 | |
| 148 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuModuleUnload(CUmodule module) { |
| 149 | CUDA_REPORT_IF_ERROR(cuModuleUnload(module)); |
| 150 | } |
| 151 | |
| 152 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUfunction |
| 153 | mgpuModuleGetFunction(CUmodule module, const char *name) { |
| 154 | CUfunction function = nullptr; |
| 155 | CUDA_REPORT_IF_ERROR(cuModuleGetFunction(&function, module, name)); |
| 156 | return function; |
| 157 | } |
| 158 | |
| 159 | // The wrapper uses intptr_t instead of CUDA's unsigned int to match |
| 160 | // the type of MLIR's index type. This avoids the need for casts in the |
| 161 | // generated MLIR code. |
| 162 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
| 163 | mgpuLaunchKernel(CUfunction function, intptr_t gridX, intptr_t gridY, |
| 164 | intptr_t gridZ, intptr_t blockX, intptr_t blockY, |
| 165 | intptr_t blockZ, int32_t smem, CUstream stream, void **params, |
| 166 | void **, size_t /*paramsCount*/) { |
| 167 | ScopedContext scopedContext; |
| 168 | if (smem > 0) { |
| 169 | // Avoid checking driver as it's more expensive than if statement |
| 170 | int32_t maxShmem = 0; |
| 171 | CUdevice device = getDefaultCuDevice(); |
| 172 | CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice)); |
| 173 | CUDA_REPORT_IF_ERROR(cuDeviceGetAttribute( |
| 174 | &maxShmem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, |
| 175 | device)); |
| 176 | if (maxShmem < smem) { |
| 177 | fprintf(stderr, |
| 178 | format: "Requested shared memory (%dkb) is larger than maximum allowed " |
| 179 | "shared memory (%dkb) for this device\n" , |
| 180 | smem, maxShmem); |
| 181 | } |
| 182 | CUDA_REPORT_IF_ERROR(cuFuncSetAttribute( |
| 183 | function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, smem)); |
| 184 | } |
| 185 | debug_print("Launching kernel, grid=%ld,%ld,%ld, " |
| 186 | "threads: %ld, %ld, %ld, " |
| 187 | "smem: %dkb\n" , |
| 188 | gridX, gridY, gridZ, blockX, blockY, blockZ, smem); |
| 189 | CUDA_REPORT_IF_ERROR(cuLaunchKernel(function, gridX, gridY, gridZ, blockX, |
| 190 | blockY, blockZ, smem, stream, params, |
| 191 | extra)); |
| 192 | } |
| 193 | |
| 194 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUstream mgpuStreamCreate() { |
| 195 | ScopedContext scopedContext; |
| 196 | CUstream stream = nullptr; |
| 197 | CUDA_REPORT_IF_ERROR(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); |
| 198 | return stream; |
| 199 | } |
| 200 | |
| 201 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuStreamDestroy(CUstream stream) { |
| 202 | CUDA_REPORT_IF_ERROR(cuStreamDestroy(stream)); |
| 203 | } |
| 204 | |
| 205 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
| 206 | mgpuStreamSynchronize(CUstream stream) { |
| 207 | CUDA_REPORT_IF_ERROR(cuStreamSynchronize(stream)); |
| 208 | } |
| 209 | |
| 210 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuStreamWaitEvent(CUstream stream, |
| 211 | CUevent event) { |
| 212 | CUDA_REPORT_IF_ERROR(cuStreamWaitEvent(stream, event, /*flags=*/0)); |
| 213 | } |
| 214 | |
| 215 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUevent mgpuEventCreate() { |
| 216 | ScopedContext scopedContext; |
| 217 | CUevent event = nullptr; |
| 218 | CUDA_REPORT_IF_ERROR(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); |
| 219 | return event; |
| 220 | } |
| 221 | |
| 222 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventDestroy(CUevent event) { |
| 223 | CUDA_REPORT_IF_ERROR(cuEventDestroy(event)); |
| 224 | } |
| 225 | |
| 226 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventSynchronize(CUevent event) { |
| 227 | CUDA_REPORT_IF_ERROR(cuEventSynchronize(event)); |
| 228 | } |
| 229 | |
| 230 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventRecord(CUevent event, |
| 231 | CUstream stream) { |
| 232 | CUDA_REPORT_IF_ERROR(cuEventRecord(event, stream)); |
| 233 | } |
| 234 | |
| 235 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * |
| 236 | mgpuMemAlloc(uint64_t sizeBytes, CUstream stream, bool isHostShared) { |
| 237 | ScopedContext scopedContext; |
| 238 | CUdeviceptr ptr = 0; |
| 239 | if (sizeBytes == 0) |
| 240 | return reinterpret_cast<void *>(ptr); |
| 241 | |
| 242 | if (isHostShared) { |
| 243 | CUDA_REPORT_IF_ERROR( |
| 244 | cuMemAllocManaged(&ptr, sizeBytes, CU_MEM_ATTACH_GLOBAL)); |
| 245 | return reinterpret_cast<void *>(ptr); |
| 246 | } |
| 247 | CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes)); |
| 248 | return reinterpret_cast<void *>(ptr); |
| 249 | } |
| 250 | |
| 251 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuMemFree(void *ptr, |
| 252 | CUstream /*stream*/) { |
| 253 | CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(ptr))); |
| 254 | } |
| 255 | |
| 256 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
| 257 | mgpuMemcpy(void *dst, void *src, size_t sizeBytes, CUstream stream) { |
| 258 | CUDA_REPORT_IF_ERROR(cuMemcpyAsync(reinterpret_cast<CUdeviceptr>(dst), |
| 259 | reinterpret_cast<CUdeviceptr>(src), |
| 260 | sizeBytes, stream)); |
| 261 | } |
| 262 | |
| 263 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
| 264 | mgpuMemset32(void *dst, unsigned int value, size_t count, CUstream stream) { |
| 265 | CUDA_REPORT_IF_ERROR(cuMemsetD32Async(reinterpret_cast<CUdeviceptr>(dst), |
| 266 | value, count, stream)); |
| 267 | } |
| 268 | |
| 269 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
| 270 | mgpuMemset16(void *dst, unsigned short value, size_t count, CUstream stream) { |
| 271 | CUDA_REPORT_IF_ERROR(cuMemsetD16Async(reinterpret_cast<CUdeviceptr>(dst), |
| 272 | value, count, stream)); |
| 273 | } |
| 274 | |
| 275 | /// |
| 276 | /// Helper functions for writing mlir example code |
| 277 | /// |
| 278 | |
| 279 | // Allows to register byte array with the CUDA runtime. Helpful until we have |
| 280 | // transfer functions implemented. |
| 281 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
| 282 | mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) { |
| 283 | ScopedContext scopedContext; |
| 284 | CUDA_REPORT_IF_ERROR(cuMemHostRegister(ptr, sizeBytes, /*flags=*/0)); |
| 285 | } |
| 286 | |
| 287 | /// Registers a memref with the CUDA runtime. `descriptor` is a pointer to a |
| 288 | /// ranked memref descriptor struct of rank `rank`. Helpful until we have |
| 289 | /// transfer functions implemented. |
| 290 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
| 291 | mgpuMemHostRegisterMemRef(int64_t rank, StridedMemRefType<char, 1> *descriptor, |
| 292 | int64_t elementSizeBytes) { |
| 293 | // Only densely packed tensors are currently supported. |
| 294 | #ifdef _WIN32 |
| 295 | int64_t *denseStrides = (int64_t *)_alloca(rank * sizeof(int64_t)); |
| 296 | #else |
| 297 | int64_t *denseStrides = (int64_t *)alloca(rank * sizeof(int64_t)); |
| 298 | #endif // _WIN32 |
| 299 | int64_t *sizes = descriptor->sizes; |
| 300 | for (int64_t i = rank - 1, runningStride = 1; i >= 0; i--) { |
| 301 | denseStrides[i] = runningStride; |
| 302 | runningStride *= sizes[i]; |
| 303 | } |
| 304 | uint64_t sizeBytes = sizes[0] * denseStrides[0] * elementSizeBytes; |
| 305 | int64_t *strides = &sizes[rank]; |
| 306 | (void)strides; |
| 307 | for (unsigned i = 0; i < rank; ++i) |
| 308 | assert(strides[i] == denseStrides[i] && |
| 309 | "Mismatch in computed dense strides" ); |
| 310 | |
| 311 | auto *ptr = descriptor->data + descriptor->offset * elementSizeBytes; |
| 312 | mgpuMemHostRegister(ptr, sizeBytes); |
| 313 | } |
| 314 | |
| 315 | // Allows to unregister byte array with the CUDA runtime. |
| 316 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuMemHostUnregister(void *ptr) { |
| 317 | ScopedContext scopedContext; |
| 318 | CUDA_REPORT_IF_ERROR(cuMemHostUnregister(ptr)); |
| 319 | } |
| 320 | |
| 321 | /// Unregisters a memref with the CUDA runtime. `descriptor` is a pointer to a |
| 322 | /// ranked memref descriptor struct of rank `rank` |
| 323 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
| 324 | mgpuMemHostUnregisterMemRef(int64_t rank, |
| 325 | StridedMemRefType<char, 1> *descriptor, |
| 326 | int64_t elementSizeBytes) { |
| 327 | auto *ptr = descriptor->data + descriptor->offset * elementSizeBytes; |
| 328 | mgpuMemHostUnregister(ptr); |
| 329 | } |
| 330 | |
| 331 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) { |
| 332 | defaultDevice = device; |
| 333 | } |
| 334 | |
| 335 | /// |
| 336 | /// Runtime methods using CUDA 12.0+ driver |
| 337 | /// |
| 338 | |
| 339 | #if (CUDA_VERSION >= 12000) |
| 340 | |
| 341 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuLaunchClusterKernel( |
| 342 | CUfunction function, intptr_t clusterX, intptr_t clusterY, |
| 343 | intptr_t clusterZ, intptr_t gridX, intptr_t gridY, intptr_t gridZ, |
| 344 | intptr_t blockX, intptr_t blockY, intptr_t blockZ, int32_t smem, |
| 345 | CUstream stream, void **params, void **extra, size_t /*paramsCount*/) { |
| 346 | ScopedContext scopedContext; |
| 347 | if (smem > 0) { |
| 348 | // Avoid checking driver as it's more expensive than if statement |
| 349 | int32_t maxShmem = 0; |
| 350 | CUdevice device = getDefaultCuDevice(); |
| 351 | CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice)); |
| 352 | CUDA_REPORT_IF_ERROR(cuDeviceGetAttribute( |
| 353 | &maxShmem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, |
| 354 | device)); |
| 355 | if (maxShmem < smem) { |
| 356 | fprintf(stderr, |
| 357 | "Requested shared memory (%dkb) is larger than maximum allowed " |
| 358 | "shared memory (%dkb) for this device\n" , |
| 359 | smem, maxShmem); |
| 360 | } |
| 361 | CUDA_REPORT_IF_ERROR(cuFuncSetAttribute( |
| 362 | function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, smem)); |
| 363 | } |
| 364 | CUlaunchConfig config; |
| 365 | config.gridDimX = gridX; |
| 366 | config.gridDimY = gridY; |
| 367 | config.gridDimZ = gridZ; |
| 368 | config.blockDimX = blockX; |
| 369 | config.blockDimY = blockY; |
| 370 | config.blockDimZ = blockZ; |
| 371 | config.sharedMemBytes = smem; |
| 372 | config.hStream = stream; |
| 373 | CUlaunchAttribute launchAttr[2]; |
| 374 | launchAttr[0].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION; |
| 375 | launchAttr[0].value.clusterDim.x = clusterX; |
| 376 | launchAttr[0].value.clusterDim.y = clusterY; |
| 377 | launchAttr[0].value.clusterDim.z = clusterZ; |
| 378 | launchAttr[1].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE; |
| 379 | launchAttr[1].value.clusterSchedulingPolicyPreference = |
| 380 | CU_CLUSTER_SCHEDULING_POLICY_SPREAD; |
| 381 | config.numAttrs = 2; |
| 382 | config.attrs = launchAttr; |
| 383 | |
| 384 | debug_print("Launching kernel," |
| 385 | "cluster: %ld, %ld, %ld, " |
| 386 | "grid=%ld,%ld,%ld, " |
| 387 | "threads: %ld, %ld, %ld, " |
| 388 | "smem: %dkb\n" , |
| 389 | clusterX, clusterY, clusterZ, gridX, gridY, gridZ, blockX, blockY, |
| 390 | blockZ, smem); |
| 391 | |
| 392 | CUDA_REPORT_IF_ERROR(cuLaunchKernelEx(&config, function, params, extra)); |
| 393 | } |
| 394 | |
| 395 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuTensorMapEncodeTiled( |
| 396 | CUtensorMap *tensorMap, // Tensor map object |
| 397 | CUtensorMapDataType tensorDataType, // Tensor data type |
| 398 | cuuint32_t tensorRank, // Dimensionality of tensor |
| 399 | void *globalAddress, // Starting address |
| 400 | const cuuint64_t *globalDim, // Tensor size (number of elements) |
| 401 | const cuuint64_t *globalStrides, // Stride size (in bytes) |
| 402 | const cuuint32_t *boxDim, // Traversal box (number of elments) |
| 403 | const cuuint32_t *elementStrides, // Traversal stride |
| 404 | CUtensorMapInterleave interleave, // Type of interleaved layout |
| 405 | CUtensorMapSwizzle swizzle, // Bank swizzling pattern |
| 406 | CUtensorMapL2promotion l2Promotion, // L2 promotion size |
| 407 | CUtensorMapFloatOOBfill oobFill // Padding zfill or NaN fill |
| 408 | ) { |
| 409 | ScopedContext scopedContext; |
| 410 | CUDA_REPORT_IF_ERROR(cuTensorMapEncodeTiled( |
| 411 | tensorMap, tensorDataType, tensorRank, globalAddress, globalDim, |
| 412 | globalStrides, boxDim, elementStrides, interleave, swizzle, l2Promotion, |
| 413 | oobFill)); |
| 414 | debug_print("Created TMA descriptor\n Addr: %p\n" |
| 415 | "data type : %d\n" |
| 416 | "rank : %d\n" |
| 417 | "globalDim[5]: %zu, %zu, %zu, %zu, %zu\n" |
| 418 | "globalStrides[5]: %zu, %zu, %zu, %zu, %zu\n" |
| 419 | "boxDim[5]: %u, %u, %u, %u, %u\n" |
| 420 | "elementStrides[5]: %u, %u, %u, %u, %u\n" |
| 421 | "interleave: %u \n" |
| 422 | "swizzle: %u \n" |
| 423 | "l2Promotion: %u \n" |
| 424 | "oobFill: %u \n" , |
| 425 | (void *)&tensorMap, tensorDataType, tensorRank, globalDim[0], |
| 426 | globalDim[1], globalDim[2], globalDim[3], globalDim[4], |
| 427 | globalStrides[0], globalStrides[1], globalStrides[2], |
| 428 | globalStrides[3], globalStrides[4], boxDim[0], boxDim[1], |
| 429 | boxDim[2], boxDim[3], boxDim[4], elementStrides[0], |
| 430 | elementStrides[1], elementStrides[2], elementStrides[3], |
| 431 | elementStrides[4], interleave, swizzle, l2Promotion, oobFill); |
| 432 | } |
| 433 | |
| 434 | template <int Rank> |
| 435 | void mgpuGetMemRefDataAndShape(void *rawDescriptor, char **addr, |
| 436 | uint64_t *globalDim, uint64_t *globalStrides, |
| 437 | const CUtensorMapDataType tensorDataType) { |
| 438 | auto descriptor = |
| 439 | reinterpret_cast<StridedMemRefType<char, Rank> *>(rawDescriptor); |
| 440 | *addr = descriptor->data; |
| 441 | for (int i = 0; i < Rank; ++i) { |
| 442 | globalDim[i] = static_cast<uint64_t>(descriptor->sizes[Rank - i - 1]); |
| 443 | } |
| 444 | static constexpr int elementSizeInBytes[] = {1, 2, 4, 4, 8, 8, 2, |
| 445 | 4, 8, 2, 4, 4, 4}; |
| 446 | for (int i = 0; i < Rank - 1; ++i) { |
| 447 | globalStrides[i] = static_cast<uint64_t>( |
| 448 | descriptor->strides[Rank - i - 2] * elementSizeInBytes[tensorDataType]); |
| 449 | } |
| 450 | } |
| 451 | |
| 452 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *mgpuTensorMapEncodeTiledMemref( |
| 453 | int64_t tensorRank, // Dimensionality of tensor |
| 454 | void *rankedDescriptor, // Ranked MemRef descriptor |
| 455 | const CUtensorMapDataType tensorDataType, // Stride size (in bytes) |
| 456 | CUtensorMapInterleave interleave, // Type of interleaved layout |
| 457 | CUtensorMapSwizzle swizzle, // Bank swizzling pattern |
| 458 | CUtensorMapL2promotion l2Promotion, // L2 promotion size |
| 459 | CUtensorMapFloatOOBfill oobFill, // Padding zfill or NaN fill |
| 460 | int64_t *inputBoxDims // Tensor size (number of elements) |
| 461 | ) { |
| 462 | CUtensorMap tensorMap; |
| 463 | |
| 464 | uint32_t boxDim[5] = {1, 1, 1, 1, 1}, elementStrides[5] = {1, 1, 1, 1, 1}; |
| 465 | uint64_t globalDim[5] = {1, 1, 1, 1, 1}, globalStrides[5] = {0}; |
| 466 | uint32_t tensorRank32 = uint32_t(tensorRank); |
| 467 | |
| 468 | char *globalAddress = nullptr; |
| 469 | switch (tensorRank) { |
| 470 | case 1: |
| 471 | mgpuGetMemRefDataAndShape<1>(rankedDescriptor, &globalAddress, globalDim, |
| 472 | globalStrides, tensorDataType); |
| 473 | break; |
| 474 | case 2: |
| 475 | mgpuGetMemRefDataAndShape<2>(rankedDescriptor, &globalAddress, globalDim, |
| 476 | globalStrides, tensorDataType); |
| 477 | break; |
| 478 | case 3: |
| 479 | mgpuGetMemRefDataAndShape<3>(rankedDescriptor, &globalAddress, globalDim, |
| 480 | globalStrides, tensorDataType); |
| 481 | break; |
| 482 | case 4: |
| 483 | mgpuGetMemRefDataAndShape<4>(rankedDescriptor, &globalAddress, globalDim, |
| 484 | globalStrides, tensorDataType); |
| 485 | break; |
| 486 | case 5: |
| 487 | mgpuGetMemRefDataAndShape<5>(rankedDescriptor, &globalAddress, globalDim, |
| 488 | globalStrides, tensorDataType); |
| 489 | break; |
| 490 | default: |
| 491 | fprintf( |
| 492 | stderr, |
| 493 | "'mgpuTensorMapEncodeTiledMemref' failed with 'rank is too high'\n" ); |
| 494 | return nullptr; |
| 495 | } |
| 496 | |
| 497 | for (int64_t r = 0; r < tensorRank; ++r) { |
| 498 | boxDim[r] = static_cast<uint32_t>(inputBoxDims[tensorRank - r - 1]); |
| 499 | } |
| 500 | |
| 501 | ScopedContext scopedContext; |
| 502 | mgpuTensorMapEncodeTiled(&tensorMap, tensorDataType, tensorRank32, |
| 503 | globalAddress, globalDim, globalStrides, boxDim, |
| 504 | elementStrides, interleave, swizzle, l2Promotion, |
| 505 | oobFill); |
| 506 | // Copy created tensor map to device |
| 507 | CUdeviceptr dTensorMap; |
| 508 | CUDA_REPORT_IF_ERROR(cuMemAlloc(&dTensorMap, sizeof(CUtensorMap))); |
| 509 | CUDA_REPORT_IF_ERROR(cuMemcpy(dTensorMap, |
| 510 | reinterpret_cast<CUdeviceptr>(&tensorMap), |
| 511 | sizeof(CUtensorMap))); |
| 512 | return reinterpret_cast<void *>(dTensorMap); |
| 513 | } |
| 514 | #endif |
| 515 | |
| 516 | #ifdef MLIR_ENABLE_CUDA_CUSPARSE |
| 517 | |
| 518 | /// |
| 519 | /// Wrapper methods for the cuSparse library. |
| 520 | /// |
| 521 | |
| 522 | // Some macro magic to get float/double alpha and beta on host. |
| 523 | // TODO: add support to passing alpha and beta as arguments |
| 524 | #define ALPHABETA(dtp, alpha, beta) \ |
| 525 | __nv_bfloat16(alpha##16bf) = 1.0f; \ |
| 526 | __nv_bfloat16(beta##16bf) = 1.0f; \ |
| 527 | __half(alpha##16f) = 1.0f; \ |
| 528 | __half(beta##16f) = 1.0f; \ |
| 529 | float(alpha##f) = 1.0f; \ |
| 530 | float(beta##f) = 1.0f; \ |
| 531 | double(alpha##d) = 1.0; \ |
| 532 | double(beta##d) = 1.0; \ |
| 533 | const void *(alpha##p) = nullptr; \ |
| 534 | const void *(beta##p) = nullptr; \ |
| 535 | if (dtp == CUDA_R_16BF || dtp == CUDA_C_16BF) { \ |
| 536 | (alpha##p) = reinterpret_cast<void *>(&(alpha##16bf)); \ |
| 537 | (beta##p) = reinterpret_cast<void *>(&(beta##16bf)); \ |
| 538 | } else if (dtp == CUDA_R_16F || dtp == CUDA_C_16F) { \ |
| 539 | (alpha##p) = reinterpret_cast<void *>(&(alpha##16f)); \ |
| 540 | (beta##p) = reinterpret_cast<void *>(&(beta##16f)); \ |
| 541 | } else if (dtp == CUDA_R_32F || dtp == CUDA_C_32F) { \ |
| 542 | (alpha##p) = reinterpret_cast<void *>(&(alpha##f)); \ |
| 543 | (beta##p) = reinterpret_cast<void *>(&(beta##f)); \ |
| 544 | } else { \ |
| 545 | (alpha##p) = reinterpret_cast<void *>(&(alpha##d)); \ |
| 546 | (beta##p) = reinterpret_cast<void *>(&(beta##d)); \ |
| 547 | } |
| 548 | |
| 549 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuCreateSparseEnv() { |
| 550 | // ScopedContext is for cuda initialization. |
| 551 | ScopedContext scopedContext; |
| 552 | assert(!cusparse_env && "client called mgpuCreateSparseEnv() twice" ); |
| 553 | CUSPARSE_REPORT_IF_ERROR(cusparseCreate(&cusparse_env)); |
| 554 | } |
| 555 | |
| 556 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuDestroySparseEnv() { |
| 557 | assert(cusparse_env && "client did not call mgpuCreateSparseEnv()" ); |
| 558 | CUSPARSE_REPORT_IF_ERROR(cusparseDestroy(cusparse_env)); |
| 559 | cusparse_env = nullptr; |
| 560 | } |
| 561 | |
| 562 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * |
| 563 | mgpuCreateDnVec(intptr_t size, void *values, int32_t dtp, CUstream /*stream*/) { |
| 564 | cusparseDnVecDescr_t vec = nullptr; |
| 565 | auto dTp = static_cast<cudaDataType_t>(dtp); |
| 566 | CUSPARSE_REPORT_IF_ERROR(cusparseCreateDnVec(&vec, size, values, dTp)) |
| 567 | return reinterpret_cast<void *>(vec); |
| 568 | } |
| 569 | |
| 570 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
| 571 | mgpuDestroyDnVec(void *v, CUstream /*stream*/) { |
| 572 | cusparseDnVecDescr_t vec = reinterpret_cast<cusparseDnVecDescr_t>(v); |
| 573 | CUSPARSE_REPORT_IF_ERROR(cusparseDestroyDnVec(vec)) |
| 574 | } |
| 575 | |
| 576 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * |
| 577 | mgpuCreateDnMat(intptr_t rows, intptr_t cols, void *values, int32_t dtp, |
| 578 | CUstream /*stream*/) { |
| 579 | cusparseDnMatDescr_t mat = nullptr; |
| 580 | auto dTp = static_cast<cudaDataType_t>(dtp); |
| 581 | CUSPARSE_REPORT_IF_ERROR(cusparseCreateDnMat(&mat, rows, cols, /*ld=*/cols, |
| 582 | values, dTp, CUSPARSE_ORDER_ROW)) |
| 583 | return reinterpret_cast<void *>(mat); |
| 584 | } |
| 585 | |
| 586 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
| 587 | mgpuDestroyDnMat(void *m, CUstream /*stream*/) { |
| 588 | cusparseDnMatDescr_t mat = reinterpret_cast<cusparseDnMatDescr_t>(m); |
| 589 | CUSPARSE_REPORT_IF_ERROR(cusparseDestroyDnMat(mat)) |
| 590 | } |
| 591 | |
| 592 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * |
| 593 | mgpuCreateCoo(intptr_t rows, intptr_t cols, intptr_t nnz, void *rowIdxs, |
| 594 | void *colIdxs, void *values, int32_t itp, int32_t dtp, |
| 595 | CUstream /*stream*/) { |
| 596 | cusparseSpMatDescr_t mat = nullptr; |
| 597 | auto iTp = static_cast<cusparseIndexType_t>(itp); |
| 598 | auto dTp = static_cast<cudaDataType_t>(dtp); |
| 599 | CUSPARSE_REPORT_IF_ERROR(cusparseCreateCoo(&mat, rows, cols, nnz, rowIdxs, |
| 600 | colIdxs, values, iTp, |
| 601 | CUSPARSE_INDEX_BASE_ZERO, dTp)) |
| 602 | return reinterpret_cast<void *>(mat); |
| 603 | } |
| 604 | |
| 605 | #ifdef CUSPARSE_COO_AOS // deprecated in cuSPARSE 11.2 |
| 606 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * |
| 607 | mgpuCreateCooAoS(intptr_t rows, intptr_t cols, intptr_t nnz, void *idxs, |
| 608 | void *values, int32_t itp, int32_t dtp, CUstream /*stream*/) { |
| 609 | cusparseSpMatDescr_t mat = nullptr; |
| 610 | auto iTp = static_cast<cusparseIndexType_t>(itp); |
| 611 | auto dTp = static_cast<cudaDataType_t>(dtp); |
| 612 | CUSPARSE_REPORT_IF_ERROR(cusparseCreateCooAoS( |
| 613 | &mat, rows, cols, nnz, idxs, values, iTp, CUSPARSE_INDEX_BASE_ZERO, dTp)) |
| 614 | return reinterpret_cast<void *>(mat); |
| 615 | } |
| 616 | #endif // CUSPARSE_COO_AOS |
| 617 | |
| 618 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * |
| 619 | mgpuCreateCsr(intptr_t rows, intptr_t cols, intptr_t nnz, void *rowPos, |
| 620 | void *colIdxs, void *values, int32_t ptp, int32_t itp, |
| 621 | int32_t dtp, CUstream /*stream*/) { |
| 622 | cusparseSpMatDescr_t mat = nullptr; |
| 623 | auto pTp = static_cast<cusparseIndexType_t>(ptp); |
| 624 | auto iTp = static_cast<cusparseIndexType_t>(itp); |
| 625 | auto dTp = static_cast<cudaDataType_t>(dtp); |
| 626 | CUSPARSE_REPORT_IF_ERROR(cusparseCreateCsr(&mat, rows, cols, nnz, rowPos, |
| 627 | colIdxs, values, pTp, iTp, |
| 628 | CUSPARSE_INDEX_BASE_ZERO, dTp)) |
| 629 | return reinterpret_cast<void *>(mat); |
| 630 | } |
| 631 | |
| 632 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * |
| 633 | mgpuCreateCsc(intptr_t rows, intptr_t cols, intptr_t nnz, void *colPos, |
| 634 | void *rowIdxs, void *values, int32_t ptp, int32_t itp, |
| 635 | int32_t dtp, CUstream /*stream*/) { |
| 636 | cusparseSpMatDescr_t mat = nullptr; |
| 637 | auto pTp = static_cast<cusparseIndexType_t>(ptp); |
| 638 | auto iTp = static_cast<cusparseIndexType_t>(itp); |
| 639 | auto dTp = static_cast<cudaDataType_t>(dtp); |
| 640 | CUSPARSE_REPORT_IF_ERROR(cusparseCreateCsc(&mat, rows, cols, nnz, colPos, |
| 641 | rowIdxs, values, pTp, iTp, |
| 642 | CUSPARSE_INDEX_BASE_ZERO, dTp)) |
| 643 | return reinterpret_cast<void *>(mat); |
| 644 | } |
| 645 | |
| 646 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * |
| 647 | mgpuCreateBsr(intptr_t brows, intptr_t bcols, intptr_t bnnz, intptr_t rBsz, |
| 648 | intptr_t cBsz, void *rowPos, void *colIdxs, void *values, |
| 649 | int32_t ptp, int32_t itp, int32_t dtp, CUstream /*stream*/) { |
| 650 | cusparseSpMatDescr_t mat = nullptr; |
| 651 | #if CUSPARSE_VERSION >= 12100 |
| 652 | auto pTp = static_cast<cusparseIndexType_t>(ptp); |
| 653 | auto iTp = static_cast<cusparseIndexType_t>(itp); |
| 654 | auto dTp = static_cast<cudaDataType_t>(dtp); |
| 655 | CUSPARSE_REPORT_IF_ERROR(cusparseCreateBsr( |
| 656 | &mat, brows, bcols, bnnz, rBsz, cBsz, rowPos, colIdxs, values, pTp, iTp, |
| 657 | CUSPARSE_INDEX_BASE_ZERO, dTp, CUSPARSE_ORDER_ROW)) |
| 658 | #endif |
| 659 | return reinterpret_cast<void *>(mat); |
| 660 | } |
| 661 | |
| 662 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
| 663 | mgpuDestroySpMat(void *m, CUstream /*stream*/) { |
| 664 | cusparseSpMatDescr_t mat = reinterpret_cast<cusparseSpMatDescr_t>(m); |
| 665 | CUSPARSE_REPORT_IF_ERROR(cusparseDestroySpMat(mat)) |
| 666 | } |
| 667 | |
| 668 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t mgpuSpMVBufferSize( |
| 669 | int32_t ma, void *a, void *x, void *y, int32_t ctp, CUstream /*stream*/) { |
| 670 | assert(cusparse_env && "client did not call mgpuCreateSparseEnv()" ); |
| 671 | cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma); |
| 672 | cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a); |
| 673 | cusparseDnVecDescr_t vecX = reinterpret_cast<cusparseDnVecDescr_t>(x); |
| 674 | cusparseDnVecDescr_t vecY = reinterpret_cast<cusparseDnVecDescr_t>(y); |
| 675 | cudaDataType_t cTp = static_cast<cudaDataType_t>(ctp); |
| 676 | ALPHABETA(cTp, alpha, beta) |
| 677 | size_t bufferSize = 0; |
| 678 | CUSPARSE_REPORT_IF_ERROR(cusparseSpMV_bufferSize( |
| 679 | cusparse_env, modeA, alphap, matA, vecX, betap, vecY, cTp, |
| 680 | CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize)) |
| 681 | return bufferSize; |
| 682 | } |
| 683 | |
| 684 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSpMV(int32_t ma, void *a, void *x, |
| 685 | void *y, int32_t ctp, |
| 686 | void *buf, |
| 687 | CUstream /*stream*/) { |
| 688 | assert(cusparse_env && "client did not call mgpuCreateSparseEnv()" ); |
| 689 | cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma); |
| 690 | cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a); |
| 691 | cusparseDnVecDescr_t vecX = reinterpret_cast<cusparseDnVecDescr_t>(x); |
| 692 | cusparseDnVecDescr_t vecY = reinterpret_cast<cusparseDnVecDescr_t>(y); |
| 693 | cudaDataType_t cTp = static_cast<cudaDataType_t>(ctp); |
| 694 | ALPHABETA(cTp, alpha, beta) |
| 695 | CUSPARSE_REPORT_IF_ERROR(cusparseSpMV(cusparse_env, modeA, alphap, matA, vecX, |
| 696 | betap, vecY, cTp, |
| 697 | CUSPARSE_SPMV_ALG_DEFAULT, buf)) |
| 698 | } |
| 699 | |
| 700 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t |
| 701 | mgpuSpMMBufferSize(int32_t ma, int32_t mb, void *a, void *b, void *c, |
| 702 | int32_t ctp, CUstream /*stream*/) { |
| 703 | assert(cusparse_env && "client did not call mgpuCreateSparseEnv()" ); |
| 704 | cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma); |
| 705 | cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb); |
| 706 | cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a); |
| 707 | cusparseDnMatDescr_t matB = reinterpret_cast<cusparseDnMatDescr_t>(b); |
| 708 | cusparseDnMatDescr_t matC = reinterpret_cast<cusparseDnMatDescr_t>(c); |
| 709 | cudaDataType_t cTp = static_cast<cudaDataType_t>(ctp); |
| 710 | ALPHABETA(cTp, alpha, beta) |
| 711 | size_t bufferSize = 0; |
| 712 | CUSPARSE_REPORT_IF_ERROR(cusparseSpMM_bufferSize( |
| 713 | cusparse_env, modeA, modeB, alphap, matA, matB, betap, matC, cTp, |
| 714 | CUSPARSE_SPMM_ALG_DEFAULT, &bufferSize)) |
| 715 | return bufferSize; |
| 716 | } |
| 717 | |
| 718 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSpMM(int32_t ma, int32_t mb, |
| 719 | void *a, void *b, void *c, |
| 720 | int32_t ctp, void *buf, |
| 721 | CUstream /*stream*/) { |
| 722 | assert(cusparse_env && "client did not call mgpuCreateSparseEnv()" ); |
| 723 | cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma); |
| 724 | cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb); |
| 725 | cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a); |
| 726 | cusparseDnMatDescr_t matB = reinterpret_cast<cusparseDnMatDescr_t>(b); |
| 727 | cusparseDnMatDescr_t matC = reinterpret_cast<cusparseDnMatDescr_t>(c); |
| 728 | cudaDataType_t cTp = static_cast<cudaDataType_t>(ctp); |
| 729 | ALPHABETA(cTp, alpha, beta) |
| 730 | CUSPARSE_REPORT_IF_ERROR(cusparseSpMM(cusparse_env, modeA, modeB, alphap, |
| 731 | matA, matB, betap, matC, cTp, |
| 732 | CUSPARSE_SPMM_ALG_DEFAULT, buf)) |
| 733 | } |
| 734 | |
| 735 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t |
| 736 | mgpuSDDMMBufferSize(int32_t ma, int32_t mb, void *a, void *b, void *c, |
| 737 | int32_t ctp, CUstream /*stream*/) { |
| 738 | assert(cusparse_env && "client did not call mgpuCreateSparseEnv()" ); |
| 739 | cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma); |
| 740 | cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb); |
| 741 | cusparseDnMatDescr_t matA = reinterpret_cast<cusparseDnMatDescr_t>(a); |
| 742 | cusparseDnMatDescr_t matB = reinterpret_cast<cusparseDnMatDescr_t>(b); |
| 743 | cusparseSpMatDescr_t matC = reinterpret_cast<cusparseSpMatDescr_t>(c); |
| 744 | auto cTp = static_cast<cudaDataType_t>(ctp); |
| 745 | ALPHABETA(cTp, alpha, beta) |
| 746 | size_t bufferSize = 0; |
| 747 | CUSPARSE_REPORT_IF_ERROR(cusparseSDDMM_bufferSize( |
| 748 | cusparse_env, modeA, modeB, alphap, matA, matB, betap, matC, cTp, |
| 749 | CUSPARSE_SDDMM_ALG_DEFAULT, &bufferSize)) |
| 750 | return bufferSize; |
| 751 | } |
| 752 | |
| 753 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSDDMM(int32_t ma, int32_t mb, |
| 754 | void *a, void *b, void *c, |
| 755 | int32_t ctp, void *buf, |
| 756 | CUstream /*stream*/) { |
| 757 | assert(cusparse_env && "client did not call mgpuCreateSparseEnv()" ); |
| 758 | cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma); |
| 759 | cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb); |
| 760 | cusparseDnMatDescr_t matA = reinterpret_cast<cusparseDnMatDescr_t>(a); |
| 761 | cusparseDnMatDescr_t matB = reinterpret_cast<cusparseDnMatDescr_t>(b); |
| 762 | cusparseSpMatDescr_t matC = reinterpret_cast<cusparseSpMatDescr_t>(c); |
| 763 | auto cTp = static_cast<cudaDataType_t>(ctp); |
| 764 | ALPHABETA(cTp, alpha, beta) |
| 765 | CUSPARSE_REPORT_IF_ERROR(cusparseSDDMM(cusparse_env, modeA, modeB, alphap, |
| 766 | matA, matB, betap, matC, cTp, |
| 767 | CUSPARSE_SDDMM_ALG_DEFAULT, buf)) |
| 768 | } |
| 769 | |
| 770 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * |
| 771 | mgpuSpGEMMCreateDescr(CUstream /*stream*/) { |
| 772 | cusparseSpGEMMDescr_t spgemmDesc = nullptr; |
| 773 | CUSPARSE_REPORT_IF_ERROR(cusparseSpGEMM_createDescr(&spgemmDesc)) |
| 774 | return reinterpret_cast<void *>(spgemmDesc); |
| 775 | } |
| 776 | |
| 777 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
| 778 | mgpuSpGEMMDestroyDescr(void *s, CUstream /*stream*/) { |
| 779 | cusparseSpGEMMDescr_t spgemmDesc = reinterpret_cast<cusparseSpGEMMDescr_t>(s); |
| 780 | CUSPARSE_REPORT_IF_ERROR(cusparseSpGEMM_destroyDescr(spgemmDesc)) |
| 781 | } |
| 782 | |
| 783 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t mgpuSpGEMMWorkEstimation( |
| 784 | void *s, int32_t ma, int32_t mb, void *a, void *b, void *c, int32_t ctp, |
| 785 | intptr_t bs, void *buf, CUstream /*stream*/) { |
| 786 | cusparseSpGEMMDescr_t spgemmDesc = reinterpret_cast<cusparseSpGEMMDescr_t>(s); |
| 787 | cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma); |
| 788 | cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb); |
| 789 | cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a); |
| 790 | cusparseSpMatDescr_t matB = reinterpret_cast<cusparseSpMatDescr_t>(b); |
| 791 | cusparseSpMatDescr_t matC = reinterpret_cast<cusparseSpMatDescr_t>(c); |
| 792 | auto cTp = static_cast<cudaDataType_t>(ctp); |
| 793 | ALPHABETA(cTp, alpha, beta) |
| 794 | size_t newBufferSize = bs; |
| 795 | CUSPARSE_REPORT_IF_ERROR(cusparseSpGEMM_workEstimation( |
| 796 | cusparse_env, modeA, modeB, alphap, matA, matB, betap, matC, cTp, |
| 797 | CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, &newBufferSize, buf)) |
| 798 | return newBufferSize; |
| 799 | } |
| 800 | |
| 801 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t |
| 802 | mgpuSpGEMMCompute(void *s, int32_t ma, int32_t mb, void *a, void *b, void *c, |
| 803 | int32_t ctp, intptr_t bsz2, void *buf2, CUstream /*stream*/) { |
| 804 | cusparseSpGEMMDescr_t spgemmDesc = reinterpret_cast<cusparseSpGEMMDescr_t>(s); |
| 805 | cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma); |
| 806 | cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb); |
| 807 | cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a); |
| 808 | cusparseSpMatDescr_t matB = reinterpret_cast<cusparseSpMatDescr_t>(b); |
| 809 | cusparseSpMatDescr_t matC = reinterpret_cast<cusparseSpMatDescr_t>(c); |
| 810 | auto cTp = static_cast<cudaDataType_t>(ctp); |
| 811 | ALPHABETA(cTp, alpha, beta) |
| 812 | size_t newBufferSize2 = bsz2; |
| 813 | CUSPARSE_REPORT_IF_ERROR(cusparseSpGEMM_compute( |
| 814 | cusparse_env, modeA, modeB, alphap, matA, matB, betap, matC, cTp, |
| 815 | CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, &newBufferSize2, buf2)) |
| 816 | return newBufferSize2; |
| 817 | } |
| 818 | |
| 819 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
| 820 | mgpuSpGEMMCopy(void *s, int32_t ma, int32_t mb, void *a, void *b, void *c, |
| 821 | int32_t ctp, CUstream /*stream*/) { |
| 822 | cusparseSpGEMMDescr_t spgemmDesc = reinterpret_cast<cusparseSpGEMMDescr_t>(s); |
| 823 | cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma); |
| 824 | cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb); |
| 825 | cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a); |
| 826 | cusparseSpMatDescr_t matB = reinterpret_cast<cusparseSpMatDescr_t>(b); |
| 827 | cusparseSpMatDescr_t matC = reinterpret_cast<cusparseSpMatDescr_t>(c); |
| 828 | auto cTp = static_cast<cudaDataType_t>(ctp); |
| 829 | ALPHABETA(cTp, alpha, beta) |
| 830 | CUSPARSE_REPORT_IF_ERROR( |
| 831 | cusparseSpGEMM_copy(cusparse_env, modeA, modeB, alphap, matA, matB, betap, |
| 832 | matC, cTp, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc)) |
| 833 | } |
| 834 | |
| 835 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
| 836 | mgpuSpMatGetSize(void *m, void *r, void *c, void *n, CUstream /*stream*/) { |
| 837 | cusparseConstSpMatDescr_t matDescr = |
| 838 | reinterpret_cast<cusparseConstSpMatDescr_t>(m); |
| 839 | int64_t *rows = reinterpret_cast<int64_t *>(r); |
| 840 | int64_t *cols = reinterpret_cast<int64_t *>(c); |
| 841 | int64_t *nnz = reinterpret_cast<int64_t *>(n); |
| 842 | CUSPARSE_REPORT_IF_ERROR(cusparseSpMatGetSize(matDescr, rows, cols, nnz)); |
| 843 | } |
| 844 | |
| 845 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
| 846 | mgpuSetCsrPointers(void *m, void *p, void *c, void *v, CUstream /*stream*/) { |
| 847 | cusparseSpMatDescr_t matDescr = reinterpret_cast<cusparseSpMatDescr_t>(m); |
| 848 | CUSPARSE_REPORT_IF_ERROR(cusparseCsrSetPointers(matDescr, p, c, v)); |
| 849 | } |
| 850 | |
| 851 | #ifdef MLIR_ENABLE_CUDA_CUSPARSELT |
| 852 | |
| 853 | /// |
| 854 | /// Wrapper methods for the cuSparseLt library. |
| 855 | /// |
| 856 | |
| 857 | struct cusparseLtSpMatHandleAndData { |
| 858 | cusparseLtMatDescriptor_t mat; |
| 859 | // TODO: the following three are associated with the SpMM operator rather than |
| 860 | // the sparse matrix. Create workspace buffers and pass them to the SpMM |
| 861 | // execution. |
| 862 | cusparseLtMatmulAlgSelection_t alg_sel; |
| 863 | cusparseLtMatmulPlan_t plan; |
| 864 | cusparseLtMatmulDescriptor_t matmul; |
| 865 | void *values{nullptr}; |
| 866 | }; |
| 867 | |
| 868 | struct cusparseLtDnMatHandleAndData { |
| 869 | cusparseLtMatDescriptor_t mat; |
| 870 | void *values{nullptr}; |
| 871 | }; |
| 872 | |
| 873 | static_assert(sizeof(cusparseLtHandle_t) == 11024, |
| 874 | "Unexpected cusparseLt handle size" ); |
| 875 | static_assert(sizeof(cusparseLtSpMatHandleAndData) == 44104, |
| 876 | "Unexpected cusparseLt sparse matrix handle size" ); |
| 877 | static_assert(sizeof(cusparseLtDnMatHandleAndData) == 11032, |
| 878 | "Unexpected cusparseLt dense matrix handle size" ); |
| 879 | |
| 880 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuCreateSparseLtEnv() { |
| 881 | // ScopedContext is for cuda initialization. |
| 882 | ScopedContext scopedContext; |
| 883 | assert(!cusparseLt_initiated && |
| 884 | "client called mgpuCreateSparseLtEnv() twice" ); |
| 885 | // Note that cuSparseLt still uses cusparseStatus_t. |
| 886 | CUSPARSE_REPORT_IF_ERROR(cusparseLtInit(&cusparseLt_env)); |
| 887 | cusparseLt_initiated = true; |
| 888 | } |
| 889 | |
| 890 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuDestroySparseLtEnv() { |
| 891 | assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()" ); |
| 892 | CUSPARSE_REPORT_IF_ERROR(cusparseLtDestroy(&cusparseLt_env)); |
| 893 | cusparseLt_initiated = false; |
| 894 | } |
| 895 | |
| 896 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
| 897 | mgpuCreateCuSparseLtDnMat(void *dh, intptr_t rows, intptr_t cols, void *values, |
| 898 | int32_t dtp, CUstream /*stream*/) { |
| 899 | assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()" ); |
| 900 | auto dnmat_handle = reinterpret_cast<cusparseLtDnMatHandleAndData *>(dh); |
| 901 | dnmat_handle->values = values; |
| 902 | auto dTp = static_cast<cudaDataType_t>(dtp); |
| 903 | // Assume row-major when deciding lda. |
| 904 | const uint32_t alignment = 16; |
| 905 | CUSPARSE_REPORT_IF_ERROR(cusparseLtDenseDescriptorInit( |
| 906 | &cusparseLt_env, &(dnmat_handle->mat), rows, cols, /*lda=*/cols, |
| 907 | alignment, dTp, CUSPARSE_ORDER_ROW)) |
| 908 | } |
| 909 | |
| 910 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
| 911 | mgpuDestroyCuSparseLtDnMat(void *dh, CUstream /*stream*/) { |
| 912 | auto dnmat_handle = reinterpret_cast<cusparseLtDnMatHandleAndData *>(dh); |
| 913 | CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(dnmat_handle->mat))) |
| 914 | } |
| 915 | |
| 916 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
| 917 | mgpuCusparseLtCreate2To4SpMat(void *sh, intptr_t rows, intptr_t cols, |
| 918 | void *values, int32_t dtp, CUstream /*stream*/) { |
| 919 | assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()" ); |
| 920 | auto spmat_handle = reinterpret_cast<cusparseLtSpMatHandleAndData *>(sh); |
| 921 | spmat_handle->values = values; |
| 922 | auto dTp = static_cast<cudaDataType_t>(dtp); |
| 923 | // Assume row-major when deciding lda. |
| 924 | const uint32_t alignment = 16; |
| 925 | CUSPARSE_REPORT_IF_ERROR(cusparseLtStructuredDescriptorInit( |
| 926 | &cusparseLt_env, &(spmat_handle->mat), rows, cols, /*ld=*/cols, alignment, |
| 927 | dTp, CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT)) |
| 928 | } |
| 929 | |
| 930 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
| 931 | mgpuDestroyCuSparseLtSpMat(void *sh, CUstream /*stream*/) { |
| 932 | auto spmat_handle = reinterpret_cast<cusparseLtSpMatHandleAndData *>(sh); |
| 933 | CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(spmat_handle->mat))) |
| 934 | } |
| 935 | |
| 936 | // Several things are being done in this stage, algorithm selection, planning, |
| 937 | // and returning workspace and compressed matrices data buffer sizes. |
| 938 | // The parameter prune_flag is used to indicate whether pruning and pruning |
| 939 | // check will happen 0 means not prune or prune check, 1 means prune, 2 means |
| 940 | // prune & prune check |
| 941 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
| 942 | mgpuCuSparseLtSpMMBufferSize(void *bs, int32_t ma, int32_t mb, void *a, void *b, |
| 943 | void *c, int32_t ctp, int32_t prune_flag, |
| 944 | CUstream stream) { |
| 945 | assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()" ); |
| 946 | // TODO: support more advanced settings, e.g., the input right operand is a |
| 947 | // sparse matrix assuming matA is the sparse matrix |
| 948 | auto matA = reinterpret_cast<cusparseLtSpMatHandleAndData *>(a); |
| 949 | auto matB = reinterpret_cast<cusparseLtDnMatHandleAndData *>(b); |
| 950 | auto matC = reinterpret_cast<cusparseLtDnMatHandleAndData *>(c); |
| 951 | auto workspace_size = reinterpret_cast<size_t *>(bs); |
| 952 | auto compressed_size = &(reinterpret_cast<size_t *>(bs)[1]); |
| 953 | auto compressed_buffer_size = &(reinterpret_cast<size_t *>(bs)[2]); |
| 954 | auto cTp = static_cast<cusparseComputeType>(ctp); |
| 955 | |
| 956 | cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma); |
| 957 | cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb); |
| 958 | CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulDescriptorInit( |
| 959 | &cusparseLt_env, &(matA->matmul), modeA, modeB, &(matA->mat), |
| 960 | &(matB->mat), &(matC->mat), &(matC->mat), cTp)) |
| 961 | CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulAlgSelectionInit( |
| 962 | &cusparseLt_env, &(matA->alg_sel), &(matA->matmul), |
| 963 | CUSPARSELT_MATMUL_ALG_DEFAULT)) |
| 964 | int alg = 0; |
| 965 | CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulAlgSetAttribute( |
| 966 | &cusparseLt_env, &(matA->alg_sel), CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, |
| 967 | sizeof(alg))) |
| 968 | |
| 969 | CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulPlanInit( |
| 970 | &cusparseLt_env, &(matA->plan), &(matA->matmul), &(matA->alg_sel))) |
| 971 | |
| 972 | // Pruning step (in-place). |
| 973 | if (prune_flag > 0) |
| 974 | CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMAPrune( |
| 975 | &cusparseLt_env, &(matA->matmul), matA->values, matA->values, |
| 976 | CUSPARSELT_PRUNE_SPMMA_STRIP, stream)) |
| 977 | |
| 978 | // Check structure of A. |
| 979 | // Note that this adds a synchronization on the stream. |
| 980 | // TODO: Do we want that? |
| 981 | if (prune_flag == 2) { |
| 982 | int *dvalid = (int *)mgpuMemAlloc(sizeof(int), stream, false); |
| 983 | CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMAPruneCheck( |
| 984 | &cusparseLt_env, &(matA->matmul), matA->values, dvalid, stream)) |
| 985 | int valid = 0; |
| 986 | mgpuMemcpy(&valid, dvalid, sizeof(int), stream); |
| 987 | mgpuStreamSynchronize(stream); |
| 988 | mgpuMemFree(dvalid, stream); |
| 989 | if (valid != 0) |
| 990 | fprintf(stderr, "CUPARSE-LT: sparse matrix is not 2:4; computed results " |
| 991 | "will be invalid\n" ); |
| 992 | } |
| 993 | |
| 994 | CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulGetWorkspace( |
| 995 | &cusparseLt_env, &(matA->plan), workspace_size)) |
| 996 | CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMACompressedSize( |
| 997 | &cusparseLt_env, &(matA->plan), compressed_size, compressed_buffer_size)) |
| 998 | } |
| 999 | |
| 1000 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
| 1001 | mgpuCuSparseLtSpMM(void *a, void *b, void *c, void *d_workspace, |
| 1002 | void *dA_compressed, void *dA_compressedBuffer, |
| 1003 | CUstream stream) { |
| 1004 | assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()" ); |
| 1005 | auto matA = reinterpret_cast<cusparseLtSpMatHandleAndData *>(a); |
| 1006 | auto matB = reinterpret_cast<cusparseLtDnMatHandleAndData *>(b); |
| 1007 | auto matC = reinterpret_cast<cusparseLtDnMatHandleAndData *>(c); |
| 1008 | |
| 1009 | ALPHABETA(CUDA_R_32F, alpha, beta) |
| 1010 | CUSPARSE_REPORT_IF_ERROR( |
| 1011 | cusparseLtSpMMACompress(&cusparseLt_env, &(matA->plan), (matA->values), |
| 1012 | dA_compressed, dA_compressedBuffer, stream)) |
| 1013 | |
| 1014 | // TODO: add support to multi-stream execution |
| 1015 | // Perform the matrix multiplication. D = A*B+C using C==D for now |
| 1016 | CUSPARSE_REPORT_IF_ERROR( |
| 1017 | cusparseLtMatmul(&cusparseLt_env, &(matA->plan), alphap, dA_compressed, |
| 1018 | matB->values, betap, matC->values, |
| 1019 | /*dD*/ matC->values, d_workspace, nullptr, 0)) |
| 1020 | |
| 1021 | CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(matA->mat))) |
| 1022 | // destroy the plan associated with the sparse matrix |
| 1023 | CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulPlanDestroy(&(matA->plan))) |
| 1024 | } |
| 1025 | |
| 1026 | #endif // MLIR_ENABLE_CUDA_CUSPARSELT |
| 1027 | #endif // MLIR_ENABLE_CUDA_CUSPARSE |
| 1028 | |