1//===- CudaRuntimeWrappers.cpp - MLIR CUDA API wrapper library ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Implements C wrappers around the CUDA library for easy linking in ORC jit.
10// Also adds some debugging helpers that are helpful when writing MLIR code to
11// run on GPUs.
12//
13//===----------------------------------------------------------------------===//
14
15#include "mlir/ExecutionEngine/CRunnerUtils.h"
16
17#include <stdio.h>
18
19#include "cuda.h"
20#include "cuda_bf16.h"
21#include "cuda_fp16.h"
22
23#ifdef MLIR_ENABLE_CUDA_CUSPARSE
24#include "cusparse.h"
25#ifdef MLIR_ENABLE_CUDA_CUSPARSELT
26#include "cusparseLt.h"
27#endif // MLIR_ENABLE_CUDA_CUSPARSELT
28#endif // MLIR_ENABLE_CUDA_CUSPARSE
29
30#ifdef _WIN32
31#include <malloc.h>
32#define MLIR_CUDA_WRAPPERS_EXPORT __declspec(dllexport)
33#else
34#define MLIR_CUDA_WRAPPERS_EXPORT __attribute__((visibility("default")))
35#endif // _WIN32
36
37#define CUDA_REPORT_IF_ERROR(expr) \
38 [](CUresult result) { \
39 if (!result) \
40 return; \
41 const char *name = nullptr; \
42 cuGetErrorName(result, &name); \
43 if (!name) \
44 name = "<unknown>"; \
45 fprintf(stderr, "'%s' failed with '%s'\n", #expr, name); \
46 }(expr)
47
48#define CUSPARSE_REPORT_IF_ERROR(expr) \
49 { \
50 cusparseStatus_t status = (expr); \
51 if (status != CUSPARSE_STATUS_SUCCESS) { \
52 fprintf(stderr, "cuSPARSE '%s' failed with '%s'\n", #expr, \
53 cusparseGetErrorString(status)); \
54 } \
55 }
56
57thread_local static int32_t defaultDevice = 0;
58
59const char *kDebugEnvironmentVariable = "MLIR_CUDA_DEBUG";
60
61/// Helper method that checks environment value for debugging.
62bool isDebugEnabled() {
63 static bool isInitialized = false;
64 static bool isEnabled = false;
65 if (!isInitialized)
66 isEnabled = getenv(kDebugEnvironmentVariable) != nullptr;
67 return isEnabled;
68}
69
70#define debug_print(fmt, ...) \
71 do { \
72 if (isDebugEnabled()) \
73 fprintf(stderr, "%s:%d:%s(): " fmt, "CudaRuntimeWrappers.cpp", __LINE__, \
74 __func__, __VA_ARGS__); \
75 } while (0)
76
77// Returns default CUdevice
78CUdevice getDefaultCuDevice() {
79 CUdevice device;
80 CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice));
81 return device;
82}
83
84// Make the primary context of the current default device current for the
85// duration
86// of the instance and restore the previous context on destruction.
87class ScopedContext {
88public:
89 ScopedContext() {
90 // Static reference to CUDA primary context for device ordinal
91 // defaultDevice.
92 static CUcontext context = [] {
93 CUDA_REPORT_IF_ERROR(cuInit(/*flags=*/0));
94 CUcontext ctx;
95 // Note: this does not affect the current context.
96 CUDA_REPORT_IF_ERROR(
97 cuDevicePrimaryCtxRetain(&ctx, getDefaultCuDevice()));
98 return ctx;
99 }();
100
101 CUDA_REPORT_IF_ERROR(cuCtxPushCurrent(context));
102 }
103
104 ~ScopedContext() { CUDA_REPORT_IF_ERROR(cuCtxPopCurrent(nullptr)); }
105};
106
107#ifdef MLIR_ENABLE_CUDA_CUSPARSE
108// Note that (1) Nvidia confirms the safety to share handle across multiple
109// instances, and streams. (2) Clients are responsible to call the @mgpu
110// environment initialization/destruction in a thread-safe manner, e.g.,
111// at the beginning of the program before multi-threads are created.
112static cusparseHandle_t cusparse_env = nullptr;
113
114#ifdef MLIR_ENABLE_CUDA_CUSPARSELT
115// cusparseLtHandle_t is not a pointer type, so we need an additional flag to
116// indicate whether it is initialized.
117static cusparseLtHandle_t cusparseLt_env;
118static bool cusparseLt_initiated = false;
119
120#endif // MLIR_ENABLE_CUDA_CUSPARSELT
121#endif // MLIR_ENABLE_CUDA_CUSPARSE
122
123extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule
124mgpuModuleLoad(void *data, size_t /*gpuBlobSize*/) {
125 ScopedContext scopedContext;
126 CUmodule module = nullptr;
127 CUDA_REPORT_IF_ERROR(cuModuleLoadData(&module, data));
128 return module;
129}
130
131extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoadJIT(void *data,
132 int optLevel) {
133 ScopedContext scopedContext;
134 CUmodule module = nullptr;
135 char jitErrorBuffer[4096] = {0};
136 CUjit_option jitOptions[] = {CU_JIT_ERROR_LOG_BUFFER,
137 CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
138 CU_JIT_OPTIMIZATION_LEVEL};
139 void *jitOptionsVals[] = {jitErrorBuffer,
140 reinterpret_cast<void *>(sizeof(jitErrorBuffer)),
141 reinterpret_cast<void *>(optLevel)};
142
143 CUresult result =
144 cuModuleLoadDataEx(&module, data, 3, jitOptions, jitOptionsVals);
145 if (result) {
146 fprintf(stderr, format: "JIT compilation failed with: '%s'\n", jitErrorBuffer);
147 CUDA_REPORT_IF_ERROR(result);
148 }
149 return module;
150}
151
152extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuModuleUnload(CUmodule module) {
153 CUDA_REPORT_IF_ERROR(cuModuleUnload(module));
154}
155
156extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUfunction
157mgpuModuleGetFunction(CUmodule module, const char *name) {
158 CUfunction function = nullptr;
159 CUDA_REPORT_IF_ERROR(cuModuleGetFunction(&function, module, name));
160 return function;
161}
162
163// The wrapper uses intptr_t instead of CUDA's unsigned int to match
164// the type of MLIR's index type. This avoids the need for casts in the
165// generated MLIR code.
166extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
167mgpuLaunchKernel(CUfunction function, intptr_t gridX, intptr_t gridY,
168 intptr_t gridZ, intptr_t blockX, intptr_t blockY,
169 intptr_t blockZ, int32_t smem, CUstream stream, void **params,
170 void **extra, size_t /*paramsCount*/) {
171 ScopedContext scopedContext;
172 if (smem > 0) {
173 // Avoid checking driver as it's more expensive than if statement
174 int32_t maxShmem = 0;
175 CUdevice device = getDefaultCuDevice();
176 CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice));
177 CUDA_REPORT_IF_ERROR(cuDeviceGetAttribute(
178 &maxShmem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
179 device));
180 if (maxShmem < smem) {
181 fprintf(stderr,
182 format: "Requested shared memory (%dkb) is larger than maximum allowed "
183 "shared memory (%dkb) for this device\n",
184 smem, maxShmem);
185 }
186 CUDA_REPORT_IF_ERROR(cuFuncSetAttribute(
187 function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, smem));
188 }
189 debug_print("Launching kernel, grid=%ld,%ld,%ld, "
190 "threads: %ld, %ld, %ld, "
191 "smem: %dkb\n",
192 gridX, gridY, gridZ, blockX, blockY, blockZ, smem);
193 CUDA_REPORT_IF_ERROR(cuLaunchKernel(function, gridX, gridY, gridZ, blockX,
194 blockY, blockZ, smem, stream, params,
195 extra));
196}
197
198extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUstream mgpuStreamCreate() {
199 ScopedContext scopedContext;
200 CUstream stream = nullptr;
201 CUDA_REPORT_IF_ERROR(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
202 return stream;
203}
204
205extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuStreamDestroy(CUstream stream) {
206 CUDA_REPORT_IF_ERROR(cuStreamDestroy(stream));
207}
208
209extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
210mgpuStreamSynchronize(CUstream stream) {
211 CUDA_REPORT_IF_ERROR(cuStreamSynchronize(stream));
212}
213
214extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuStreamWaitEvent(CUstream stream,
215 CUevent event) {
216 CUDA_REPORT_IF_ERROR(cuStreamWaitEvent(stream, event, /*flags=*/0));
217}
218
219extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUevent mgpuEventCreate() {
220 ScopedContext scopedContext;
221 CUevent event = nullptr;
222 CUDA_REPORT_IF_ERROR(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING));
223 return event;
224}
225
226extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventDestroy(CUevent event) {
227 CUDA_REPORT_IF_ERROR(cuEventDestroy(event));
228}
229
230extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventSynchronize(CUevent event) {
231 CUDA_REPORT_IF_ERROR(cuEventSynchronize(event));
232}
233
234extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventRecord(CUevent event,
235 CUstream stream) {
236 CUDA_REPORT_IF_ERROR(cuEventRecord(event, stream));
237}
238
239extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
240mgpuMemAlloc(uint64_t sizeBytes, CUstream /*stream*/, bool /*isHostShared*/) {
241 ScopedContext scopedContext;
242 CUdeviceptr ptr = 0;
243 if (sizeBytes != 0)
244 CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes));
245 return reinterpret_cast<void *>(ptr);
246}
247
248extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuMemFree(void *ptr,
249 CUstream /*stream*/) {
250 CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(ptr)));
251}
252
253extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
254mgpuMemcpy(void *dst, void *src, size_t sizeBytes, CUstream stream) {
255 CUDA_REPORT_IF_ERROR(cuMemcpyAsync(reinterpret_cast<CUdeviceptr>(dst),
256 reinterpret_cast<CUdeviceptr>(src),
257 sizeBytes, stream));
258}
259
260extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
261mgpuMemset32(void *dst, unsigned int value, size_t count, CUstream stream) {
262 CUDA_REPORT_IF_ERROR(cuMemsetD32Async(reinterpret_cast<CUdeviceptr>(dst),
263 value, count, stream));
264}
265
266extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
267mgpuMemset16(void *dst, unsigned short value, size_t count, CUstream stream) {
268 CUDA_REPORT_IF_ERROR(cuMemsetD16Async(reinterpret_cast<CUdeviceptr>(dst),
269 value, count, stream));
270}
271
272///
273/// Helper functions for writing mlir example code
274///
275
276// Allows to register byte array with the CUDA runtime. Helpful until we have
277// transfer functions implemented.
278extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
279mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) {
280 ScopedContext scopedContext;
281 CUDA_REPORT_IF_ERROR(cuMemHostRegister(ptr, sizeBytes, /*flags=*/0));
282}
283
284/// Registers a memref with the CUDA runtime. `descriptor` is a pointer to a
285/// ranked memref descriptor struct of rank `rank`. Helpful until we have
286/// transfer functions implemented.
287extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
288mgpuMemHostRegisterMemRef(int64_t rank, StridedMemRefType<char, 1> *descriptor,
289 int64_t elementSizeBytes) {
290 // Only densely packed tensors are currently supported.
291#ifdef _WIN32
292 int64_t *denseStrides = (int64_t *)_alloca(rank * sizeof(int64_t));
293#else
294 int64_t *denseStrides = (int64_t *)alloca(rank * sizeof(int64_t));
295#endif // _WIN32
296 int64_t *sizes = descriptor->sizes;
297 for (int64_t i = rank - 1, runningStride = 1; i >= 0; i--) {
298 denseStrides[i] = runningStride;
299 runningStride *= sizes[i];
300 }
301 uint64_t sizeBytes = sizes[0] * denseStrides[0] * elementSizeBytes;
302 int64_t *strides = &sizes[rank];
303 (void)strides;
304 for (unsigned i = 0; i < rank; ++i)
305 assert(strides[i] == denseStrides[i] &&
306 "Mismatch in computed dense strides");
307
308 auto *ptr = descriptor->data + descriptor->offset * elementSizeBytes;
309 mgpuMemHostRegister(ptr, sizeBytes);
310}
311
312// Allows to unregister byte array with the CUDA runtime.
313extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuMemHostUnregister(void *ptr) {
314 ScopedContext scopedContext;
315 CUDA_REPORT_IF_ERROR(cuMemHostUnregister(ptr));
316}
317
318/// Unregisters a memref with the CUDA runtime. `descriptor` is a pointer to a
319/// ranked memref descriptor struct of rank `rank`
320extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
321mgpuMemHostUnregisterMemRef(int64_t rank,
322 StridedMemRefType<char, 1> *descriptor,
323 int64_t elementSizeBytes) {
324 auto *ptr = descriptor->data + descriptor->offset * elementSizeBytes;
325 mgpuMemHostUnregister(ptr);
326}
327
328extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) {
329 defaultDevice = device;
330}
331
332///
333/// Runtime methods using CUDA 12.0+ driver
334///
335
336#if (CUDA_VERSION >= 12000)
337
338extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuLaunchClusterKernel(
339 CUfunction function, intptr_t clusterX, intptr_t clusterY,
340 intptr_t clusterZ, intptr_t gridX, intptr_t gridY, intptr_t gridZ,
341 intptr_t blockX, intptr_t blockY, intptr_t blockZ, int32_t smem,
342 CUstream stream, void **params, void **extra, size_t /*paramsCount*/) {
343 ScopedContext scopedContext;
344 if (smem > 0) {
345 // Avoid checking driver as it's more expensive than if statement
346 int32_t maxShmem = 0;
347 CUdevice device = getDefaultCuDevice();
348 CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice));
349 CUDA_REPORT_IF_ERROR(cuDeviceGetAttribute(
350 &maxShmem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
351 device));
352 if (maxShmem < smem) {
353 fprintf(stderr,
354 "Requested shared memory (%dkb) is larger than maximum allowed "
355 "shared memory (%dkb) for this device\n",
356 smem, maxShmem);
357 }
358 CUDA_REPORT_IF_ERROR(cuFuncSetAttribute(
359 function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, smem));
360 }
361 CUlaunchConfig config;
362 config.gridDimX = gridX;
363 config.gridDimY = gridY;
364 config.gridDimZ = gridZ;
365 config.blockDimX = blockX;
366 config.blockDimY = blockY;
367 config.blockDimZ = blockZ;
368 config.sharedMemBytes = smem;
369 config.hStream = stream;
370 CUlaunchAttribute launchAttr[2];
371 launchAttr[0].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
372 launchAttr[0].value.clusterDim.x = clusterX;
373 launchAttr[0].value.clusterDim.y = clusterY;
374 launchAttr[0].value.clusterDim.z = clusterZ;
375 launchAttr[1].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE;
376 launchAttr[1].value.clusterSchedulingPolicyPreference =
377 CU_CLUSTER_SCHEDULING_POLICY_SPREAD;
378 config.numAttrs = 2;
379 config.attrs = launchAttr;
380
381 debug_print("Launching kernel,"
382 "cluster: %ld, %ld, %ld, "
383 "grid=%ld,%ld,%ld, "
384 "threads: %ld, %ld, %ld, "
385 "smem: %dkb\n",
386 clusterX, clusterY, clusterZ, gridX, gridY, gridZ, blockX, blockY,
387 blockZ, smem);
388
389 CUDA_REPORT_IF_ERROR(cuLaunchKernelEx(&config, function, params, extra));
390}
391
392extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuTensorMapEncodeTiled(
393 CUtensorMap *tensorMap, // Tensor map object
394 CUtensorMapDataType tensorDataType, // Tensor data type
395 cuuint32_t tensorRank, // Dimensionality of tensor
396 void *globalAddress, // Starting address
397 const cuuint64_t *globalDim, // Tensor size (number of elements)
398 const cuuint64_t *globalStrides, // Stride size (in bytes)
399 const cuuint32_t *boxDim, // Traversal box (number of elments)
400 const cuuint32_t *elementStrides, // Traversal stride
401 CUtensorMapInterleave interleave, // Type of interleaved layout
402 CUtensorMapSwizzle swizzle, // Bank swizzling pattern
403 CUtensorMapL2promotion l2Promotion, // L2 promotion size
404 CUtensorMapFloatOOBfill oobFill // Padding zfill or NaN fill
405) {
406 ScopedContext scopedContext;
407 CUDA_REPORT_IF_ERROR(cuTensorMapEncodeTiled(
408 tensorMap, tensorDataType, tensorRank, globalAddress, globalDim,
409 globalStrides, boxDim, elementStrides, interleave, swizzle, l2Promotion,
410 oobFill));
411 debug_print("Created TMA descriptor\n Addr: %p\n"
412 "data type : %d\n"
413 "rank : %d\n"
414 "globalDim[5]: %zu, %zu, %zu, %zu, %zu\n"
415 "globalStrides[5]: %zu, %zu, %zu, %zu, %zu\n"
416 "boxDim[5]: %u, %u, %u, %u, %u\n"
417 "elementStrides[5]: %u, %u, %u, %u, %u\n"
418 "interleave: %u \n"
419 "swizzle: %u \n"
420 "l2Promotion: %u \n"
421 "oobFill: %u \n",
422 (void *)&tensorMap, tensorDataType, tensorRank, globalDim[0],
423 globalDim[1], globalDim[2], globalDim[3], globalDim[4],
424 globalStrides[0], globalStrides[1], globalStrides[2],
425 globalStrides[3], globalStrides[4], boxDim[0], boxDim[1],
426 boxDim[2], boxDim[3], boxDim[4], elementStrides[0],
427 elementStrides[1], elementStrides[2], elementStrides[3],
428 elementStrides[4], interleave, swizzle, l2Promotion, oobFill);
429}
430
431template <int Rank>
432void mgpuGetMemRefDataAndShape(void *rawDescriptor, char **addr,
433 uint64_t *globalDim, uint64_t *globalStrides,
434 const CUtensorMapDataType tensorDataType) {
435 auto descriptor =
436 reinterpret_cast<StridedMemRefType<char, Rank> *>(rawDescriptor);
437 *addr = descriptor->data;
438 for (int i = 0; i < Rank; ++i) {
439 globalDim[i] = static_cast<uint64_t>(descriptor->sizes[Rank - i - 1]);
440 }
441 static constexpr int elementSizeInBytes[] = {1, 2, 4, 4, 8, 8, 2,
442 4, 8, 2, 4, 4, 4};
443 for (int i = 0; i < Rank - 1; ++i) {
444 globalStrides[i] = static_cast<uint64_t>(
445 descriptor->strides[Rank - i - 2] * elementSizeInBytes[tensorDataType]);
446 }
447}
448
449extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *mgpuTensorMapEncodeTiledMemref(
450 int64_t tensorRank, // Dimensionality of tensor
451 void *rankedDescriptor, // Ranked MemRef descriptor
452 const CUtensorMapDataType tensorDataType, // Stride size (in bytes)
453 CUtensorMapInterleave interleave, // Type of interleaved layout
454 CUtensorMapSwizzle swizzle, // Bank swizzling pattern
455 CUtensorMapL2promotion l2Promotion, // L2 promotion size
456 CUtensorMapFloatOOBfill oobFill, // Padding zfill or NaN fill
457 int64_t *inputBoxDims // Tensor size (number of elements)
458) {
459 CUtensorMap tensorMap;
460
461 uint32_t boxDim[5] = {1, 1, 1, 1, 1}, elementStrides[5] = {1, 1, 1, 1, 1};
462 uint64_t globalDim[5] = {1, 1, 1, 1, 1}, globalStrides[5] = {0};
463 uint32_t tensorRank32 = uint32_t(tensorRank);
464
465 char *globalAddress = nullptr;
466 switch (tensorRank) {
467 case 1:
468 mgpuGetMemRefDataAndShape<1>(rankedDescriptor, &globalAddress, globalDim,
469 globalStrides, tensorDataType);
470 break;
471 case 2:
472 mgpuGetMemRefDataAndShape<2>(rankedDescriptor, &globalAddress, globalDim,
473 globalStrides, tensorDataType);
474 break;
475 case 3:
476 mgpuGetMemRefDataAndShape<3>(rankedDescriptor, &globalAddress, globalDim,
477 globalStrides, tensorDataType);
478 break;
479 case 4:
480 mgpuGetMemRefDataAndShape<4>(rankedDescriptor, &globalAddress, globalDim,
481 globalStrides, tensorDataType);
482 break;
483 case 5:
484 mgpuGetMemRefDataAndShape<5>(rankedDescriptor, &globalAddress, globalDim,
485 globalStrides, tensorDataType);
486 break;
487 default:
488 fprintf(
489 stderr,
490 "'mgpuTensorMapEncodeTiledMemref' failed with 'rank is too high'\n");
491 return nullptr;
492 }
493
494 for (int64_t r = 0; r < tensorRank; ++r) {
495 boxDim[r] = static_cast<uint32_t>(inputBoxDims[tensorRank - r - 1]);
496 }
497
498 ScopedContext scopedContext;
499 mgpuTensorMapEncodeTiled(&tensorMap, tensorDataType, tensorRank32,
500 globalAddress, globalDim, globalStrides, boxDim,
501 elementStrides, interleave, swizzle, l2Promotion,
502 oobFill);
503 // Copy created tensor map to device
504 CUdeviceptr dTensorMap;
505 CUDA_REPORT_IF_ERROR(cuMemAlloc(&dTensorMap, sizeof(CUtensorMap)));
506 CUDA_REPORT_IF_ERROR(cuMemcpy(dTensorMap,
507 reinterpret_cast<CUdeviceptr>(&tensorMap),
508 sizeof(CUtensorMap)));
509 return reinterpret_cast<void *>(dTensorMap);
510}
511#endif
512
513#ifdef MLIR_ENABLE_CUDA_CUSPARSE
514
515///
516/// Wrapper methods for the cuSparse library.
517///
518
519// Some macro magic to get float/double alpha and beta on host.
520// TODO: add support to passing alpha and beta as arguments
521#define ALPHABETA(dtp, alpha, beta) \
522 __nv_bfloat16(alpha##16bf) = 1.0f; \
523 __nv_bfloat16(beta##16bf) = 1.0f; \
524 __half(alpha##16f) = 1.0f; \
525 __half(beta##16f) = 1.0f; \
526 float(alpha##f) = 1.0f; \
527 float(beta##f) = 1.0f; \
528 double(alpha##d) = 1.0; \
529 double(beta##d) = 1.0; \
530 const void *(alpha##p) = nullptr; \
531 const void *(beta##p) = nullptr; \
532 if (dtp == CUDA_R_16BF || dtp == CUDA_C_16BF) { \
533 (alpha##p) = reinterpret_cast<void *>(&(alpha##16bf)); \
534 (beta##p) = reinterpret_cast<void *>(&(beta##16bf)); \
535 } else if (dtp == CUDA_R_16F || dtp == CUDA_C_16F) { \
536 (alpha##p) = reinterpret_cast<void *>(&(alpha##16f)); \
537 (beta##p) = reinterpret_cast<void *>(&(beta##16f)); \
538 } else if (dtp == CUDA_R_32F || dtp == CUDA_C_32F) { \
539 (alpha##p) = reinterpret_cast<void *>(&(alpha##f)); \
540 (beta##p) = reinterpret_cast<void *>(&(beta##f)); \
541 } else { \
542 (alpha##p) = reinterpret_cast<void *>(&(alpha##d)); \
543 (beta##p) = reinterpret_cast<void *>(&(beta##d)); \
544 }
545
546extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuCreateSparseEnv() {
547 // ScopedContext is for cuda initialization.
548 ScopedContext scopedContext;
549 assert(!cusparse_env && "client called mgpuCreateSparseEnv() twice");
550 CUSPARSE_REPORT_IF_ERROR(cusparseCreate(&cusparse_env));
551}
552
553extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuDestroySparseEnv() {
554 assert(cusparse_env && "client did not call mgpuCreateSparseEnv()");
555 CUSPARSE_REPORT_IF_ERROR(cusparseDestroy(cusparse_env));
556 cusparse_env = nullptr;
557}
558
559extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
560mgpuCreateDnVec(intptr_t size, void *values, int32_t dtp, CUstream /*stream*/) {
561 cusparseDnVecDescr_t vec = nullptr;
562 auto dTp = static_cast<cudaDataType_t>(dtp);
563 CUSPARSE_REPORT_IF_ERROR(cusparseCreateDnVec(&vec, size, values, dTp))
564 return reinterpret_cast<void *>(vec);
565}
566
567extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
568mgpuDestroyDnVec(void *v, CUstream /*stream*/) {
569 cusparseDnVecDescr_t vec = reinterpret_cast<cusparseDnVecDescr_t>(v);
570 CUSPARSE_REPORT_IF_ERROR(cusparseDestroyDnVec(vec))
571}
572
573extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
574mgpuCreateDnMat(intptr_t rows, intptr_t cols, void *values, int32_t dtp,
575 CUstream /*stream*/) {
576 cusparseDnMatDescr_t mat = nullptr;
577 auto dTp = static_cast<cudaDataType_t>(dtp);
578 CUSPARSE_REPORT_IF_ERROR(cusparseCreateDnMat(&mat, rows, cols, /*ld=*/cols,
579 values, dTp, CUSPARSE_ORDER_ROW))
580 return reinterpret_cast<void *>(mat);
581}
582
583extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
584mgpuDestroyDnMat(void *m, CUstream /*stream*/) {
585 cusparseDnMatDescr_t mat = reinterpret_cast<cusparseDnMatDescr_t>(m);
586 CUSPARSE_REPORT_IF_ERROR(cusparseDestroyDnMat(mat))
587}
588
589extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
590mgpuCreateCoo(intptr_t rows, intptr_t cols, intptr_t nnz, void *rowIdxs,
591 void *colIdxs, void *values, int32_t itp, int32_t dtp,
592 CUstream /*stream*/) {
593 cusparseSpMatDescr_t mat = nullptr;
594 auto iTp = static_cast<cusparseIndexType_t>(itp);
595 auto dTp = static_cast<cudaDataType_t>(dtp);
596 CUSPARSE_REPORT_IF_ERROR(cusparseCreateCoo(&mat, rows, cols, nnz, rowIdxs,
597 colIdxs, values, iTp,
598 CUSPARSE_INDEX_BASE_ZERO, dTp))
599 return reinterpret_cast<void *>(mat);
600}
601
602#ifdef CUSPARSE_COO_AOS // deprecated in cuSPARSE 11.2
603extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
604mgpuCreateCooAoS(intptr_t rows, intptr_t cols, intptr_t nnz, void *idxs,
605 void *values, int32_t itp, int32_t dtp, CUstream /*stream*/) {
606 cusparseSpMatDescr_t mat = nullptr;
607 auto iTp = static_cast<cusparseIndexType_t>(itp);
608 auto dTp = static_cast<cudaDataType_t>(dtp);
609 CUSPARSE_REPORT_IF_ERROR(cusparseCreateCooAoS(
610 &mat, rows, cols, nnz, idxs, values, iTp, CUSPARSE_INDEX_BASE_ZERO, dTp))
611 return reinterpret_cast<void *>(mat);
612}
613#endif // CUSPARSE_COO_AOS
614
615extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
616mgpuCreateCsr(intptr_t rows, intptr_t cols, intptr_t nnz, void *rowPos,
617 void *colIdxs, void *values, int32_t ptp, int32_t itp,
618 int32_t dtp, CUstream /*stream*/) {
619 cusparseSpMatDescr_t mat = nullptr;
620 auto pTp = static_cast<cusparseIndexType_t>(ptp);
621 auto iTp = static_cast<cusparseIndexType_t>(itp);
622 auto dTp = static_cast<cudaDataType_t>(dtp);
623 CUSPARSE_REPORT_IF_ERROR(cusparseCreateCsr(&mat, rows, cols, nnz, rowPos,
624 colIdxs, values, pTp, iTp,
625 CUSPARSE_INDEX_BASE_ZERO, dTp))
626 return reinterpret_cast<void *>(mat);
627}
628
629extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
630mgpuCreateCsc(intptr_t rows, intptr_t cols, intptr_t nnz, void *colPos,
631 void *rowIdxs, void *values, int32_t ptp, int32_t itp,
632 int32_t dtp, CUstream /*stream*/) {
633 cusparseSpMatDescr_t mat = nullptr;
634 auto pTp = static_cast<cusparseIndexType_t>(ptp);
635 auto iTp = static_cast<cusparseIndexType_t>(itp);
636 auto dTp = static_cast<cudaDataType_t>(dtp);
637 CUSPARSE_REPORT_IF_ERROR(cusparseCreateCsc(&mat, rows, cols, nnz, colPos,
638 rowIdxs, values, pTp, iTp,
639 CUSPARSE_INDEX_BASE_ZERO, dTp))
640 return reinterpret_cast<void *>(mat);
641}
642
643extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
644mgpuCreateBsr(intptr_t brows, intptr_t bcols, intptr_t bnnz, intptr_t rBsz,
645 intptr_t cBsz, void *rowPos, void *colIdxs, void *values,
646 int32_t ptp, int32_t itp, int32_t dtp, CUstream /*stream*/) {
647 cusparseSpMatDescr_t mat = nullptr;
648#if CUSPARSE_VERSION >= 12100
649 auto pTp = static_cast<cusparseIndexType_t>(ptp);
650 auto iTp = static_cast<cusparseIndexType_t>(itp);
651 auto dTp = static_cast<cudaDataType_t>(dtp);
652 CUSPARSE_REPORT_IF_ERROR(cusparseCreateBsr(
653 &mat, brows, bcols, bnnz, rBsz, cBsz, rowPos, colIdxs, values, pTp, iTp,
654 CUSPARSE_INDEX_BASE_ZERO, dTp, CUSPARSE_ORDER_ROW))
655#endif
656 return reinterpret_cast<void *>(mat);
657}
658
659extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
660mgpuDestroySpMat(void *m, CUstream /*stream*/) {
661 cusparseSpMatDescr_t mat = reinterpret_cast<cusparseSpMatDescr_t>(m);
662 CUSPARSE_REPORT_IF_ERROR(cusparseDestroySpMat(mat))
663}
664
665extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t mgpuSpMVBufferSize(
666 int32_t ma, void *a, void *x, void *y, int32_t ctp, CUstream /*stream*/) {
667 assert(cusparse_env && "client did not call mgpuCreateSparseEnv()");
668 cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
669 cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
670 cusparseDnVecDescr_t vecX = reinterpret_cast<cusparseDnVecDescr_t>(x);
671 cusparseDnVecDescr_t vecY = reinterpret_cast<cusparseDnVecDescr_t>(y);
672 cudaDataType_t cTp = static_cast<cudaDataType_t>(ctp);
673 ALPHABETA(cTp, alpha, beta)
674 size_t bufferSize = 0;
675 CUSPARSE_REPORT_IF_ERROR(cusparseSpMV_bufferSize(
676 cusparse_env, modeA, alphap, matA, vecX, betap, vecY, cTp,
677 CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize))
678 return bufferSize;
679}
680
681extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSpMV(int32_t ma, void *a, void *x,
682 void *y, int32_t ctp,
683 void *buf,
684 CUstream /*stream*/) {
685 assert(cusparse_env && "client did not call mgpuCreateSparseEnv()");
686 cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
687 cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
688 cusparseDnVecDescr_t vecX = reinterpret_cast<cusparseDnVecDescr_t>(x);
689 cusparseDnVecDescr_t vecY = reinterpret_cast<cusparseDnVecDescr_t>(y);
690 cudaDataType_t cTp = static_cast<cudaDataType_t>(ctp);
691 ALPHABETA(cTp, alpha, beta)
692 CUSPARSE_REPORT_IF_ERROR(cusparseSpMV(cusparse_env, modeA, alphap, matA, vecX,
693 betap, vecY, cTp,
694 CUSPARSE_SPMV_ALG_DEFAULT, buf))
695}
696
697extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t
698mgpuSpMMBufferSize(int32_t ma, int32_t mb, void *a, void *b, void *c,
699 int32_t ctp, CUstream /*stream*/) {
700 assert(cusparse_env && "client did not call mgpuCreateSparseEnv()");
701 cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
702 cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb);
703 cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
704 cusparseDnMatDescr_t matB = reinterpret_cast<cusparseDnMatDescr_t>(b);
705 cusparseDnMatDescr_t matC = reinterpret_cast<cusparseDnMatDescr_t>(c);
706 cudaDataType_t cTp = static_cast<cudaDataType_t>(ctp);
707 ALPHABETA(cTp, alpha, beta)
708 size_t bufferSize = 0;
709 CUSPARSE_REPORT_IF_ERROR(cusparseSpMM_bufferSize(
710 cusparse_env, modeA, modeB, alphap, matA, matB, betap, matC, cTp,
711 CUSPARSE_SPMM_ALG_DEFAULT, &bufferSize))
712 return bufferSize;
713}
714
715extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSpMM(int32_t ma, int32_t mb,
716 void *a, void *b, void *c,
717 int32_t ctp, void *buf,
718 CUstream /*stream*/) {
719 assert(cusparse_env && "client did not call mgpuCreateSparseEnv()");
720 cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
721 cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb);
722 cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
723 cusparseDnMatDescr_t matB = reinterpret_cast<cusparseDnMatDescr_t>(b);
724 cusparseDnMatDescr_t matC = reinterpret_cast<cusparseDnMatDescr_t>(c);
725 cudaDataType_t cTp = static_cast<cudaDataType_t>(ctp);
726 ALPHABETA(cTp, alpha, beta)
727 CUSPARSE_REPORT_IF_ERROR(cusparseSpMM(cusparse_env, modeA, modeB, alphap,
728 matA, matB, betap, matC, cTp,
729 CUSPARSE_SPMM_ALG_DEFAULT, buf))
730}
731
732extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t
733mgpuSDDMMBufferSize(int32_t ma, int32_t mb, void *a, void *b, void *c,
734 int32_t ctp, CUstream /*stream*/) {
735 assert(cusparse_env && "client did not call mgpuCreateSparseEnv()");
736 cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
737 cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb);
738 cusparseDnMatDescr_t matA = reinterpret_cast<cusparseDnMatDescr_t>(a);
739 cusparseDnMatDescr_t matB = reinterpret_cast<cusparseDnMatDescr_t>(b);
740 cusparseSpMatDescr_t matC = reinterpret_cast<cusparseSpMatDescr_t>(c);
741 auto cTp = static_cast<cudaDataType_t>(ctp);
742 ALPHABETA(cTp, alpha, beta)
743 size_t bufferSize = 0;
744 CUSPARSE_REPORT_IF_ERROR(cusparseSDDMM_bufferSize(
745 cusparse_env, modeA, modeB, alphap, matA, matB, betap, matC, cTp,
746 CUSPARSE_SDDMM_ALG_DEFAULT, &bufferSize))
747 return bufferSize;
748}
749
750extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSDDMM(int32_t ma, int32_t mb,
751 void *a, void *b, void *c,
752 int32_t ctp, void *buf,
753 CUstream /*stream*/) {
754 assert(cusparse_env && "client did not call mgpuCreateSparseEnv()");
755 cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
756 cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb);
757 cusparseDnMatDescr_t matA = reinterpret_cast<cusparseDnMatDescr_t>(a);
758 cusparseDnMatDescr_t matB = reinterpret_cast<cusparseDnMatDescr_t>(b);
759 cusparseSpMatDescr_t matC = reinterpret_cast<cusparseSpMatDescr_t>(c);
760 auto cTp = static_cast<cudaDataType_t>(ctp);
761 ALPHABETA(cTp, alpha, beta)
762 CUSPARSE_REPORT_IF_ERROR(cusparseSDDMM(cusparse_env, modeA, modeB, alphap,
763 matA, matB, betap, matC, cTp,
764 CUSPARSE_SDDMM_ALG_DEFAULT, buf))
765}
766
767extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
768mgpuSpGEMMCreateDescr(CUstream /*stream*/) {
769 cusparseSpGEMMDescr_t spgemmDesc = nullptr;
770 CUSPARSE_REPORT_IF_ERROR(cusparseSpGEMM_createDescr(&spgemmDesc))
771 return reinterpret_cast<void *>(spgemmDesc);
772}
773
774extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
775mgpuSpGEMMDestroyDescr(void *s, CUstream /*stream*/) {
776 cusparseSpGEMMDescr_t spgemmDesc = reinterpret_cast<cusparseSpGEMMDescr_t>(s);
777 CUSPARSE_REPORT_IF_ERROR(cusparseSpGEMM_destroyDescr(spgemmDesc))
778}
779
780extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t mgpuSpGEMMWorkEstimation(
781 void *s, int32_t ma, int32_t mb, void *a, void *b, void *c, int32_t ctp,
782 intptr_t bs, void *buf, CUstream /*stream*/) {
783 cusparseSpGEMMDescr_t spgemmDesc = reinterpret_cast<cusparseSpGEMMDescr_t>(s);
784 cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
785 cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb);
786 cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
787 cusparseSpMatDescr_t matB = reinterpret_cast<cusparseSpMatDescr_t>(b);
788 cusparseSpMatDescr_t matC = reinterpret_cast<cusparseSpMatDescr_t>(c);
789 auto cTp = static_cast<cudaDataType_t>(ctp);
790 ALPHABETA(cTp, alpha, beta)
791 size_t newBufferSize = bs;
792 CUSPARSE_REPORT_IF_ERROR(cusparseSpGEMM_workEstimation(
793 cusparse_env, modeA, modeB, alphap, matA, matB, betap, matC, cTp,
794 CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, &newBufferSize, buf))
795 return newBufferSize;
796}
797
798extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t
799mgpuSpGEMMCompute(void *s, int32_t ma, int32_t mb, void *a, void *b, void *c,
800 int32_t ctp, intptr_t bsz2, void *buf2, CUstream /*stream*/) {
801 cusparseSpGEMMDescr_t spgemmDesc = reinterpret_cast<cusparseSpGEMMDescr_t>(s);
802 cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
803 cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb);
804 cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
805 cusparseSpMatDescr_t matB = reinterpret_cast<cusparseSpMatDescr_t>(b);
806 cusparseSpMatDescr_t matC = reinterpret_cast<cusparseSpMatDescr_t>(c);
807 auto cTp = static_cast<cudaDataType_t>(ctp);
808 ALPHABETA(cTp, alpha, beta)
809 size_t newBufferSize2 = bsz2;
810 CUSPARSE_REPORT_IF_ERROR(cusparseSpGEMM_compute(
811 cusparse_env, modeA, modeB, alphap, matA, matB, betap, matC, cTp,
812 CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, &newBufferSize2, buf2))
813 return newBufferSize2;
814}
815
816extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
817mgpuSpGEMMCopy(void *s, int32_t ma, int32_t mb, void *a, void *b, void *c,
818 int32_t ctp, CUstream /*stream*/) {
819 cusparseSpGEMMDescr_t spgemmDesc = reinterpret_cast<cusparseSpGEMMDescr_t>(s);
820 cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
821 cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb);
822 cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
823 cusparseSpMatDescr_t matB = reinterpret_cast<cusparseSpMatDescr_t>(b);
824 cusparseSpMatDescr_t matC = reinterpret_cast<cusparseSpMatDescr_t>(c);
825 auto cTp = static_cast<cudaDataType_t>(ctp);
826 ALPHABETA(cTp, alpha, beta)
827 CUSPARSE_REPORT_IF_ERROR(
828 cusparseSpGEMM_copy(cusparse_env, modeA, modeB, alphap, matA, matB, betap,
829 matC, cTp, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc))
830}
831
832extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
833mgpuSpMatGetSize(void *m, void *r, void *c, void *n, CUstream /*stream*/) {
834 cusparseConstSpMatDescr_t matDescr =
835 reinterpret_cast<cusparseConstSpMatDescr_t>(m);
836 int64_t *rows = reinterpret_cast<int64_t *>(r);
837 int64_t *cols = reinterpret_cast<int64_t *>(c);
838 int64_t *nnz = reinterpret_cast<int64_t *>(n);
839 CUSPARSE_REPORT_IF_ERROR(cusparseSpMatGetSize(matDescr, rows, cols, nnz));
840}
841
842extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
843mgpuSetCsrPointers(void *m, void *p, void *c, void *v, CUstream /*stream*/) {
844 cusparseSpMatDescr_t matDescr = reinterpret_cast<cusparseSpMatDescr_t>(m);
845 CUSPARSE_REPORT_IF_ERROR(cusparseCsrSetPointers(matDescr, p, c, v));
846}
847
848#ifdef MLIR_ENABLE_CUDA_CUSPARSELT
849
850///
851/// Wrapper methods for the cuSparseLt library.
852///
853
854struct cusparseLtSpMatHandleAndData {
855 cusparseLtMatDescriptor_t mat;
856 // TODO: the following three are associated with the SpMM operator rather than
857 // the sparse matrix. Create workspace buffers and pass them to the SpMM
858 // execution.
859 cusparseLtMatmulAlgSelection_t alg_sel;
860 cusparseLtMatmulPlan_t plan;
861 cusparseLtMatmulDescriptor_t matmul;
862 void *values{nullptr};
863};
864
865struct cusparseLtDnMatHandleAndData {
866 cusparseLtMatDescriptor_t mat;
867 void *values{nullptr};
868};
869
870static_assert(sizeof(cusparseLtHandle_t) == 11024,
871 "Unexpected cusparseLt handle size");
872static_assert(sizeof(cusparseLtSpMatHandleAndData) == 44104,
873 "Unexpected cusparseLt sparse matrix handle size");
874static_assert(sizeof(cusparseLtDnMatHandleAndData) == 11032,
875 "Unexpected cusparseLt dense matrix handle size");
876
877extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuCreateSparseLtEnv() {
878 // ScopedContext is for cuda initialization.
879 ScopedContext scopedContext;
880 assert(!cusparseLt_initiated &&
881 "client called mgpuCreateSparseLtEnv() twice");
882 // Note that cuSparseLt still uses cusparseStatus_t.
883 CUSPARSE_REPORT_IF_ERROR(cusparseLtInit(&cusparseLt_env));
884 cusparseLt_initiated = true;
885}
886
887extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuDestroySparseLtEnv() {
888 assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()");
889 CUSPARSE_REPORT_IF_ERROR(cusparseLtDestroy(&cusparseLt_env));
890 cusparseLt_initiated = false;
891}
892
893extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
894mgpuCreateCuSparseLtDnMat(void *dh, intptr_t rows, intptr_t cols, void *values,
895 int32_t dtp, CUstream /*stream*/) {
896 assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()");
897 auto dnmat_handle = reinterpret_cast<cusparseLtDnMatHandleAndData *>(dh);
898 dnmat_handle->values = values;
899 auto dTp = static_cast<cudaDataType_t>(dtp);
900 // Assume row-major when deciding lda.
901 const uint32_t alignment = 16;
902 CUSPARSE_REPORT_IF_ERROR(cusparseLtDenseDescriptorInit(
903 &cusparseLt_env, &(dnmat_handle->mat), rows, cols, /*lda=*/cols,
904 alignment, dTp, CUSPARSE_ORDER_ROW))
905}
906
907extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
908mgpuDestroyCuSparseLtDnMat(void *dh, CUstream /*stream*/) {
909 auto dnmat_handle = reinterpret_cast<cusparseLtDnMatHandleAndData *>(dh);
910 CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(dnmat_handle->mat)))
911}
912
913extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
914mgpuCusparseLtCreate2To4SpMat(void *sh, intptr_t rows, intptr_t cols,
915 void *values, int32_t dtp, CUstream /*stream*/) {
916 assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()");
917 auto spmat_handle = reinterpret_cast<cusparseLtSpMatHandleAndData *>(sh);
918 spmat_handle->values = values;
919 auto dTp = static_cast<cudaDataType_t>(dtp);
920 // Assume row-major when deciding lda.
921 const uint32_t alignment = 16;
922 CUSPARSE_REPORT_IF_ERROR(cusparseLtStructuredDescriptorInit(
923 &cusparseLt_env, &(spmat_handle->mat), rows, cols, /*ld=*/cols, alignment,
924 dTp, CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT))
925}
926
927extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
928mgpuDestroyCuSparseLtSpMat(void *sh, CUstream /*stream*/) {
929 auto spmat_handle = reinterpret_cast<cusparseLtSpMatHandleAndData *>(sh);
930 CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(spmat_handle->mat)))
931}
932
933// Several things are being done in this stage, algorithm selection, planning,
934// and returning workspace and compressed matrices data buffer sizes.
935// The parameter prune_flag is used to indicate whether pruning and pruning
936// check will happen 0 means not prune or prune check, 1 means prune, 2 means
937// prune & prune check
938extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
939mgpuCuSparseLtSpMMBufferSize(void *bs, int32_t ma, int32_t mb, void *a, void *b,
940 void *c, int32_t ctp, int32_t prune_flag,
941 CUstream stream) {
942 assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()");
943 // TODO: support more advanced settings, e.g., the input right operand is a
944 // sparse matrix assuming matA is the sparse matrix
945 auto matA = reinterpret_cast<cusparseLtSpMatHandleAndData *>(a);
946 auto matB = reinterpret_cast<cusparseLtDnMatHandleAndData *>(b);
947 auto matC = reinterpret_cast<cusparseLtDnMatHandleAndData *>(c);
948 auto workspace_size = reinterpret_cast<size_t *>(bs);
949 auto compressed_size = &(reinterpret_cast<size_t *>(bs)[1]);
950 auto compressed_buffer_size = &(reinterpret_cast<size_t *>(bs)[2]);
951 auto cTp = static_cast<cusparseComputeType>(ctp);
952
953 cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
954 cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb);
955 CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulDescriptorInit(
956 &cusparseLt_env, &(matA->matmul), modeA, modeB, &(matA->mat),
957 &(matB->mat), &(matC->mat), &(matC->mat), cTp))
958 CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulAlgSelectionInit(
959 &cusparseLt_env, &(matA->alg_sel), &(matA->matmul),
960 CUSPARSELT_MATMUL_ALG_DEFAULT))
961 int alg = 0;
962 CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulAlgSetAttribute(
963 &cusparseLt_env, &(matA->alg_sel), CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg,
964 sizeof(alg)))
965
966 CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulPlanInit(
967 &cusparseLt_env, &(matA->plan), &(matA->matmul), &(matA->alg_sel)))
968
969 // Pruning step (in-place).
970 if (prune_flag > 0)
971 CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMAPrune(
972 &cusparseLt_env, &(matA->matmul), matA->values, matA->values,
973 CUSPARSELT_PRUNE_SPMMA_STRIP, stream))
974
975 // Check structure of A.
976 // Note that this adds a synchronization on the stream.
977 // TODO: Do we want that?
978 if (prune_flag == 2) {
979 int *dvalid = (int *)mgpuMemAlloc(sizeof(int), stream, false);
980 CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMAPruneCheck(
981 &cusparseLt_env, &(matA->matmul), matA->values, dvalid, stream))
982 int valid = 0;
983 mgpuMemcpy(&valid, dvalid, sizeof(int), stream);
984 mgpuStreamSynchronize(stream);
985 mgpuMemFree(dvalid, stream);
986 if (valid != 0)
987 fprintf(stderr, "CUPARSE-LT: sparse matrix is not 2:4; computed results "
988 "will be invalid\n");
989 }
990
991 CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulGetWorkspace(
992 &cusparseLt_env, &(matA->plan), workspace_size))
993 CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMACompressedSize(
994 &cusparseLt_env, &(matA->plan), compressed_size, compressed_buffer_size))
995}
996
997extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
998mgpuCuSparseLtSpMM(void *a, void *b, void *c, void *d_workspace,
999 void *dA_compressed, void *dA_compressedBuffer,
1000 CUstream stream) {
1001 assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()");
1002 auto matA = reinterpret_cast<cusparseLtSpMatHandleAndData *>(a);
1003 auto matB = reinterpret_cast<cusparseLtDnMatHandleAndData *>(b);
1004 auto matC = reinterpret_cast<cusparseLtDnMatHandleAndData *>(c);
1005
1006 ALPHABETA(CUDA_R_32F, alpha, beta)
1007 CUSPARSE_REPORT_IF_ERROR(
1008 cusparseLtSpMMACompress(&cusparseLt_env, &(matA->plan), (matA->values),
1009 dA_compressed, dA_compressedBuffer, stream))
1010
1011 // TODO: add support to multi-stream execution
1012 // Perform the matrix multiplication. D = A*B+C using C==D for now
1013 CUSPARSE_REPORT_IF_ERROR(
1014 cusparseLtMatmul(&cusparseLt_env, &(matA->plan), alphap, dA_compressed,
1015 matB->values, betap, matC->values,
1016 /*dD*/ matC->values, d_workspace, nullptr, 0))
1017
1018 CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(matA->mat)))
1019 // destroy the plan associated with the sparse matrix
1020 CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulPlanDestroy(&(matA->plan)))
1021}
1022
1023#endif // MLIR_ENABLE_CUDA_CUSPARSELT
1024#endif // MLIR_ENABLE_CUDA_CUSPARSE
1025

source code of mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp