1 | //===- CudaRuntimeWrappers.cpp - MLIR CUDA API wrapper library ------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // Implements C wrappers around the CUDA library for easy linking in ORC jit. |
10 | // Also adds some debugging helpers that are helpful when writing MLIR code to |
11 | // run on GPUs. |
12 | // |
13 | //===----------------------------------------------------------------------===// |
14 | |
15 | #include "mlir/ExecutionEngine/CRunnerUtils.h" |
16 | |
17 | #include <stdio.h> |
18 | |
19 | #include "cuda.h" |
20 | #include "cuda_bf16.h" |
21 | #include "cuda_fp16.h" |
22 | |
23 | #ifdef MLIR_ENABLE_CUDA_CUSPARSE |
24 | #include "cusparse.h" |
25 | #ifdef MLIR_ENABLE_CUDA_CUSPARSELT |
26 | #include "cusparseLt.h" |
27 | #endif // MLIR_ENABLE_CUDA_CUSPARSELT |
28 | #endif // MLIR_ENABLE_CUDA_CUSPARSE |
29 | |
30 | #ifdef _WIN32 |
31 | #include <malloc.h> |
32 | #define MLIR_CUDA_WRAPPERS_EXPORT __declspec(dllexport) |
33 | #else |
34 | #define MLIR_CUDA_WRAPPERS_EXPORT __attribute__((visibility("default"))) |
35 | #endif // _WIN32 |
36 | |
37 | #define CUDA_REPORT_IF_ERROR(expr) \ |
38 | [](CUresult result) { \ |
39 | if (!result) \ |
40 | return; \ |
41 | const char *name = nullptr; \ |
42 | cuGetErrorName(result, &name); \ |
43 | if (!name) \ |
44 | name = "<unknown>"; \ |
45 | fprintf(stderr, "'%s' failed with '%s'\n", #expr, name); \ |
46 | }(expr) |
47 | |
48 | #define CUSPARSE_REPORT_IF_ERROR(expr) \ |
49 | { \ |
50 | cusparseStatus_t status = (expr); \ |
51 | if (status != CUSPARSE_STATUS_SUCCESS) { \ |
52 | fprintf(stderr, "cuSPARSE '%s' failed with '%s'\n", #expr, \ |
53 | cusparseGetErrorString(status)); \ |
54 | } \ |
55 | } |
56 | |
57 | thread_local static int32_t defaultDevice = 0; |
58 | |
59 | const char *kDebugEnvironmentVariable = "MLIR_CUDA_DEBUG" ; |
60 | |
61 | /// Helper method that checks environment value for debugging. |
62 | bool isDebugEnabled() { |
63 | static bool isInitialized = false; |
64 | static bool isEnabled = false; |
65 | if (!isInitialized) |
66 | isEnabled = getenv(kDebugEnvironmentVariable) != nullptr; |
67 | return isEnabled; |
68 | } |
69 | |
70 | #define debug_print(fmt, ...) \ |
71 | do { \ |
72 | if (isDebugEnabled()) \ |
73 | fprintf(stderr, "%s:%d:%s(): " fmt, "CudaRuntimeWrappers.cpp", __LINE__, \ |
74 | __func__, __VA_ARGS__); \ |
75 | } while (0) |
76 | |
77 | // Returns default CUdevice |
78 | CUdevice getDefaultCuDevice() { |
79 | CUdevice device; |
80 | CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice)); |
81 | return device; |
82 | } |
83 | |
84 | // Make the primary context of the current default device current for the |
85 | // duration |
86 | // of the instance and restore the previous context on destruction. |
87 | class ScopedContext { |
88 | public: |
89 | ScopedContext() { |
90 | // Static reference to CUDA primary context for device ordinal |
91 | // defaultDevice. |
92 | static CUcontext context = [] { |
93 | CUDA_REPORT_IF_ERROR(cuInit(/*flags=*/0)); |
94 | CUcontext ctx; |
95 | // Note: this does not affect the current context. |
96 | CUDA_REPORT_IF_ERROR( |
97 | cuDevicePrimaryCtxRetain(&ctx, getDefaultCuDevice())); |
98 | return ctx; |
99 | }(); |
100 | |
101 | CUDA_REPORT_IF_ERROR(cuCtxPushCurrent(context)); |
102 | } |
103 | |
104 | ~ScopedContext() { CUDA_REPORT_IF_ERROR(cuCtxPopCurrent(nullptr)); } |
105 | }; |
106 | |
107 | #ifdef MLIR_ENABLE_CUDA_CUSPARSE |
108 | // Note that (1) Nvidia confirms the safety to share handle across multiple |
109 | // instances, and streams. (2) Clients are responsible to call the @mgpu |
110 | // environment initialization/destruction in a thread-safe manner, e.g., |
111 | // at the beginning of the program before multi-threads are created. |
112 | static cusparseHandle_t cusparse_env = nullptr; |
113 | |
114 | #ifdef MLIR_ENABLE_CUDA_CUSPARSELT |
115 | // cusparseLtHandle_t is not a pointer type, so we need an additional flag to |
116 | // indicate whether it is initialized. |
117 | static cusparseLtHandle_t cusparseLt_env; |
118 | static bool cusparseLt_initiated = false; |
119 | |
120 | #endif // MLIR_ENABLE_CUDA_CUSPARSELT |
121 | #endif // MLIR_ENABLE_CUDA_CUSPARSE |
122 | |
123 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule |
124 | mgpuModuleLoad(void *data, size_t /*gpuBlobSize*/) { |
125 | ScopedContext scopedContext; |
126 | CUmodule module = nullptr; |
127 | CUDA_REPORT_IF_ERROR(cuModuleLoadData(&module, data)); |
128 | return module; |
129 | } |
130 | |
131 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoadJIT(void *data, |
132 | int optLevel) { |
133 | ScopedContext scopedContext; |
134 | CUmodule module = nullptr; |
135 | char jitErrorBuffer[4096] = {0}; |
136 | CUjit_option jitOptions[] = {CU_JIT_ERROR_LOG_BUFFER, |
137 | CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, |
138 | CU_JIT_OPTIMIZATION_LEVEL}; |
139 | void *jitOptionsVals[] = {jitErrorBuffer, |
140 | reinterpret_cast<void *>(sizeof(jitErrorBuffer)), |
141 | reinterpret_cast<void *>(optLevel)}; |
142 | |
143 | CUresult result = |
144 | cuModuleLoadDataEx(&module, data, 3, jitOptions, jitOptionsVals); |
145 | if (result) { |
146 | fprintf(stderr, format: "JIT compilation failed with: '%s'\n" , jitErrorBuffer); |
147 | CUDA_REPORT_IF_ERROR(result); |
148 | } |
149 | return module; |
150 | } |
151 | |
152 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuModuleUnload(CUmodule module) { |
153 | CUDA_REPORT_IF_ERROR(cuModuleUnload(module)); |
154 | } |
155 | |
156 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUfunction |
157 | mgpuModuleGetFunction(CUmodule module, const char *name) { |
158 | CUfunction function = nullptr; |
159 | CUDA_REPORT_IF_ERROR(cuModuleGetFunction(&function, module, name)); |
160 | return function; |
161 | } |
162 | |
163 | // The wrapper uses intptr_t instead of CUDA's unsigned int to match |
164 | // the type of MLIR's index type. This avoids the need for casts in the |
165 | // generated MLIR code. |
166 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
167 | mgpuLaunchKernel(CUfunction function, intptr_t gridX, intptr_t gridY, |
168 | intptr_t gridZ, intptr_t blockX, intptr_t blockY, |
169 | intptr_t blockZ, int32_t smem, CUstream stream, void **params, |
170 | void **, size_t /*paramsCount*/) { |
171 | ScopedContext scopedContext; |
172 | if (smem > 0) { |
173 | // Avoid checking driver as it's more expensive than if statement |
174 | int32_t maxShmem = 0; |
175 | CUdevice device = getDefaultCuDevice(); |
176 | CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice)); |
177 | CUDA_REPORT_IF_ERROR(cuDeviceGetAttribute( |
178 | &maxShmem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, |
179 | device)); |
180 | if (maxShmem < smem) { |
181 | fprintf(stderr, |
182 | format: "Requested shared memory (%dkb) is larger than maximum allowed " |
183 | "shared memory (%dkb) for this device\n" , |
184 | smem, maxShmem); |
185 | } |
186 | CUDA_REPORT_IF_ERROR(cuFuncSetAttribute( |
187 | function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, smem)); |
188 | } |
189 | debug_print("Launching kernel, grid=%ld,%ld,%ld, " |
190 | "threads: %ld, %ld, %ld, " |
191 | "smem: %dkb\n" , |
192 | gridX, gridY, gridZ, blockX, blockY, blockZ, smem); |
193 | CUDA_REPORT_IF_ERROR(cuLaunchKernel(function, gridX, gridY, gridZ, blockX, |
194 | blockY, blockZ, smem, stream, params, |
195 | extra)); |
196 | } |
197 | |
198 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUstream mgpuStreamCreate() { |
199 | ScopedContext scopedContext; |
200 | CUstream stream = nullptr; |
201 | CUDA_REPORT_IF_ERROR(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); |
202 | return stream; |
203 | } |
204 | |
205 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuStreamDestroy(CUstream stream) { |
206 | CUDA_REPORT_IF_ERROR(cuStreamDestroy(stream)); |
207 | } |
208 | |
209 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
210 | mgpuStreamSynchronize(CUstream stream) { |
211 | CUDA_REPORT_IF_ERROR(cuStreamSynchronize(stream)); |
212 | } |
213 | |
214 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuStreamWaitEvent(CUstream stream, |
215 | CUevent event) { |
216 | CUDA_REPORT_IF_ERROR(cuStreamWaitEvent(stream, event, /*flags=*/0)); |
217 | } |
218 | |
219 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUevent mgpuEventCreate() { |
220 | ScopedContext scopedContext; |
221 | CUevent event = nullptr; |
222 | CUDA_REPORT_IF_ERROR(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); |
223 | return event; |
224 | } |
225 | |
226 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventDestroy(CUevent event) { |
227 | CUDA_REPORT_IF_ERROR(cuEventDestroy(event)); |
228 | } |
229 | |
230 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventSynchronize(CUevent event) { |
231 | CUDA_REPORT_IF_ERROR(cuEventSynchronize(event)); |
232 | } |
233 | |
234 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventRecord(CUevent event, |
235 | CUstream stream) { |
236 | CUDA_REPORT_IF_ERROR(cuEventRecord(event, stream)); |
237 | } |
238 | |
239 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * |
240 | mgpuMemAlloc(uint64_t sizeBytes, CUstream /*stream*/, bool /*isHostShared*/) { |
241 | ScopedContext scopedContext; |
242 | CUdeviceptr ptr = 0; |
243 | if (sizeBytes != 0) |
244 | CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes)); |
245 | return reinterpret_cast<void *>(ptr); |
246 | } |
247 | |
248 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuMemFree(void *ptr, |
249 | CUstream /*stream*/) { |
250 | CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(ptr))); |
251 | } |
252 | |
253 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
254 | mgpuMemcpy(void *dst, void *src, size_t sizeBytes, CUstream stream) { |
255 | CUDA_REPORT_IF_ERROR(cuMemcpyAsync(reinterpret_cast<CUdeviceptr>(dst), |
256 | reinterpret_cast<CUdeviceptr>(src), |
257 | sizeBytes, stream)); |
258 | } |
259 | |
260 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
261 | mgpuMemset32(void *dst, unsigned int value, size_t count, CUstream stream) { |
262 | CUDA_REPORT_IF_ERROR(cuMemsetD32Async(reinterpret_cast<CUdeviceptr>(dst), |
263 | value, count, stream)); |
264 | } |
265 | |
266 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
267 | mgpuMemset16(void *dst, unsigned short value, size_t count, CUstream stream) { |
268 | CUDA_REPORT_IF_ERROR(cuMemsetD16Async(reinterpret_cast<CUdeviceptr>(dst), |
269 | value, count, stream)); |
270 | } |
271 | |
272 | /// |
273 | /// Helper functions for writing mlir example code |
274 | /// |
275 | |
276 | // Allows to register byte array with the CUDA runtime. Helpful until we have |
277 | // transfer functions implemented. |
278 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
279 | mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) { |
280 | ScopedContext scopedContext; |
281 | CUDA_REPORT_IF_ERROR(cuMemHostRegister(ptr, sizeBytes, /*flags=*/0)); |
282 | } |
283 | |
284 | /// Registers a memref with the CUDA runtime. `descriptor` is a pointer to a |
285 | /// ranked memref descriptor struct of rank `rank`. Helpful until we have |
286 | /// transfer functions implemented. |
287 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
288 | mgpuMemHostRegisterMemRef(int64_t rank, StridedMemRefType<char, 1> *descriptor, |
289 | int64_t elementSizeBytes) { |
290 | // Only densely packed tensors are currently supported. |
291 | #ifdef _WIN32 |
292 | int64_t *denseStrides = (int64_t *)_alloca(rank * sizeof(int64_t)); |
293 | #else |
294 | int64_t *denseStrides = (int64_t *)alloca(rank * sizeof(int64_t)); |
295 | #endif // _WIN32 |
296 | int64_t *sizes = descriptor->sizes; |
297 | for (int64_t i = rank - 1, runningStride = 1; i >= 0; i--) { |
298 | denseStrides[i] = runningStride; |
299 | runningStride *= sizes[i]; |
300 | } |
301 | uint64_t sizeBytes = sizes[0] * denseStrides[0] * elementSizeBytes; |
302 | int64_t *strides = &sizes[rank]; |
303 | (void)strides; |
304 | for (unsigned i = 0; i < rank; ++i) |
305 | assert(strides[i] == denseStrides[i] && |
306 | "Mismatch in computed dense strides" ); |
307 | |
308 | auto *ptr = descriptor->data + descriptor->offset * elementSizeBytes; |
309 | mgpuMemHostRegister(ptr, sizeBytes); |
310 | } |
311 | |
312 | // Allows to unregister byte array with the CUDA runtime. |
313 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuMemHostUnregister(void *ptr) { |
314 | ScopedContext scopedContext; |
315 | CUDA_REPORT_IF_ERROR(cuMemHostUnregister(ptr)); |
316 | } |
317 | |
318 | /// Unregisters a memref with the CUDA runtime. `descriptor` is a pointer to a |
319 | /// ranked memref descriptor struct of rank `rank` |
320 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
321 | mgpuMemHostUnregisterMemRef(int64_t rank, |
322 | StridedMemRefType<char, 1> *descriptor, |
323 | int64_t elementSizeBytes) { |
324 | auto *ptr = descriptor->data + descriptor->offset * elementSizeBytes; |
325 | mgpuMemHostUnregister(ptr); |
326 | } |
327 | |
328 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) { |
329 | defaultDevice = device; |
330 | } |
331 | |
332 | /// |
333 | /// Runtime methods using CUDA 12.0+ driver |
334 | /// |
335 | |
336 | #if (CUDA_VERSION >= 12000) |
337 | |
338 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuLaunchClusterKernel( |
339 | CUfunction function, intptr_t clusterX, intptr_t clusterY, |
340 | intptr_t clusterZ, intptr_t gridX, intptr_t gridY, intptr_t gridZ, |
341 | intptr_t blockX, intptr_t blockY, intptr_t blockZ, int32_t smem, |
342 | CUstream stream, void **params, void **extra, size_t /*paramsCount*/) { |
343 | ScopedContext scopedContext; |
344 | if (smem > 0) { |
345 | // Avoid checking driver as it's more expensive than if statement |
346 | int32_t maxShmem = 0; |
347 | CUdevice device = getDefaultCuDevice(); |
348 | CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice)); |
349 | CUDA_REPORT_IF_ERROR(cuDeviceGetAttribute( |
350 | &maxShmem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, |
351 | device)); |
352 | if (maxShmem < smem) { |
353 | fprintf(stderr, |
354 | "Requested shared memory (%dkb) is larger than maximum allowed " |
355 | "shared memory (%dkb) for this device\n" , |
356 | smem, maxShmem); |
357 | } |
358 | CUDA_REPORT_IF_ERROR(cuFuncSetAttribute( |
359 | function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, smem)); |
360 | } |
361 | CUlaunchConfig config; |
362 | config.gridDimX = gridX; |
363 | config.gridDimY = gridY; |
364 | config.gridDimZ = gridZ; |
365 | config.blockDimX = blockX; |
366 | config.blockDimY = blockY; |
367 | config.blockDimZ = blockZ; |
368 | config.sharedMemBytes = smem; |
369 | config.hStream = stream; |
370 | CUlaunchAttribute launchAttr[2]; |
371 | launchAttr[0].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION; |
372 | launchAttr[0].value.clusterDim.x = clusterX; |
373 | launchAttr[0].value.clusterDim.y = clusterY; |
374 | launchAttr[0].value.clusterDim.z = clusterZ; |
375 | launchAttr[1].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE; |
376 | launchAttr[1].value.clusterSchedulingPolicyPreference = |
377 | CU_CLUSTER_SCHEDULING_POLICY_SPREAD; |
378 | config.numAttrs = 2; |
379 | config.attrs = launchAttr; |
380 | |
381 | debug_print("Launching kernel," |
382 | "cluster: %ld, %ld, %ld, " |
383 | "grid=%ld,%ld,%ld, " |
384 | "threads: %ld, %ld, %ld, " |
385 | "smem: %dkb\n" , |
386 | clusterX, clusterY, clusterZ, gridX, gridY, gridZ, blockX, blockY, |
387 | blockZ, smem); |
388 | |
389 | CUDA_REPORT_IF_ERROR(cuLaunchKernelEx(&config, function, params, extra)); |
390 | } |
391 | |
392 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuTensorMapEncodeTiled( |
393 | CUtensorMap *tensorMap, // Tensor map object |
394 | CUtensorMapDataType tensorDataType, // Tensor data type |
395 | cuuint32_t tensorRank, // Dimensionality of tensor |
396 | void *globalAddress, // Starting address |
397 | const cuuint64_t *globalDim, // Tensor size (number of elements) |
398 | const cuuint64_t *globalStrides, // Stride size (in bytes) |
399 | const cuuint32_t *boxDim, // Traversal box (number of elments) |
400 | const cuuint32_t *elementStrides, // Traversal stride |
401 | CUtensorMapInterleave interleave, // Type of interleaved layout |
402 | CUtensorMapSwizzle swizzle, // Bank swizzling pattern |
403 | CUtensorMapL2promotion l2Promotion, // L2 promotion size |
404 | CUtensorMapFloatOOBfill oobFill // Padding zfill or NaN fill |
405 | ) { |
406 | ScopedContext scopedContext; |
407 | CUDA_REPORT_IF_ERROR(cuTensorMapEncodeTiled( |
408 | tensorMap, tensorDataType, tensorRank, globalAddress, globalDim, |
409 | globalStrides, boxDim, elementStrides, interleave, swizzle, l2Promotion, |
410 | oobFill)); |
411 | debug_print("Created TMA descriptor\n Addr: %p\n" |
412 | "data type : %d\n" |
413 | "rank : %d\n" |
414 | "globalDim[5]: %zu, %zu, %zu, %zu, %zu\n" |
415 | "globalStrides[5]: %zu, %zu, %zu, %zu, %zu\n" |
416 | "boxDim[5]: %u, %u, %u, %u, %u\n" |
417 | "elementStrides[5]: %u, %u, %u, %u, %u\n" |
418 | "interleave: %u \n" |
419 | "swizzle: %u \n" |
420 | "l2Promotion: %u \n" |
421 | "oobFill: %u \n" , |
422 | (void *)&tensorMap, tensorDataType, tensorRank, globalDim[0], |
423 | globalDim[1], globalDim[2], globalDim[3], globalDim[4], |
424 | globalStrides[0], globalStrides[1], globalStrides[2], |
425 | globalStrides[3], globalStrides[4], boxDim[0], boxDim[1], |
426 | boxDim[2], boxDim[3], boxDim[4], elementStrides[0], |
427 | elementStrides[1], elementStrides[2], elementStrides[3], |
428 | elementStrides[4], interleave, swizzle, l2Promotion, oobFill); |
429 | } |
430 | |
431 | template <int Rank> |
432 | void mgpuGetMemRefDataAndShape(void *rawDescriptor, char **addr, |
433 | uint64_t *globalDim, uint64_t *globalStrides, |
434 | const CUtensorMapDataType tensorDataType) { |
435 | auto descriptor = |
436 | reinterpret_cast<StridedMemRefType<char, Rank> *>(rawDescriptor); |
437 | *addr = descriptor->data; |
438 | for (int i = 0; i < Rank; ++i) { |
439 | globalDim[i] = static_cast<uint64_t>(descriptor->sizes[Rank - i - 1]); |
440 | } |
441 | static constexpr int elementSizeInBytes[] = {1, 2, 4, 4, 8, 8, 2, |
442 | 4, 8, 2, 4, 4, 4}; |
443 | for (int i = 0; i < Rank - 1; ++i) { |
444 | globalStrides[i] = static_cast<uint64_t>( |
445 | descriptor->strides[Rank - i - 2] * elementSizeInBytes[tensorDataType]); |
446 | } |
447 | } |
448 | |
449 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *mgpuTensorMapEncodeTiledMemref( |
450 | int64_t tensorRank, // Dimensionality of tensor |
451 | void *rankedDescriptor, // Ranked MemRef descriptor |
452 | const CUtensorMapDataType tensorDataType, // Stride size (in bytes) |
453 | CUtensorMapInterleave interleave, // Type of interleaved layout |
454 | CUtensorMapSwizzle swizzle, // Bank swizzling pattern |
455 | CUtensorMapL2promotion l2Promotion, // L2 promotion size |
456 | CUtensorMapFloatOOBfill oobFill, // Padding zfill or NaN fill |
457 | int64_t *inputBoxDims // Tensor size (number of elements) |
458 | ) { |
459 | CUtensorMap tensorMap; |
460 | |
461 | uint32_t boxDim[5] = {1, 1, 1, 1, 1}, elementStrides[5] = {1, 1, 1, 1, 1}; |
462 | uint64_t globalDim[5] = {1, 1, 1, 1, 1}, globalStrides[5] = {0}; |
463 | uint32_t tensorRank32 = uint32_t(tensorRank); |
464 | |
465 | char *globalAddress = nullptr; |
466 | switch (tensorRank) { |
467 | case 1: |
468 | mgpuGetMemRefDataAndShape<1>(rankedDescriptor, &globalAddress, globalDim, |
469 | globalStrides, tensorDataType); |
470 | break; |
471 | case 2: |
472 | mgpuGetMemRefDataAndShape<2>(rankedDescriptor, &globalAddress, globalDim, |
473 | globalStrides, tensorDataType); |
474 | break; |
475 | case 3: |
476 | mgpuGetMemRefDataAndShape<3>(rankedDescriptor, &globalAddress, globalDim, |
477 | globalStrides, tensorDataType); |
478 | break; |
479 | case 4: |
480 | mgpuGetMemRefDataAndShape<4>(rankedDescriptor, &globalAddress, globalDim, |
481 | globalStrides, tensorDataType); |
482 | break; |
483 | case 5: |
484 | mgpuGetMemRefDataAndShape<5>(rankedDescriptor, &globalAddress, globalDim, |
485 | globalStrides, tensorDataType); |
486 | break; |
487 | default: |
488 | fprintf( |
489 | stderr, |
490 | "'mgpuTensorMapEncodeTiledMemref' failed with 'rank is too high'\n" ); |
491 | return nullptr; |
492 | } |
493 | |
494 | for (int64_t r = 0; r < tensorRank; ++r) { |
495 | boxDim[r] = static_cast<uint32_t>(inputBoxDims[tensorRank - r - 1]); |
496 | } |
497 | |
498 | ScopedContext scopedContext; |
499 | mgpuTensorMapEncodeTiled(&tensorMap, tensorDataType, tensorRank32, |
500 | globalAddress, globalDim, globalStrides, boxDim, |
501 | elementStrides, interleave, swizzle, l2Promotion, |
502 | oobFill); |
503 | // Copy created tensor map to device |
504 | CUdeviceptr dTensorMap; |
505 | CUDA_REPORT_IF_ERROR(cuMemAlloc(&dTensorMap, sizeof(CUtensorMap))); |
506 | CUDA_REPORT_IF_ERROR(cuMemcpy(dTensorMap, |
507 | reinterpret_cast<CUdeviceptr>(&tensorMap), |
508 | sizeof(CUtensorMap))); |
509 | return reinterpret_cast<void *>(dTensorMap); |
510 | } |
511 | #endif |
512 | |
513 | #ifdef MLIR_ENABLE_CUDA_CUSPARSE |
514 | |
515 | /// |
516 | /// Wrapper methods for the cuSparse library. |
517 | /// |
518 | |
519 | // Some macro magic to get float/double alpha and beta on host. |
520 | // TODO: add support to passing alpha and beta as arguments |
521 | #define ALPHABETA(dtp, alpha, beta) \ |
522 | __nv_bfloat16(alpha##16bf) = 1.0f; \ |
523 | __nv_bfloat16(beta##16bf) = 1.0f; \ |
524 | __half(alpha##16f) = 1.0f; \ |
525 | __half(beta##16f) = 1.0f; \ |
526 | float(alpha##f) = 1.0f; \ |
527 | float(beta##f) = 1.0f; \ |
528 | double(alpha##d) = 1.0; \ |
529 | double(beta##d) = 1.0; \ |
530 | const void *(alpha##p) = nullptr; \ |
531 | const void *(beta##p) = nullptr; \ |
532 | if (dtp == CUDA_R_16BF || dtp == CUDA_C_16BF) { \ |
533 | (alpha##p) = reinterpret_cast<void *>(&(alpha##16bf)); \ |
534 | (beta##p) = reinterpret_cast<void *>(&(beta##16bf)); \ |
535 | } else if (dtp == CUDA_R_16F || dtp == CUDA_C_16F) { \ |
536 | (alpha##p) = reinterpret_cast<void *>(&(alpha##16f)); \ |
537 | (beta##p) = reinterpret_cast<void *>(&(beta##16f)); \ |
538 | } else if (dtp == CUDA_R_32F || dtp == CUDA_C_32F) { \ |
539 | (alpha##p) = reinterpret_cast<void *>(&(alpha##f)); \ |
540 | (beta##p) = reinterpret_cast<void *>(&(beta##f)); \ |
541 | } else { \ |
542 | (alpha##p) = reinterpret_cast<void *>(&(alpha##d)); \ |
543 | (beta##p) = reinterpret_cast<void *>(&(beta##d)); \ |
544 | } |
545 | |
546 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuCreateSparseEnv() { |
547 | // ScopedContext is for cuda initialization. |
548 | ScopedContext scopedContext; |
549 | assert(!cusparse_env && "client called mgpuCreateSparseEnv() twice" ); |
550 | CUSPARSE_REPORT_IF_ERROR(cusparseCreate(&cusparse_env)); |
551 | } |
552 | |
553 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuDestroySparseEnv() { |
554 | assert(cusparse_env && "client did not call mgpuCreateSparseEnv()" ); |
555 | CUSPARSE_REPORT_IF_ERROR(cusparseDestroy(cusparse_env)); |
556 | cusparse_env = nullptr; |
557 | } |
558 | |
559 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * |
560 | mgpuCreateDnVec(intptr_t size, void *values, int32_t dtp, CUstream /*stream*/) { |
561 | cusparseDnVecDescr_t vec = nullptr; |
562 | auto dTp = static_cast<cudaDataType_t>(dtp); |
563 | CUSPARSE_REPORT_IF_ERROR(cusparseCreateDnVec(&vec, size, values, dTp)) |
564 | return reinterpret_cast<void *>(vec); |
565 | } |
566 | |
567 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
568 | mgpuDestroyDnVec(void *v, CUstream /*stream*/) { |
569 | cusparseDnVecDescr_t vec = reinterpret_cast<cusparseDnVecDescr_t>(v); |
570 | CUSPARSE_REPORT_IF_ERROR(cusparseDestroyDnVec(vec)) |
571 | } |
572 | |
573 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * |
574 | mgpuCreateDnMat(intptr_t rows, intptr_t cols, void *values, int32_t dtp, |
575 | CUstream /*stream*/) { |
576 | cusparseDnMatDescr_t mat = nullptr; |
577 | auto dTp = static_cast<cudaDataType_t>(dtp); |
578 | CUSPARSE_REPORT_IF_ERROR(cusparseCreateDnMat(&mat, rows, cols, /*ld=*/cols, |
579 | values, dTp, CUSPARSE_ORDER_ROW)) |
580 | return reinterpret_cast<void *>(mat); |
581 | } |
582 | |
583 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
584 | mgpuDestroyDnMat(void *m, CUstream /*stream*/) { |
585 | cusparseDnMatDescr_t mat = reinterpret_cast<cusparseDnMatDescr_t>(m); |
586 | CUSPARSE_REPORT_IF_ERROR(cusparseDestroyDnMat(mat)) |
587 | } |
588 | |
589 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * |
590 | mgpuCreateCoo(intptr_t rows, intptr_t cols, intptr_t nnz, void *rowIdxs, |
591 | void *colIdxs, void *values, int32_t itp, int32_t dtp, |
592 | CUstream /*stream*/) { |
593 | cusparseSpMatDescr_t mat = nullptr; |
594 | auto iTp = static_cast<cusparseIndexType_t>(itp); |
595 | auto dTp = static_cast<cudaDataType_t>(dtp); |
596 | CUSPARSE_REPORT_IF_ERROR(cusparseCreateCoo(&mat, rows, cols, nnz, rowIdxs, |
597 | colIdxs, values, iTp, |
598 | CUSPARSE_INDEX_BASE_ZERO, dTp)) |
599 | return reinterpret_cast<void *>(mat); |
600 | } |
601 | |
602 | #ifdef CUSPARSE_COO_AOS // deprecated in cuSPARSE 11.2 |
603 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * |
604 | mgpuCreateCooAoS(intptr_t rows, intptr_t cols, intptr_t nnz, void *idxs, |
605 | void *values, int32_t itp, int32_t dtp, CUstream /*stream*/) { |
606 | cusparseSpMatDescr_t mat = nullptr; |
607 | auto iTp = static_cast<cusparseIndexType_t>(itp); |
608 | auto dTp = static_cast<cudaDataType_t>(dtp); |
609 | CUSPARSE_REPORT_IF_ERROR(cusparseCreateCooAoS( |
610 | &mat, rows, cols, nnz, idxs, values, iTp, CUSPARSE_INDEX_BASE_ZERO, dTp)) |
611 | return reinterpret_cast<void *>(mat); |
612 | } |
613 | #endif // CUSPARSE_COO_AOS |
614 | |
615 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * |
616 | mgpuCreateCsr(intptr_t rows, intptr_t cols, intptr_t nnz, void *rowPos, |
617 | void *colIdxs, void *values, int32_t ptp, int32_t itp, |
618 | int32_t dtp, CUstream /*stream*/) { |
619 | cusparseSpMatDescr_t mat = nullptr; |
620 | auto pTp = static_cast<cusparseIndexType_t>(ptp); |
621 | auto iTp = static_cast<cusparseIndexType_t>(itp); |
622 | auto dTp = static_cast<cudaDataType_t>(dtp); |
623 | CUSPARSE_REPORT_IF_ERROR(cusparseCreateCsr(&mat, rows, cols, nnz, rowPos, |
624 | colIdxs, values, pTp, iTp, |
625 | CUSPARSE_INDEX_BASE_ZERO, dTp)) |
626 | return reinterpret_cast<void *>(mat); |
627 | } |
628 | |
629 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * |
630 | mgpuCreateCsc(intptr_t rows, intptr_t cols, intptr_t nnz, void *colPos, |
631 | void *rowIdxs, void *values, int32_t ptp, int32_t itp, |
632 | int32_t dtp, CUstream /*stream*/) { |
633 | cusparseSpMatDescr_t mat = nullptr; |
634 | auto pTp = static_cast<cusparseIndexType_t>(ptp); |
635 | auto iTp = static_cast<cusparseIndexType_t>(itp); |
636 | auto dTp = static_cast<cudaDataType_t>(dtp); |
637 | CUSPARSE_REPORT_IF_ERROR(cusparseCreateCsc(&mat, rows, cols, nnz, colPos, |
638 | rowIdxs, values, pTp, iTp, |
639 | CUSPARSE_INDEX_BASE_ZERO, dTp)) |
640 | return reinterpret_cast<void *>(mat); |
641 | } |
642 | |
643 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * |
644 | mgpuCreateBsr(intptr_t brows, intptr_t bcols, intptr_t bnnz, intptr_t rBsz, |
645 | intptr_t cBsz, void *rowPos, void *colIdxs, void *values, |
646 | int32_t ptp, int32_t itp, int32_t dtp, CUstream /*stream*/) { |
647 | cusparseSpMatDescr_t mat = nullptr; |
648 | #if CUSPARSE_VERSION >= 12100 |
649 | auto pTp = static_cast<cusparseIndexType_t>(ptp); |
650 | auto iTp = static_cast<cusparseIndexType_t>(itp); |
651 | auto dTp = static_cast<cudaDataType_t>(dtp); |
652 | CUSPARSE_REPORT_IF_ERROR(cusparseCreateBsr( |
653 | &mat, brows, bcols, bnnz, rBsz, cBsz, rowPos, colIdxs, values, pTp, iTp, |
654 | CUSPARSE_INDEX_BASE_ZERO, dTp, CUSPARSE_ORDER_ROW)) |
655 | #endif |
656 | return reinterpret_cast<void *>(mat); |
657 | } |
658 | |
659 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
660 | mgpuDestroySpMat(void *m, CUstream /*stream*/) { |
661 | cusparseSpMatDescr_t mat = reinterpret_cast<cusparseSpMatDescr_t>(m); |
662 | CUSPARSE_REPORT_IF_ERROR(cusparseDestroySpMat(mat)) |
663 | } |
664 | |
665 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t mgpuSpMVBufferSize( |
666 | int32_t ma, void *a, void *x, void *y, int32_t ctp, CUstream /*stream*/) { |
667 | assert(cusparse_env && "client did not call mgpuCreateSparseEnv()" ); |
668 | cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma); |
669 | cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a); |
670 | cusparseDnVecDescr_t vecX = reinterpret_cast<cusparseDnVecDescr_t>(x); |
671 | cusparseDnVecDescr_t vecY = reinterpret_cast<cusparseDnVecDescr_t>(y); |
672 | cudaDataType_t cTp = static_cast<cudaDataType_t>(ctp); |
673 | ALPHABETA(cTp, alpha, beta) |
674 | size_t bufferSize = 0; |
675 | CUSPARSE_REPORT_IF_ERROR(cusparseSpMV_bufferSize( |
676 | cusparse_env, modeA, alphap, matA, vecX, betap, vecY, cTp, |
677 | CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize)) |
678 | return bufferSize; |
679 | } |
680 | |
681 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSpMV(int32_t ma, void *a, void *x, |
682 | void *y, int32_t ctp, |
683 | void *buf, |
684 | CUstream /*stream*/) { |
685 | assert(cusparse_env && "client did not call mgpuCreateSparseEnv()" ); |
686 | cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma); |
687 | cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a); |
688 | cusparseDnVecDescr_t vecX = reinterpret_cast<cusparseDnVecDescr_t>(x); |
689 | cusparseDnVecDescr_t vecY = reinterpret_cast<cusparseDnVecDescr_t>(y); |
690 | cudaDataType_t cTp = static_cast<cudaDataType_t>(ctp); |
691 | ALPHABETA(cTp, alpha, beta) |
692 | CUSPARSE_REPORT_IF_ERROR(cusparseSpMV(cusparse_env, modeA, alphap, matA, vecX, |
693 | betap, vecY, cTp, |
694 | CUSPARSE_SPMV_ALG_DEFAULT, buf)) |
695 | } |
696 | |
697 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t |
698 | mgpuSpMMBufferSize(int32_t ma, int32_t mb, void *a, void *b, void *c, |
699 | int32_t ctp, CUstream /*stream*/) { |
700 | assert(cusparse_env && "client did not call mgpuCreateSparseEnv()" ); |
701 | cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma); |
702 | cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb); |
703 | cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a); |
704 | cusparseDnMatDescr_t matB = reinterpret_cast<cusparseDnMatDescr_t>(b); |
705 | cusparseDnMatDescr_t matC = reinterpret_cast<cusparseDnMatDescr_t>(c); |
706 | cudaDataType_t cTp = static_cast<cudaDataType_t>(ctp); |
707 | ALPHABETA(cTp, alpha, beta) |
708 | size_t bufferSize = 0; |
709 | CUSPARSE_REPORT_IF_ERROR(cusparseSpMM_bufferSize( |
710 | cusparse_env, modeA, modeB, alphap, matA, matB, betap, matC, cTp, |
711 | CUSPARSE_SPMM_ALG_DEFAULT, &bufferSize)) |
712 | return bufferSize; |
713 | } |
714 | |
715 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSpMM(int32_t ma, int32_t mb, |
716 | void *a, void *b, void *c, |
717 | int32_t ctp, void *buf, |
718 | CUstream /*stream*/) { |
719 | assert(cusparse_env && "client did not call mgpuCreateSparseEnv()" ); |
720 | cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma); |
721 | cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb); |
722 | cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a); |
723 | cusparseDnMatDescr_t matB = reinterpret_cast<cusparseDnMatDescr_t>(b); |
724 | cusparseDnMatDescr_t matC = reinterpret_cast<cusparseDnMatDescr_t>(c); |
725 | cudaDataType_t cTp = static_cast<cudaDataType_t>(ctp); |
726 | ALPHABETA(cTp, alpha, beta) |
727 | CUSPARSE_REPORT_IF_ERROR(cusparseSpMM(cusparse_env, modeA, modeB, alphap, |
728 | matA, matB, betap, matC, cTp, |
729 | CUSPARSE_SPMM_ALG_DEFAULT, buf)) |
730 | } |
731 | |
732 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t |
733 | mgpuSDDMMBufferSize(int32_t ma, int32_t mb, void *a, void *b, void *c, |
734 | int32_t ctp, CUstream /*stream*/) { |
735 | assert(cusparse_env && "client did not call mgpuCreateSparseEnv()" ); |
736 | cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma); |
737 | cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb); |
738 | cusparseDnMatDescr_t matA = reinterpret_cast<cusparseDnMatDescr_t>(a); |
739 | cusparseDnMatDescr_t matB = reinterpret_cast<cusparseDnMatDescr_t>(b); |
740 | cusparseSpMatDescr_t matC = reinterpret_cast<cusparseSpMatDescr_t>(c); |
741 | auto cTp = static_cast<cudaDataType_t>(ctp); |
742 | ALPHABETA(cTp, alpha, beta) |
743 | size_t bufferSize = 0; |
744 | CUSPARSE_REPORT_IF_ERROR(cusparseSDDMM_bufferSize( |
745 | cusparse_env, modeA, modeB, alphap, matA, matB, betap, matC, cTp, |
746 | CUSPARSE_SDDMM_ALG_DEFAULT, &bufferSize)) |
747 | return bufferSize; |
748 | } |
749 | |
750 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSDDMM(int32_t ma, int32_t mb, |
751 | void *a, void *b, void *c, |
752 | int32_t ctp, void *buf, |
753 | CUstream /*stream*/) { |
754 | assert(cusparse_env && "client did not call mgpuCreateSparseEnv()" ); |
755 | cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma); |
756 | cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb); |
757 | cusparseDnMatDescr_t matA = reinterpret_cast<cusparseDnMatDescr_t>(a); |
758 | cusparseDnMatDescr_t matB = reinterpret_cast<cusparseDnMatDescr_t>(b); |
759 | cusparseSpMatDescr_t matC = reinterpret_cast<cusparseSpMatDescr_t>(c); |
760 | auto cTp = static_cast<cudaDataType_t>(ctp); |
761 | ALPHABETA(cTp, alpha, beta) |
762 | CUSPARSE_REPORT_IF_ERROR(cusparseSDDMM(cusparse_env, modeA, modeB, alphap, |
763 | matA, matB, betap, matC, cTp, |
764 | CUSPARSE_SDDMM_ALG_DEFAULT, buf)) |
765 | } |
766 | |
767 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * |
768 | mgpuSpGEMMCreateDescr(CUstream /*stream*/) { |
769 | cusparseSpGEMMDescr_t spgemmDesc = nullptr; |
770 | CUSPARSE_REPORT_IF_ERROR(cusparseSpGEMM_createDescr(&spgemmDesc)) |
771 | return reinterpret_cast<void *>(spgemmDesc); |
772 | } |
773 | |
774 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
775 | mgpuSpGEMMDestroyDescr(void *s, CUstream /*stream*/) { |
776 | cusparseSpGEMMDescr_t spgemmDesc = reinterpret_cast<cusparseSpGEMMDescr_t>(s); |
777 | CUSPARSE_REPORT_IF_ERROR(cusparseSpGEMM_destroyDescr(spgemmDesc)) |
778 | } |
779 | |
780 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t mgpuSpGEMMWorkEstimation( |
781 | void *s, int32_t ma, int32_t mb, void *a, void *b, void *c, int32_t ctp, |
782 | intptr_t bs, void *buf, CUstream /*stream*/) { |
783 | cusparseSpGEMMDescr_t spgemmDesc = reinterpret_cast<cusparseSpGEMMDescr_t>(s); |
784 | cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma); |
785 | cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb); |
786 | cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a); |
787 | cusparseSpMatDescr_t matB = reinterpret_cast<cusparseSpMatDescr_t>(b); |
788 | cusparseSpMatDescr_t matC = reinterpret_cast<cusparseSpMatDescr_t>(c); |
789 | auto cTp = static_cast<cudaDataType_t>(ctp); |
790 | ALPHABETA(cTp, alpha, beta) |
791 | size_t newBufferSize = bs; |
792 | CUSPARSE_REPORT_IF_ERROR(cusparseSpGEMM_workEstimation( |
793 | cusparse_env, modeA, modeB, alphap, matA, matB, betap, matC, cTp, |
794 | CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, &newBufferSize, buf)) |
795 | return newBufferSize; |
796 | } |
797 | |
798 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t |
799 | mgpuSpGEMMCompute(void *s, int32_t ma, int32_t mb, void *a, void *b, void *c, |
800 | int32_t ctp, intptr_t bsz2, void *buf2, CUstream /*stream*/) { |
801 | cusparseSpGEMMDescr_t spgemmDesc = reinterpret_cast<cusparseSpGEMMDescr_t>(s); |
802 | cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma); |
803 | cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb); |
804 | cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a); |
805 | cusparseSpMatDescr_t matB = reinterpret_cast<cusparseSpMatDescr_t>(b); |
806 | cusparseSpMatDescr_t matC = reinterpret_cast<cusparseSpMatDescr_t>(c); |
807 | auto cTp = static_cast<cudaDataType_t>(ctp); |
808 | ALPHABETA(cTp, alpha, beta) |
809 | size_t newBufferSize2 = bsz2; |
810 | CUSPARSE_REPORT_IF_ERROR(cusparseSpGEMM_compute( |
811 | cusparse_env, modeA, modeB, alphap, matA, matB, betap, matC, cTp, |
812 | CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, &newBufferSize2, buf2)) |
813 | return newBufferSize2; |
814 | } |
815 | |
816 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
817 | mgpuSpGEMMCopy(void *s, int32_t ma, int32_t mb, void *a, void *b, void *c, |
818 | int32_t ctp, CUstream /*stream*/) { |
819 | cusparseSpGEMMDescr_t spgemmDesc = reinterpret_cast<cusparseSpGEMMDescr_t>(s); |
820 | cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma); |
821 | cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb); |
822 | cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a); |
823 | cusparseSpMatDescr_t matB = reinterpret_cast<cusparseSpMatDescr_t>(b); |
824 | cusparseSpMatDescr_t matC = reinterpret_cast<cusparseSpMatDescr_t>(c); |
825 | auto cTp = static_cast<cudaDataType_t>(ctp); |
826 | ALPHABETA(cTp, alpha, beta) |
827 | CUSPARSE_REPORT_IF_ERROR( |
828 | cusparseSpGEMM_copy(cusparse_env, modeA, modeB, alphap, matA, matB, betap, |
829 | matC, cTp, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc)) |
830 | } |
831 | |
832 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
833 | mgpuSpMatGetSize(void *m, void *r, void *c, void *n, CUstream /*stream*/) { |
834 | cusparseConstSpMatDescr_t matDescr = |
835 | reinterpret_cast<cusparseConstSpMatDescr_t>(m); |
836 | int64_t *rows = reinterpret_cast<int64_t *>(r); |
837 | int64_t *cols = reinterpret_cast<int64_t *>(c); |
838 | int64_t *nnz = reinterpret_cast<int64_t *>(n); |
839 | CUSPARSE_REPORT_IF_ERROR(cusparseSpMatGetSize(matDescr, rows, cols, nnz)); |
840 | } |
841 | |
842 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
843 | mgpuSetCsrPointers(void *m, void *p, void *c, void *v, CUstream /*stream*/) { |
844 | cusparseSpMatDescr_t matDescr = reinterpret_cast<cusparseSpMatDescr_t>(m); |
845 | CUSPARSE_REPORT_IF_ERROR(cusparseCsrSetPointers(matDescr, p, c, v)); |
846 | } |
847 | |
848 | #ifdef MLIR_ENABLE_CUDA_CUSPARSELT |
849 | |
850 | /// |
851 | /// Wrapper methods for the cuSparseLt library. |
852 | /// |
853 | |
854 | struct cusparseLtSpMatHandleAndData { |
855 | cusparseLtMatDescriptor_t mat; |
856 | // TODO: the following three are associated with the SpMM operator rather than |
857 | // the sparse matrix. Create workspace buffers and pass them to the SpMM |
858 | // execution. |
859 | cusparseLtMatmulAlgSelection_t alg_sel; |
860 | cusparseLtMatmulPlan_t plan; |
861 | cusparseLtMatmulDescriptor_t matmul; |
862 | void *values{nullptr}; |
863 | }; |
864 | |
865 | struct cusparseLtDnMatHandleAndData { |
866 | cusparseLtMatDescriptor_t mat; |
867 | void *values{nullptr}; |
868 | }; |
869 | |
870 | static_assert(sizeof(cusparseLtHandle_t) == 11024, |
871 | "Unexpected cusparseLt handle size" ); |
872 | static_assert(sizeof(cusparseLtSpMatHandleAndData) == 44104, |
873 | "Unexpected cusparseLt sparse matrix handle size" ); |
874 | static_assert(sizeof(cusparseLtDnMatHandleAndData) == 11032, |
875 | "Unexpected cusparseLt dense matrix handle size" ); |
876 | |
877 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuCreateSparseLtEnv() { |
878 | // ScopedContext is for cuda initialization. |
879 | ScopedContext scopedContext; |
880 | assert(!cusparseLt_initiated && |
881 | "client called mgpuCreateSparseLtEnv() twice" ); |
882 | // Note that cuSparseLt still uses cusparseStatus_t. |
883 | CUSPARSE_REPORT_IF_ERROR(cusparseLtInit(&cusparseLt_env)); |
884 | cusparseLt_initiated = true; |
885 | } |
886 | |
887 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuDestroySparseLtEnv() { |
888 | assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()" ); |
889 | CUSPARSE_REPORT_IF_ERROR(cusparseLtDestroy(&cusparseLt_env)); |
890 | cusparseLt_initiated = false; |
891 | } |
892 | |
893 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
894 | mgpuCreateCuSparseLtDnMat(void *dh, intptr_t rows, intptr_t cols, void *values, |
895 | int32_t dtp, CUstream /*stream*/) { |
896 | assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()" ); |
897 | auto dnmat_handle = reinterpret_cast<cusparseLtDnMatHandleAndData *>(dh); |
898 | dnmat_handle->values = values; |
899 | auto dTp = static_cast<cudaDataType_t>(dtp); |
900 | // Assume row-major when deciding lda. |
901 | const uint32_t alignment = 16; |
902 | CUSPARSE_REPORT_IF_ERROR(cusparseLtDenseDescriptorInit( |
903 | &cusparseLt_env, &(dnmat_handle->mat), rows, cols, /*lda=*/cols, |
904 | alignment, dTp, CUSPARSE_ORDER_ROW)) |
905 | } |
906 | |
907 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
908 | mgpuDestroyCuSparseLtDnMat(void *dh, CUstream /*stream*/) { |
909 | auto dnmat_handle = reinterpret_cast<cusparseLtDnMatHandleAndData *>(dh); |
910 | CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(dnmat_handle->mat))) |
911 | } |
912 | |
913 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
914 | mgpuCusparseLtCreate2To4SpMat(void *sh, intptr_t rows, intptr_t cols, |
915 | void *values, int32_t dtp, CUstream /*stream*/) { |
916 | assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()" ); |
917 | auto spmat_handle = reinterpret_cast<cusparseLtSpMatHandleAndData *>(sh); |
918 | spmat_handle->values = values; |
919 | auto dTp = static_cast<cudaDataType_t>(dtp); |
920 | // Assume row-major when deciding lda. |
921 | const uint32_t alignment = 16; |
922 | CUSPARSE_REPORT_IF_ERROR(cusparseLtStructuredDescriptorInit( |
923 | &cusparseLt_env, &(spmat_handle->mat), rows, cols, /*ld=*/cols, alignment, |
924 | dTp, CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT)) |
925 | } |
926 | |
927 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
928 | mgpuDestroyCuSparseLtSpMat(void *sh, CUstream /*stream*/) { |
929 | auto spmat_handle = reinterpret_cast<cusparseLtSpMatHandleAndData *>(sh); |
930 | CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(spmat_handle->mat))) |
931 | } |
932 | |
933 | // Several things are being done in this stage, algorithm selection, planning, |
934 | // and returning workspace and compressed matrices data buffer sizes. |
935 | // The parameter prune_flag is used to indicate whether pruning and pruning |
936 | // check will happen 0 means not prune or prune check, 1 means prune, 2 means |
937 | // prune & prune check |
938 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
939 | mgpuCuSparseLtSpMMBufferSize(void *bs, int32_t ma, int32_t mb, void *a, void *b, |
940 | void *c, int32_t ctp, int32_t prune_flag, |
941 | CUstream stream) { |
942 | assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()" ); |
943 | // TODO: support more advanced settings, e.g., the input right operand is a |
944 | // sparse matrix assuming matA is the sparse matrix |
945 | auto matA = reinterpret_cast<cusparseLtSpMatHandleAndData *>(a); |
946 | auto matB = reinterpret_cast<cusparseLtDnMatHandleAndData *>(b); |
947 | auto matC = reinterpret_cast<cusparseLtDnMatHandleAndData *>(c); |
948 | auto workspace_size = reinterpret_cast<size_t *>(bs); |
949 | auto compressed_size = &(reinterpret_cast<size_t *>(bs)[1]); |
950 | auto compressed_buffer_size = &(reinterpret_cast<size_t *>(bs)[2]); |
951 | auto cTp = static_cast<cusparseComputeType>(ctp); |
952 | |
953 | cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma); |
954 | cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb); |
955 | CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulDescriptorInit( |
956 | &cusparseLt_env, &(matA->matmul), modeA, modeB, &(matA->mat), |
957 | &(matB->mat), &(matC->mat), &(matC->mat), cTp)) |
958 | CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulAlgSelectionInit( |
959 | &cusparseLt_env, &(matA->alg_sel), &(matA->matmul), |
960 | CUSPARSELT_MATMUL_ALG_DEFAULT)) |
961 | int alg = 0; |
962 | CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulAlgSetAttribute( |
963 | &cusparseLt_env, &(matA->alg_sel), CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, |
964 | sizeof(alg))) |
965 | |
966 | CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulPlanInit( |
967 | &cusparseLt_env, &(matA->plan), &(matA->matmul), &(matA->alg_sel))) |
968 | |
969 | // Pruning step (in-place). |
970 | if (prune_flag > 0) |
971 | CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMAPrune( |
972 | &cusparseLt_env, &(matA->matmul), matA->values, matA->values, |
973 | CUSPARSELT_PRUNE_SPMMA_STRIP, stream)) |
974 | |
975 | // Check structure of A. |
976 | // Note that this adds a synchronization on the stream. |
977 | // TODO: Do we want that? |
978 | if (prune_flag == 2) { |
979 | int *dvalid = (int *)mgpuMemAlloc(sizeof(int), stream, false); |
980 | CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMAPruneCheck( |
981 | &cusparseLt_env, &(matA->matmul), matA->values, dvalid, stream)) |
982 | int valid = 0; |
983 | mgpuMemcpy(&valid, dvalid, sizeof(int), stream); |
984 | mgpuStreamSynchronize(stream); |
985 | mgpuMemFree(dvalid, stream); |
986 | if (valid != 0) |
987 | fprintf(stderr, "CUPARSE-LT: sparse matrix is not 2:4; computed results " |
988 | "will be invalid\n" ); |
989 | } |
990 | |
991 | CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulGetWorkspace( |
992 | &cusparseLt_env, &(matA->plan), workspace_size)) |
993 | CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMACompressedSize( |
994 | &cusparseLt_env, &(matA->plan), compressed_size, compressed_buffer_size)) |
995 | } |
996 | |
997 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
998 | mgpuCuSparseLtSpMM(void *a, void *b, void *c, void *d_workspace, |
999 | void *dA_compressed, void *dA_compressedBuffer, |
1000 | CUstream stream) { |
1001 | assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()" ); |
1002 | auto matA = reinterpret_cast<cusparseLtSpMatHandleAndData *>(a); |
1003 | auto matB = reinterpret_cast<cusparseLtDnMatHandleAndData *>(b); |
1004 | auto matC = reinterpret_cast<cusparseLtDnMatHandleAndData *>(c); |
1005 | |
1006 | ALPHABETA(CUDA_R_32F, alpha, beta) |
1007 | CUSPARSE_REPORT_IF_ERROR( |
1008 | cusparseLtSpMMACompress(&cusparseLt_env, &(matA->plan), (matA->values), |
1009 | dA_compressed, dA_compressedBuffer, stream)) |
1010 | |
1011 | // TODO: add support to multi-stream execution |
1012 | // Perform the matrix multiplication. D = A*B+C using C==D for now |
1013 | CUSPARSE_REPORT_IF_ERROR( |
1014 | cusparseLtMatmul(&cusparseLt_env, &(matA->plan), alphap, dA_compressed, |
1015 | matB->values, betap, matC->values, |
1016 | /*dD*/ matC->values, d_workspace, nullptr, 0)) |
1017 | |
1018 | CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(matA->mat))) |
1019 | // destroy the plan associated with the sparse matrix |
1020 | CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulPlanDestroy(&(matA->plan))) |
1021 | } |
1022 | |
1023 | #endif // MLIR_ENABLE_CUDA_CUSPARSELT |
1024 | #endif // MLIR_ENABLE_CUDA_CUSPARSE |
1025 | |