CudaRuntimeWrappers.cpp source code [mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp]

1	//===- CudaRuntimeWrappers.cpp - MLIR CUDA API wrapper library ------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// Implements C wrappers around the CUDA library for easy linking in ORC jit.
10	// Also adds some debugging helpers that are helpful when writing MLIR code to
11	// run on GPUs.
12	//
13	//===----------------------------------------------------------------------===//
14
15	#include "mlir/ExecutionEngine/CRunnerUtils.h"
16
17	#include <stdio.h>
18
19	#include "cuda.h"
20	#include "cuda_bf16.h"
21	#include "cuda_fp16.h"
22
23	#ifdef MLIR_ENABLE_CUDA_CUSPARSE
24	#include "cusparse.h"
25	#ifdef MLIR_ENABLE_CUDA_CUSPARSELT
26	#include "cusparseLt.h"
27	#endif // MLIR_ENABLE_CUDA_CUSPARSELT
28	#endif // MLIR_ENABLE_CUDA_CUSPARSE
29
30	#ifdef _WIN32
31	#include <malloc.h>
32	#define MLIR_CUDA_WRAPPERS_EXPORT __declspec(dllexport)
33	#else
34	#define MLIR_CUDA_WRAPPERS_EXPORT __attribute__((visibility("default")))
35	#endif // _WIN32
36
37	#define CUDA_REPORT_IF_ERROR(expr) \
38	[](CUresult result) { \
39	if (!result) \
40	return; \
41	const char *name = nullptr; \
42	cuGetErrorName(result, &name); \
43	if (!name) \
44	name = "<unknown>"; \
45	fprintf(stderr, "'%s' failed with '%s'\n", #expr, name); \
46	}(expr)
47
48	#define CUSPARSE_REPORT_IF_ERROR(expr) \
49	{ \
50	cusparseStatus_t status = (expr); \
51	if (status != CUSPARSE_STATUS_SUCCESS) { \
52	fprintf(stderr, "cuSPARSE '%s' failed with '%s'\n", #expr, \
53	cusparseGetErrorString(status)); \
54	} \
55	}
56
57	thread_local static int32_t defaultDevice = `0`;
58
59	const char *kDebugEnvironmentVariable = "MLIR_CUDA_DEBUG";
60
61	/// Helper method that checks environment value for debugging.
62	bool isDebugEnabled() {
63	static bool isInitialized = false;
64	static bool isEnabled = false;
65	if (!isInitialized)
66	isEnabled = getenv(kDebugEnvironmentVariable) != nullptr;
67	return isEnabled;
68	}
69
70	#define debug_print(fmt, ...) \
71	do { \
72	if (isDebugEnabled()) \
73	fprintf(stderr, "%s:%d:%s(): " fmt, "CudaRuntimeWrappers.cpp", __LINE__, \
74	__func__, __VA_ARGS__); \
75	} while (0)
76
77	// Returns default CUdevice
78	CUdevice getDefaultCuDevice() {
79	CUdevice device;
80	CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /ordinal=/defaultDevice));
81	return device;
82	}
83
84	// Make the primary context of the current default device current for the
85	// duration
86	// of the instance and restore the previous context on destruction.
87	class ScopedContext {
88	public:
89	ScopedContext() {
90	// Static reference to CUDA primary context for device ordinal
91	// defaultDevice.
92	static CUcontext context = [] {
93	CUDA_REPORT_IF_ERROR(cuInit(/flags=/`0`));
94	CUcontext ctx;
95	// Note: this does not affect the current context.
96	CUDA_REPORT_IF_ERROR(
97	cuDevicePrimaryCtxRetain(&ctx, getDefaultCuDevice()));
98	return ctx;
99	}();
100
101	CUDA_REPORT_IF_ERROR(cuCtxPushCurrent(context));
102	}
103
104	~ScopedContext() { CUDA_REPORT_IF_ERROR(cuCtxPopCurrent(nullptr)); }
105	};
106
107	#ifdef MLIR_ENABLE_CUDA_CUSPARSE
108	// Note that (1) Nvidia confirms the safety to share handle across multiple
109	// instances, and streams. (2) Clients are responsible to call the @mgpu
110	// environment initialization/destruction in a thread-safe manner, e.g.,
111	// at the beginning of the program before multi-threads are created.
112	static cusparseHandle_t cusparse_env = nullptr;
113
114	#ifdef MLIR_ENABLE_CUDA_CUSPARSELT
115	// cusparseLtHandle_t is not a pointer type, so we need an additional flag to
116	// indicate whether it is initialized.
117	static cusparseLtHandle_t cusparseLt_env;
118	static bool cusparseLt_initiated = false;
119
120	#endif // MLIR_ENABLE_CUDA_CUSPARSELT
121	#endif // MLIR_ENABLE_CUDA_CUSPARSE
122
123	extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule
124	mgpuModuleLoad(void data, size_t /gpuBlobSize/*) {
125	ScopedContext scopedContext;
126	CUmodule module = nullptr;
127	CUDA_REPORT_IF_ERROR(cuModuleLoadData(&module, data));
128	return module;
129	}
130
131	extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoadJIT(void *data,
132	int optLevel) {
133	ScopedContext scopedContext;
134	CUmodule module = nullptr;
135	char jitErrorBuffer[`4096`] = {`0`};
136	CUjit_option jitOptions[] = {CU_JIT_ERROR_LOG_BUFFER,
137	CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
138	CU_JIT_OPTIMIZATION_LEVEL};
139	void *jitOptionsVals[] = {jitErrorBuffer,
140	reinterpret_cast<void >(sizeof*(jitErrorBuffer)),
141	reinterpret_cast<void *>(optLevel)};
142
143	CUresult result =
144	cuModuleLoadDataEx(&module, data, `3`, jitOptions, jitOptionsVals);
145	if (result) {
146	fprintf(stderr, format: "JIT compilation failed with: '%s'\n", jitErrorBuffer);
147	CUDA_REPORT_IF_ERROR(result);
148	}
149	return module;
150	}
151
152	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuModuleUnload(CUmodule module) {
153	CUDA_REPORT_IF_ERROR(cuModuleUnload(module));
154	}
155
156	extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUfunction
157	mgpuModuleGetFunction(CUmodule module, const char *name) {
158	CUfunction function = nullptr;
159	CUDA_REPORT_IF_ERROR(cuModuleGetFunction(&function, module, name));
160	return function;
161	}
162
163	// The wrapper uses intptr_t instead of CUDA's unsigned int to match
164	// the type of MLIR's index type. This avoids the need for casts in the
165	// generated MLIR code.
166	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
167	mgpuLaunchKernel(CUfunction function, intptr_t gridX, intptr_t gridY,
168	intptr_t gridZ, intptr_t blockX, intptr_t blockY,
169	intptr_t blockZ, int32_t smem, CUstream stream, void **params,
170	void *extra, size_t /paramsCount/*) {
171	ScopedContext scopedContext;
172	if (smem > `0`) {
173	// Avoid checking driver as it's more expensive than if statement
174	int32_t maxShmem = `0`;
175	CUdevice device = getDefaultCuDevice();
176	CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /ordinal=/defaultDevice));
177	CUDA_REPORT_IF_ERROR(cuDeviceGetAttribute(
178	&maxShmem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
179	device));
180	if (maxShmem < smem) {
181	fprintf(stderr,
182	format: "Requested shared memory (%dkb) is larger than maximum allowed "
183	"shared memory (%dkb) for this device\n",
184	smem, maxShmem);
185	}
186	CUDA_REPORT_IF_ERROR(cuFuncSetAttribute(
187	function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, smem));
188	}
189	debug_print("Launching kernel, grid=%ld,%ld,%ld, "
190	"threads: %ld, %ld, %ld, "
191	"smem: %dkb\n",
192	gridX, gridY, gridZ, blockX, blockY, blockZ, smem);
193	CUDA_REPORT_IF_ERROR(cuLaunchKernel(function, gridX, gridY, gridZ, blockX,
194	blockY, blockZ, smem, stream, params,
195	extra));
196	}
197
198	extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUstream mgpuStreamCreate() {
199	ScopedContext scopedContext;
200	CUstream stream = nullptr;
201	CUDA_REPORT_IF_ERROR(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
202	return stream;
203	}
204
205	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuStreamDestroy(CUstream stream) {
206	CUDA_REPORT_IF_ERROR(cuStreamDestroy(stream));
207	}
208
209	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
210	mgpuStreamSynchronize(CUstream stream) {
211	CUDA_REPORT_IF_ERROR(cuStreamSynchronize(stream));
212	}
213
214	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuStreamWaitEvent(CUstream stream,
215	CUevent event) {
216	CUDA_REPORT_IF_ERROR(cuStreamWaitEvent(stream, event, /flags=/`0`));
217	}
218
219	extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUevent mgpuEventCreate() {
220	ScopedContext scopedContext;
221	CUevent event = nullptr;
222	CUDA_REPORT_IF_ERROR(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING));
223	return event;
224	}
225
226	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventDestroy(CUevent event) {
227	CUDA_REPORT_IF_ERROR(cuEventDestroy(event));
228	}
229
230	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventSynchronize(CUevent event) {
231	CUDA_REPORT_IF_ERROR(cuEventSynchronize(event));
232	}
233
234	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventRecord(CUevent event,
235	CUstream stream) {
236	CUDA_REPORT_IF_ERROR(cuEventRecord(event, stream));
237	}
238
239	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
240	mgpuMemAlloc(uint64_t sizeBytes, CUstream /stream/, bool /isHostShared/) {
241	ScopedContext scopedContext;
242	CUdeviceptr ptr = `0`;
243	if (sizeBytes != `0`)
244	CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes));
245	return reinterpret_cast<void *>(ptr);
246	}
247
248	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuMemFree(void *ptr,
249	CUstream /stream/) {
250	CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(ptr)));
251	}
252
253	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
254	mgpuMemcpy(void dst, void* *src, size_t sizeBytes, CUstream stream) {
255	CUDA_REPORT_IF_ERROR(cuMemcpyAsync(reinterpret_cast<CUdeviceptr>(dst),
256	reinterpret_cast<CUdeviceptr>(src),
257	sizeBytes, stream));
258	}
259
260	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
261	mgpuMemset32(void dst, unsigned* int value, size_t count, CUstream stream) {
262	CUDA_REPORT_IF_ERROR(cuMemsetD32Async(reinterpret_cast<CUdeviceptr>(dst),
263	value, count, stream));
264	}
265
266	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
267	mgpuMemset16(void dst, unsigned* short value, size_t count, CUstream stream) {
268	CUDA_REPORT_IF_ERROR(cuMemsetD16Async(reinterpret_cast<CUdeviceptr>(dst),
269	value, count, stream));
270	}
271
272	///
273	/// Helper functions for writing mlir example code
274	///
275
276	// Allows to register byte array with the CUDA runtime. Helpful until we have
277	// transfer functions implemented.
278	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
279	mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) {
280	ScopedContext scopedContext;
281	CUDA_REPORT_IF_ERROR(cuMemHostRegister(ptr, sizeBytes, /flags=/`0`));
282	}
283
284	/// Registers a memref with the CUDA runtime. `descriptor` is a pointer to a
285	/// ranked memref descriptor struct of rank `rank`. Helpful until we have
286	/// transfer functions implemented.
287	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
288	mgpuMemHostRegisterMemRef(int64_t rank, StridedMemRefType<char, `1`> *descriptor,
289	int64_t elementSizeBytes) {
290	// Only densely packed tensors are currently supported.
291	#ifdef _WIN32
292	int64_t denseStrides = (int64_t )_alloca(rank * sizeof(int64_t));
293	#else
294	int64_t denseStrides = (int64_t )alloca(rank * sizeof(int64_t));
295	#endif // _WIN32
296	int64_t *sizes = descriptor->sizes;
297	for (int64_t i = rank - `1`, runningStride = `1`; i >= `0`; i--) {
298	denseStrides[i] = runningStride;
299	runningStride *= sizes[i];
300	}
301	uint64_t sizeBytes = sizes[`0`] * denseStrides[`0`] * elementSizeBytes;
302	int64_t *strides = &sizes[rank];
303	(void)strides;
304	for (unsigned i = `0`; i < rank; ++i)
305	assert(strides[i] == denseStrides[i] &&
306	"Mismatch in computed dense strides");
307
308	auto ptr = descriptor->data + descriptor->offset elementSizeBytes;
309	mgpuMemHostRegister(ptr, sizeBytes);
310	}
311
312	// Allows to unregister byte array with the CUDA runtime.
313	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuMemHostUnregister(void *ptr) {
314	ScopedContext scopedContext;
315	CUDA_REPORT_IF_ERROR(cuMemHostUnregister(ptr));
316	}
317
318	/// Unregisters a memref with the CUDA runtime. `descriptor` is a pointer to a
319	/// ranked memref descriptor struct of rank `rank`
320	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
321	mgpuMemHostUnregisterMemRef(int64_t rank,
322	StridedMemRefType<char, `1`> *descriptor,
323	int64_t elementSizeBytes) {
324	auto ptr = descriptor->data + descriptor->offset elementSizeBytes;
325	mgpuMemHostUnregister(ptr);
326	}
327
328	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) {
329	defaultDevice = device;
330	}
331
332	///
333	/// Runtime methods using CUDA 12.0+ driver
334	///
335
336	#if (CUDA_VERSION >= 12000)
337
338	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuLaunchClusterKernel(
339	CUfunction function, intptr_t clusterX, intptr_t clusterY,
340	intptr_t clusterZ, intptr_t gridX, intptr_t gridY, intptr_t gridZ,
341	intptr_t blockX, intptr_t blockY, intptr_t blockZ, int32_t smem,
342	CUstream stream, void *params, void* *extra, size_t /paramsCount/*) {
343	ScopedContext scopedContext;
344	if (smem > `0`) {
345	// Avoid checking driver as it's more expensive than if statement
346	int32_t maxShmem = `0`;
347	CUdevice device = getDefaultCuDevice();
348	CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /ordinal=/defaultDevice));
349	CUDA_REPORT_IF_ERROR(cuDeviceGetAttribute(
350	&maxShmem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
351	device));
352	if (maxShmem < smem) {
353	fprintf(stderr,
354	"Requested shared memory (%dkb) is larger than maximum allowed "
355	"shared memory (%dkb) for this device\n",
356	smem, maxShmem);
357	}
358	CUDA_REPORT_IF_ERROR(cuFuncSetAttribute(
359	function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, smem));
360	}
361	CUlaunchConfig config;
362	config.gridDimX = gridX;
363	config.gridDimY = gridY;
364	config.gridDimZ = gridZ;
365	config.blockDimX = blockX;
366	config.blockDimY = blockY;
367	config.blockDimZ = blockZ;
368	config.sharedMemBytes = smem;
369	config.hStream = stream;
370	CUlaunchAttribute launchAttr[`2`];
371	launchAttr[`0`].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
372	launchAttr[`0`].value.clusterDim.x = clusterX;
373	launchAttr[`0`].value.clusterDim.y = clusterY;
374	launchAttr[`0`].value.clusterDim.z = clusterZ;
375	launchAttr[`1`].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE;
376	launchAttr[`1`].value.clusterSchedulingPolicyPreference =
377	CU_CLUSTER_SCHEDULING_POLICY_SPREAD;
378	config.numAttrs = `2`;
379	config.attrs = launchAttr;
380
381	debug_print("Launching kernel,"
382	"cluster: %ld, %ld, %ld, "
383	"grid=%ld,%ld,%ld, "
384	"threads: %ld, %ld, %ld, "
385	"smem: %dkb\n",
386	clusterX, clusterY, clusterZ, gridX, gridY, gridZ, blockX, blockY,
387	blockZ, smem);
388
389	CUDA_REPORT_IF_ERROR(cuLaunchKernelEx(&config, function, params, extra));
390	}
391
392	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuTensorMapEncodeTiled(
393	CUtensorMap tensorMap, // Tensor map object*
394	CUtensorMapDataType tensorDataType, // Tensor data type
395	cuuint32_t tensorRank, // Dimensionality of tensor
396	void globalAddress, // Starting address*
397	const cuuint64_t globalDim, // Tensor size (number of elements)*
398	const cuuint64_t globalStrides, // Stride size (in bytes)*
399	const cuuint32_t boxDim, // Traversal box (number of elments)*
400	const cuuint32_t elementStrides, // Traversal stride*
401	CUtensorMapInterleave interleave, // Type of interleaved layout
402	CUtensorMapSwizzle swizzle, // Bank swizzling pattern
403	CUtensorMapL2promotion l2Promotion, // L2 promotion size
404	CUtensorMapFloatOOBfill oobFill // Padding zfill or NaN fill
405	) {
406	ScopedContext scopedContext;
407	CUDA_REPORT_IF_ERROR(cuTensorMapEncodeTiled(
408	tensorMap, tensorDataType, tensorRank, globalAddress, globalDim,
409	globalStrides, boxDim, elementStrides, interleave, swizzle, l2Promotion,
410	oobFill));
411	debug_print("Created TMA descriptor\n Addr: %p\n"
412	"data type : %d\n"
413	"rank : %d\n"
414	"globalDim[5]: %zu, %zu, %zu, %zu, %zu\n"
415	"globalStrides[5]: %zu, %zu, %zu, %zu, %zu\n"
416	"boxDim[5]: %u, %u, %u, %u, %u\n"
417	"elementStrides[5]: %u, %u, %u, %u, %u\n"
418	"interleave: %u \n"
419	"swizzle: %u \n"
420	"l2Promotion: %u \n"
421	"oobFill: %u \n",
422	(void *)&tensorMap, tensorDataType, tensorRank, globalDim[`0`],
423	globalDim[`1`], globalDim[`2`], globalDim[`3`], globalDim[`4`],
424	globalStrides[`0`], globalStrides[`1`], globalStrides[`2`],
425	globalStrides[`3`], globalStrides[`4`], boxDim[`0`], boxDim[`1`],
426	boxDim[`2`], boxDim[`3`], boxDim[`4`], elementStrides[`0`],
427	elementStrides[`1`], elementStrides[`2`], elementStrides[`3`],
428	elementStrides[`4`], interleave, swizzle, l2Promotion, oobFill);
429	}
430
431	template <int Rank>
432	void mgpuGetMemRefDataAndShape(void rawDescriptor, char* **addr,
433	uint64_t globalDim, uint64_t globalStrides,
434	const CUtensorMapDataType tensorDataType) {
435	auto descriptor =
436	reinterpret_cast<StridedMemRefType<char, Rank> *>(rawDescriptor);
437	*addr = descriptor->data;
438	for (int i = `0`; i < Rank; ++i) {
439	globalDim[i] = static_cast<uint64_t>(descriptor->sizes[Rank - i - `1`]);
440	}
441	static constexpr int elementSizeInBytes[] = {`1`, `2`, `4`, `4`, `8`, `8`, `2`,
442	`4`, `8`, `2`, `4`, `4`, `4`};
443	for (int i = `0`; i < Rank - `1`; ++i) {
444	globalStrides[i] = static_cast<uint64_t>(
445	descriptor->strides[Rank - i - `2`] * elementSizeInBytes[tensorDataType]);
446	}
447	}
448
449	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *mgpuTensorMapEncodeTiledMemref(
450	int64_t tensorRank, // Dimensionality of tensor
451	void rankedDescriptor, // Ranked MemRef descriptor*
452	const CUtensorMapDataType tensorDataType, // Stride size (in bytes)
453	CUtensorMapInterleave interleave, // Type of interleaved layout
454	CUtensorMapSwizzle swizzle, // Bank swizzling pattern
455	CUtensorMapL2promotion l2Promotion, // L2 promotion size
456	CUtensorMapFloatOOBfill oobFill, // Padding zfill or NaN fill
457	int64_t inputBoxDims // Tensor size (number of elements)*
458	) {
459	CUtensorMap tensorMap;
460
461	uint32_t boxDim[`5`] = {`1`, `1`, `1`, `1`, `1`}, elementStrides[`5`] = {`1`, `1`, `1`, `1`, `1`};
462	uint64_t globalDim[`5`] = {`1`, `1`, `1`, `1`, `1`}, globalStrides[`5`] = {`0`};
463	uint32_t tensorRank32 = uint32_t(tensorRank);
464
465	char globalAddress = nullptr*;
466	switch (tensorRank) {
467	case `1`:
468	mgpuGetMemRefDataAndShape<`1`>(rankedDescriptor, &globalAddress, globalDim,
469	globalStrides, tensorDataType);
470	break;
471	case `2`:
472	mgpuGetMemRefDataAndShape<`2`>(rankedDescriptor, &globalAddress, globalDim,
473	globalStrides, tensorDataType);
474	break;
475	case `3`:
476	mgpuGetMemRefDataAndShape<`3`>(rankedDescriptor, &globalAddress, globalDim,
477	globalStrides, tensorDataType);
478	break;
479	case `4`:
480	mgpuGetMemRefDataAndShape<`4`>(rankedDescriptor, &globalAddress, globalDim,
481	globalStrides, tensorDataType);
482	break;
483	case `5`:
484	mgpuGetMemRefDataAndShape<`5`>(rankedDescriptor, &globalAddress, globalDim,
485	globalStrides, tensorDataType);
486	break;
487	default:
488	fprintf(
489	stderr,
490	"'mgpuTensorMapEncodeTiledMemref' failed with 'rank is too high'\n");
491	return nullptr;
492	}
493
494	for (int64_t r = `0`; r < tensorRank; ++r) {
495	boxDim[r] = static_cast<uint32_t>(inputBoxDims[tensorRank - r - `1`]);
496	}
497
498	ScopedContext scopedContext;
499	mgpuTensorMapEncodeTiled(&tensorMap, tensorDataType, tensorRank32,
500	globalAddress, globalDim, globalStrides, boxDim,
501	elementStrides, interleave, swizzle, l2Promotion,
502	oobFill);
503	// Copy created tensor map to device
504	CUdeviceptr dTensorMap;
505	CUDA_REPORT_IF_ERROR(cuMemAlloc(&dTensorMap, sizeof(CUtensorMap)));
506	CUDA_REPORT_IF_ERROR(cuMemcpy(dTensorMap,
507	reinterpret_cast<CUdeviceptr>(&tensorMap),
508	sizeof(CUtensorMap)));
509	return reinterpret_cast<void *>(dTensorMap);
510	}
511	#endif
512
513	#ifdef MLIR_ENABLE_CUDA_CUSPARSE
514
515	///
516	/// Wrapper methods for the cuSparse library.
517	///
518
519	// Some macro magic to get float/double alpha and beta on host.
520	// TODO: add support to passing alpha and beta as arguments
521	#define ALPHABETA(dtp, alpha, beta) \
522	__nv_bfloat16(alpha##16bf) = 1.0f; \
523	__nv_bfloat16(beta##16bf) = 1.0f; \
524	__half(alpha##16f) = 1.0f; \
525	__half(beta##16f) = 1.0f; \
526	float(alpha##f) = 1.0f; \
527	float(beta##f) = 1.0f; \
528	double(alpha##d) = 1.0; \
529	double(beta##d) = 1.0; \
530	const void *(alpha##p) = nullptr; \
531	const void *(beta##p) = nullptr; \
532	if (dtp == CUDA_R_16BF \|\| dtp == CUDA_C_16BF) { \
533	(alpha##p) = reinterpret_cast<void *>(&(alpha##16bf)); \
534	(beta##p) = reinterpret_cast<void *>(&(beta##16bf)); \
535	} else if (dtp == CUDA_R_16F \|\| dtp == CUDA_C_16F) { \
536	(alpha##p) = reinterpret_cast<void *>(&(alpha##16f)); \
537	(beta##p) = reinterpret_cast<void *>(&(beta##16f)); \
538	} else if (dtp == CUDA_R_32F \|\| dtp == CUDA_C_32F) { \
539	(alpha##p) = reinterpret_cast<void *>(&(alpha##f)); \
540	(beta##p) = reinterpret_cast<void *>(&(beta##f)); \
541	} else { \
542	(alpha##p) = reinterpret_cast<void *>(&(alpha##d)); \
543	(beta##p) = reinterpret_cast<void *>(&(beta##d)); \
544	}
545
546	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuCreateSparseEnv() {
547	// ScopedContext is for cuda initialization.
548	ScopedContext scopedContext;
549	assert(!cusparse_env && "client called mgpuCreateSparseEnv() twice");
550	CUSPARSE_REPORT_IF_ERROR(cusparseCreate(&cusparse_env));
551	}
552
553	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuDestroySparseEnv() {
554	assert(cusparse_env && "client did not call mgpuCreateSparseEnv()");
555	CUSPARSE_REPORT_IF_ERROR(cusparseDestroy(cusparse_env));
556	cusparse_env = nullptr;
557	}
558
559	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
560	mgpuCreateDnVec(intptr_t size, void values, int32_t dtp, CUstream /stream/*) {
561	cusparseDnVecDescr_t vec = nullptr;
562	auto dTp = static_cast<cudaDataType_t>(dtp);
563	CUSPARSE_REPORT_IF_ERROR(cusparseCreateDnVec(&vec, size, values, dTp))
564	return reinterpret_cast<void *>(vec);
565	}
566
567	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
568	mgpuDestroyDnVec(void v, CUstream /stream/*) {
569	cusparseDnVecDescr_t vec = reinterpret_cast<cusparseDnVecDescr_t>(v);
570	CUSPARSE_REPORT_IF_ERROR(cusparseDestroyDnVec(vec))
571	}
572
573	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
574	mgpuCreateDnMat(intptr_t rows, intptr_t cols, void *values, int32_t dtp,
575	CUstream /stream/) {
576	cusparseDnMatDescr_t mat = nullptr;
577	auto dTp = static_cast<cudaDataType_t>(dtp);
578	CUSPARSE_REPORT_IF_ERROR(cusparseCreateDnMat(&mat, rows, cols, /ld=/cols,
579	values, dTp, CUSPARSE_ORDER_ROW))
580	return reinterpret_cast<void *>(mat);
581	}
582
583	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
584	mgpuDestroyDnMat(void m, CUstream /stream/*) {
585	cusparseDnMatDescr_t mat = reinterpret_cast<cusparseDnMatDescr_t>(m);
586	CUSPARSE_REPORT_IF_ERROR(cusparseDestroyDnMat(mat))
587	}
588
589	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
590	mgpuCreateCoo(intptr_t rows, intptr_t cols, intptr_t nnz, void *rowIdxs,
591	void colIdxs, void* *values, int32_t itp, int32_t dtp,
592	CUstream /stream/) {
593	cusparseSpMatDescr_t mat = nullptr;
594	auto iTp = static_cast<cusparseIndexType_t>(itp);
595	auto dTp = static_cast<cudaDataType_t>(dtp);
596	CUSPARSE_REPORT_IF_ERROR(cusparseCreateCoo(&mat, rows, cols, nnz, rowIdxs,
597	colIdxs, values, iTp,
598	CUSPARSE_INDEX_BASE_ZERO, dTp))
599	return reinterpret_cast<void *>(mat);
600	}
601
602	#ifdef CUSPARSE_COO_AOS // deprecated in cuSPARSE 11.2
603	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
604	mgpuCreateCooAoS(intptr_t rows, intptr_t cols, intptr_t nnz, void *idxs,
605	void values, int32_t itp, int32_t dtp, CUstream /stream/*) {
606	cusparseSpMatDescr_t mat = nullptr;
607	auto iTp = static_cast<cusparseIndexType_t>(itp);
608	auto dTp = static_cast<cudaDataType_t>(dtp);
609	CUSPARSE_REPORT_IF_ERROR(cusparseCreateCooAoS(
610	&mat, rows, cols, nnz, idxs, values, iTp, CUSPARSE_INDEX_BASE_ZERO, dTp))
611	return reinterpret_cast<void *>(mat);
612	}
613	#endif // CUSPARSE_COO_AOS
614
615	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
616	mgpuCreateCsr(intptr_t rows, intptr_t cols, intptr_t nnz, void *rowPos,
617	void colIdxs, void* *values, int32_t ptp, int32_t itp,
618	int32_t dtp, CUstream /stream/) {
619	cusparseSpMatDescr_t mat = nullptr;
620	auto pTp = static_cast<cusparseIndexType_t>(ptp);
621	auto iTp = static_cast<cusparseIndexType_t>(itp);
622	auto dTp = static_cast<cudaDataType_t>(dtp);
623	CUSPARSE_REPORT_IF_ERROR(cusparseCreateCsr(&mat, rows, cols, nnz, rowPos,
624	colIdxs, values, pTp, iTp,
625	CUSPARSE_INDEX_BASE_ZERO, dTp))
626	return reinterpret_cast<void *>(mat);
627	}
628
629	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
630	mgpuCreateCsc(intptr_t rows, intptr_t cols, intptr_t nnz, void *colPos,
631	void rowIdxs, void* *values, int32_t ptp, int32_t itp,
632	int32_t dtp, CUstream /stream/) {
633	cusparseSpMatDescr_t mat = nullptr;
634	auto pTp = static_cast<cusparseIndexType_t>(ptp);
635	auto iTp = static_cast<cusparseIndexType_t>(itp);
636	auto dTp = static_cast<cudaDataType_t>(dtp);
637	CUSPARSE_REPORT_IF_ERROR(cusparseCreateCsc(&mat, rows, cols, nnz, colPos,
638	rowIdxs, values, pTp, iTp,
639	CUSPARSE_INDEX_BASE_ZERO, dTp))
640	return reinterpret_cast<void *>(mat);
641	}
642
643	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
644	mgpuCreateBsr(intptr_t brows, intptr_t bcols, intptr_t bnnz, intptr_t rBsz,
645	intptr_t cBsz, void rowPos, void* colIdxs, void* *values,
646	int32_t ptp, int32_t itp, int32_t dtp, CUstream /stream/) {
647	cusparseSpMatDescr_t mat = nullptr;
648	#if CUSPARSE_VERSION >= 12100
649	auto pTp = static_cast<cusparseIndexType_t>(ptp);
650	auto iTp = static_cast<cusparseIndexType_t>(itp);
651	auto dTp = static_cast<cudaDataType_t>(dtp);
652	CUSPARSE_REPORT_IF_ERROR(cusparseCreateBsr(
653	&mat, brows, bcols, bnnz, rBsz, cBsz, rowPos, colIdxs, values, pTp, iTp,
654	CUSPARSE_INDEX_BASE_ZERO, dTp, CUSPARSE_ORDER_ROW))
655	#endif
656	return reinterpret_cast<void *>(mat);
657	}
658
659	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
660	mgpuDestroySpMat(void m, CUstream /stream/*) {
661	cusparseSpMatDescr_t mat = reinterpret_cast<cusparseSpMatDescr_t>(m);
662	CUSPARSE_REPORT_IF_ERROR(cusparseDestroySpMat(mat))
663	}
664
665	extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t mgpuSpMVBufferSize(
666	int32_t ma, void a, void* x, void* y, int32_t ctp, CUstream /stream/*) {
667	assert(cusparse_env && "client did not call mgpuCreateSparseEnv()");
668	cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
669	cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
670	cusparseDnVecDescr_t vecX = reinterpret_cast<cusparseDnVecDescr_t>(x);
671	cusparseDnVecDescr_t vecY = reinterpret_cast<cusparseDnVecDescr_t>(y);
672	cudaDataType_t cTp = static_cast<cudaDataType_t>(ctp);
673	ALPHABETA(cTp, alpha, beta)
674	size_t bufferSize = `0`;
675	CUSPARSE_REPORT_IF_ERROR(cusparseSpMV_bufferSize(
676	cusparse_env, modeA, alphap, matA, vecX, betap, vecY, cTp,
677	CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize))
678	return bufferSize;
679	}
680
681	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSpMV(int32_t ma, void a, void* *x,
682	void *y, int32_t ctp,
683	void *buf,
684	CUstream /stream/) {
685	assert(cusparse_env && "client did not call mgpuCreateSparseEnv()");
686	cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
687	cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
688	cusparseDnVecDescr_t vecX = reinterpret_cast<cusparseDnVecDescr_t>(x);
689	cusparseDnVecDescr_t vecY = reinterpret_cast<cusparseDnVecDescr_t>(y);
690	cudaDataType_t cTp = static_cast<cudaDataType_t>(ctp);
691	ALPHABETA(cTp, alpha, beta)
692	CUSPARSE_REPORT_IF_ERROR(cusparseSpMV(cusparse_env, modeA, alphap, matA, vecX,
693	betap, vecY, cTp,
694	CUSPARSE_SPMV_ALG_DEFAULT, buf))
695	}
696
697	extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t
698	mgpuSpMMBufferSize(int32_t ma, int32_t mb, void a, void* b, void* *c,
699	int32_t ctp, CUstream /stream/) {
700	assert(cusparse_env && "client did not call mgpuCreateSparseEnv()");
701	cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
702	cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb);
703	cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
704	cusparseDnMatDescr_t matB = reinterpret_cast<cusparseDnMatDescr_t>(b);
705	cusparseDnMatDescr_t matC = reinterpret_cast<cusparseDnMatDescr_t>(c);
706	cudaDataType_t cTp = static_cast<cudaDataType_t>(ctp);
707	ALPHABETA(cTp, alpha, beta)
708	size_t bufferSize = `0`;
709	CUSPARSE_REPORT_IF_ERROR(cusparseSpMM_bufferSize(
710	cusparse_env, modeA, modeB, alphap, matA, matB, betap, matC, cTp,
711	CUSPARSE_SPMM_ALG_DEFAULT, &bufferSize))
712	return bufferSize;
713	}
714
715	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSpMM(int32_t ma, int32_t mb,
716	void a, void* b, void* *c,
717	int32_t ctp, void *buf,
718	CUstream /stream/) {
719	assert(cusparse_env && "client did not call mgpuCreateSparseEnv()");
720	cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
721	cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb);
722	cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
723	cusparseDnMatDescr_t matB = reinterpret_cast<cusparseDnMatDescr_t>(b);
724	cusparseDnMatDescr_t matC = reinterpret_cast<cusparseDnMatDescr_t>(c);
725	cudaDataType_t cTp = static_cast<cudaDataType_t>(ctp);
726	ALPHABETA(cTp, alpha, beta)
727	CUSPARSE_REPORT_IF_ERROR(cusparseSpMM(cusparse_env, modeA, modeB, alphap,
728	matA, matB, betap, matC, cTp,
729	CUSPARSE_SPMM_ALG_DEFAULT, buf))
730	}
731
732	extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t
733	mgpuSDDMMBufferSize(int32_t ma, int32_t mb, void a, void* b, void* *c,
734	int32_t ctp, CUstream /stream/) {
735	assert(cusparse_env && "client did not call mgpuCreateSparseEnv()");
736	cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
737	cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb);
738	cusparseDnMatDescr_t matA = reinterpret_cast<cusparseDnMatDescr_t>(a);
739	cusparseDnMatDescr_t matB = reinterpret_cast<cusparseDnMatDescr_t>(b);
740	cusparseSpMatDescr_t matC = reinterpret_cast<cusparseSpMatDescr_t>(c);
741	auto cTp = static_cast<cudaDataType_t>(ctp);
742	ALPHABETA(cTp, alpha, beta)
743	size_t bufferSize = `0`;
744	CUSPARSE_REPORT_IF_ERROR(cusparseSDDMM_bufferSize(
745	cusparse_env, modeA, modeB, alphap, matA, matB, betap, matC, cTp,
746	CUSPARSE_SDDMM_ALG_DEFAULT, &bufferSize))
747	return bufferSize;
748	}
749
750	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSDDMM(int32_t ma, int32_t mb,
751	void a, void* b, void* *c,
752	int32_t ctp, void *buf,
753	CUstream /stream/) {
754	assert(cusparse_env && "client did not call mgpuCreateSparseEnv()");
755	cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
756	cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb);
757	cusparseDnMatDescr_t matA = reinterpret_cast<cusparseDnMatDescr_t>(a);
758	cusparseDnMatDescr_t matB = reinterpret_cast<cusparseDnMatDescr_t>(b);
759	cusparseSpMatDescr_t matC = reinterpret_cast<cusparseSpMatDescr_t>(c);
760	auto cTp = static_cast<cudaDataType_t>(ctp);
761	ALPHABETA(cTp, alpha, beta)
762	CUSPARSE_REPORT_IF_ERROR(cusparseSDDMM(cusparse_env, modeA, modeB, alphap,
763	matA, matB, betap, matC, cTp,
764	CUSPARSE_SDDMM_ALG_DEFAULT, buf))
765	}
766
767	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
768	mgpuSpGEMMCreateDescr(CUstream /stream/) {
769	cusparseSpGEMMDescr_t spgemmDesc = nullptr;
770	CUSPARSE_REPORT_IF_ERROR(cusparseSpGEMM_createDescr(&spgemmDesc))
771	return reinterpret_cast<void *>(spgemmDesc);
772	}
773
774	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
775	mgpuSpGEMMDestroyDescr(void s, CUstream /stream/*) {
776	cusparseSpGEMMDescr_t spgemmDesc = reinterpret_cast<cusparseSpGEMMDescr_t>(s);
777	CUSPARSE_REPORT_IF_ERROR(cusparseSpGEMM_destroyDescr(spgemmDesc))
778	}
779
780	extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t mgpuSpGEMMWorkEstimation(
781	void s, int32_t ma, int32_t mb, void* a, void* b, void* *c, int32_t ctp,
782	intptr_t bs, void buf, CUstream /stream/*) {
783	cusparseSpGEMMDescr_t spgemmDesc = reinterpret_cast<cusparseSpGEMMDescr_t>(s);
784	cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
785	cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb);
786	cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
787	cusparseSpMatDescr_t matB = reinterpret_cast<cusparseSpMatDescr_t>(b);
788	cusparseSpMatDescr_t matC = reinterpret_cast<cusparseSpMatDescr_t>(c);
789	auto cTp = static_cast<cudaDataType_t>(ctp);
790	ALPHABETA(cTp, alpha, beta)
791	size_t newBufferSize = bs;
792	CUSPARSE_REPORT_IF_ERROR(cusparseSpGEMM_workEstimation(
793	cusparse_env, modeA, modeB, alphap, matA, matB, betap, matC, cTp,
794	CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, &newBufferSize, buf))
795	return newBufferSize;
796	}
797
798	extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t
799	mgpuSpGEMMCompute(void s, int32_t ma, int32_t mb, void* a, void* b, void* *c,
800	int32_t ctp, intptr_t bsz2, void buf2, CUstream /stream/*) {
801	cusparseSpGEMMDescr_t spgemmDesc = reinterpret_cast<cusparseSpGEMMDescr_t>(s);
802	cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
803	cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb);
804	cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
805	cusparseSpMatDescr_t matB = reinterpret_cast<cusparseSpMatDescr_t>(b);
806	cusparseSpMatDescr_t matC = reinterpret_cast<cusparseSpMatDescr_t>(c);
807	auto cTp = static_cast<cudaDataType_t>(ctp);
808	ALPHABETA(cTp, alpha, beta)
809	size_t newBufferSize2 = bsz2;
810	CUSPARSE_REPORT_IF_ERROR(cusparseSpGEMM_compute(
811	cusparse_env, modeA, modeB, alphap, matA, matB, betap, matC, cTp,
812	CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, &newBufferSize2, buf2))
813	return newBufferSize2;
814	}
815
816	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
817	mgpuSpGEMMCopy(void s, int32_t ma, int32_t mb, void* a, void* b, void* *c,
818	int32_t ctp, CUstream /stream/) {
819	cusparseSpGEMMDescr_t spgemmDesc = reinterpret_cast<cusparseSpGEMMDescr_t>(s);
820	cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
821	cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb);
822	cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
823	cusparseSpMatDescr_t matB = reinterpret_cast<cusparseSpMatDescr_t>(b);
824	cusparseSpMatDescr_t matC = reinterpret_cast<cusparseSpMatDescr_t>(c);
825	auto cTp = static_cast<cudaDataType_t>(ctp);
826	ALPHABETA(cTp, alpha, beta)
827	CUSPARSE_REPORT_IF_ERROR(
828	cusparseSpGEMM_copy(cusparse_env, modeA, modeB, alphap, matA, matB, betap,
829	matC, cTp, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc))
830	}
831
832	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
833	mgpuSpMatGetSize(void m, void* r, void* c, void* n, CUstream /stream/*) {
834	cusparseConstSpMatDescr_t matDescr =
835	reinterpret_cast<cusparseConstSpMatDescr_t>(m);
836	int64_t rows = reinterpret_cast<int64_t >(r);
837	int64_t cols = reinterpret_cast<int64_t >(c);
838	int64_t nnz = reinterpret_cast<int64_t >(n);
839	CUSPARSE_REPORT_IF_ERROR(cusparseSpMatGetSize(matDescr, rows, cols, nnz));
840	}
841
842	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
843	mgpuSetCsrPointers(void m, void* p, void* c, void* v, CUstream /stream/*) {
844	cusparseSpMatDescr_t matDescr = reinterpret_cast<cusparseSpMatDescr_t>(m);
845	CUSPARSE_REPORT_IF_ERROR(cusparseCsrSetPointers(matDescr, p, c, v));
846	}
847
848	#ifdef MLIR_ENABLE_CUDA_CUSPARSELT
849
850	///
851	/// Wrapper methods for the cuSparseLt library.
852	///
853
854	struct cusparseLtSpMatHandleAndData {
855	cusparseLtMatDescriptor_t mat;
856	// TODO: the following three are associated with the SpMM operator rather than
857	// the sparse matrix. Create workspace buffers and pass them to the SpMM
858	// execution.
859	cusparseLtMatmulAlgSelection_t alg_sel;
860	cusparseLtMatmulPlan_t plan;
861	cusparseLtMatmulDescriptor_t matmul;
862	void values{nullptr*};
863	};
864
865	struct cusparseLtDnMatHandleAndData {
866	cusparseLtMatDescriptor_t mat;
867	void values{nullptr*};
868	};
869
870	static_assert(sizeof(cusparseLtHandle_t) == `11024`,
871	"Unexpected cusparseLt handle size");
872	static_assert(sizeof(cusparseLtSpMatHandleAndData) == `44104`,
873	"Unexpected cusparseLt sparse matrix handle size");
874	static_assert(sizeof(cusparseLtDnMatHandleAndData) == `11032`,
875	"Unexpected cusparseLt dense matrix handle size");
876
877	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuCreateSparseLtEnv() {
878	// ScopedContext is for cuda initialization.
879	ScopedContext scopedContext;
880	assert(!cusparseLt_initiated &&
881	"client called mgpuCreateSparseLtEnv() twice");
882	// Note that cuSparseLt still uses cusparseStatus_t.
883	CUSPARSE_REPORT_IF_ERROR(cusparseLtInit(&cusparseLt_env));
884	cusparseLt_initiated = true;
885	}
886
887	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuDestroySparseLtEnv() {
888	assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()");
889	CUSPARSE_REPORT_IF_ERROR(cusparseLtDestroy(&cusparseLt_env));
890	cusparseLt_initiated = false;
891	}
892
893	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
894	mgpuCreateCuSparseLtDnMat(void dh, intptr_t rows, intptr_t cols, void* *values,
895	int32_t dtp, CUstream /stream/) {
896	assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()");
897	auto dnmat_handle = reinterpret_cast<cusparseLtDnMatHandleAndData *>(dh);
898	dnmat_handle->values = values;
899	auto dTp = static_cast<cudaDataType_t>(dtp);
900	// Assume row-major when deciding lda.
901	const uint32_t alignment = `16`;
902	CUSPARSE_REPORT_IF_ERROR(cusparseLtDenseDescriptorInit(
903	&cusparseLt_env, &(dnmat_handle->mat), rows, cols, /lda=/cols,
904	alignment, dTp, CUSPARSE_ORDER_ROW))
905	}
906
907	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
908	mgpuDestroyCuSparseLtDnMat(void dh, CUstream /stream/*) {
909	auto dnmat_handle = reinterpret_cast<cusparseLtDnMatHandleAndData *>(dh);
910	CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(dnmat_handle->mat)))
911	}
912
913	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
914	mgpuCusparseLtCreate2To4SpMat(void *sh, intptr_t rows, intptr_t cols,
915	void values, int32_t dtp, CUstream /stream/*) {
916	assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()");
917	auto spmat_handle = reinterpret_cast<cusparseLtSpMatHandleAndData *>(sh);
918	spmat_handle->values = values;
919	auto dTp = static_cast<cudaDataType_t>(dtp);
920	// Assume row-major when deciding lda.
921	const uint32_t alignment = `16`;
922	CUSPARSE_REPORT_IF_ERROR(cusparseLtStructuredDescriptorInit(
923	&cusparseLt_env, &(spmat_handle->mat), rows, cols, /ld=/cols, alignment,
924	dTp, CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT))
925	}
926
927	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
928	mgpuDestroyCuSparseLtSpMat(void sh, CUstream /stream/*) {
929	auto spmat_handle = reinterpret_cast<cusparseLtSpMatHandleAndData *>(sh);
930	CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(spmat_handle->mat)))
931	}
932
933	// Several things are being done in this stage, algorithm selection, planning,
934	// and returning workspace and compressed matrices data buffer sizes.
935	// The parameter prune_flag is used to indicate whether pruning and pruning
936	// check will happen 0 means not prune or prune check, 1 means prune, 2 means
937	// prune & prune check
938	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
939	mgpuCuSparseLtSpMMBufferSize(void bs, int32_t ma, int32_t mb, void* a, void* *b,
940	void *c, int32_t ctp, int32_t prune_flag,
941	CUstream stream) {
942	assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()");
943	// TODO: support more advanced settings, e.g., the input right operand is a
944	// sparse matrix assuming matA is the sparse matrix
945	auto matA = reinterpret_cast<cusparseLtSpMatHandleAndData *>(a);
946	auto matB = reinterpret_cast<cusparseLtDnMatHandleAndData *>(b);
947	auto matC = reinterpret_cast<cusparseLtDnMatHandleAndData *>(c);
948	auto workspace_size = reinterpret_cast<size_t *>(bs);
949	auto compressed_size = &(reinterpret_cast<size_t *>(bs)[`1`]);
950	auto compressed_buffer_size = &(reinterpret_cast<size_t *>(bs)[`2`]);
951	auto cTp = static_cast<cusparseComputeType>(ctp);
952
953	cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
954	cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb);
955	CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulDescriptorInit(
956	&cusparseLt_env, &(matA->matmul), modeA, modeB, &(matA->mat),
957	&(matB->mat), &(matC->mat), &(matC->mat), cTp))
958	CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulAlgSelectionInit(
959	&cusparseLt_env, &(matA->alg_sel), &(matA->matmul),
960	CUSPARSELT_MATMUL_ALG_DEFAULT))
961	int alg = `0`;
962	CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulAlgSetAttribute(
963	&cusparseLt_env, &(matA->alg_sel), CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg,
964	sizeof(alg)))
965
966	CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulPlanInit(
967	&cusparseLt_env, &(matA->plan), &(matA->matmul), &(matA->alg_sel)))
968
969	// Pruning step (in-place).
970	if (prune_flag > `0`)
971	CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMAPrune(
972	&cusparseLt_env, &(matA->matmul), matA->values, matA->values,
973	CUSPARSELT_PRUNE_SPMMA_STRIP, stream))
974
975	// Check structure of A.
976	// Note that this adds a synchronization on the stream.
977	// TODO: Do we want that?
978	if (prune_flag == `2`) {
979	int dvalid = (int* )mgpuMemAlloc(sizeof(int), stream, false*);
980	CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMAPruneCheck(
981	&cusparseLt_env, &(matA->matmul), matA->values, dvalid, stream))
982	int valid = `0`;
983	mgpuMemcpy(&valid, dvalid, sizeof(int), stream);
984	mgpuStreamSynchronize(stream);
985	mgpuMemFree(dvalid, stream);
986	if (valid != `0`)
987	fprintf(stderr, "CUPARSE-LT: sparse matrix is not 2:4; computed results "
988	"will be invalid\n");
989	}
990
991	CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulGetWorkspace(
992	&cusparseLt_env, &(matA->plan), workspace_size))
993	CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMACompressedSize(
994	&cusparseLt_env, &(matA->plan), compressed_size, compressed_buffer_size))
995	}
996
997	extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
998	mgpuCuSparseLtSpMM(void a, void* b, void* c, void* *d_workspace,
999	void dA_compressed, void* *dA_compressedBuffer,
1000	CUstream stream) {
1001	assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()");
1002	auto matA = reinterpret_cast<cusparseLtSpMatHandleAndData *>(a);
1003	auto matB = reinterpret_cast<cusparseLtDnMatHandleAndData *>(b);
1004	auto matC = reinterpret_cast<cusparseLtDnMatHandleAndData *>(c);
1005
1006	ALPHABETA(CUDA_R_32F, alpha, beta)
1007	CUSPARSE_REPORT_IF_ERROR(
1008	cusparseLtSpMMACompress(&cusparseLt_env, &(matA->plan), (matA->values),
1009	dA_compressed, dA_compressedBuffer, stream))
1010
1011	// TODO: add support to multi-stream execution
1012	// Perform the matrix multiplication. D = AB+C using C==D for now*
1013	CUSPARSE_REPORT_IF_ERROR(
1014	cusparseLtMatmul(&cusparseLt_env, &(matA->plan), alphap, dA_compressed,
1015	matB->values, betap, matC->values,
1016	/dD/ matC->values, d_workspace, nullptr, `0`))
1017
1018	CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(matA->mat)))
1019	// destroy the plan associated with the sparse matrix
1020	CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulPlanDestroy(&(matA->plan)))
1021	}
1022
1023	#endif // MLIR_ENABLE_CUDA_CUSPARSELT
1024	#endif // MLIR_ENABLE_CUDA_CUSPARSE
1025

Provided by KDAB

Definitions

source code of mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp